fschwartzer commited on
Commit
827d04a
1 Parent(s): c8077b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -6
app.py CHANGED
@@ -3,15 +3,20 @@ import pandas as pd
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
- # Load the tokenizer and quantized model
7
- model_name = "meta-llama/Meta-Llama-3.1-8B"
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
 
10
- # Use bitsandbytes to load the model in 8-bit precision
11
- model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map='auto')
12
 
13
- # Move model to the appropriate device (GPU/CPU)
14
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
15
  model = model.to(device)
16
 
17
  # Set the padding token to the end-of-sequence token
 
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
+ # Load the tokenizer
7
+ model_name = "your-llama-model" # Replace with the LLaMA model name
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
 
10
+ # Load the model
11
+ model = AutoModelForCausalLM.from_pretrained(model_name)
12
 
13
+ # Apply dynamic quantization for CPU
14
+ model = torch.quantization.quantize_dynamic(
15
+ model, {torch.nn.Linear}, dtype=torch.qint8
16
+ )
17
+
18
+ # Move model to CPU
19
+ device = torch.device("cpu")
20
  model = model.to(device)
21
 
22
  # Set the padding token to the end-of-sequence token