Spaces:

fschwartzer
/

streamlit_chatbot

Running

fschwartzer commited on 15 days ago

Commit

827d04a

•

1 Parent(s): c8077b9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,15 +3,20 @@ import pandas as pd
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# Load the tokenizer and quantized model
-model_name = "meta-llama/Meta-Llama-3.1-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Use bitsandbytes to load the model in 8-bit precision
-model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map='auto')
-# Move model to the appropriate device (GPU/CPU)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = model.to(device)
 # Set the padding token to the end-of-sequence token

 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# Load the tokenizer
+model_name = "your-llama-model"  # Replace with the LLaMA model name
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Load the model
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Apply dynamic quantization for CPU
+model = torch.quantization.quantize_dynamic(
+    model, {torch.nn.Linear}, dtype=torch.qint8
+)
+# Move model to CPU
+device = torch.device("cpu")
 model = model.to(device)
 # Set the padding token to the end-of-sequence token