ORLM

Running on Zero

tangzhy commited on Jul 25

Commit

a9764a0

•

1 Parent(s): 54becda

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,6 +12,9 @@ from transformers import (
     TextIteratorStreamer,
 )
 DESCRIPTION = """\
 # ORLM LLaMA-3-8B
@@ -24,18 +27,26 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model_id = "CardinalOperations/ORLM-LLaMA-3-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
-    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
 )
 model.config.sliding_window = 4096
 model.eval()
-@spaces.GPU(duration=100)
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
@@ -63,6 +74,7 @@ def generate(
         temperature=temperature,
         num_beams=1,
         repetition_penalty=repetition_penalty,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()

     TextIteratorStreamer,
 )
+import subprocess
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 DESCRIPTION = """\
 # ORLM LLaMA-3-8B
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type= "nf4")
+# quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 model_id = "CardinalOperations/ORLM-LLaMA-3-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
+    attn_implementation="flash_attention_2",
+    # quantization_config=quantization_config,
 )
 model.config.sliding_window = 4096
 model.eval()
+@spaces.GPU(duration=120)
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
         temperature=temperature,
         num_beams=1,
         repetition_penalty=repetition_penalty,
+        eos_token_id=[tok.eos_token_id],
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()