tangzhy commited on
Commit
bda33ad
1 Parent(s): b2b7f7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -5
app.py CHANGED
@@ -29,21 +29,21 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
29
  # bnb_4bit_compute_dtype=torch.bfloat16,
30
  # bnb_4bit_use_double_quant=True,
31
  # bnb_4bit_quant_type= "nf4")
32
- # quantization_config = BitsAndBytesConfig(load_in_8bit=True)
33
 
34
  model_id = "CardinalOperations/ORLM-LLaMA-3-8B"
35
  tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
36
  model = AutoModelForCausalLM.from_pretrained(
37
  model_id,
38
  device_map="auto",
39
- torch_dtype=torch.bfloat16,
40
- attn_implementation="flash_attention_2",
41
- # quantization_config=quantization_config,
42
  )
43
  model.eval()
44
 
45
 
46
- @spaces.GPU(duration=100)
47
  def generate(
48
  message: str,
49
  chat_history: list[tuple[str, str]],
 
29
  # bnb_4bit_compute_dtype=torch.bfloat16,
30
  # bnb_4bit_use_double_quant=True,
31
  # bnb_4bit_quant_type= "nf4")
32
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
33
 
34
  model_id = "CardinalOperations/ORLM-LLaMA-3-8B"
35
  tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
36
  model = AutoModelForCausalLM.from_pretrained(
37
  model_id,
38
  device_map="auto",
39
+ # torch_dtype=torch.bfloat16,
40
+ # attn_implementation="flash_attention_2",
41
+ quantization_config=quantization_config,
42
  )
43
  model.eval()
44
 
45
 
46
+ @spaces.GPU(duration=120)
47
  def generate(
48
  message: str,
49
  chat_history: list[tuple[str, str]],