vilarin commited on
Commit
1d4c579
1 Parent(s): 4d71d31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -2
app.py CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  import os
2
  import time
3
  import spaces
@@ -37,6 +43,7 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL)
37
  model = AutoModelForCausalLM.from_pretrained(
38
  MODEL,
39
  torch_dtype=torch.bfloat16,
 
40
  device_map="auto",
41
  ignore_mismatched_sizes=True)
42
 
@@ -44,7 +51,7 @@ model = AutoModelForCausalLM.from_pretrained(
44
  def stream_chat(
45
  message: str,
46
  history: list,
47
- temperature: float = 0.35,
48
  max_new_tokens: int = 1024,
49
  top_p: float = 1.0,
50
  top_k: int = 20,
@@ -101,7 +108,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
101
  minimum=0,
102
  maximum=1,
103
  step=0.1,
104
- value=0.35,
105
  label="Temperature",
106
  render=False,
107
  ),
 
1
+ import subprocess
2
+ subprocess.run(
3
+ 'pip install flash-attn --no-build-isolation',
4
+ env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
5
+ shell=True
6
+ )
7
  import os
8
  import time
9
  import spaces
 
43
  model = AutoModelForCausalLM.from_pretrained(
44
  MODEL,
45
  torch_dtype=torch.bfloat16,
46
+ attn_implementation="flash_attention_2",
47
  device_map="auto",
48
  ignore_mismatched_sizes=True)
49
 
 
51
  def stream_chat(
52
  message: str,
53
  history: list,
54
+ temperature: float = 0.3,
55
  max_new_tokens: int = 1024,
56
  top_p: float = 1.0,
57
  top_k: int = 20,
 
108
  minimum=0,
109
  maximum=1,
110
  step=0.1,
111
+ value=0.3,
112
  label="Temperature",
113
  render=False,
114
  ),