vilarin commited on
Commit
3eed0af
1 Parent(s): 93fdc72

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -25
app.py CHANGED
@@ -2,22 +2,18 @@ import os
2
  import time
3
  #import spaces
4
  import torch
5
- from transformers import AutoModelForCausalLM, AutoTokenizer
6
  import gradio as gr
 
7
 
8
- MODEL_LIST = ["openbmb/MiniCPM-1B-sft-bf16", "openbmb/MiniCPM-S-1B-sft"]
9
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
10
- MODEL_ID = os.environ.get("MODEL_ID", None)
11
- MODEL_NAME = MODEL_ID.split("/")[-1]
12
 
13
- TITLE = "<h1><center>MiniCPM-S-1B-chat</center></h1>"
14
 
15
- DESCRIPTION = f"""
16
- <h3>MODEL NOW: <a href="https://hf.co/{MODEL_ID}">{MODEL_NAME}</a></h3>
17
- """
18
  PLACEHOLDER = """
19
  <center>
20
- <p>MiniCPM is an End-Size LLM with only 1.2B parameters excluding embeddings.</p>
21
  </center>
22
  """
23
 
@@ -34,13 +30,22 @@ h3 {
34
  }
35
  """
36
 
37
- model = AutoModelForCausalLM.from_pretrained(
38
- MODEL_ID,
39
- torch_dtype=torch.bfloat16,
40
- device_map='auto',
41
- low_cpu_mem_usage=True,
42
- trust_remote_code=True)
43
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 
 
 
 
 
 
 
 
 
44
 
45
  #@spaces.GPU()
46
  def stream_chat(
@@ -50,7 +55,8 @@ def stream_chat(
50
  max_new_tokens: int = 1024,
51
  top_p: float = 1.0,
52
  top_k: int = 20,
53
- penalty: float = 1.2
 
54
  ):
55
  print(f'message: {message}')
56
  print(f'history: {history}')
@@ -61,26 +67,49 @@ def stream_chat(
61
  {"role": "user", "content": prompt},
62
  {"role": "assistant", "content": answer},
63
  ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- torch.manual_seed(0)
66
- resp, history = model.chat(
67
- tokenizer,
68
- query = message,
69
- history = conversation,
70
- max_length = max_new_tokens,
71
  do_sample = False if temperature == 0 else True,
72
  top_p = top_p,
73
  top_k = top_k,
74
  temperature = temperature,
 
75
  )
76
- return resp
77
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
80
 
81
  with gr.Blocks(css=CSS, theme="soft") as demo:
82
  gr.HTML(TITLE)
83
- gr.HTML(DESCRIPTION)
84
  gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
85
  gr.ChatInterface(
86
  fn=stream_chat,
@@ -128,6 +157,12 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
128
  label="Repetition penalty",
129
  render=False,
130
  ),
 
 
 
 
 
 
131
  ],
132
  examples=[
133
  ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
 
2
  import time
3
  #import spaces
4
  import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
  import gradio as gr
7
+ from threading import Thread
8
 
9
+ MODEL_LIST = ["HuggingFaceTB/SmolLM-1.7B-Instruct", "HuggingFaceTB/SmolLM-135M-Instruct", "HuggingFaceTB/SmolLM-360M-Instruct"]
10
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
 
11
 
12
+ TITLE = "<h1><center>SmolLM-Instruct</center></h1>"
13
 
 
 
 
14
  PLACEHOLDER = """
15
  <center>
16
+ <pSmolLM is a series of state-of-the-art small language models available in three sizes: 135M, 360M, and 1.7B parameters.</p>
17
  </center>
18
  """
19
 
 
30
  }
31
  """
32
 
33
+ # pip install transformers
34
+ from transformers import AutoModelForCausalLM, AutoTokenizer
35
+
36
+ device = "cpu" # for GPU usage or "cpu" for CPU usage
37
+
38
+ tokenizer0 = AutoTokenizer.from_pretrained(MODEL_LIST[0])
39
+ model0 = AutoModelForCausalLM.from_pretrained(MODEL_LIST[0]).to(device)
40
+
41
+ tokenizer1 = AutoTokenizer.from_pretrained(MODEL_LIST[1])
42
+ model1 = AutoModelForCausalLM.from_pretrained(MODEL_LIST[1]).to(device)
43
+
44
+ tokenizer2 = AutoTokenizer.from_pretrained(MODEL_LIST[2])
45
+ model2 = AutoModelForCausalLM.from_pretrained(MODEL_LIST[2]).to(device)
46
+
47
+ messages = [{"role": "user", "content": "List the steps to bake a chocolate cake from scratch."}]
48
+
49
 
50
  #@spaces.GPU()
51
  def stream_chat(
 
55
  max_new_tokens: int = 1024,
56
  top_p: float = 1.0,
57
  top_k: int = 20,
58
+ penalty: float = 1.2,
59
+ choice: str = "1.7B"
60
  ):
61
  print(f'message: {message}')
62
  print(f'history: {history}')
 
67
  {"role": "user", "content": prompt},
68
  {"role": "assistant", "content": answer},
69
  ])
70
+
71
+ conversation.append({"role": "user", "content": message})
72
+
73
+ if choice == "1.7B":
74
+ tokenizer = tokenizer0
75
+ model = model0
76
+ elif choice == "135M":
77
+ model = model1
78
+ tokenizer = tokenizer1
79
+ else:
80
+ model = model2
81
+ tokenizer = tokenizer2
82
+
83
+ input_text=tokenizer.apply_chat_template(conversation, tokenize=False)
84
+ inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
85
+ streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
86
 
87
+ generate_kwargs = dict(
88
+ inputs,
89
+ max_new_tokens = max_new_tokens,
 
 
 
90
  do_sample = False if temperature == 0 else True,
91
  top_p = top_p,
92
  top_k = top_k,
93
  temperature = temperature,
94
+ streamer=streamer,
95
  )
 
96
 
97
+ with torch.no_grad():
98
+ thread = Thread(target=model.generate, kwargs=gen_kwargs)
99
+ thread.start()
100
+
101
+ buffer = ""
102
+ for new_text in streamer:
103
+ buffer += new_text
104
+ yield buffer
105
+
106
+
107
+ #print(tokenizer.decode(outputs[0]))
108
 
109
  chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
110
 
111
  with gr.Blocks(css=CSS, theme="soft") as demo:
112
  gr.HTML(TITLE)
 
113
  gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
114
  gr.ChatInterface(
115
  fn=stream_chat,
 
157
  label="Repetition penalty",
158
  render=False,
159
  ),
160
+ gr.Radio(
161
+ ["135M", "360M", "1.7B"],
162
+ value="1.7B",
163
+ label="Load Model",
164
+ render=False,
165
+ ),
166
  ],
167
  examples=[
168
  ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],