ffreemt commited on
Commit
9314f7d
1 Parent(s): 896d8df

Update refactor

Browse files
Files changed (3) hide show
  1. .flake8 +21 -0
  2. .gitignore +1 -0
  3. app.py +125 -29
.flake8 ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [flake8]
2
+ ignore =
3
+ # E203 whitespace before ':'
4
+ E203
5
+ D203
6
+ # line too long
7
+ E501
8
+ per-file-ignores =
9
+ # imported but unused
10
+ # __init__.py: F401
11
+ test_*.py: F401
12
+ exclude =
13
+ .git
14
+ __pycache__
15
+ docs/source/conf.py
16
+ old
17
+ build
18
+ dist
19
+ .venv
20
+ pad*.py app-.py
21
+ max-complexity = 25
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ call-activate.bat
app.py CHANGED
@@ -1,14 +1,19 @@
1
  """Run codes"""
 
2
  # import gradio
3
 
4
  # gradio.load("models/WizardLM/WizardCoder-15B-V1.0").launch()
 
5
  import os
6
  import time
 
7
  from types import SimpleNamespace
8
 
9
  import gradio as gr
10
  from about_time import about_time
11
- from ctransformers import AutoConfig, AutoModelForCausalLM
 
 
12
  from huggingface_hub import hf_hub_download
13
  from loguru import logger
14
 
@@ -24,6 +29,11 @@ ns = SimpleNamespace(
24
  generator=[],
25
  )
26
 
 
 
 
 
 
27
 
28
  def predict(prompt, bot):
29
  # logger.debug(f"{prompt=}, {bot=}, {timeout=}")
@@ -33,7 +43,12 @@ def predict(prompt, bot):
33
  with about_time() as atime: # type: ignore
34
  try:
35
  # user_prompt = prompt
36
- generator = generate(llm, generation_config, system_prompt, prompt.strip())
 
 
 
 
 
37
  print(assistant_prefix, end=" ", flush=True)
38
 
39
  response = ""
@@ -67,7 +82,21 @@ def predict_api(prompt):
67
  ns.response = ""
68
  try:
69
  # user_prompt = prompt
70
- generator = generate(llm, generation_config, system_prompt, prompt.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  print(assistant_prefix, end=" ", flush=True)
72
 
73
  response = ""
@@ -98,6 +127,50 @@ def download_quant(destination_folder: str, repo_id: str, model_filename: str):
98
  )
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  logger.info("start dl")
102
  _ = """full url: https://huggingface.co/TheBloke/mpt-30B-chat-GGML/blob/main/mpt-30b-chat.ggmlv0.q4_1.bin"""
103
 
@@ -109,16 +182,16 @@ mpt-30b-chat.ggmlv0.q5_0.bin q5_0 5 20.60 GB 23.10 GB
109
  mpt-30b-chat.ggmlv0.q5_1.bin q5_1 5 22.47 GB 24.97 GB
110
  mpt-30b-chat.ggmlv0.q8_0.bin q8_0 8 31.83 GB 34.33 GB
111
  """
112
- model_filename = "mpt-30b-chat.ggmlv0.q4_1.bin"
113
- model_filename = "WizardCoder-15B-1.0.ggmlv3.q4_0.bin" # 10.7G
114
- model_filename = "WizardCoder-15B-1.0.ggmlv3.q4_1.bin" # 11.9G
115
- destination_folder = "models"
 
 
 
 
116
 
117
- repo_id = "TheBloke/mpt-30B-chat-GGML"
118
- if "WizardCoder" in model_filename:
119
- repo_id = "TheBloke/WizardCoder-15B-1.0-GGML"
120
-
121
- download_quant(destination_folder, repo_id, model_filename)
122
 
123
  logger.info("done dl")
124
 
@@ -131,16 +204,40 @@ logger.info("done dl")
131
  # )
132
 
133
  # https://huggingface.co/spaces/matthoffner/wizardcoder-ggml/blob/main/main.py
134
- if "WizardCoder" in model_filename:
135
- llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardCoder-15B-1.0-GGML",
136
- model_file="",
137
- model_type="starcoder",
138
- threads=8)
139
-
140
- default_system_prompt = "A conversation between a user and an LLM-based AI assistant named Local Assistant. Local Assistant gives helpful and honest answers."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- user_prefix = "[user]: "
143
- assistant_prefix = "[assistant]: "
 
 
 
 
 
 
 
 
 
 
144
 
145
  css = """
146
  .importantButton {
@@ -157,7 +254,7 @@ css = """
157
 
158
  with gr.Blocks(
159
  # title="mpt-30b-chat-ggml",
160
- title=f"{model_filename}",
161
  theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
162
  css=css,
163
  ) as block:
@@ -166,7 +263,7 @@ with gr.Blocks(
166
  # """<center><a href="https://huggingface.co/spaces/mikeee/mpt-30b-chat?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate"></a> and spin a CPU UPGRADE to avoid the queue</center>"""
167
  # )
168
  gr.Markdown(
169
- f"""<h4><center>{model_filename}</center></h4>
170
 
171
  Most examples are meant for another model. You probably should try
172
  some coder-related prompts.
@@ -177,17 +274,17 @@ with gr.Blocks(
177
  """,
178
  elem_classes="xsmall",
179
  )
180
- conversation = Chat()
181
  chatbot = gr.Chatbot(scroll_to_output=True).style(height=700) # 500
182
  buff = gr.Textbox(show_label=False)
183
  with gr.Row():
184
- with gr.Column(scale=1):
185
  msg = gr.Textbox(
186
  label="Chat Message Box",
187
  placeholder="Ask me anything (press Enter or click Submit to send)",
188
  show_label=False,
189
  ).style(container=False)
190
- with gr.Column(scale=0.1):
191
  with gr.Row():
192
  submit = gr.Button("Submit", elem_classes="xsmall")
193
  stop = gr.Button("Stop", visible=False)
@@ -212,7 +309,7 @@ with gr.Blocks(
212
  examples=[
213
  ["js 判断一个数是不是质数"],
214
  ["js 实现python 的 range(10)"],
215
- ["js 实现python 的 [*(range(10)]"],
216
  ["Explain the plot of Cinderella in a sentence."],
217
  [
218
  "How long does it take to become proficient in French, and what are the best methods for retaining information?"
@@ -244,7 +341,7 @@ with gr.Blocks(
244
 
245
  # with gr.Row():
246
  with gr.Accordion("Disclaimer", open=False):
247
- _ = "-".join(model_filename.split("-")[:2])
248
  gr.Markdown(
249
  f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
250
  "factually accurate information. {_} was trained on various public datasets; while great efforts "
@@ -292,4 +389,3 @@ with gr.Blocks(
292
  # concurrency_count=5, max_size=20
293
  # max_size=36, concurrency_count=14
294
  block.queue(concurrency_count=5, max_size=20).launch(debug=True)
295
-
 
1
  """Run codes"""
2
+ # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
3
  # import gradio
4
 
5
  # gradio.load("models/WizardLM/WizardCoder-15B-V1.0").launch()
6
+
7
  import os
8
  import time
9
+ from dataclasses import asdict, dataclass
10
  from types import SimpleNamespace
11
 
12
  import gradio as gr
13
  from about_time import about_time
14
+
15
+ # from ctransformers import AutoConfig, AutoModelForCausalLM
16
+ from ctransformers import AutoModelForCausalLM
17
  from huggingface_hub import hf_hub_download
18
  from loguru import logger
19
 
 
29
  generator=[],
30
  )
31
 
32
+ default_system_prompt = "A conversation between a user and an LLM-based AI assistant named Local Assistant. Local Assistant gives helpful and honest answers."
33
+
34
+ user_prefix = "[user]: "
35
+ assistant_prefix = "[assistant]: "
36
+
37
 
38
  def predict(prompt, bot):
39
  # logger.debug(f"{prompt=}, {bot=}, {timeout=}")
 
43
  with about_time() as atime: # type: ignore
44
  try:
45
  # user_prompt = prompt
46
+ generator = generate(
47
+ LLM,
48
+ GENERATION_CONFIG,
49
+ system_prompt=default_system_prompt,
50
+ user_prompt=prompt.strip(),
51
+ )
52
  print(assistant_prefix, end=" ", flush=True)
53
 
54
  response = ""
 
82
  ns.response = ""
83
  try:
84
  # user_prompt = prompt
85
+ _ = GenerationConfig(
86
+ temperature=0.2,
87
+ top_k=0,
88
+ top_p=0.9,
89
+ repetition_penalty=1.0,
90
+ max_new_tokens=512, # adjust as needed
91
+ seed=42,
92
+ reset=False, # reset history (cache)
93
+ stream=False, # streaming per word/token
94
+ threads=os.cpu_count() // 2, # type: ignore # adjust for your CPU
95
+ stop=["<|im_end|>", "|<"],
96
+ )
97
+ generator = generate(
98
+ LLM, _, system_prompt=default_system_prompt, user_prompt=prompt.strip()
99
+ )
100
  print(assistant_prefix, end=" ", flush=True)
101
 
102
  response = ""
 
127
  )
128
 
129
 
130
+ @dataclass
131
+ class GenerationConfig:
132
+ temperature: float
133
+ top_k: int
134
+ top_p: float
135
+ repetition_penalty: float
136
+ max_new_tokens: int
137
+ seed: int
138
+ reset: bool
139
+ stream: bool
140
+ threads: int
141
+ stop: list[str]
142
+
143
+
144
+ def format_prompt(system_prompt: str, user_prompt: str):
145
+ """Format prompt based on: https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py.
146
+
147
+ May need to be modified for WizardCoder: TODO
148
+ """
149
+
150
+ system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
151
+ user_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
152
+ assistant_prompt = "<|im_start|>assistant\n"
153
+
154
+ return f"{system_prompt}{user_prompt}{assistant_prompt}"
155
+
156
+
157
+ def generate(
158
+ llm: AutoModelForCausalLM,
159
+ generation_config: GenerationConfig,
160
+ system_prompt: str = default_system_prompt,
161
+ user_prompt: str = "",
162
+ ):
163
+ """Run model inference, will return a Generator if streaming is true"""
164
+ # if not user_prompt.strip():
165
+ return llm(
166
+ format_prompt(
167
+ system_prompt,
168
+ user_prompt,
169
+ ),
170
+ **asdict(generation_config),
171
+ )
172
+
173
+
174
  logger.info("start dl")
175
  _ = """full url: https://huggingface.co/TheBloke/mpt-30B-chat-GGML/blob/main/mpt-30b-chat.ggmlv0.q4_1.bin"""
176
 
 
182
  mpt-30b-chat.ggmlv0.q5_1.bin q5_1 5 22.47 GB 24.97 GB
183
  mpt-30b-chat.ggmlv0.q8_0.bin q8_0 8 31.83 GB 34.33 GB
184
  """
185
+ MODEL_FILENAME = "mpt-30b-chat.ggmlv0.q4_1.bin"
186
+ MODEL_FILENAME = "WizardCoder-15B-1.0.ggmlv3.q4_0.bin" # 10.7G
187
+ MODEL_FILENAME = "WizardCoder-15B-1.0.ggmlv3.q4_1.bin" # 11.9G
188
+ DESTINATION_FOLDER = "models"
189
+
190
+ REPO_ID = "TheBloke/mpt-30B-chat-GGML"
191
+ if "WizardCoder" in MODEL_FILENAME:
192
+ REPO_ID = "TheBloke/WizardCoder-15B-1.0-GGML"
193
 
194
+ download_quant(DESTINATION_FOLDER, REPO_ID, MODEL_FILENAME)
 
 
 
 
195
 
196
  logger.info("done dl")
197
 
 
204
  # )
205
 
206
  # https://huggingface.co/spaces/matthoffner/wizardcoder-ggml/blob/main/main.py
207
+ _ = """
208
+ llm = AutoModelForCausalLM.from_pretrained(
209
+ "TheBloke/WizardCoder-15B-1.0-GGML",
210
+ model_file="",
211
+ model_type="starcoder",
212
+ threads=8
213
+ )
214
+ # """
215
+ if "WizardCoder" in MODEL_FILENAME:
216
+ LLM = AutoModelForCausalLM.from_pretrained(
217
+ "TheBloke/WizardCoder-15B-1.0-GGML",
218
+ model_file=MODEL_FILENAME,
219
+ model_type="starcoder",
220
+ threads=os.cpu_count() // 2, # type: ignore
221
+ )
222
+ LLM = AutoModelForCausalLM.from_pretrained(
223
+ "TheBloke/WizardCoder-15B-1.0-GGML",
224
+ model_file="",
225
+ model_type="starcoder",
226
+ threads=os.cpu_count() // 2 # type: ignore
227
+ )
228
 
229
+ GENERATION_CONFIG = GenerationConfig(
230
+ temperature=0.2,
231
+ top_k=0,
232
+ top_p=0.9,
233
+ repetition_penalty=1.0,
234
+ max_new_tokens=512, # adjust as needed
235
+ seed=42,
236
+ reset=False, # reset history (cache)
237
+ stream=True, # streaming per word/token
238
+ threads=os.cpu_count() // 2, # type: ignore # adjust for your CPU
239
+ stop=["<|im_end|>", "|<"],
240
+ )
241
 
242
  css = """
243
  .importantButton {
 
254
 
255
  with gr.Blocks(
256
  # title="mpt-30b-chat-ggml",
257
+ title=f"{MODEL_FILENAME}",
258
  theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
259
  css=css,
260
  ) as block:
 
263
  # """<center><a href="https://huggingface.co/spaces/mikeee/mpt-30b-chat?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate"></a> and spin a CPU UPGRADE to avoid the queue</center>"""
264
  # )
265
  gr.Markdown(
266
+ f"""<h4><center>{MODEL_FILENAME}</center></h4>
267
 
268
  Most examples are meant for another model. You probably should try
269
  some coder-related prompts.
 
274
  """,
275
  elem_classes="xsmall",
276
  )
277
+
278
  chatbot = gr.Chatbot(scroll_to_output=True).style(height=700) # 500
279
  buff = gr.Textbox(show_label=False)
280
  with gr.Row():
281
+ with gr.Column(scale=4):
282
  msg = gr.Textbox(
283
  label="Chat Message Box",
284
  placeholder="Ask me anything (press Enter or click Submit to send)",
285
  show_label=False,
286
  ).style(container=False)
287
+ with gr.Column(scale=1):
288
  with gr.Row():
289
  submit = gr.Button("Submit", elem_classes="xsmall")
290
  stop = gr.Button("Stop", visible=False)
 
309
  examples=[
310
  ["js 判断一个数是不是质数"],
311
  ["js 实现python 的 range(10)"],
312
+ ["js 实现python 的 [*(range(10)]"],
313
  ["Explain the plot of Cinderella in a sentence."],
314
  [
315
  "How long does it take to become proficient in French, and what are the best methods for retaining information?"
 
341
 
342
  # with gr.Row():
343
  with gr.Accordion("Disclaimer", open=False):
344
+ _ = "-".join(MODEL_FILENAME.split("-")[:2])
345
  gr.Markdown(
346
  f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
347
  "factually accurate information. {_} was trained on various public datasets; while great efforts "
 
389
  # concurrency_count=5, max_size=20
390
  # max_size=36, concurrency_count=14
391
  block.queue(concurrency_count=5, max_size=20).launch(debug=True)