qwen2.5

Running on Zero

App Files Files Community

CMLL commited on Jun 18

Commit

d71ad7e

•

1 Parent(s): 220ce3a

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -38

app.py CHANGED Viewed

@@ -5,22 +5,22 @@ from typing import Iterator
 import gradio as gr
 import spaces
 import torch
-from transformers import pipeline, AutoTokenizer
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """\
-# ZhongJing 2 1.8B Merge
-This Space demonstrates model [CMLL/ZhongJing-2-1_8b-merge](https://huggingface.co/CMLL/ZhongJing-2-1_8b-merge) for text generation. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).
 """
 LICENSE = """
 <p/>
 ---
-As a derivative work of [CMLL/ZhongJing-2-1_8b-merge](https://huggingface.co/CMLL/ZhongJing-2-1_8b-merge),
-this demo is governed by the original [license](https://huggingface.co/CMLL/ZhongJing-2-1_8b-merge/LICENSE).
 """
 if not torch.cuda.is_available():
@@ -28,7 +28,7 @@ if not torch.cuda.is_available():
 if torch.cuda.is_available():
     model_id = "CMLL/ZhongJing-2-1_8b-merge"
-    pipe = pipeline("text-generation", model=model_id)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     tokenizer.use_default_system_prompt = False
@@ -36,50 +36,50 @@ if torch.cuda.is_available():
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
-    system_prompt: str = "You are a helpful TCM medical assistant named 仲景中医大语言模型, created by 医哲未来.",
     max_new_tokens: int = 1024,
     temperature: float = 0.6,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
-    conversation = [{"role": "system", "content": system_prompt}]
     for user, assistant in chat_history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
-    input_text = "\n".join([f"{entry['role']}: {entry['content']}" for entry in conversation])
-    generate_kwargs = {
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "top_p": top_p,
-        "top_k": top_k,
-        "temperature": temperature,
-        "repetition_penalty": repetition_penalty,
-    }
-    # Function to run the generation
-    def run_generation():
-        try:
-            results = pipe(input_text, **generate_kwargs)
-            return results
-        except Exception as e:
-            return [f"Error in generation: {e}"]
-    # Run generation in a separate thread and wait for it to finish
     outputs = []
-    generation_thread = Thread(target=lambda: outputs.extend(run_generation()))
-    generation_thread.start()
-    generation_thread.join()
-    for output in outputs:
-        yield output['generated_text'] if isinstance(output, dict) else output
 chat_interface = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
-        gr.Textbox(label="System prompt", lines=6, value="You are a helpful TCM medical assistant named 仲景中医大语言模型, created by 医哲未来."),
         gr.Slider(
             label="Max new tokens",
             minimum=1,
@@ -118,11 +118,11 @@ chat_interface = gr.ChatInterface(
     ],
     stop_btn=None,
     examples=[
-        ["Hello there! How are you doing?"],
-        ["Can you explain briefly to me what is the Python programming language?"],
-        ["Explain the plot of Cinderella in a sentence."],
-        ["How many hours does it take a man to eat a Helicopter?"],
-        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
     ],
 )

 import gradio as gr
 import spaces
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """\
+# ZhongJing-2-1_8b-merge
+This Space demonstrates model [ZhongJing-2-1_8b-merge](https://huggingface.co/CMLL/ZhongJing-2-1_8b-merge) by CMLL, a powerful model for TCM-related applications. Feel free to play with it, or duplicate to run generations without a queue!
 """
 LICENSE = """
 <p/>
 ---
+As a derivate work of [ZhongJing-2-1_8b-merge](https://huggingface.co/CMLL/ZhongJing-2-1_8b-merge) by CMLL,
+this demo is governed by the original [license](https://huggingface.co/CMLL/ZhongJing-2-1_8b-merge/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/CMLL/ZhongJing-2-1_8b-merge/blob/main/USE_POLICY.md).
 """
 if not torch.cuda.is_available():
 if torch.cuda.is_available():
     model_id = "CMLL/ZhongJing-2-1_8b-merge"
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     tokenizer.use_default_system_prompt = False
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
+    system_prompt: str,
     max_new_tokens: int = 1024,
     temperature: float = 0.6,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
+    conversation = []
+    if system_prompt:
+        conversation.append({"role": "system", "content": system_prompt})
     for user, assistant in chat_history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
     outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
 chat_interface = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
+        gr.Textbox(label="System prompt", lines=6),
         gr.Slider(
             label="Max new tokens",
             minimum=1,
     ],
     stop_btn=None,
     examples=[
+        ["你是谁？"],
+        ["你能简要解释一下什么是中医吗？"],
+        ["简述《黄帝内经》的主要内容。"],
+        ["中医如何治疗失眠？"],
+        ["写一篇关于‘AI在中医研究中的应用’的100字文章。"],
     ],
 )