MS-YUN commited on
Commit
a16c971
β€’
1 Parent(s): bd9aba8

Add application file3

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +156 -0
  3. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.ipynb
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer
2
+
3
+ model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
4
+ model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
5
+ device_map="auto",
6
+ trust_remote_code=False,
7
+ revision="main")
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
10
+
11
+
12
+ def predict(message, chatbot, temperature=0.9, max_new_tokens=256, top_p=0.6, repetition_penalty=1.0,):
13
+
14
+ system_message = "\n당신은 도움이 되고 μ •μ€‘ν•˜λ©° μ •μ§ν•œ Assistantμž…λ‹ˆλ‹€. μ•ˆμ „μ„ μœ μ§€ν•˜λ©΄μ„œ 항상 κ°€λŠ₯ν•œ ν•œ 도움이 λ˜λ„λ‘ λ‹΅λ³€ν•˜μ‹­μ‹œμ˜€. κ·€ν•˜μ˜ λ‹΅λ³€μ—λŠ” μœ ν•΄ν•˜κ±°λ‚˜, λΉ„μœ€λ¦¬μ μ΄κ±°λ‚˜, μΈμ’…μ°¨λ³„μ μ΄κ±°λ‚˜, μ„±μ°¨λ³„μ μ΄κ±°λ‚˜, 독성이 μžˆκ±°λ‚˜, μœ„ν—˜ν•˜κ±°λ‚˜ λΆˆλ²•μ μΈ μ½˜ν…μΈ κ°€ ν¬ν•¨λ˜μ–΄μ„œλŠ” μ•ˆ λ©λ‹ˆλ‹€. κ·€ν•˜μ˜ 닡변은 μ‚¬νšŒμ μœΌλ‘œ 편견이 μ—†κ³  κΈμ •μ μž…λ‹ˆλ‹€.\n\n질문이 μ˜λ―Έκ°€ μ—†κ±°λ‚˜ μ‚¬μ‹€μ μœΌλ‘œ 일관성이 μ—†λŠ” 경우, μ˜³μ§€ μ•Šμ€ 것에 λ‹΅λ³€ν•˜λŠ” λŒ€μ‹  이유λ₯Ό μ„€λͺ…ν•˜μ‹­μ‹œμ˜€. μ§ˆλ¬Έμ— λŒ€ν•œ 닡변을 λͺ¨λ₯΄λŠ” 경우, ν—ˆμœ„μ •λ³΄ κ³΅μœ ν•˜μ§€ λ§ˆμ„Έμš”"
15
+ input_system = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
16
+
17
+ input_history = ""
18
+ for interaction in chatbot:
19
+ input_history = input_system + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s> [INST] "
20
+
21
+ input_prompt = input_history + str(message) + " [/INST] "
22
+
23
+ inputs = tokenizer.encode(input_prompt, return_tensors="pt").to('cuda')
24
+
25
+ temperature = float(temperature)
26
+ if temperature < 1e-2: temperature = 1e-2
27
+ top_p = float(top_p)
28
+
29
+ generate_kwargs = dict(
30
+ input_ids=inputs,
31
+ temperature=temperature,
32
+ top_p=top_p,
33
+ max_new_tokens=max_new_tokens,
34
+ repetition_penalty=repetition_penalty,
35
+ )
36
+
37
+ outputs = model.generate(**generate_kwargs)
38
+ generated_indcluded_full_text = tokenizer.decode(outputs[0])
39
+ print("generated_indcluded_full_text:", generated_indcluded_full_text)
40
+
41
+ generated_text = generated_indcluded_full_text.split('[/INST] ')[-1]
42
+ if '</s>' in generated_text :
43
+ generated_text = generated_text.split('</s>')[0]
44
+ else : pass
45
+
46
+ import json
47
+ tokens = generated_text.split('\n')
48
+ token_list = []
49
+ for idx, token in enumerate(tokens):
50
+ token_dict = {"id": idx + 1, "text": token}
51
+ token_list.append(token_dict)
52
+ response = {"data": {"token": token_list}}
53
+ response = json.dumps(response, indent=4)
54
+
55
+ response = json.loads(response)
56
+ data_dict = response.get('data', {})
57
+ token_list = data_dict.get('token', [])
58
+
59
+ import time
60
+ partial_message = ""
61
+ for token_entry in token_list:
62
+ if token_entry:
63
+ try:
64
+ token_id = token_entry.get('id', None)
65
+ token_text = token_entry.get('text', None)
66
+
67
+ if token_text:
68
+ for char in token_text:
69
+ partial_message += char
70
+ yield partial_message
71
+ time.sleep(0.01)
72
+ else:
73
+ gr.Warning(f"The key 'text' does not exist or is None in this token entry: {token_entry}")
74
+
75
+ except KeyError as e:
76
+ gr.Warning(f"KeyError: {e} occurred for token entry: {token_entry}")
77
+ continue
78
+
79
+ title = "TheBloke/Llama-2-7b-Chat-GPTQ닝 λͺ¨λΈ chatbot"
80
+
81
+ description = """
82
+ TheBloke/Llama-2-7b-Chat-GPTQ λͺ¨λΈμž…λ‹ˆλ‹€.
83
+ """
84
+ css = """.toast-wrap { display: none !important } """
85
+ examples=[
86
+ ['Hello there! How are you doing?'],
87
+ ['Can you explain to me briefly what is Python programming language?'],
88
+ ['Explain the plot of Cinderella in a sentence.'],
89
+ ['How many hours does it take a man to eat a Helicopter?'],
90
+ ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
91
+ ]
92
+
93
+ import gradio as gr
94
+ def vote(data: gr.LikeData):
95
+ if data.liked:
96
+ print("You upvoted this response: " + data.value)
97
+ else:
98
+ print("You downvoted this response: " + data.value)
99
+
100
+
101
+ additional_inputs=[
102
+ gr.Slider(
103
+ label="Temperature",
104
+ value=0.9,
105
+ minimum=0.0,
106
+ maximum=1.0,
107
+ step=0.05,
108
+ interactive=True,
109
+ info="Higher values produce more diverse outputs",
110
+ ),
111
+ gr.Slider(
112
+ label="Max new tokens",
113
+ value=256,
114
+ minimum=0,
115
+ maximum=4096,
116
+ step=64,
117
+ interactive=True,
118
+ info="The maximum numbers of new tokens",
119
+ ),
120
+ gr.Slider(
121
+ label="Top-p (nucleus sampling)",
122
+ value=0.6,
123
+ minimum=0.0,
124
+ maximum=1,
125
+ step=0.05,
126
+ interactive=True,
127
+ info="Higher values sample more low-probability tokens",
128
+ ),
129
+ gr.Slider(
130
+ label="Repetition penalty",
131
+ value=1.2,
132
+ minimum=1.0,
133
+ maximum=2.0,
134
+ step=0.05,
135
+ interactive=True,
136
+ info="Penalize repeated tokens",
137
+ )
138
+ ]
139
+
140
+ chatbot_stream = gr.Chatbot(avatar_images=('user.png', 'bot2.png'), bubble_full_width = False)
141
+ chat_interface_stream = gr.ChatInterface(predict,
142
+ title=title,
143
+ description=description,
144
+ chatbot=chatbot_stream,
145
+ css=css,
146
+ examples=examples,
147
+ cache_examples=False,
148
+ additional_inputs=additional_inputs,)
149
+
150
+ with gr.Blocks() as demo:
151
+
152
+ with gr.Tab("Streaming"):
153
+ chatbot_stream.like(vote, None, None)
154
+ chat_interface_stream.render()
155
+
156
+ demo.queue(concurrency_count=75, max_size=100).launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio