File size: 4,977 Bytes
7886158
 
103f4bd
7886158
103f4bd
6baa22b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103f4bd
27a0fd9
 
2096aa8
27a0fd9
2096aa8
 
27a0fd9
2096aa8
 
 
 
27a0fd9
2096aa8
 
 
27a0fd9
 
 
103f4bd
 
 
 
 
 
 
 
 
 
 
 
2096aa8
103f4bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7886158
 
103f4bd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gradio as gr
from transformers import pipeline
import librosa

########################LLama model###############################
# from transformers import AutoModelForCausalLM, AutoTokenizer

# model_name_or_path = "TheBloke/llama2_7b_chat_uncensored-GPTQ"
# # To use a different branch, change revision
# # For example: revision="main"
# model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
#                                              device_map="auto",
#                                              trust_remote_code=True,
#                                              revision="main",
#                                              #quantization_config=QuantizationConfig(disable_exllama=True)
#                                              )

# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
# Llama_pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_new_tokens=40,
#     do_sample=True,
#     temperature=0.7,
#     top_p=0.95,
#     top_k=40,
#     repetition_penalty=1.1
# )
# history="""User: Hello, Rally?
# Rally: I'm happy to see you again. What you want to talk to day?
# User: Let's talk about food
# Rally: Sure.
# User: I'm hungry right now. Do you know any Vietnamese food?"""

# prompt_template = f"""<|im_start|>system
# Write one sentence to continue the conversation<|im_end|>
# {history}
# Rally:"""
# print(Llama_pipe(prompt_template)[0]['generated_text'])

# def RallyRespone(chat_history, message):
#     chat_history += "User: " + message + "\n"
#     t_chat = Llama_pipe(prompt_template)[0]['generated_text']
#     res = t_chat[t_chat.rfind("Rally: "):]
#     return res

########################ASR model###############################

from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr").to("cuda")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr", do_upper_case=True)

def RallyListen(audio):
    features = processor(audio, sampling_rate=16000, padding=True, return_tensors="pt")
    input_features = features.input_features.to("cuda")
    attention_mask = features.attention_mask.to("cuda")

    gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
    ret = processor.batch_decode(gen_tokens, skip_special_tokens=True)
    return ret



########################Gradio UI###############################
    
# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
def add_file(files):
    return files.name

def print_like_dislike(x: gr.LikeData):
    print(x.index, x.value, x.liked)

def upfile(files):
    x = librosa.load(files, sr=16000)
    print(x[0])
    text = RallyListen(x[0])
    return [text[0], text[0]]

def transcribe(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    return transcriber({"sampling_rate": sr, "raw": y})["text"], transcriber({"sampling_rate": sr, "raw": y})["text"]


# def recommand(text):
#     ret = "answer for"

#     return ret + text

def add_text(history, text):
    history = history + [(text, None)]
    return history, gr.Textbox(value="", interactive=False)

# def bot(history):
#     response = "**That's cool!**"
#     history[-1][1] = ""
#     for character in response:
#         history[-1][1] += character
#         time.sleep(0.05)
#         yield history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(
        [],
        elem_id="chatbot",
        bubble_full_width=False,
    )
    file_output = gr.File()

    def respond(message, chat_history):
        bot_message = RallyRespone(chat_history, message)


        chat_history.append((message, bot_message))
        time.sleep(2)
        print (chat_history[-1])
        return chat_history[-1][-1], chat_history

    with gr.Row():
        with gr.Column():
            audio_speech = gr.Audio(sources=["microphone"])
            submit = gr.Button("Submit")
            send = gr.Button("Send")
            btn = gr.UploadButton("📁", file_types=["audio"])

        with gr.Column():
            opt1 = gr.Button("1: ")
            opt2 = gr.Button("2: ")

    #submit.click(translate, inputs=audio_speech, outputs=[opt1, opt2])
    # output is opt1 value, opt2 value [ , ]

    file_msg = btn.upload(add_file, btn, file_output)
    submit.click(upfile, inputs=file_output, outputs=[opt1, opt2])
    send.click(transcribe, inputs=audio_speech, outputs=[opt1, opt2])
    opt1.click(respond, [opt1, chatbot], [opt1, chatbot])

    opt2.click(respond, [opt2, chatbot], [opt2, chatbot])

    #opt2.click(recommand, inputs=opt2)
    #click event maybe BOT . generate history = optx.value,

    chatbot.like(print_like_dislike, None, None)

if __name__ == "__main__":
    demo.queue()
    demo.launch(debug=True)