File size: 3,393 Bytes
202030f
d51e19d
 
 
 
 
 
 
 
3721320
d51e19d
1b34aa5
d51e19d
3721320
ed73c38
1b34aa5
61b53d6
 
 
1b34aa5
3721320
1b34aa5
d51e19d
3721320
d51e19d
 
3721320
d51e19d
 
 
 
 
 
86b5bcb
d51e19d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ba294c
d51e19d
 
 
 
61b53d6
 
 
202030f
d51e19d
 
 
 
86b5bcb
 
70ca8ee
 
61b53d6
3721320
1b34aa5
ed73c38
86b5bcb
1b34aa5
3721320
61b53d6
d51e19d
ed73c38
1b34aa5
d51e19d
 
61b53d6
d51e19d
61b53d6
 
6ee0077
27afca3
 
 
 
 
 
61b53d6
 
d51e19d
 
b4920d8
ed73c38
d51e19d
27afca3
 
 
c8b7fcf
ed73c38
 
f76ff96
27afca3
 
d51e19d
2b54ce7
 
61b53d6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import time
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient

# Model names
ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

# Initial system prompt
system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant, and you respond with one to two sentences. [/INST] Hello there! I'm Friday, how can I help you?</s>"""

# Global variables for history
instruct_history = system_prompt
formatted_history = ""

# Create inference client for text generation
client = InferenceClient(LLM_MODEL_NAME)

# Set device for ASR pipeline
device = 0 if torch.cuda.is_available() else "cpu"

# ASR pipeline
pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    device=device,
)

def generate(instruct_history, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    output = client.text_generation(
        instruct_history, **generate_kwargs, stream=False, details=False, return_full_text=False)

    return output

@spaces.GPU(duration=60)
def transcribe(audio, past_history):
    global instruct_history, formatted_history

    time.sleep(1)
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    transcribed_user_audio = pipe({"sampling_rate": sr, "raw": y})["text"]

    formatted_history += past_history

    formatted_history += f"πŸ˜ƒ Human: {transcribed_user_audio}\n\n"
    instruct_history += f"<s>[INST] {transcribed_user_audio} [/INST] "

    # Generate LLM response
    llm_response = generate(instruct_history)

    instruct_history += f" {llm_response}</s>"
    formatted_history += f"πŸ€– Friday: {llm_response}\n\n"

    # Convert AI response to audio
    audio_response = gTTS(llm_response)
    audio_response.save("response.mp3")

    print("Formatted History: ", formatted_history)

    # Return the full conversation history
    return "response.mp3", formatted_history

def clear_history(formatted_history):
    instruct_history = ""
    instruct_history += system_prompt
    formatted_history = ""
    return formatted_history

with gr.Blocks() as demo:
    gr.HTML("<center><h1>Friday: AI Virtual Assistant πŸ€–</h1><center>")

    with gr.Row():
        audio_input = gr.Audio(label="Human", sources="microphone")
        output_audio = gr.Audio(label="Friday", type="filepath", interactive=False, autoplay=True, elem_classes="audio")

    with gr.Row():
        send_btn = gr.Button("πŸš€ Send")
        clear_btn = gr.Button("πŸ—‘οΈ Clear")

    # Textbox to display the full conversation history
    transcription_box = gr.Textbox(label="Transcription", lines=10, placeholder="Conversation History...")

    send_btn.click(fn=transcribe, inputs=[audio_input, transcription_box], outputs=[output_audio, transcription_box])
    clear_btn.click(fn=clear_history, inputs=[transcription_box], outputs=[transcription_box])

if __name__ == "__main__":
    demo.queue()
    demo.launch()