friday / app.py
gospacedev's picture
delay user audio decoding
202030f
raw
history blame contribute delete
No virus
3.39 kB
import time
import torch
import spaces
import numpy as np
import gradio as gr
from gtts import gTTS
from transformers import pipeline
from huggingface_hub import InferenceClient
# Model names
ASR_MODEL_NAME = "openai/whisper-small"
LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
# Initial system prompt
system_prompt = """"<s>[INST] You are Friday, a helpful and conversational AI assistant, and you respond with one to two sentences. [/INST] Hello there! I'm Friday, how can I help you?</s>"""
# Global variables for history
instruct_history = system_prompt
formatted_history = ""
# Create inference client for text generation
client = InferenceClient(LLM_MODEL_NAME)
# Set device for ASR pipeline
device = 0 if torch.cuda.is_available() else "cpu"
# ASR pipeline
pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
device=device,
)
def generate(instruct_history, temperature=0.1, max_new_tokens=128, top_p=0.95, repetition_penalty=1.0):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
output = client.text_generation(
instruct_history, **generate_kwargs, stream=False, details=False, return_full_text=False)
return output
@spaces.GPU(duration=60)
def transcribe(audio, past_history):
global instruct_history, formatted_history
time.sleep(1)
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
transcribed_user_audio = pipe({"sampling_rate": sr, "raw": y})["text"]
formatted_history += past_history
formatted_history += f"πŸ˜ƒ Human: {transcribed_user_audio}\n\n"
instruct_history += f"<s>[INST] {transcribed_user_audio} [/INST] "
# Generate LLM response
llm_response = generate(instruct_history)
instruct_history += f" {llm_response}</s>"
formatted_history += f"πŸ€– Friday: {llm_response}\n\n"
# Convert AI response to audio
audio_response = gTTS(llm_response)
audio_response.save("response.mp3")
print("Formatted History: ", formatted_history)
# Return the full conversation history
return "response.mp3", formatted_history
def clear_history(formatted_history):
instruct_history = ""
instruct_history += system_prompt
formatted_history = ""
return formatted_history
with gr.Blocks() as demo:
gr.HTML("<center><h1>Friday: AI Virtual Assistant πŸ€–</h1><center>")
with gr.Row():
audio_input = gr.Audio(label="Human", sources="microphone")
output_audio = gr.Audio(label="Friday", type="filepath", interactive=False, autoplay=True, elem_classes="audio")
with gr.Row():
send_btn = gr.Button("πŸš€ Send")
clear_btn = gr.Button("πŸ—‘οΈ Clear")
# Textbox to display the full conversation history
transcription_box = gr.Textbox(label="Transcription", lines=10, placeholder="Conversation History...")
send_btn.click(fn=transcribe, inputs=[audio_input, transcription_box], outputs=[output_audio, transcription_box])
clear_btn.click(fn=clear_history, inputs=[transcription_box], outputs=[transcription_box])
if __name__ == "__main__":
demo.queue()
demo.launch()