File size: 2,753 Bytes
9383317 6f66464 dc3d621 6f66464 7a88549 6f66464 7a88549 704d4b2 7a88549 6f66464 e6b60d2 7a88549 6f66464 8f92d6d 6f66464 4e38726 7a88549 cfbb13d dc3d621 6f66464 0ad0f14 6f66464 ccf8b8e 0f52bd7 cfbb13d 6f66464 7df555c ccf8b8e 6f66464 8f92d6d ccf8b8e 6f66464 7b91730 a05b0ba cfbb13d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
"""
Resources:
Canary 1B: https://huggingface.co/nvidia/canary-1b
Phi-3-Mini-4K-Instruct: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
VITS TTS: https://huggingface.co/docs/transformers/en/model_doc/vits
Blocks and Event Listeners, Gradio Guide: https://www.gradio.app/guides/blocks-and-event-listeners
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import scipy
import numpy as np
########################################## Phi 3
torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-4k-instruct",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
generation_args = {
"max_new_tokens": 64,
"return_full_text": False,
"temperature": 0.0,
"do_sample": False,
}
def phi(user_question):
messages = [{"role": "system", "content": "What can I do for you today"},
{"role": "user", "content": user_question}]
output = pipe(messages, **generation_args)
return output
########################################## Canary
from nemo.collections.asr.models import EncDecMultiTaskModel
# load model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
# update dcode params
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)
########################################## VITS
from transformers import VitsTokenizer, VitsModel, set_seed
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
set_seed(555) # make deterministic
########################################## Main
import gradio as gr
def fromvoice(input):
query = canary_model.transcribe(input, batch_size=16)
resp = phi(query[0])
voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt")
with torch.no_grad():
v = vits_model(**voice)
output=v.waveform[0].numpy()
return (vits_model.config.sampling_rate, output)
def fromtext(input):
resp = phi(input)
voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt")
with torch.no_grad():
v = vits_model(**voice)
output=v.waveform[0].numpy()
return (vits_model.config.sampling_rate, output)
Alexa = gr.Blocks()
with Alexa:
audio_file = gr.Audio(type="filepath")
text = gr.Textbox()
output=gr.Audio()
b1 = gr.Button("From Speech")
b2 = gr.Button("From Text")
b1.click(fromvoice, inputs=audio_file, outputs=output)
b2.click(fromtext, inputs=text, outputs=output)
Alexa.launch()
|