File size: 2,753 Bytes
9383317
 
 
 
 
 
 
 
 
6f66464
 
dc3d621
 
6f66464
 
 
7a88549
 
 
 
6f66464
 
 
7a88549
 
 
 
 
 
 
704d4b2
7a88549
 
 
 
 
6f66464
 
 
 
 
e6b60d2
7a88549
6f66464
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f92d6d
 
 
 
 
 
 
6f66464
 
4e38726
7a88549
cfbb13d
 
dc3d621
 
 
 
6f66464
0ad0f14
6f66464
 
ccf8b8e
0f52bd7
cfbb13d
6f66464
7df555c
ccf8b8e
6f66464
8f92d6d
ccf8b8e
6f66464
7b91730
a05b0ba
 
cfbb13d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Resources:

Canary 1B: https://huggingface.co/nvidia/canary-1b
Phi-3-Mini-4K-Instruct: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
VITS TTS: https://huggingface.co/docs/transformers/en/model_doc/vits
Blocks and Event Listeners, Gradio Guide: https://www.gradio.app/guides/blocks-and-event-listeners
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import scipy
import numpy as np
########################################## Phi 3
torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 64,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

def phi(user_question):
    messages = [{"role": "system", "content": "What can I do for you today"},
                {"role": "user", "content": user_question}]

    output = pipe(messages, **generation_args)
    return output
    
########################################## Canary
from nemo.collections.asr.models import EncDecMultiTaskModel

# load model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')

# update dcode params
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)

########################################## VITS
from transformers import VitsTokenizer, VitsModel, set_seed

tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
set_seed(555)  # make deterministic

########################################## Main
import gradio as gr

def fromvoice(input):
  query = canary_model.transcribe(input, batch_size=16)
  resp = phi(query[0])
  voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt")
  with torch.no_grad():
    v = vits_model(**voice)
  output=v.waveform[0].numpy()
  
  return (vits_model.config.sampling_rate, output)

def fromtext(input):
  resp = phi(input)
  voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt")
  with torch.no_grad():
    v = vits_model(**voice)
  output=v.waveform[0].numpy()
  
  return (vits_model.config.sampling_rate, output)
  

Alexa = gr.Blocks()

with Alexa:
    audio_file = gr.Audio(type="filepath")
    text = gr.Textbox()
    output=gr.Audio()

    b1 = gr.Button("From Speech")
    b2 = gr.Button("From Text")

    b1.click(fromvoice, inputs=audio_file, outputs=output)
    b2.click(fromtext, inputs=text, outputs=output)


Alexa.launch()