File size: 2,815 Bytes
da685d1
ad81fab
2dd5f48
da685d1
 
 
 
 
 
 
 
 
 
 
5e9f227
da685d1
 
5e9f227
be5f920
 
 
 
 
da685d1
cf1be34
da685d1
 
 
 
be5f920
 
 
 
a50abbe
da685d1
180aa71
 
 
 
 
 
da685d1
 
 
 
 
 
 
a9e9adc
be5f920
adc53e2
 
da685d1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr
import librosa 
from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline


def load_and_fix_data(input_file, model_sampling_rate):
    speech, sample_rate = librosa.load(input_file)
    if len(speech.shape) > 1:
        speech = speech[:, 0] + speech[:, 1]
    if sample_rate != model_sampling_rate:
        speech = librosa.resample(speech, sample_rate, model_sampling_rate)
    return speech


feature_extractor = AutoFeatureExtractor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-spanish")
sampling_rate = feature_extractor.sampling_rate

asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")



model = AutoModelForSeq2SeqLM.from_pretrained('hackathon-pln-es/t5-small-spanish-nahuatl')
tokenizer = AutoTokenizer.from_pretrained('hackathon-pln-es/t5-small-spanish-nahuatl')

new_line = '\n'

def predict_and_ctc_lm_decode(input_file):
    speech = load_and_fix_data(input_file, sampling_rate)
    transcribed_text = asr(speech, chunk_length_s=5, stride_length_s=1)
    transcribed_text = transcribed_text["text"]
    input_ids = tokenizer('translate Spanish to Nahuatl: ' + transcribed_text, return_tensors='pt').input_ids
    outputs = model.generate(input_ids, max_length=512)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return f"Spanish Audio Transcription: {transcribed_text} {new_line} Nahuatl Translation :{outputs}"

description = """ This is a Gradio demo of Spanish Audio Transcriptions to Nahuatl Translation. To use this, simply provide an audio input (audio recording or via microphone), which will subsequently be transcribed and translated to the Nahuatl language.

Pre-trained model used for Spanish ASR: [jonatasgrosman/wav2vec2-large-xlsr-53-spanish](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-spanish)

Pre-trained model used for translating Spanish audio transcription to the Nahuatl language: [hackathon-pln-es/t5-small-spanish-nahuatl](https://huggingface.co/hackathon-pln-es/t5-small-spanish-nahuatl)
"""

gr.Interface(
    predict_and_ctc_lm_decode,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")
    ],
    outputs=[gr.outputs.Textbox()],
    examples=[["audio1.wav"], ["travel.wav"]],
    title="Spanish-Audio-Transcriptions-to-Nahuatl-Translation",
    description = "This is a Gradio demo of Spanish Audio Transcriptions to Nahuatl Translation. To use this, simply provide an audio input (audio recording or via microphone), which will subsequently be transcribed and translated to Nahuatl language.",
    #article="<p><center><img src='........e'></center></p>",
    layout="horizontal",
    theme="huggingface",
).launch(enable_queue=True, cache_examples=True)