DrishtiSharma commited on
Commit
11b23f3
1 Parent(s): dc433d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -10
app.py CHANGED
@@ -3,6 +3,8 @@ import librosa
3
  from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
4
 
5
 
 
 
6
  def load_and_fix_data(input_file, model_sampling_rate):
7
  speech, sample_rate = librosa.load(input_file)
8
  if len(speech.shape) > 1:
@@ -17,21 +19,90 @@ sampling_rate = feature_extractor.sampling_rate
17
 
18
  asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")
19
 
 
 
 
 
20
 
 
 
 
 
 
 
21
 
22
- model = AutoModelForSeq2SeqLM.from_pretrained('hackathon-pln-es/t5-small-spanish-nahuatl')
23
- tokenizer = AutoTokenizer.from_pretrained('hackathon-pln-es/t5-small-spanish-nahuatl')
24
-
25
- new_line = '\n'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- def predict_and_ctc_lm_decode(input_file):
28
  speech = load_and_fix_data(input_file, sampling_rate)
29
  transcribed_text = asr(speech, chunk_length_s=5, stride_length_s=1)
30
  transcribed_text = transcribed_text["text"]
31
- input_ids = tokenizer('translate Spanish to Nahuatl: ' + transcribed_text, return_tensors='pt').input_ids
32
- outputs = model.generate(input_ids, max_length=512)
33
- outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
34
- return f"Spanish Audio Transcription: {transcribed_text} {new_line} Nahuatl Translation :{outputs}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
 
37
  gr.Interface(
@@ -39,7 +110,7 @@ gr.Interface(
39
  inputs=[
40
  gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")
41
  ],
42
- outputs=[gr.outputs.Textbox()],
43
  examples=[["audio1.wav"], ["travel.wav"]],
44
  title="Generate-Gender-Neutralized-Audios",
45
  description = "This is a Gradio demo for generating gender neutralized audios. To use it, simply provide an audio input (via microphone or audio recording), which will then be transcribed and gender-neutralized using a pre-trained models. Finally, with the help of Coqui's TTS model, gender neutralised audio is generated.",
 
3
  from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
4
 
5
 
6
+
7
+
8
  def load_and_fix_data(input_file, model_sampling_rate):
9
  speech, sample_rate = librosa.load(input_file)
10
  if len(speech.shape) > 1:
 
19
 
20
  asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")
21
 
22
+ prefix = ''
23
+ model_checkpoint = "hackathon-pln-es/es_text_neutralizer"
24
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
25
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
26
 
27
+ def postproc(input_sentence, preds):
28
+ try:
29
+ preds = preds.replace('De el', 'Del').replace('de el', 'del').replace(' ', ' ')
30
+ if preds[0].islower():
31
+ preds = preds.capitalize()
32
+ preds = preds.replace(' . ', '. ').replace(' , ', ', ')
33
 
34
+ # Nombres en mayusculas
35
+ prev_letter = ''
36
+ for word in input_sentence.split(' '):
37
+ if word:
38
+ if word[0].isupper():
39
+ if word.lower() in preds and word != input_sentence.split(' ')[0]:
40
+ if prev_letter == '.':
41
+ preds = preds.replace('. ' + word.lower() + ' ', '. ' + word + ' ')
42
+ else:
43
+ if word[-1] == '.':
44
+ preds = preds.replace(word.lower(), word)
45
+ else:
46
+ preds = preds.replace(word.lower() + ' ', word + ' ')
47
+ prev_letter = word[-1]
48
+ preds = preds.strip() # quitar ultimo espacio
49
+ except:
50
+ pass
51
+ return preds
52
+
53
+ model_name = "es/mai/tacotron2-DDC"
54
 
55
+ def predict_and_ctc_lm_decode(input_file, speaker_idx: str=None):
56
  speech = load_and_fix_data(input_file, sampling_rate)
57
  transcribed_text = asr(speech, chunk_length_s=5, stride_length_s=1)
58
  transcribed_text = transcribed_text["text"]
59
+ inputs = tokenizer([prefix + transcribed_text], return_tensors="pt", padding=True)
60
+ with torch.no_grad():
61
+ if first_generation:
62
+ output_sequence = model.generate(
63
+ input_ids=inputs["input_ids"].to(device),
64
+ attention_mask=inputs["attention_mask"].to(device),
65
+ do_sample=False, # disable sampling to test if batching affects output
66
+ )
67
+ else:
68
+
69
+ output_sequence = model.generate(
70
+ input_ids=inputs["input_ids"].to(device),
71
+ attention_mask=inputs["attention_mask"].to(device),
72
+ do_sample=False,
73
+ num_beams=2,
74
+ repetition_penalty=2.5,
75
+ # length_penalty=1.0,
76
+ early_stopping=True# disable sampling to test if batching affects output
77
+ )
78
+ preds = postproc(transcribed_text,
79
+ preds=tokenizer.decode(output_sequence[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
80
+ if len(preds) > MAX_TXT_LEN:
81
+ text = preds[:MAX_TXT_LEN]
82
+ print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
83
+ print(text, model_name)
84
+ # download model
85
+ model_path, config_path, model_item = manager.download_model(f"tts_models/{model_name}")
86
+ vocoder_name: Optional[str] = model_item["default_vocoder"]
87
+ # download vocoder
88
+ vocoder_path = None
89
+ vocoder_config_path = None
90
+ if vocoder_name is not None:
91
+ vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)
92
+ # init synthesizer
93
+ synthesizer = Synthesizer(
94
+ model_path, config_path, None, None, vocoder_path, vocoder_config_path,
95
+ )
96
+ # synthesize
97
+ if synthesizer is None:
98
+ raise NameError("model not found")
99
+ wavs = synthesizer.tts(preds, speaker_idx)
100
+ # return output
101
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
102
+ synthesizer.save_wav(wavs, fp)
103
+ return fp.name
104
+
105
+
106
 
107
 
108
  gr.Interface(
 
110
  inputs=[
111
  gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")
112
  ],
113
+ outputs=gr.outputs.Audio(label="Output"),
114
  examples=[["audio1.wav"], ["travel.wav"]],
115
  title="Generate-Gender-Neutralized-Audios",
116
  description = "This is a Gradio demo for generating gender neutralized audios. To use it, simply provide an audio input (via microphone or audio recording), which will then be transcribed and gender-neutralized using a pre-trained models. Finally, with the help of Coqui's TTS model, gender neutralised audio is generated.",