ATP
Add application file
8746ace
raw
history blame contribute delete
No virus
7.73 kB
import gradio as gr
import subprocess
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import librosa
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from gtts import gTTS
from numba import cuda
#variables
language_input_audio = 'en'
language_output_audio='ch'
dict_lang = {
'en': 'eng_latn',
'es': 'spa_Latn',
'fr': 'fra_Latn',
'de': 'deu_Latn',
'pl': 'pol_Latn',
'uk': 'ukr_Cyrl',
'ro': 'ron_Latn',
'hu': 'hun_Latn',
'bg': 'bul_Cyrl',
'nl': 'nld_Latn',
'fi': 'fin_Latn',
'sl': 'slv_Latn',
'lv': 'lvs_Latn',
'ga': 'gle_Latn',
'ch': 'zho_Hant',
'ru': 'rus_Cyrl'
}
#functions
def radio_lang_input(lang):
language_input_audio = lang
return {var: language_input_audio}
#a function that determines the language of the output audio
def radio_input(lang):
language_output_audio = lang
return {var_lang: language_output_audio}
##
#convert input video file to text, audio, video
def video_load(video, language_input_audio, language_output_audio):
#convert video to video720p -s 1280x720
#
subprocess.run(f'ffmpeg -y -i {video} -vf scale=720:-2 video720p.mp4', shell=True)
#convert video to audio
#
subprocess.run('ffmpeg -y -i video720p.mp4 -vn -ar 16000 -ac 2 -ab 192K -f wav sound_from_input_video.wav', shell=True)
#convert audio to text
#
# load model and tokenizer
if language_input_audio == 'en':
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000)
input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values
# retrieve logits
logits = model(input_values).logits
# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
if language_input_audio == 'ru':
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000)
input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values
# retrieve logits
logits = model(input_values).logits
# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
#convert text to text translations
#
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
device = 0 if torch.cuda.is_available() else -1
translation_pipeline = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=dict_lang[language_input_audio], tgt_lang=dict_lang[language_output_audio], max_length=2000000, device=-1)
result = translation_pipeline(transcription)
text_translations = result[0]['translation_text']
#convert text to audio
#
#ru
if language_output_audio == 'ru':
tts = gTTS(text_translations, lang='ru')
tts.save('ru.mp3')
audio = 'ru.mp3'
#Vashington obcom
if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']:
coquiTTS = CoquiTTS()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio})
audio = fp.name
#Chineese
if language_output_audio == 'ch':
tts = gTTS(text_translations, lang='zh-CN')
tts.save('china.mp3')
audio = 'china.mp3'
#audio to video
#
subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True)
video = 'results/result_voice.mp4'
return text_translations, audio, video
##
# function for create video from audio
def audio_to_video_custom(audio):
subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True)
video = 'results/result_voice.mp4'
return video
##
# function for create audio from custom translations
def text_to_audio_custom(text_translations, language_output_audio):
#ru
if language_output_audio == 'ru':
tts = gTTS(text_translations, lang='ru')
tts.save('ru.mp3')
audio = 'ru.mp3'
#Vashington obcom
if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']:
coquiTTS = CoquiTTS()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio})
audio = fp.name
#Chineese
if language_output_audio == 'ch':
tts = gTTS(text_translations, lang='zh-CN')
tts.save('china.mp3')
audio = 'china.mp3'
return audio
##### blocks
with gr.Blocks(title="Speak video in any language") as demo:
# state variable
var = gr.State('en')
var_lang = gr.State('ch')
# markdown text
gr.Markdown("Service for translating videos into other languages ​​with support for the speaker's facial expressions")
gr.Markdown("The uploaded video must be only with a face. Preferably without sudden movements of the head.")
with gr.Row():
with gr.Column():
# radio button for change input lang
radio_input_lang_video = gr.Radio(['en', 'ru'], value="en", label='Select input video language')
# video input
seed = gr.Video(label="Input Video")
# radio button for change to output language
radio = gr.Radio(['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga', 'ch', 'ru'], value="ch", label='Choose the language you want to speak')
# main button
btn_1 = gr.Button("1. Generate video with translated audio")
with gr.Column():
# text output
translations_text = gr.Text(label="Generated Translations Text", interactive=True)
# button to generate text to audio
btn_3 = gr.Button("Generate custom translations to speech")
# output audio
translations_audio = gr.Audio(label="Generated Translations Audio", interactive=True, type="filepath")
# button to generate audio to video
btn_2 = gr.Button("Generate video with custom audio")
# video output
video_output = gr.Video(interactive=False, label="Generated Translations Video")
# change input lang video
radio_input_lang_video.change(fn=radio_lang_input, inputs=radio_input_lang_video, outputs=var)
# change output lang
radio.change(fn=radio_input, inputs=radio, outputs=var_lang)
# main button click
btn_1.click(video_load, inputs=[seed, var, var_lang], outputs=[translations_text, translations_audio, video_output])
# button click to custom audio to video
btn_2.click(audio_to_video_custom, inputs=[translations_audio], outputs=[video_output])
# button click to custom test to audio
btn_3.click(text_to_audio_custom, inputs=[translations_text, var_lang], outputs=[translations_audio])
demo.launch(show_api=False)