Spaces:

FunAudioLLM
/

CosyVoice-300M

Running on Zero

App Files Files Community

CosyVoice-300M / css /custom.py

wenmeng.zwm

chinese to english

4905c07 about 2 months ago

raw

history blame contribute delete

No virus

3.92 kB

	import spaces
	import random

	import gradio as gr
	from css.utils import *


	# 定制语音生成
	def custom():

	def random_seed():
	return random.randint(1, 100000000)

	@spaces.GPU
	def generate_audio(_recorded_audio, _prompt_input_textbox, _language_radio,
	_synthetic_input_textbox, _seed):
	import time
	t1 = time.time()
	print(_recorded_audio, _prompt_input_textbox, _language_radio, _synthetic_input_textbox, _seed)
	if _synthetic_input_textbox == '':
	# gr.Warning('合成文本为空，您是否忘记输入合成文本？')
	gr.Warning('The synthesis text is empty, did you forget to input the synthesis text?')
	return (target_sr, default_data)
	set_all_random_seed(_seed)
	if use_instruct(_synthetic_input_textbox):
	model = cosyvoice_instruct
	else:
	model = cosyvoice
	prompt_speech_16k = postprocess(load_wav(_recorded_audio, prompt_sr))
	t2 = time.time()
	if _language_radio == 'cross' or _prompt_input_textbox == '':
	output = model.inference_cross_lingual(_synthetic_input_textbox, prompt_speech_16k)
	else:
	output = model.inference_zero_shot(_synthetic_input_textbox, _prompt_input_textbox, prompt_speech_16k)

	t3 = time.time()
	audio_data = postprocess(output['tts_speech']).numpy().flatten()
	t4 = time.time()
	print(f'load and preprocess time: {t2-t1}s')
	print(f'inference time: {t3-t2}s')
	print(f'postprocess time: {t4-t3}s')
	return (target_sr, audio_data)

	with gr.Column():
	with gr.Row():
	with gr.Column(scale=1, min_width=400):
	with gr.Group():
	recorded_audio = gr.Audio(sources=['microphone'],
	label="Record Audio File",
	type='filepath')
	gr.Text("Please click to record and read the text on the right (Chinese or English) to complete the input",
	max_lines=1,
	container=False,
	interactive=False)
	with gr.Column(scale=10):
	prompt_input_textbox = gr.Textbox(label="Input Text for Recording")
	gr.Examples(
	label="Example Recording Texts",
	examples=example_prompt_text,
	inputs=[prompt_input_textbox])

	with gr.Column():
	language_radio = gr.Radio(choices=[('Same Language', 'same'), ('Cross Language', 'cross')],
	value='same',
	label="Input Synthesis Text")
	synthetic_input_textbox = gr.Textbox(show_label=False)
	gr.Examples(
	label="Example Texts",
	examples=example_tts_text,
	inputs=[synthetic_input_textbox])

	with gr.Accordion(label="Random Seed"):
	with gr.Row():
	with gr.Column(scale=1, min_width=180):
	seed_button = gr.Button(value="\U0001F3B2 Shuffle Randomly",
	elem_classes="full-height")
	with gr.Column(scale=10):
	seed = gr.Number(show_label=False,
	value=0,
	container=False,
	elem_classes="full-height")
	with gr.Column():
	generate_button = gr.Button("Generate Audio", variant="primary", size="lg")

	with gr.Column():
	output_audio = gr.Audio(label="Synthesized Audio")

	seed_button.click(fn=random_seed, outputs=[seed])
	generate_button.click(
	fn=generate_audio,
	inputs=[recorded_audio, prompt_input_textbox, language_radio, synthetic_input_textbox, seed],
	outputs=[output_audio])