Spaces:

gorkemgoknar
/

xtts-streaming

Running

App Files Files Community

gorkemgoknar commited on Oct 6, 2023

Commit

310916c

•

1 Parent(s): 42db371

add metrics text

Browse files

Files changed (1) hide show

app.py +27 -3

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import random
 from zipfile import ZipFile
 import uuid
 import torch
 import torchaudio
 # By using XTTS you agree to CPML license https://coqui.ai/cpml
@@ -59,6 +60,7 @@ DEVICE_ASSERT_DETECTED=0
 DEVICE_ASSERT_PROMPT=None
 DEVICE_ASSERT_LANG=None
 def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
     if agree == True:
         supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
@@ -165,9 +167,18 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
             global DEVICE_ASSERT_LANG
             #It will likely never come here as we restart space on first unrecoverable error now
             print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
         gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
         wav_chunks = []
         chunks = model.inference_stream(
             prompt,
@@ -175,14 +186,24 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
             gpt_cond_latent,
             speaker_embedding,)
         try:
             for i, chunk in enumerate(chunks):
                 wav_chunks.append(chunk)
                 print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
                 out_file = f'{i}.wav'
                 write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
                 audio = AudioSegment.from_file(out_file)
                 audio.export(out_file, format='wav')
-                yield (None, out_file, None)
         except RuntimeError as e :
             if "device-side assert" in str(e):
                 # cannot do anything on cuda device side error, need tor estart
@@ -212,6 +233,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
                 audio="output.wav",
             ),
             "sil.wav",
             speaker_wav,
         )
     else:
@@ -220,6 +242,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
                 None,
                 None,
                 None,
             )
@@ -439,10 +462,11 @@ gr.Interface(
     outputs=[
         gr.Video(label="Waveform Visual"),
         gr.Audio(label="Synthesised Audio", streaming=True, autoplay=True),
         gr.Audio(label="Reference Audio Used"),
     ],
     title=title,
     description=description,
     article=article,
-    #examples=examples,
-).queue().launch(debug=True,show_api=False)

 from zipfile import ZipFile
 import uuid
+import time
 import torch
 import torchaudio
 # By using XTTS you agree to CPML license https://coqui.ai/cpml
 DEVICE_ASSERT_PROMPT=None
 DEVICE_ASSERT_LANG=None
 def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
     if agree == True:
         supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
             global DEVICE_ASSERT_LANG
             #It will likely never come here as we restart space on first unrecoverable error now
             print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
+        metrics_text= ""
+        t_latent=time.time()
         gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
+        latent_calculation_time = time.time() - t_latent
+        metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
         wav_chunks = []
+        t_inference=time.time()
         chunks = model.inference_stream(
             prompt,
             gpt_cond_latent,
             speaker_embedding,)
         try:
+            first_chunk=True
             for i, chunk in enumerate(chunks):
+                if first_chunk:
+                    first_chunk_time = time.time() - t_inference
+                    metrics_text+=f"Streaming: First chunk actual latency: {first_chunk_time:.2f} seconds\n"
+                    first_chunk=False
                 wav_chunks.append(chunk)
                 print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
                 out_file = f'{i}.wav'
                 write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
                 audio = AudioSegment.from_file(out_file)
                 audio.export(out_file, format='wav')
+                yield (None, out_file, metrics_text, None)
         except RuntimeError as e :
             if "device-side assert" in str(e):
                 # cannot do anything on cuda device side error, need tor estart
                 audio="output.wav",
             ),
             "sil.wav",
+            metrics_text,
             speaker_wav,
         )
     else:
                 None,
                 None,
                 None,
+                None,
             )
     outputs=[
         gr.Video(label="Waveform Visual"),
         gr.Audio(label="Synthesised Audio", streaming=True, autoplay=True),
+        gr.Text(label="Metrics"),
         gr.Audio(label="Reference Audio Used"),
     ],
     title=title,
     description=description,
     article=article,
+    examples=examples,
+).queue().launch(debug=True,show_api=True)