whisperspeech

Paused

Tonic commited on Jan 20

Commit

decaf77

•

1 Parent(s): 9c3ab74

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -35,23 +35,16 @@ def whisper_speech_demo(text, lang, speaker_audio, mix_lang, mix_text):
     resample_audio = resampler(newsr=24000)
     audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
-    # Normalize audio
     audio_np = audio_data_resampled.cpu().numpy()
     audio_np = audio_np / np.max(np.abs(audio_np))
-    # Ensure audio data is in the correct format
     audio_np = np.asarray(audio_np, dtype=np.float32)
-    # Create stereo audio by duplicating the mono channel
     audio_stereo = np.stack((audio_np, audio_np), axis=-1)
-    # Debugging: Inspect the shape and dtype of the audio array
-    print("Audio Array Shape:", audio_stereo.shape)
-    print("Audio Array Dtype:", audio_stereo.dtype)
-    # Save to a temporary WAV file as stereo
     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
-        # Write the stereo data with a sample rate of 24000 Hz
         sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')
     return tmp_file.name

     resample_audio = resampler(newsr=24000)
     audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
     audio_np = audio_data_resampled.cpu().numpy()
     audio_np = audio_np / np.max(np.abs(audio_np))
     audio_np = np.asarray(audio_np, dtype=np.float32)
     audio_stereo = np.stack((audio_np, audio_np), axis=-1)
+    audio_stereo = audio_stereo.reshape(-1, 2)
+    # print("Audio Array Shape:", audio_stereo.shape)
+    # print("Audio Array Dtype:", audio_stereo.dtype)
     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
         sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')
     return tmp_file.name