audio-transcriber / voice_recognition.py
mboushaba's picture
Create voice_recognition.py
4c18362 verified
raw
history blame contribute delete
No virus
3.95 kB
import os
import speech_recognition as sr
from deep_translator import GoogleTranslator
import ffmpeg
import random
import string
# Constants
AUDIO_FILE_PATH = "audio/test-ph-3.m4a"
SOURCE_LANG = "fil-PH"
TARGET_LANG = "en"
def convert_audio_to_wav(input_audio_path, output_wav_path):
"""
Convert any audio format (like M4A) to WAV using ffmpeg and return the output WAV file path.
"""
try:
ffmpeg.input(input_audio_path).output(output_wav_path, acodec='pcm_s16le', ar=44100).run()
print(f"Audio successfully converted to WAV: {output_wav_path}")
return output_wav_path
except ffmpeg.Error as e:
print(f"Error converting {input_audio_path} to WAV: {e}")
return None
except Exception as e:
print(f"Error converting {input_audio_path} to WAV: {e}")
return None
def recognize_speech_from_wav(model, wav_file_path, source_lang):
"""
Recognize speech from a WAV file using the Whisper recognition model.
"""
recognizer = sr.Recognizer()
with sr.AudioFile(wav_file_path) as source:
try:
#print('Transcribing audio to text...')
recognizer.adjust_for_ambient_noise(source)
audio_data = recognizer.record(source)
if model.lower() == "whisper":
text = recognizer.recognize_whisper(audio_data, language=source_lang)
elif model.lower() == "google":
text = recognizer.recognize_google(audio_data, language=source_lang)
else:
print(f"Invalid model name: {model}")
return None
return text
except sr.UnknownValueError:
print("Could not understand the audio.")
return None
except sr.RequestError as e:
print(f"Could not request results from the service; {e}")
return None
except Exception as e:
print(f"Could not request results from the service; {e}")
return None
def translate_text(text, target_lang):
"""
Translate the recognized text into the target language using Google Translator.
"""
try:
return GoogleTranslator(source='auto', target=target_lang).translate(text)
except Exception as e:
print(f"Error translating text: {e}")
return None
def process_audio_recognition(model="whisper", audio_path=None, source_lang="en", target_lang="en", translate=False):
"""
Main function to handle audio recognition and optional translation.
Converts the audio to WAV, recognizes speech, and optionally translates it.
"""
wav_file = audio_path
if wav_file and not wav_file.endswith(".wav"):
wav_file = convert_audio_to_wav(audio_path, ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)) + "converted_audio.wav")
if not wav_file:
print(f"Failed to process the audio file: {audio_path}")
return None
text = recognize_speech_from_wav(model, wav_file, source_lang)
if text:
# print(f"############# RECOGNIZED TEXT ({source_lang}) ##################")
# print(text)
# print("################################################")
if translate:
translated_text = translate_text(text, target_lang)
if translated_text:
# print(f"############# TRANSLATED TEXT ({target_lang}) ##################")
# print(translated_text)
# print("################################################")
text = translated_text
# Cleanup temporary WAV file
# try:
# #os.remove(wav_file)
# print(f"Temporary WAV file {wav_file} removed.")
# except OSError as e:
# print(f"Error removing temporary WAV file {wav_file}: {e}")
return text
if __name__ == '__main__':
process_audio_recognition(AUDIO_FILE_PATH, SOURCE_LANG, TARGET_LANG, translate=True)