|
|
|
import gradio as gr |
|
import speech_recognition as sr |
|
from Levenshtein import distance as lev_distance, ratio |
|
import tempfile |
|
import soundfile as sf |
|
import librosa |
|
|
|
def analyze_speech(file_info): |
|
r = sr.Recognizer() |
|
with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile: |
|
|
|
sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV') |
|
tmpfile.seek(0) |
|
|
|
|
|
y, sr_lib = librosa.load(tmpfile.name, sr=None) |
|
duration = librosa.get_duration(y=y, sr=sr_lib) |
|
|
|
|
|
pause_frames = librosa.effects.split(y, top_db=32) |
|
pauses = [(start, end) for start, end in pause_frames if (end - start) / sr_lib > 0.5] |
|
num_pauses = len(pauses) |
|
|
|
with sr.AudioFile(tmpfile.name) as source: |
|
audio_data = r.record(source) |
|
text = r.recognize_google(audio_data) |
|
|
|
return text, num_pauses, duration, len(text.split()) |
|
|
|
def calculate_wer(reference, hypothesis): |
|
ref_words = reference.split() |
|
hyp_words = hypothesis.split() |
|
edit_distance = lev_distance(ref_words, hyp_words) |
|
wer = edit_distance / len(ref_words) if ref_words else float('inf') |
|
return wer |
|
|
|
def pronunciation_correction(expected_text, file_info): |
|
user_spoken_text, num_pauses, duration, total_words = analyze_speech(file_info) |
|
wer = calculate_wer(expected_text.lower(), user_spoken_text.lower()) |
|
wpm = total_words / (duration / 60) if duration > 0 else 0 |
|
similarity = ratio(expected_text.lower(), user_spoken_text.lower()) |
|
|
|
feedback = "Excellent pronunciation!" if similarity >= 0.9 else \ |
|
"Good pronunciation!" if similarity >= 0.7 else \ |
|
"Needs improvement." if similarity >= 0.5 else \ |
|
"Poor pronunciation, try to focus more on clarity." |
|
|
|
description = f"WER: {wer:.2f}, Fluency: {num_pauses} pauses, {wpm:.0f} WPM" |
|
|
|
return feedback, description |
|
|
|
with gr.Blocks() as app: |
|
with gr.Row(): |
|
text_input = gr.Textbox(label="Enter or paste your text here") |
|
audio_input = gr.Audio(label="Upload Audio File", type="numpy") |
|
check_pronunciation_button = gr.Button("Check Pronunciation") |
|
pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback") |
|
pronunciation_details = gr.Textbox(label="Detailed Metrics") |
|
|
|
check_pronunciation_button.click( |
|
pronunciation_correction, |
|
inputs=[text_input, audio_input], |
|
outputs=[pronunciation_feedback, pronunciation_details] |
|
) |
|
|
|
app.launch(debug=True) |
|
|