Spaces:

MK-316
/

speechfeedback

App Files Files Community

MK-316 commited on Jun 4

Commit

a0a375f

•

1 Parent(s): acfb622

Create app.py

Files changed (1) hide show

app.py +67 -0

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#@markdown Language Application: WER, Fluency (in N of pauses), WPM (Words per minute)
+import gradio as gr
+import speech_recognition as sr
+from Levenshtein import distance as lev_distance, ratio
+import tempfile
+import soundfile as sf
+import librosa
+def analyze_speech(file_info):
+    r = sr.Recognizer()
+    with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile:
+        # Write the sound file to the temporary file
+        sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV')
+        tmpfile.seek(0)
+        # Load audio for pause analysis and speech rate
+        y, sr_lib = librosa.load(tmpfile.name, sr=None)  # Load the file with the original sampling rate
+        duration = librosa.get_duration(y=y, sr=sr_lib)
+        # Detect pauses
+        pause_frames = librosa.effects.split(y, top_db=32)
+        pauses = [(start, end) for start, end in pause_frames if (end - start) / sr_lib > 0.5]
+        num_pauses = len(pauses)
+        with sr.AudioFile(tmpfile.name) as source:
+            audio_data = r.record(source)
+        text = r.recognize_google(audio_data)
+        return text, num_pauses, duration, len(text.split())
+def calculate_wer(reference, hypothesis):
+    ref_words = reference.split()
+    hyp_words = hypothesis.split()
+    edit_distance = lev_distance(ref_words, hyp_words)
+    wer = edit_distance / len(ref_words) if ref_words else float('inf')  # Avoid division by zero
+    return wer
+def pronunciation_correction(expected_text, file_info):
+    user_spoken_text, num_pauses, duration, total_words = analyze_speech(file_info)
+    wer = calculate_wer(expected_text.lower(), user_spoken_text.lower())
+    wpm = total_words / (duration / 60) if duration > 0 else 0
+    similarity = ratio(expected_text.lower(), user_spoken_text.lower())
+    feedback = "Excellent pronunciation!" if similarity >= 0.9 else \
+               "Good pronunciation!" if similarity >= 0.7 else \
+               "Needs improvement." if similarity >= 0.5 else \
+               "Poor pronunciation, try to focus more on clarity."
+    description = f"WER: {wer:.2f}, Fluency: {num_pauses} pauses, {wpm:.0f} WPM"
+    return feedback, description
+with gr.Blocks() as app:
+    with gr.Row():
+        text_input = gr.Textbox(label="Enter or paste your text here")
+    audio_input = gr.Audio(label="Upload Audio File", type="numpy")
+    check_pronunciation_button = gr.Button("Check Pronunciation")
+    pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback")
+    pronunciation_details = gr.Textbox(label="Detailed Metrics")
+    check_pronunciation_button.click(
+        pronunciation_correction,
+        inputs=[text_input, audio_input],
+        outputs=[pronunciation_feedback, pronunciation_details]
+    )
+app.launch(debug=True)