MK-316 commited on
Commit
a0a375f
1 Parent(s): acfb622

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #@markdown Language Application: WER, Fluency (in N of pauses), WPM (Words per minute)
2
+ import gradio as gr
3
+ import speech_recognition as sr
4
+ from Levenshtein import distance as lev_distance, ratio
5
+ import tempfile
6
+ import soundfile as sf
7
+ import librosa
8
+
9
+ def analyze_speech(file_info):
10
+ r = sr.Recognizer()
11
+ with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile:
12
+ # Write the sound file to the temporary file
13
+ sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV')
14
+ tmpfile.seek(0)
15
+
16
+ # Load audio for pause analysis and speech rate
17
+ y, sr_lib = librosa.load(tmpfile.name, sr=None) # Load the file with the original sampling rate
18
+ duration = librosa.get_duration(y=y, sr=sr_lib)
19
+
20
+ # Detect pauses
21
+ pause_frames = librosa.effects.split(y, top_db=32)
22
+ pauses = [(start, end) for start, end in pause_frames if (end - start) / sr_lib > 0.5]
23
+ num_pauses = len(pauses)
24
+
25
+ with sr.AudioFile(tmpfile.name) as source:
26
+ audio_data = r.record(source)
27
+ text = r.recognize_google(audio_data)
28
+
29
+ return text, num_pauses, duration, len(text.split())
30
+
31
+ def calculate_wer(reference, hypothesis):
32
+ ref_words = reference.split()
33
+ hyp_words = hypothesis.split()
34
+ edit_distance = lev_distance(ref_words, hyp_words)
35
+ wer = edit_distance / len(ref_words) if ref_words else float('inf') # Avoid division by zero
36
+ return wer
37
+
38
+ def pronunciation_correction(expected_text, file_info):
39
+ user_spoken_text, num_pauses, duration, total_words = analyze_speech(file_info)
40
+ wer = calculate_wer(expected_text.lower(), user_spoken_text.lower())
41
+ wpm = total_words / (duration / 60) if duration > 0 else 0
42
+ similarity = ratio(expected_text.lower(), user_spoken_text.lower())
43
+
44
+ feedback = "Excellent pronunciation!" if similarity >= 0.9 else \
45
+ "Good pronunciation!" if similarity >= 0.7 else \
46
+ "Needs improvement." if similarity >= 0.5 else \
47
+ "Poor pronunciation, try to focus more on clarity."
48
+
49
+ description = f"WER: {wer:.2f}, Fluency: {num_pauses} pauses, {wpm:.0f} WPM"
50
+
51
+ return feedback, description
52
+
53
+ with gr.Blocks() as app:
54
+ with gr.Row():
55
+ text_input = gr.Textbox(label="Enter or paste your text here")
56
+ audio_input = gr.Audio(label="Upload Audio File", type="numpy")
57
+ check_pronunciation_button = gr.Button("Check Pronunciation")
58
+ pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback")
59
+ pronunciation_details = gr.Textbox(label="Detailed Metrics")
60
+
61
+ check_pronunciation_button.click(
62
+ pronunciation_correction,
63
+ inputs=[text_input, audio_input],
64
+ outputs=[pronunciation_feedback, pronunciation_details]
65
+ )
66
+
67
+ app.launch(debug=True)