File size: 2,805 Bytes
a0a375f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#@markdown Language Application: WER, Fluency (in N of pauses), WPM (Words per minute)
import gradio as gr
import speech_recognition as sr
from Levenshtein import distance as lev_distance, ratio
import tempfile
import soundfile as sf
import librosa

def analyze_speech(file_info):
    r = sr.Recognizer()
    with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile:
        # Write the sound file to the temporary file
        sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV')
        tmpfile.seek(0)
        
        # Load audio for pause analysis and speech rate
        y, sr_lib = librosa.load(tmpfile.name, sr=None)  # Load the file with the original sampling rate
        duration = librosa.get_duration(y=y, sr=sr_lib)
        
        # Detect pauses
        pause_frames = librosa.effects.split(y, top_db=32)
        pauses = [(start, end) for start, end in pause_frames if (end - start) / sr_lib > 0.5]
        num_pauses = len(pauses)

        with sr.AudioFile(tmpfile.name) as source:
            audio_data = r.record(source)
        text = r.recognize_google(audio_data)

        return text, num_pauses, duration, len(text.split())

def calculate_wer(reference, hypothesis):
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    edit_distance = lev_distance(ref_words, hyp_words)
    wer = edit_distance / len(ref_words) if ref_words else float('inf')  # Avoid division by zero
    return wer

def pronunciation_correction(expected_text, file_info):
    user_spoken_text, num_pauses, duration, total_words = analyze_speech(file_info)
    wer = calculate_wer(expected_text.lower(), user_spoken_text.lower())
    wpm = total_words / (duration / 60) if duration > 0 else 0
    similarity = ratio(expected_text.lower(), user_spoken_text.lower())
    
    feedback = "Excellent pronunciation!" if similarity >= 0.9 else \
               "Good pronunciation!" if similarity >= 0.7 else \
               "Needs improvement." if similarity >= 0.5 else \
               "Poor pronunciation, try to focus more on clarity."
    
    description = f"WER: {wer:.2f}, Fluency: {num_pauses} pauses, {wpm:.0f} WPM"

    return feedback, description

with gr.Blocks() as app:
    with gr.Row():
        text_input = gr.Textbox(label="Enter or paste your text here")
    audio_input = gr.Audio(label="Upload Audio File", type="numpy")
    check_pronunciation_button = gr.Button("Check Pronunciation")
    pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback")
    pronunciation_details = gr.Textbox(label="Detailed Metrics")

    check_pronunciation_button.click(
        pronunciation_correction,
        inputs=[text_input, audio_input],
        outputs=[pronunciation_feedback, pronunciation_details]
    )

app.launch(debug=True)