Spaces:

MK-316
/

speechfeedback

App Files Files Community

speechfeedback / app.py

MK-316's picture

Create app.py

a0a375f verified 5 months ago

2.81 kB

	#@markdown Language Application: WER, Fluency (in N of pauses), WPM (Words per minute)
	import gradio as gr
	import speech_recognition as sr
	from Levenshtein import distance as lev_distance, ratio
	import tempfile
	import soundfile as sf
	import librosa

	def analyze_speech(file_info):
	r = sr.Recognizer()
	with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile:
	# Write the sound file to the temporary file
	sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV')
	tmpfile.seek(0)

	# Load audio for pause analysis and speech rate
	y, sr_lib = librosa.load(tmpfile.name, sr=None) # Load the file with the original sampling rate
	duration = librosa.get_duration(y=y, sr=sr_lib)

	# Detect pauses
	pause_frames = librosa.effects.split(y, top_db=32)
	pauses = [(start, end) for start, end in pause_frames if (end - start) / sr_lib > 0.5]
	num_pauses = len(pauses)

	with sr.AudioFile(tmpfile.name) as source:
	audio_data = r.record(source)
	text = r.recognize_google(audio_data)

	return text, num_pauses, duration, len(text.split())

	def calculate_wer(reference, hypothesis):
	ref_words = reference.split()
	hyp_words = hypothesis.split()
	edit_distance = lev_distance(ref_words, hyp_words)
	wer = edit_distance / len(ref_words) if ref_words else float('inf') # Avoid division by zero
	return wer

	def pronunciation_correction(expected_text, file_info):
	user_spoken_text, num_pauses, duration, total_words = analyze_speech(file_info)
	wer = calculate_wer(expected_text.lower(), user_spoken_text.lower())
	wpm = total_words / (duration / 60) if duration > 0 else 0
	similarity = ratio(expected_text.lower(), user_spoken_text.lower())

	feedback = "Excellent pronunciation!" if similarity >= 0.9 else \
	"Good pronunciation!" if similarity >= 0.7 else \
	"Needs improvement." if similarity >= 0.5 else \
	"Poor pronunciation, try to focus more on clarity."

	description = f"WER: {wer:.2f}, Fluency: {num_pauses} pauses, {wpm:.0f} WPM"

	return feedback, description

	with gr.Blocks() as app:
	with gr.Row():
	text_input = gr.Textbox(label="Enter or paste your text here")
	audio_input = gr.Audio(label="Upload Audio File", type="numpy")
	check_pronunciation_button = gr.Button("Check Pronunciation")
	pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback")
	pronunciation_details = gr.Textbox(label="Detailed Metrics")

	check_pronunciation_button.click(
	pronunciation_correction,
	inputs=[text_input, audio_input],
	outputs=[pronunciation_feedback, pronunciation_details]
	)

	app.launch(debug=True)