Spaces:
Runtime error
Runtime error
File size: 5,245 Bytes
4519e61 97d1106 4519e61 dc2de00 4519e61 de695ce 4519e61 97d1106 4519e61 97d1106 4519e61 97d1106 4519e61 4cfc376 4519e61 97d1106 4519e61 ddf065d 4519e61 57e04eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import os
import wave
import nltk
import torch
import torch
import openai
import whisper
import textstat
import datetime
import requests
import subprocess
import contextlib
import numpy as np
import gradio as gr
from pyannote.audio import Audio
from pyannote.core import Segment
from sklearn.cluster import AgglomerativeClustering
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
"speechbrain/spkrec-ecapa-voxceleb",
device=torch.device("cpu"))
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
model = whisper.load_model('base')
audio = Audio()
openai.api_key = os.environ['OPEN_AI_API_KEY']
example_files = [
"https://pdf.bluetickconsultants.com/e-commerce-call.mp3",
"https://pdf.bluetickconsultants.com/customer_support.mp3",
"https://pdf.bluetickconsultants.com/product_refund.mp3",
]
file_names = []
def download_file(url, save_name):
url = url
if not os.path.exists(save_name):
file = requests.get(url)
open(save_name, 'wb').write(file.content)
for url in example_files:
save_name = str(url).split("/")[-1]
download_file(url, str(url).split("/")[-1])
file_names.append([save_name, 2])
def segment_embedding(segment, duration, audio_file):
start = segment["start"]
# Whisper overshoots the end timestamp in the last segment
end = min(duration, segment["end"])
clip = Segment(start, end)
waveform, sample_rate = audio.crop(audio_file, clip)
waveform = waveform.mean(dim=0, keepdim=True)
return embedding_model(waveform.unsqueeze(0))
def speech_to_text_and_sentiment(audio_file, number_of_speakers=2):
if audio_file[-3:] != 'wav':
audio_file_name = audio_file.split("/")[-1]
audio_file_name = audio_file_name.split(".")[0] + ".wav"
subprocess.call(['ffmpeg', '-i', audio_file, audio_file_name, '-y'])
audio_file = audio_file_name
result = model.transcribe(audio_file)
segments = result["segments"]
with contextlib.closing(wave.open(audio_file, 'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
embeddings[i] = segment_embedding(segment, duration, audio_file)
embeddings = np.nan_to_num(embeddings)
clustering = AgglomerativeClustering(
int(number_of_speakers)).fit(embeddings)
labels = clustering.labels_
for i in range(len(segments)):
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
def time(secs):
return datetime.timedelta(seconds=round(secs))
conv = ""
for (i, segment) in enumerate(segments):
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
conv += "\n" + segment["speaker"] + ' ' + \
str(time(segment["start"])) + '\n'
conv += segment["text"][1:] + ' '
sentiment_scores = sid.polarity_scores(conv)
messages = [
{
"role": "system",
"content": """You will be provided with a conversation. Your task is to give a summary and mention all the main details in bullet points.
Replace speaker 1 and speaker 2 with sales excutive or comapny name and customer name if available.
"""
},
{
"role": "user",
"content": conv
}
]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=0,
max_tokens=1000,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
readability_score = textstat.flesch_reading_ease(conv)
call_summary = ""
call_summary += "Sentiment Analysis:\n" + "-------------------------------------\n"
call_summary += f"Positive: {sentiment_scores['pos']} | Negative: {sentiment_scores['neg']} | Neutral: {sentiment_scores['neu']}\n\n"
call_summary += "Readability/ Clarity of speach:\n" + "-------------------------------------\n"
call_summary += f"Readability Score (Flesch-Kincaid): {readability_score}\n\n"
call_summary += "Call Summary:\n" + "-------------------------------------\n"
call_summary += response["choices"][0]["message"]["content"]
return call_summary, conv
demo = gr.Interface(
title="Bluetick Sales Call Evaluator",
description="Upload a sales call audio file and get a transcription of the call along with sentiment analysis",
fn=speech_to_text_and_sentiment,
inputs=[
gr.Audio(label="Select audio file", type="filepath"),
gr.Number(label="Select number of speakers (1-5)",
default=2, type="number", min=1, max=5)
],
outputs=[
gr.Textbox(label="Analysis & Summary"),
gr.Textbox(label="Transcript"),
],
examples=file_names,
theme=gr.themes.Soft().set(
body_text_color="black"
),
css=" .gradio-container {background-color: white !important;} .prose h1{color: black !important;} p {color: black !important;}",
)
demo.launch(debug=True)
|