Spaces:

akashkumarbtc
/

bluetick-sales-call-evaluator

Runtime error

File size: 5,245 Bytes

import os
import wave
import nltk
import torch
import torch
import openai
import whisper
import textstat
import datetime
import requests
import subprocess
import contextlib
import numpy as np
import gradio as gr
from pyannote.audio import Audio
from pyannote.core import Segment
from sklearn.cluster import AgglomerativeClustering
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding


embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cpu"))

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
model = whisper.load_model('base')
audio = Audio()
openai.api_key = os.environ['OPEN_AI_API_KEY']

example_files = [
    "https://pdf.bluetickconsultants.com/e-commerce-call.mp3",
    "https://pdf.bluetickconsultants.com/customer_support.mp3",
    "https://pdf.bluetickconsultants.com/product_refund.mp3",
]


file_names = []


def download_file(url, save_name):
    url = url
    if not os.path.exists(save_name):
        file = requests.get(url)
        open(save_name, 'wb').write(file.content)


for url in example_files:
    save_name = str(url).split("/")[-1]
    download_file(url, str(url).split("/")[-1])
    file_names.append([save_name, 2])


def segment_embedding(segment, duration, audio_file):
    start = segment["start"]
    # Whisper overshoots the end timestamp in the last segment
    end = min(duration, segment["end"])
    clip = Segment(start, end)
    waveform, sample_rate = audio.crop(audio_file, clip)
    waveform = waveform.mean(dim=0, keepdim=True)
    return embedding_model(waveform.unsqueeze(0))


def speech_to_text_and_sentiment(audio_file, number_of_speakers=2):

    if audio_file[-3:] != 'wav':
        audio_file_name = audio_file.split("/")[-1]
        audio_file_name = audio_file_name.split(".")[0] + ".wav"
        subprocess.call(['ffmpeg', '-i', audio_file, audio_file_name, '-y'])
        audio_file = audio_file_name

    result = model.transcribe(audio_file)
    segments = result["segments"]

    with contextlib.closing(wave.open(audio_file, 'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)

    embeddings = np.zeros(shape=(len(segments), 192))
    for i, segment in enumerate(segments):
        embeddings[i] = segment_embedding(segment, duration, audio_file)

    embeddings = np.nan_to_num(embeddings)

    clustering = AgglomerativeClustering(
        int(number_of_speakers)).fit(embeddings)
    labels = clustering.labels_
    for i in range(len(segments)):
        segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

    def time(secs):
        return datetime.timedelta(seconds=round(secs))

    conv = ""

    for (i, segment) in enumerate(segments):
        if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
            conv += "\n" + segment["speaker"] + ' ' + \
                str(time(segment["start"])) + '\n'
        conv += segment["text"][1:] + ' '

    sentiment_scores = sid.polarity_scores(conv)

    messages = [
        {
            "role": "system",
            "content": """You will be provided with a conversation. Your task is to give a summary and mention all the main details in bullet points. 
        Replace speaker 1 and speaker 2 with sales excutive or comapny name and customer name if available.
        """
        },
        {
            "role": "user",
            "content": conv
        }
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    
    readability_score = textstat.flesch_reading_ease(conv)
    
    call_summary = ""
    call_summary += "Sentiment Analysis:\n" + "-------------------------------------\n"
    call_summary += f"Positive: {sentiment_scores['pos']} | Negative: {sentiment_scores['neg']} | Neutral: {sentiment_scores['neu']}\n\n"

    call_summary += "Readability/ Clarity of speach:\n" + "-------------------------------------\n"
    call_summary += f"Readability Score (Flesch-Kincaid): {readability_score}\n\n"

    call_summary += "Call Summary:\n" + "-------------------------------------\n"
    call_summary += response["choices"][0]["message"]["content"]

    return call_summary, conv


demo = gr.Interface(
    title="Bluetick Sales Call Evaluator",
    description="Upload a sales call audio file and get a transcription of the call along with sentiment analysis",
    fn=speech_to_text_and_sentiment,
    inputs=[
        gr.Audio(label="Select audio file", type="filepath"),
        gr.Number(label="Select number of speakers (1-5)",
                  default=2, type="number", min=1, max=5)
    ],
    outputs=[
        gr.Textbox(label="Analysis & Summary"),
        gr.Textbox(label="Transcript"),
        
    ],
    examples=file_names,
    theme=gr.themes.Soft().set(
        body_text_color="black"
    ),
    css=" .gradio-container {background-color: white !important;} .prose  h1{color: black !important;} p {color: black !important;}",

)

demo.launch(debug=True)