File size: 3,162 Bytes
b8b135b
c2110e8
55a586c
14c0ec2
e559d03
b8b135b
e559d03
8549c9b
e559d03
 
 
 
 
 
 
 
 
 
 
55a586c
 
 
e559d03
77fc3c3
e559d03
77fc3c3
accb4e2
55a586c
77fc3c3
55a586c
77fc3c3
55a586c
77fc3c3
 
55a586c
c2110e8
accb4e2
14c0ec2
55a586c
 
 
 
 
77fc3c3
89c0d34
 
accb4e2
89c0d34
 
 
55a586c
 
89c0d34
 
 
14c0ec2
accb4e2
14c0ec2
be35c90
14c0ec2
accb4e2
4c692e0
55a586c
e891b09
14c0ec2
 
 
 
 
 
 
 
 
55a586c
14c0ec2
 
55a586c
c1335fa
89c0d34
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
import torch
from charts import spider_chart
from icon import generate_icon
from transformers import pipeline
from timestamp import format_timestamp

MODEL_NAME = "openai/whisper-medium"
BATCH_SIZE = 8

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

#Define classifier for sentiment analysis
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)

def transcribe(file, task, return_timestamps):
    outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
    text = outputs["text"]
    timestamps = outputs["chunks"]

    #If return timestamps is True, return html text with timestamps format
    if return_timestamps==True:
      spider_text = [f"{chunk['text']}" for chunk in timestamps] #Text for spider chart without timestamps
      timestamps = [f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}" for chunk in timestamps]
      
    else:
      timestamps = [f"{chunk['text']}" for chunk in timestamps]
      spider_text = timestamps

    text = "<br>".join(str(feature) for feature in timestamps)
    text = f"<h4>Transcription</h4><div style='overflow-y: scroll; height: 250px;'>{text}</div>"

    spider_text = "\n".join(str(feature) for feature in spider_text)
    fig = spider_chart(classifier, spider_text)  
    
    return file, text, fig


inputs = [gr.Audio(source="upload", label="Audio file", type="filepath"),
        gr.Radio(["transcribe"], label="Task", value="transcribe"),
        gr.Checkbox(value=True, label="Return timestamps")]

outputs = [gr.Audio(label="Processed Audio", type="filepath"),
        gr.outputs.HTML("text"),
           gr.Plot(label="fig")]

title = "Whisper Demo: Transcribe Audio"

MODEL_NAME1 = "jpdiazpardo/whisper-tiny-metal"

description = ("Transcribe long-form audio inputs with the click of a button! Demo uses the"
        f" checkpoint [{MODEL_NAME1}](https://huggingface.co/{MODEL_NAME1}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length. Check some of the 'cool' examples below")

examples = [["When a Demon Defiles a Witch.wav","transcribe",True],
            ["Immaculate Misconception.wav","transcribe", True]]


linkedin = generate_icon("linkedin")
github = generate_icon("github")

article = ("<div style='text-align: center; max-width:800px; margin:10px auto;'>"
            f"<p>{linkedin} <a href='https://www.linkedin.com/in/juanpablodiazp/' target='_blank'>Juan Pablo Díaz Pardo</a><br>"
            f"{github} <a href='https://github.com/jpdiazpardo' target='_blank'>jpdiazpardo</a></p>"
          )


title = "Scream: Fine-Tuned Whisper model for automatic gutural speech recognition 🤟🤟🤟"

demo = gr.Interface(title = title, fn=transcribe, inputs = inputs, outputs = outputs, description=description, cache_examples=True, allow_flagging="never", article = article , examples=examples)

demo.queue(concurrency_count=3)
demo.launch(debug = True)