File size: 3,122 Bytes
1947bbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import streamlit as st
import os
from pydub import AudioSegment
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import whisper

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
models = models.to(device)
model = whisper.load_model("base")

st.title("Audio Analysis")

# Arguments input
st.subheader("Enter YouTube link and file name:")
url = st.text_input("YouTube link")
name = st.text_input("File name")

# Process audio and generate headings
if st.button("Process"):
    if os.path.exists("audio.mp3"):
        os.remove("audio.mp3")

    os.system("youtube-dl "+"--write-thumbnail "+"--skip-download "+url + " -o logo.png")
    os.system("yt-dlp -f 140 -o audio.mp3 " + url)

    while not os.path.exists("audio.mp3"):
        continue

    if os.path.exists("segments"):
        os.system("rm -rf segments")

    audio = AudioSegment.from_file("audio.mp3")
    segment_length = 30 * 1000

    if not os.path.exists("segments"):
        os.makedirs("segments")

    for i, segment in enumerate(audio[::segment_length]):
        segment.export(f"segments/{i}.mp3", format="mp3")

    original_text = ""
    audio_list = os.listdir("segments")
    headings = []
    original_texts = []
    dataForWeb = {}

    for i in range(len(audio_list)):
        st.write(f"Processing segment {i+1}/{len(audio_list)}")
        audio = whisper.load_audio(f"segments/{i}.mp3")
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(model.device)
        _, probs = model.detect_language(mel)
        options = whisper.DecodingOptions(fp16=False)
        result = whisper.decode(model, mel, options)

        text = "headline: " + result.text
        max_len = 256
        encoding = tokenizer.encode_plus(text, return_tensors="pt")
        input_ids = encoding["input_ids"].to(device)
        attention_masks = encoding["attention_mask"].to(device)
        beam_outputs = models.generate(
            input_ids=input_ids,
            attention_mask=attention_masks,
            max_length=64,
            num_beams=3,
            early_stopping=True,
        )
        generated_heading = tokenizer.decode(beam_outputs[0])
        headings.append(generated_heading)
        original_texts.append(result.text)
        dataForWeb[i] = {
            "heading": generated_heading,
            "text": result.text
        }

        original_text += "\n"
        original_text += "<h3>" + generated_heading + "</h3>"
        original_text += "\n"
        original_text += "<p>" + result.text + "</p>"

    with open(name, "w") as f:
        f.write(original_text)

    st.success("Audio processing completed!")

    # Display results
    st.subheader("Generated Headings and Text:")
    for i, heading in enumerate(headings):
        st.write(f"Segment {i+1}:")
        st.write("Heading:", heading)
        st.write("Text:", original_texts[i])
        st.write("-----------")