jtlonsako commited on
Commit
635f416
1 Parent(s): ab34adc

added a batch_size input and allow for multiple outputs, also edited the details screen

Browse files
Files changed (1) hide show
  1. app.py +13 -13
app.py CHANGED
@@ -8,7 +8,7 @@ import time
8
  import gc
9
  import gradio as gr
10
  import librosa
11
- from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor
12
  from huggingface_hub import hf_hub_download
13
  from torchaudio.models.decoder import ctc_decoder
14
  from numba import cuda
@@ -47,7 +47,7 @@ beam_search_decoder = ctc_decoder(
47
  tokens=token_file,
48
  lm=lm_file,
49
  nbest=1,
50
- beam_size=500,
51
  beam_size_token=50,
52
  lm_weight=float(decoding_config["lmweight"]),
53
  word_score=float(decoding_config["wordscore"]),
@@ -67,7 +67,7 @@ def preprocessAudio(audioFile):
67
  os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audioToConvert.wav")
68
 
69
  #Transcribe!!!
70
- def Transcribe(file):
71
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
72
  start_time = time.time()
73
  model.load_adapter("amh")
@@ -75,7 +75,6 @@ def Transcribe(file):
75
 
76
  preprocessAudio(file)
77
  block_size = 30
78
- batch_size = 8 # or whatever number you choose
79
 
80
  transcripts = []
81
  speech_segments = []
@@ -94,9 +93,7 @@ def Transcribe(file):
94
  encoding_start = 0
95
  encoding_end = 0
96
  sbv_file = open("subtitle.sbv", "w")
97
-
98
- # Define batch size
99
- batch_size = 11
100
 
101
  # Create an empty list to hold batches
102
  batch = []
@@ -122,7 +119,6 @@ def Transcribe(file):
122
  # Transcribe each segment in the batch
123
  for i in range(batch_size):
124
  transcription = " ".join(beam_search_result[i][0].words).strip()
125
- print(transcription)
126
  transcripts.append(transcription)
127
 
128
  encoding_end = encoding_start + block_size
@@ -176,15 +172,19 @@ def Transcribe(file):
176
 
177
  # Join all transcripts into a single transcript
178
  transcript = ' '.join(transcripts)
 
179
  sbv_file.close()
 
180
 
181
  end_time = time.time()
182
  print(f"The script ran for {end_time - start_time} seconds.")
183
- return("./subtitle.sbv")
184
 
185
- demo = gr.Interface(fn=Transcribe, inputs=gr.File(label="Upload an audio file of Amharic content"), outputs=gr.File(label="Download .sbv transcription"),
186
- title="Amharic Audio Transcription",
187
- description="This application uses Meta MMS and a custom kenLM model to transcribe Amharic Audio files of arbitrary length into .sbv files. Upload an Amharic audio file and get your transcription! \n(Note: This is only a rough implementation of Meta's MMS for audio transcription, you should manually edit files after transcription has completed.)"
188
- )
 
 
189
  demo.launch()
190
 
 
8
  import gc
9
  import gradio as gr
10
  import librosa
11
+ from transformers import Wav2Vec2ForCTC, AutoProcessor
12
  from huggingface_hub import hf_hub_download
13
  from torchaudio.models.decoder import ctc_decoder
14
  from numba import cuda
 
47
  tokens=token_file,
48
  lm=lm_file,
49
  nbest=1,
50
+ beam_size=400,
51
  beam_size_token=50,
52
  lm_weight=float(decoding_config["lmweight"]),
53
  word_score=float(decoding_config["wordscore"]),
 
67
  os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audioToConvert.wav")
68
 
69
  #Transcribe!!!
70
+ def Transcribe(file, batch_size):
71
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
72
  start_time = time.time()
73
  model.load_adapter("amh")
 
75
 
76
  preprocessAudio(file)
77
  block_size = 30
 
78
 
79
  transcripts = []
80
  speech_segments = []
 
93
  encoding_start = 0
94
  encoding_end = 0
95
  sbv_file = open("subtitle.sbv", "w")
96
+ transcription_file = open("transcription.txt", "w")
 
 
97
 
98
  # Create an empty list to hold batches
99
  batch = []
 
119
  # Transcribe each segment in the batch
120
  for i in range(batch_size):
121
  transcription = " ".join(beam_search_result[i][0].words).strip()
 
122
  transcripts.append(transcription)
123
 
124
  encoding_end = encoding_start + block_size
 
172
 
173
  # Join all transcripts into a single transcript
174
  transcript = ' '.join(transcripts)
175
+ transcription_file.write(f"{transcript}")
176
  sbv_file.close()
177
+ transcription_file.close()
178
 
179
  end_time = time.time()
180
  print(f"The script ran for {end_time - start_time} seconds.")
181
+ return(["./subtitle.sbv", "./transcription.txt"])
182
 
183
+ demo = gr.Interface(fn=Transcribe, inputs=[gr.File(label="Upload an audio file of Amharic content"), gr.Slider(0, 25, value=4, step=1, label="batch size", info="Approximately .5GB per batch")],
184
+ outputs=gr.File(label="Download .sbv transcription", file_count="multiple"),
185
+ title="Amharic Audio Transcription",
186
+ description="This application uses Meta MMS and an Amharic kenLM model to transcribe Amharic Audio files of arbitrary length into .sbv and .txt files. Upload an Amharic audio file and get your transcription! \n(Note: Transcription quality is quite low, you should review and edit transcriptions before making them publicly available)"
187
+ )
188
+
189
  demo.launch()
190