jlonsako commited on
Commit
a08d5e7
1 Parent(s): d031248

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -1
app.py CHANGED
@@ -67,7 +67,7 @@ def preprocessAudio(audioFile):
67
  if isinstance(audioFile, str): # If audioFile is a string (filepath)
68
  os.system(f"ffmpeg -y -i {audioFile} -ar 16000 ./audioToConvert.wav")
69
  else: # If audioFile is an object with a name attribute
70
- os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./{audioFile.name}.wav")
71
 
72
  #Transcribe!!!
73
  def Transcribe(file):
@@ -191,6 +191,127 @@ def Transcribe(file):
191
  error_log.write(f"Exception occurred: {e}")
192
  error_log.close()
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  demo = gr.Blocks()
195
 
196
  with demo:
 
67
  if isinstance(audioFile, str): # If audioFile is a string (filepath)
68
  os.system(f"ffmpeg -y -i {audioFile} -ar 16000 ./audioToConvert.wav")
69
  else: # If audioFile is an object with a name attribute
70
+ os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audioToConvert.wav")
71
 
72
  #Transcribe!!!
73
  def Transcribe(file):
 
191
  error_log.write(f"Exception occurred: {e}")
192
  error_log.close()
193
 
194
+
195
+ #Transcribe!!!
196
+ def TranscribeMic(file):
197
+ try:
198
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
199
+ start_time = time.time()
200
+ model.load_adapter("amh")
201
+ processor.tokenizer.set_target_lang("amh")
202
+
203
+ preprocessAudio(file)
204
+ block_size = 30
205
+ batch_size = 1
206
+
207
+ transcripts = []
208
+ speech_segments = []
209
+
210
+ stream = librosa.stream(
211
+ "./audioToConvert.wav",
212
+ block_length=block_size,
213
+ frame_length=16000,
214
+ hop_length=16000
215
+ )
216
+
217
+ model.to(device)
218
+ print(f"Model loaded to {device}: Entering transcription phase")
219
+
220
+ #Code for timestamping
221
+ encoding_start = 0
222
+ encoding_end = 0
223
+ sbv_file = open(f"microphone_subtitle.sbv", "w")
224
+ transcription_file = open(f"microphone_transcription.txt", "w")
225
+
226
+ # Create an empty list to hold batches
227
+ batch = []
228
+
229
+ for speech_segment in stream:
230
+ if len(speech_segment.shape) > 1:
231
+ speech_segment = speech_segment[:,0] + speech_segment[:,1]
232
+
233
+ # Add the current speech segment to the batch
234
+ batch.append(speech_segment)
235
+
236
+ # If the batch is full, process it
237
+ if len(batch) == batch_size:
238
+ # Concatenate all segments in the batch along the time axis
239
+ input_values = processor(batch, sampling_rate=16_000, return_tensors="pt", padding=True)
240
+ input_values = input_values.to(device)
241
+ with torch.no_grad():
242
+ logits = model(**input_values).logits
243
+ if len(logits.shape) == 1:
244
+ logits = logits.unsqueeze(0)
245
+ beam_search_result = beam_search_decoder(logits.to("cpu"))
246
+
247
+ # Transcribe each segment in the batch
248
+ for i in range(batch_size):
249
+ transcription = " ".join(beam_search_result[i][0].words).strip()
250
+ transcripts.append(transcription)
251
+
252
+ encoding_end = encoding_start + block_size
253
+ formatted_start = format_time(encoding_start)
254
+ formatted_end = format_time(encoding_end)
255
+ sbv_file.write(f"{formatted_start},{formatted_end}\n")
256
+ sbv_file.write(f"{transcription}\n\n")
257
+
258
+ encoding_start = encoding_end
259
+
260
+ # Freeing up memory
261
+ del input_values
262
+ del logits
263
+ del transcription
264
+ torch.cuda.empty_cache()
265
+ gc.collect()
266
+
267
+ # Clear the batch
268
+ batch = []
269
+
270
+ if batch:
271
+ # Concatenate all segments in the batch along the time axis
272
+ input_values = processor(batch, sampling_rate=16_000, return_tensors="pt", padding=True)
273
+ input_values = input_values.to(device)
274
+ with torch.no_grad():
275
+ logits = model(**input_values).logits
276
+ if len(logits.shape) == 1:
277
+ logits = logits.unsqueeze(0)
278
+ beam_search_result = beam_search_decoder(logits.to("cpu"))
279
+
280
+ # Transcribe each segment in the batch
281
+ for i in range(len(batch)):
282
+ transcription = " ".join(beam_search_result[i][0].words).strip()
283
+ print(transcription)
284
+ transcripts.append(transcription)
285
+
286
+ encoding_end = encoding_start + block_size
287
+ formatted_start = format_time(encoding_start)
288
+ formatted_end = format_time(encoding_end)
289
+ sbv_file.write(f"{formatted_start},{formatted_end}\n")
290
+ sbv_file.write(f"{transcription}\n\n")
291
+
292
+ encoding_start = encoding_end
293
+
294
+ # Freeing up memory
295
+ del input_values
296
+ del logits
297
+ del transcription
298
+ torch.cuda.empty_cache()
299
+ gc.collect()
300
+
301
+ # Join all transcripts into a single transcript
302
+ transcript = ' '.join(transcripts)
303
+ transcription_file.write(f"{transcript}")
304
+ sbv_file.close()
305
+ transcription_file.close()
306
+
307
+ end_time = time.time()
308
+ print(f"The script ran for {end_time - start_time} seconds.")
309
+ return([f"./microphone_subtitle.sbv", f"./microphone_transcription.txt"])
310
+ except Exception as e:
311
+ error_log = open("error_log.txt", "w")
312
+ error_log.write(f"Exception occurred: {e}")
313
+ error_log.close()
314
+
315
  demo = gr.Blocks()
316
 
317
  with demo: