Spaces:

jlonsako
/

Amh-Transcribe

Running

App Files Files Community

jlonsako commited on Jul 12, 2023

Commit

4e9010c

•

1 Parent(s): 7e652b2

Halving precision to speed up inference

Browse files

Files changed (1) hide show

app.py +4 -11

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ def format_time(seconds):
 #Convert Video/Audio into 16K wav file
 def preprocessAudio(audioFile):
-    os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audio.wav")
 #Transcribe!!!
 def Transcribe(file):
@@ -33,18 +33,18 @@ def Transcribe(file):
     model.load_adapter("amh")
     preprocessAudio(file)
-    #os.system(f"ffmpeg -y -i ./July3_2023_Sermon.mov -ar 16000 ./audio.wav")
     block_size = 30 #30 second chunks of audio
     transcripts = []
     stream = librosa.stream(
-        "./audio.wav",
         block_length=block_size,
         frame_length=16000,
         hop_length=16000
     )
     model.to(device)
     print(f"Model loaded to {device}: Entering transcription phase")
     #Code for timestamping
@@ -55,12 +55,11 @@ def Transcribe(file):
         if len(speech_segment.shape) > 1:
             speech_segment = speech_segment[:,0] + speech_segment[:,1]
         input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device)
         with torch.no_grad():
             logits = model(input_values).logits
         if len(logits.shape) == 1:
-            print("test")
             logits = logits.unsqueeze(0)
-        #predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(logits.cpu().numpy()).text
         transcripts.append(transcription[0])
@@ -77,7 +76,6 @@ def Transcribe(file):
         # Freeing up memory
         del input_values
         del logits
-        #del predicted_ids
         del transcription
         torch.cuda.empty_cache()
         gc.collect()
@@ -92,11 +90,6 @@ def Transcribe(file):
     return("./subtitle.sbv")
 demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs="file")
-#with gr.Blocks() as demo:
-    #file_output = gr.Textbox()
-    #upload_button = gr.UploadButton("Click to Upload a sermon",
-    #                                    file_types=["video", "audio"], file_count="multiple")
-    #upload_button.upload(Transcribe, upload_button, file_output)
 demo.launch()

 #Convert Video/Audio into 16K wav file
 def preprocessAudio(audioFile):
+    os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audioToConvert.wav")
 #Transcribe!!!
 def Transcribe(file):
     model.load_adapter("amh")
     preprocessAudio(file)
     block_size = 30 #30 second chunks of audio
     transcripts = []
     stream = librosa.stream(
+        "./audioToConvert.wav",
         block_length=block_size,
         frame_length=16000,
         hop_length=16000
     )
     model.to(device)
+    model.half()
     print(f"Model loaded to {device}: Entering transcription phase")
     #Code for timestamping
         if len(speech_segment.shape) > 1:
             speech_segment = speech_segment[:,0] + speech_segment[:,1]
         input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device)
+        input_values = input_values.half()
         with torch.no_grad():
             logits = model(input_values).logits
         if len(logits.shape) == 1:
             logits = logits.unsqueeze(0)
         transcription = processor.batch_decode(logits.cpu().numpy()).text
         transcripts.append(transcription[0])
         # Freeing up memory
         del input_values
         del logits
         del transcription
         torch.cuda.empty_cache()
         gc.collect()
     return("./subtitle.sbv")
 demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs="file")
 demo.launch()