jlonsako commited on
Commit
4e9010c
1 Parent(s): 7e652b2

Halving precision to speed up inference

Browse files
Files changed (1) hide show
  1. app.py +4 -11
app.py CHANGED
@@ -24,7 +24,7 @@ def format_time(seconds):
24
 
25
  #Convert Video/Audio into 16K wav file
26
  def preprocessAudio(audioFile):
27
- os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audio.wav")
28
 
29
  #Transcribe!!!
30
  def Transcribe(file):
@@ -33,18 +33,18 @@ def Transcribe(file):
33
  model.load_adapter("amh")
34
 
35
  preprocessAudio(file)
36
- #os.system(f"ffmpeg -y -i ./July3_2023_Sermon.mov -ar 16000 ./audio.wav")
37
  block_size = 30 #30 second chunks of audio
38
 
39
  transcripts = []
40
  stream = librosa.stream(
41
- "./audio.wav",
42
  block_length=block_size,
43
  frame_length=16000,
44
  hop_length=16000
45
  )
46
 
47
  model.to(device)
 
48
  print(f"Model loaded to {device}: Entering transcription phase")
49
 
50
  #Code for timestamping
@@ -55,12 +55,11 @@ def Transcribe(file):
55
  if len(speech_segment.shape) > 1:
56
  speech_segment = speech_segment[:,0] + speech_segment[:,1]
57
  input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device)
 
58
  with torch.no_grad():
59
  logits = model(input_values).logits
60
  if len(logits.shape) == 1:
61
- print("test")
62
  logits = logits.unsqueeze(0)
63
- #predicted_ids = torch.argmax(logits, dim=-1)
64
  transcription = processor.batch_decode(logits.cpu().numpy()).text
65
  transcripts.append(transcription[0])
66
 
@@ -77,7 +76,6 @@ def Transcribe(file):
77
  # Freeing up memory
78
  del input_values
79
  del logits
80
- #del predicted_ids
81
  del transcription
82
  torch.cuda.empty_cache()
83
  gc.collect()
@@ -92,11 +90,6 @@ def Transcribe(file):
92
  return("./subtitle.sbv")
93
 
94
  demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs="file")
95
- #with gr.Blocks() as demo:
96
- #file_output = gr.Textbox()
97
- #upload_button = gr.UploadButton("Click to Upload a sermon",
98
- # file_types=["video", "audio"], file_count="multiple")
99
- #upload_button.upload(Transcribe, upload_button, file_output)
100
  demo.launch()
101
 
102
 
 
24
 
25
  #Convert Video/Audio into 16K wav file
26
  def preprocessAudio(audioFile):
27
+ os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audioToConvert.wav")
28
 
29
  #Transcribe!!!
30
  def Transcribe(file):
 
33
  model.load_adapter("amh")
34
 
35
  preprocessAudio(file)
 
36
  block_size = 30 #30 second chunks of audio
37
 
38
  transcripts = []
39
  stream = librosa.stream(
40
+ "./audioToConvert.wav",
41
  block_length=block_size,
42
  frame_length=16000,
43
  hop_length=16000
44
  )
45
 
46
  model.to(device)
47
+ model.half()
48
  print(f"Model loaded to {device}: Entering transcription phase")
49
 
50
  #Code for timestamping
 
55
  if len(speech_segment.shape) > 1:
56
  speech_segment = speech_segment[:,0] + speech_segment[:,1]
57
  input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device)
58
+ input_values = input_values.half()
59
  with torch.no_grad():
60
  logits = model(input_values).logits
61
  if len(logits.shape) == 1:
 
62
  logits = logits.unsqueeze(0)
 
63
  transcription = processor.batch_decode(logits.cpu().numpy()).text
64
  transcripts.append(transcription[0])
65
 
 
76
  # Freeing up memory
77
  del input_values
78
  del logits
 
79
  del transcription
80
  torch.cuda.empty_cache()
81
  gc.collect()
 
90
  return("./subtitle.sbv")
91
 
92
  demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs="file")
 
 
 
 
 
93
  demo.launch()
94
 
95