import gradio as gr import numpy as np import torch from peft import PeftModel, PeftConfig from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline peft_model_id = "mfidabel/Modelo_1_Whisper_Large_V3" language = "guarani" task = "transcribe" peft_config = PeftConfig.from_pretrained(peft_model_id) model = WhisperForConditionalGeneration.from_pretrained( peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto" ) model = PeftModel.from_pretrained(model, peft_model_id) tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) feature_extractor = processor.feature_extractor forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task) pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) def transcribe(audio): if audio is None: return "Espera a que la grabaciĆ³n termine de subirse al servidor !! Intentelo de nuevo en unos segundos" sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) with torch.cuda.amp.autocast(): return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"] gr.Interface(fn=transcribe, inputs="microphone", outputs="text").launch(share=True)