import argparse import logging import random import uuid import numpy as np from transformers import pipeline from diffusers import DiffusionPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler from diffusers.utils import load_image, export_to_video from transformers import ( SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5ForSpeechToSpeech, BlipProcessor, BlipForConditionalGeneration, TrOCRProcessor, VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer, AutoImageProcessor, TimesformerForVideoClassification, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, DPTForDepthEstimation, DPTFeatureExtractor ) from datasets import load_dataset from PIL import Image from torchvision import transforms import torch import torchaudio from speechbrain.pretrained import WaveformEnhancement import joblib from huggingface_hub import hf_hub_url, cached_download from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector, CannyDetector, MidasDetector import warnings import time from espnet2.bin.tts_inference import Text2Speech import soundfile as sf from asteroid.models import BaseModel import traceback import os import yaml warnings.filterwarnings("ignore") def setup_logger(): logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) handler = logging.StreamHandler() handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) return logger logger = setup_logger() def load_config(config_path): with open(config_path, "r") as file: return yaml.load(file, Loader=yaml.FullLoader) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, default="config.yaml") return parser.parse_args() args = parse_args() # Ensure the config is always set when not running as the main script if __name__ != "__main__": args.config = "config.gradio.yaml" config = load_config(args.config) local_deployment = config["local_deployment"] if config["inference_mode"] == "huggingface": local_deployment = "none" PROXY = {"https": config["proxy"]} if config["proxy"] else None start = time.time() local_models = "" # Changed to empty string def load_pipes(local_deployment): standard_pipes = {} other_pipes = {} controlnet_sd_pipes = {} if local_deployment in ["full"]: other_pipes = { "damo-vilab/text-to-video-ms-1.7b": { "model": DiffusionPipeline.from_pretrained(f"{local_models}damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"), "device": "cuda:0" }, "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k": { "model": BaseModel.from_pretrained("JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"), "device": "cuda:0" }, "microsoft/speecht5_vc": { "processor": SpeechT5Processor.from_pretrained(f"{local_models}microsoft/speecht5_vc"), "model": SpeechT5ForSpeechToSpeech.from_pretrained(f"{local_models}microsoft/speecht5_vc"), "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_models}microsoft/speecht5_hifigan"), "embeddings_dataset": load_dataset(f"{local_models}Matthijs/cmu-arctic-xvectors", split="validation"), "device": "cuda:0" }, "facebook/maskformer-swin-base-coco": { "feature_extractor": MaskFormerFeatureExtractor.from_pretrained(f"{local_models}facebook/maskformer-swin-base-coco"), "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_models}facebook/maskformer-swin-base-coco"), "device": "cuda:0" }, "Intel/dpt-hybrid-midas": { "model": DPTForDepthEstimation.from_pretrained(f"{local_models}Intel/dpt-hybrid-midas", low_cpu_mem_usage=True), "feature_extractor": DPTFeatureExtractor.from_pretrained(f"{local_models}Intel/dpt-hybrid-midas"), "device": "cuda:0" }