model: vit_model: "clip_L" qformer_num_query_token: 16 qformer_cross_attention_freq: 1 sd_train_text_encoder: False sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" load_finetuned: False load_pretrained: True # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz" controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-hed" preprocess: vis_processor: train: name: "blip_diffusion_inp_image_eval" eval: name: "blip_diffusion_inp_image_eval" text_processor: train: name: "blip_caption" eval: name: "blip_caption"