import gradio as gr import torch import spaces from diffusers import FluxPipeline, DiffusionPipeline # # # normal FluxPipeline # pipeline_normal = FluxPipeline.from_pretrained( # "sayakpaul/FLUX.1-merged", # torch_dtype=torch.bfloat16 # ).to("cuda") # pipeline_normal.transformer.to(memory_format=torch.channels_last) # pipeline_normal.transformer = torch.compile(pipeline_normal.transformer, mode="max-autotune", fullgraph=True) torch.backends.cuda.matmul.allow_tf32 = True # Enable TensorFloat32 for faster matrix operations torch.backends.cudnn.benchmark = True # Optimizes for GPU by enabling auto-tuning # Compile the model with maximum optimizations torch.compile(backend="inductor", mode="max-autotune") pipe = FluxPipeline.from_pretrained( "sayakpaul/FLUX.1-merged", torch_dtype=torch.bfloat16 ) # Offload to CPU if necessary pipe.enable_model_cpu_offload() # Use xformers for memory-efficient attention pipe.enable_xformers_memory_efficient_attention() # Apply dynamic quantization for even faster inference pipe = torch.quantization.quantize_dynamic(pipe, {torch.nn.Linear}, dtype=torch.qint8) # # optimized FluxPipeline # pipeline_optimized = FluxPipeline.from_pretrained( # "camenduru/FLUX.1-dev-diffusers", # torch_dtype=torch.bfloat16 # ).to("cuda") # pipeline_optimized.transformer.to(memory_format=torch.channels_last) # pipeline_optimized.transformer = torch.compile( # pipeline_optimized.transformer, # mode="max-autotune", # fullgraph=True # ) # # wrap the autoquant call in a try-except block to handle unsupported layers # for name, layer in pipeline_optimized.transformer.named_children(): # try: # # apply autoquant to each layer # pipeline_optimized.transformer._modules[name] = autoquant(layer, error_on_unseen=False) # print(f"Successfully quantized {name}") # except AttributeError as e: # print(f"Skipping layer {name} due to error: {e}") # except Exception as e: # print(f"Unexpected error while quantizing {name}: {e}") # pipeline_optimized.transformer = autoquant( # pipeline_optimized.transformer, # error_on_unseen=False # ) pipeline_optimized = pipe @spaces.GPU(duration=120) def generate_images(prompt, guidance_scale, num_inference_steps): # # generate image with normal pipeline # image_normal = pipeline_normal( # prompt=prompt, # guidance_scale=guidance_scale, # num_inference_steps=int(num_inference_steps) # ).images[0] # generate image with optimized pipeline image_optimized = pipeline_optimized( prompt=prompt, guidance_scale=guidance_scale, num_inference_steps=int(num_inference_steps) ).images[0] return image_optimized # set up Gradio interface demo = gr.Interface( fn=generate_images, inputs=[ gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt"), gr.Slider(1.0, 10.0, step=0.5, value=3.5, label="Guidance Scale"), gr.Slider(10, 100, step=1, value=50, label="Number of Inference Steps") ], outputs=[ gr.Image(type="pil", label="Optimized FluxPipeline") ], title="FluxPipeline Comparison", description="Compare images generated by the normal FluxPipeline and the optimized one using torchao and torch.compile()." ) demo.launch()