import gradio as gr
import torch
import spaces
from diffusers import FluxPipeline, DiffusionPipeline
 

# # # normal FluxPipeline
# pipeline_normal = FluxPipeline.from_pretrained(
#     "sayakpaul/FLUX.1-merged",
#     torch_dtype=torch.bfloat16
# ).to("cuda")
# pipeline_normal.transformer.to(memory_format=torch.channels_last)
# pipeline_normal.transformer = torch.compile(pipeline_normal.transformer, mode="max-autotune", fullgraph=True)

torch.backends.cuda.matmul.allow_tf32 = True  # Enable TensorFloat32 for faster matrix operations
torch.backends.cudnn.benchmark = True         # Optimizes for GPU by enabling auto-tuning

# Compile the model with maximum optimizations
torch.compile(backend="inductor", mode="max-autotune")

pipe = FluxPipeline.from_pretrained(
    "sayakpaul/FLUX.1-merged",
    torch_dtype=torch.bfloat16
)

# Offload to CPU if necessary
pipe.enable_model_cpu_offload()

# Use xformers for memory-efficient attention
pipe.enable_xformers_memory_efficient_attention()

# Apply dynamic quantization for even faster inference
pipe = torch.quantization.quantize_dynamic(pipe, {torch.nn.Linear}, dtype=torch.qint8)
# # optimized FluxPipeline
# pipeline_optimized = FluxPipeline.from_pretrained(
#     "camenduru/FLUX.1-dev-diffusers",
#     torch_dtype=torch.bfloat16
# ).to("cuda")
# pipeline_optimized.transformer.to(memory_format=torch.channels_last)
# pipeline_optimized.transformer = torch.compile(
#     pipeline_optimized.transformer,
#     mode="max-autotune",
#     fullgraph=True
# )
# # wrap the autoquant call in a try-except block to handle unsupported layers
# for name, layer in pipeline_optimized.transformer.named_children():
#     try:
#         # apply autoquant to each layer
#         pipeline_optimized.transformer._modules[name] = autoquant(layer, error_on_unseen=False)
#         print(f"Successfully quantized {name}")
#     except AttributeError as e:
#         print(f"Skipping layer {name} due to error: {e}")
#     except Exception as e:
#         print(f"Unexpected error while quantizing {name}: {e}")

# pipeline_optimized.transformer = autoquant(
#     pipeline_optimized.transformer,
#     error_on_unseen=False
# )
pipeline_optimized = pipe

@spaces.GPU(duration=120)
def generate_images(prompt, guidance_scale, num_inference_steps):
    # # generate image with normal pipeline
    # image_normal = pipeline_normal(
    #     prompt=prompt,
    #     guidance_scale=guidance_scale,
    #     num_inference_steps=int(num_inference_steps)
    # ).images[0]
    
    # generate image with optimized pipeline
    image_optimized = pipeline_optimized(
        prompt=prompt,
        guidance_scale=guidance_scale,
        num_inference_steps=int(num_inference_steps)
    ).images[0]
    
    return image_optimized

# set up Gradio interface
demo = gr.Interface(
    fn=generate_images,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt"),
        gr.Slider(1.0, 10.0, step=0.5, value=3.5, label="Guidance Scale"),
        gr.Slider(10, 100, step=1, value=50, label="Number of Inference Steps")
    ],
    outputs=[
        gr.Image(type="pil", label="Optimized FluxPipeline")
    ],
    title="FluxPipeline Comparison",
    description="Compare images generated by the normal FluxPipeline and the optimized one using torchao and torch.compile()."
)

demo.launch()