In [None]:
!pip install git+https://github.com/huggingface/diffusers.git
!pip install git+https://github.com/huggingface/accelerate
!pip install --upgrade transformers

In [None]:

!pip install datasets


!pip install torchvision
!sudo apt -qq install git-lfs
!git config --global credential.helper store
!pip install tqdm
!pip install bitsandbytes
!pip install torch
!pip install torchvision

In [None]:
from dataclasses import dataclass
from datasets import load_dataset
from torchvision import transforms
from accelerate.state import AcceleratorState
import math
import os
import numpy as np
import accelerate
from accelerate import Accelerator
from tqdm.auto import tqdm
from pathlib import Path
from accelerate import notebook_launcher
import torch.nn.functional as F
from diffusers.optimization import get_cosine_schedule_with_warmup
import torch
from PIL import Image
from diffusers import UNet2DModel
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
from diffusers.optimization import get_scheduler
from huggingface_hub import create_repo, upload_folder, upload_file
import bitsandbytes as bnb
from transformers.utils import ContextManagers
from huggingface_hub import snapshot_download


@dataclass
class TrainingConfig:
 pretrained_model_name_or_path = "runwayml/stable-diffusion-v1-5"
 validation_prompts = ["a dragon on a white background"," a fiery skull", "a skull", "a face", "a snake and skull"]
 image_size = 512 # the generated image resolution
 train_batch_size = 2
 eval_batch_size = 2 # how many images to sample during evaluation
 num_epochs = 50
 gradient_accumulation_steps = 1
 lr_scheduler = "constant"
 learning_rate = 1e-5
 lr_warmup_steps = 500
 save_image_epochs = 1
 save_model_epochs = 1
 token = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
 num_processes = 1
 mixed_precision = "fp16" # `no` for float32, `fp16` for automatic mixed precision
 output_dir = "tattoo-diffusion" # the model name locally and on the HF Hub

 push_to_hub = True # whether to upload the saved model to the HF Hub
 hub_private_repo = False
 overwrite_output_dir = True # overwrite the old model when re-running the notebook
 seed = 0


config = TrainingConfig()

In [None]:
snapshot_download(repo_id="TejasNavada/tattoo-diffusion", local_dir=config.output_dir, local_dir_use_symlinks=False )

In [None]:


def make_grid(images, rows, cols):
 w, h = images[0].size
 grid = Image.new("RGB", size=(cols * w, rows * h))
 for i, image in enumerate(images):
 grid.paste(image, box=(i % cols * w, i // cols * h))
 return grid


def evaluate(vae, text_encoder, tokenizer, unet, config, accelerator, epoch):
 pipeline = StableDiffusionPipeline.from_pretrained(
 config.pretrained_model_name_or_path,
 vae=accelerator.unwrap_model(vae),
 text_encoder=accelerator.unwrap_model(text_encoder),
 tokenizer=tokenizer,
 unet=accelerator.unwrap_model(unet),
 safety_checker=None,
 torch_dtype=torch.float16,
 )

 pipeline = pipeline.to(accelerator.device)
 pipeline.set_progress_bar_config(disable=True)

 generator = torch.Generator(device=accelerator.device).manual_seed(config.seed)

 images = []

 for i in range(len(config.validation_prompts)):
 with torch.autocast("cuda"):
 image = pipeline(config.validation_prompts[i], num_inference_steps=20, generator=None).images[0]

 images.append(image)

 for tracker in accelerator.trackers:
 if tracker.name == "tensorboard":
 np_images = np.stack([np.asarray(img) for img in images])
 tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")

 del pipeline
 torch.cuda.empty_cache()

 image_grid = make_grid(images, rows=1, cols=len(images))

 test_dir = os.path.join(config.output_dir, "samples")
 os.makedirs(test_dir, exist_ok=True)
 image_grid.save(f"{test_dir}/{epoch:04d}.png")

 return images





In [None]:


config.dataset_name = "Drozdik/tattoo_v3"
dataset = load_dataset(config.dataset_name, split="train")
tokenizer = CLIPTokenizer.from_pretrained(
 config.pretrained_model_name_or_path, subfolder="tokenizer",
 )
preprocess = transforms.Compose(
 [
 transforms.Resize((config.image_size, config.image_size)),
 transforms.RandomHorizontalFlip(),
 transforms.ToTensor(),
 transforms.Normalize([.5],[.5]),
 ]
)

def tokenize_captions(examples):
 captions = examples["text"]
 inputs = tokenizer(
 captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
 )
 return inputs.input_ids



def transform(examples):
 images = [preprocess(image.convert("RGB")) for image in examples["image"]]
 examples["pixel_values"] = images
 examples["input_ids"] = tokenize_captions(examples)
 return examples

In [None]:
def collate_fn(examples):
 pixel_values = torch.stack([example["pixel_values"] for example in examples])
 pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
 input_ids = torch.stack([example["input_ids"] for example in examples])
 return {"pixel_values": pixel_values, "input_ids": input_ids}


In [None]:
def save_model_card(args,repo_id: str,images=None,repo_folder=None):
 img_str = ""
 if images is not None and len(images) > 0:
 image_grid = make_grid(images, 1, len(config.validation_prompts))
 image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png"))
 img_str += "![val_imgs_grid](./val_imgs_grid.png)\n"
 yaml = f"""
---
license: creativeml-openrail-m
base_model: {config.pretrained_model_name_or_path}
datasets:
- {config.dataset_name}
tags:
- stable-diffusion
- stable-diffusion-diffusers
- text-to-image
- diffusers
inference: true
---
 """
 model_card = f"""
# Text-to-image finetuning - {repo_id}

This pipeline was finetuned from **{config.pretrained_model_name_or_path}** on the **{config.dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompts: {config.validation_prompts}: \n
{img_str}

## Pipeline usage

You can use the pipeline like so:

```python
from diffusers import DiffusionPipeline
import torch

pipeline = DiffusionPipeline.from_pretrained("{repo_id}", torch_dtype=torch.float16)
prompt = "{config.validation_prompts[0]}"
image = pipeline(prompt).images[0]
image.save("my_image.png")
```

## Training info

These are the key hyperparameters used during training:

* Epochs: {config.num_epochs}
* Learning rate: {config.learning_rate}
* Batch size: {config.train_batch_size}
* Image resolution: {config.image_size}
* Mixed-precision: {config.mixed_precision}

"""
 with open(os.path.join(repo_folder, "README.md"), "w") as f:
 f.write(yaml + model_card)



In [None]:
def deepspeed_zero_init_disabled_context_manager():
 """
 returns either a context list that includes one that will disable zero.Init or an empty context list
 """
 deepspeed_plugin = AcceleratorState().deepspeed_plugin if accelerate.state.is_initialized() else None
 if deepspeed_plugin is None:
 return []

 return [deepspeed_plugin.zero3_init_context_manager(enable=False)]

In [None]:
def train_loop(config, unet, vae, noise_scheduler, optimizer, train_dataloader, lr_scheduler):
 repo_id = "TejasNavada/tattoo-diffusion"

 accelerator = Accelerator(
 mixed_precision=config.mixed_precision,
 gradient_accumulation_steps=config.gradient_accumulation_steps,
 log_with="tensorboard",
 project_dir=os.path.join(config.output_dir, "logs"),
 )
 state_dict = lr_scheduler.state_dict()
 print(state_dict)
 if accelerator.is_main_process:
 os.makedirs(config.output_dir,exist_ok=True)
 accelerator.init_trackers("train_example")

 unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
 unet, optimizer, train_dataloader, lr_scheduler
 )


 text_encoder.to(accelerator.device, dtype=torch.float16)
 vae.to(accelerator.device, dtype=torch.float16)
 global_step = 0

 if(True):

 dirs = os.listdir(config.output_dir)
 dirs = [d for d in dirs if d.startswith("checkpoint")]
 dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
 path = dirs[-1] if len(dirs) > 0 else None
 accelerator.print(f"Resuming from checkpoint {path}")
 accelerator.load_state(os.path.join(config.output_dir, path))
 global_step = int(path.split("-")[1])

 start_epoch = global_step//len(train_dataloader)

 lr_scheduler.load_state_dict(state_dict)
 print(lr_scheduler.get_last_lr())

 for epoch in range(start_epoch, config.num_epochs):
 unet.train()

 progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process)
 progress_bar.set_description(f"Epoch {epoch}")

 for step, batch in enumerate(train_dataloader):

 # Convert images to latent space
 latents = vae.encode(batch["pixel_values"].to(torch.float16)).latent_dist.sample()
 latents = latents * vae.config.scaling_factor

 # Sample noise that to add to the latents
 noise = torch.randn_like(latents)

 bsz = latents.shape[0]

 # Sample a random timestep for each image
 timesteps = torch.randint(
 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
 ).long()
 # Add noise to the latents according to the noise magnitude at each timestep
 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 # Get the text embedding for conditioning
 encoder_hidden_states = text_encoder(batch["input_ids"])[0]
 # Predict the noise residual and compute loss
 with accelerator.accumulate(unet):

 model_pred = unet(noisy_latents,timesteps,encoder_hidden_states).sample

 loss = F.mse_loss(model_pred.float(),noise.float(), reduction="mean")

 # Backpropagate
 accelerator.backward(loss)
 accelerator.clip_grad_norm_(unet.parameters(),1.0)

 optimizer.step()
 lr_scheduler.step()
 optimizer.zero_grad()

 progress_bar.update(1)
 logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
 progress_bar.set_postfix(**logs)
 accelerator.log(logs, step=global_step)
 global_step += 1

 if accelerator.is_main_process:

 if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1:
 images = evaluate(vae, text_encoder, tokenizer, unet, config, accelerator, epoch)
 save_path = os.path.join(config.output_dir, f"checkpoint-{global_step}")
 accelerator.save_state(save_path)
 save_model_card(config, repo_id, images, repo_folder=config.output_dir)
 upload_folder(
 repo_id=repo_id,
 folder_path=save_path,
 path_in_repo=f"checkpoint-{global_step}",
 commit_message="Latest Checkpoint",
 ignore_patterns=["step_*", "epoch_*"],
 )
 upload_folder(
 repo_id=repo_id,
 folder_path=os.path.join(config.output_dir, "samples"),
 path_in_repo="samples",
 commit_message="new samples",
 ignore_patterns=["step_*", "epoch_*"],
 )
 upload_file(
 path_or_fileobj=os.path.join(config.output_dir, "README.md"),
 path_in_repo="README.md",
 repo_id=repo_id,
 )

 unet = accelerator.unwrap_model(unet)
 pipeline = StableDiffusionPipeline.from_pretrained(
 config.pretrained_model_name_or_path,
 text_encoder=text_encoder,
 vae=vae,
 unet=unet,
 )
 pipeline.save_pretrained(config.output_dir)
 accelerator.end_training()











In [None]:
config.validation_prompts[0]

In [None]:
from transformers.utils.hub import huggingface_hub
huggingface_hub.login(config.token, add_to_git_credential=True, new_session=True, write_permission=True)

In [None]:
dataset.set_transform(transform)
train_dataloader = torch.utils.data.DataLoader(dataset, collate_fn=collate_fn, batch_size=config.train_batch_size, shuffle=True)
noise_scheduler = DDPMScheduler.from_pretrained(config.pretrained_model_name_or_path, subfolder="scheduler")
with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
 text_encoder = CLIPTextModel.from_pretrained(
 config.pretrained_model_name_or_path, subfolder="text_encoder",
 )
 vae = AutoencoderKL.from_pretrained(
 config.pretrained_model_name_or_path, subfolder="vae",
 )



unet = UNet2DConditionModel(
 sample_size=config.image_size//8,
 cross_attention_dim = 768,
 )

vae.requires_grad_(False)
text_encoder.requires_grad_(False)
optimizer = bnb.optim.AdamW8bit(
 unet.parameters(),
 lr=config.learning_rate,
 )
lr_scheduler = get_scheduler(
 config.lr_scheduler,
 optimizer=optimizer,
 num_warmup_steps=config.lr_warmup_steps,
 num_training_steps=(len(train_dataloader)*config.num_epochs),
)


args = (config, unet, vae, noise_scheduler, optimizer, train_dataloader, lr_scheduler)




In [None]:
notebook_launcher(train_loop, args, num_processes=1)