IrohXu
/

stable-diffusion-3-inpainting

Model card Files Files and versions Community

IrohXu commited on Jul 3

Commit

9729d10

•

1 Parent(s): 3c12e7b

update the code

Browse files

Files changed (3) hide show

demo.py +3 -4
overture-creations-5sI6fQgYIuo_output.jpg +0 -0
pipeline_stable_diffusion_3_inpaint.py +258 -112

demo.py CHANGED Viewed

@@ -8,7 +8,6 @@ def preprocess_image(image):
     image = image.convert("RGB")
     image = transforms.CenterCrop((image.size[1] // 64 * 64, image.size[0] // 64 * 64))(image)
     image = transforms.ToTensor()(image)
-    image = image * 2 - 1
     image = image.unsqueeze(0).to("cuda")
     return image
@@ -24,7 +23,7 @@ pipe = StableDiffusion3InpaintPipeline.from_pretrained(
     torch_dtype=torch.float16,
 ).to("cuda")
-prompt = "Face of a cat, high resolution, sitting on a park bench"
 source_image = load_image(
     "./overture-creations-5sI6fQgYIuo.png"
 )
@@ -38,10 +37,10 @@ mask = preprocess_mask(
 image = pipe(
     prompt=prompt,
     image=source,
-    mask_image=1-mask,
     height=1024,
     width=1024,
-    num_inference_steps=28,
     guidance_scale=7.0,
     strength=0.6,
 ).images[0]

     image = image.convert("RGB")
     image = transforms.CenterCrop((image.size[1] // 64 * 64, image.size[0] // 64 * 64))(image)
     image = transforms.ToTensor()(image)
     image = image.unsqueeze(0).to("cuda")
     return image
     torch_dtype=torch.float16,
 ).to("cuda")
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
 source_image = load_image(
     "./overture-creations-5sI6fQgYIuo.png"
 )
 image = pipe(
     prompt=prompt,
     image=source,
+    mask_image=mask,
     height=1024,
     width=1024,
+    num_inference_steps=50,
     guidance_scale=7.0,
     strength=0.6,
 ).images[0]

overture-creations-5sI6fQgYIuo_output.jpg CHANGED Viewed

pipeline_stable_diffusion_3_inpaint.py CHANGED Viewed

@@ -1,3 +1,5 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,7 +15,6 @@
 import inspect
 from typing import Callable, Dict, List, Optional, Union
-import PIL.Image
 import torch
 from transformers import (
     CLIPTextModelWithProjection,
@@ -22,6 +23,7 @@ from transformers import (
     T5TokenizerFast,
 )
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.models.autoencoders import AutoencoderKL
 from diffusers.models.transformers import SD3Transformer2DModel
@@ -50,21 +52,20 @@ EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import AutoPipelineForImage2Image
         >>> from diffusers.utils import load_image
-        >>> device = "cuda"
-        >>> model_id_or_path = "stabilityai/stable-diffusion-3-medium-diffusers"
-        >>> pipe = AutoPipelineForImage2Image.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
-        >>> pipe = pipe.to(device)
-        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-        >>> init_image = load_image(url).resize((512, 512))
-        >>> prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
-        >>> images = pipe(prompt=prompt, image=init_image, strength=0.95, guidance_scale=7.5).images[0]
         ```
 """
@@ -211,7 +212,11 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
             vae_scale_factor=self.vae_scale_factor, vae_latent_channels=self.vae.config.latent_channels
         )
         self.mask_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, vae_latent_channels=self.vae.config.latent_channels, do_normalize=False, do_binarize=True, do_convert_grayscale=True
         )
         self.tokenizer_max_length = self.tokenizer.model_max_length
         self.default_sample_size = self.transformer.config.sample_size
@@ -499,6 +504,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
         return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
     def check_inputs(
         self,
         prompt,
@@ -588,6 +594,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
         if max_sequence_length is not None and max_sequence_length > 512:
             raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
         init_timestep = min(num_inference_steps * strength, num_inference_steps)
@@ -599,59 +606,93 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
         return timesteps, num_inference_steps - t_start
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
-        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
             raise ValueError(
-                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
             )
-        image = image.to(device=device, dtype=dtype)
-        batch_size = batch_size * num_images_per_prompt
-        if image.shape[1] == self.vae.config.latent_channels:
-            init_latents = image
         else:
-            if isinstance(generator, list) and len(generator) != batch_size:
-                raise ValueError(
-                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-                )
-            elif isinstance(generator, list):
-                init_latents = [
-                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
-                    for i in range(batch_size)
-                ]
-                init_latents = torch.cat(init_latents, dim=0)
-            else:
-                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
-            init_latents = (init_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
         else:
-            init_latents = torch.cat([init_latents], dim=0)
-        shape = init_latents.shape
-        init_latents_orig = init_latents
-        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        # get latents
-        init_latents = self.scheduler.scale_noise(init_latents, timestep, noise)
-        latents = init_latents.to(device=device, dtype=dtype)
-        return latents, init_latents_orig, noise
     def prepare_mask_latents(
-        self, mask, masked_image, batch_size, num_images_per_prompt, height, width, dtype, device, generator
     ):
         # resize the mask to latents shape as we concatenate the mask to the latents
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
@@ -660,16 +701,18 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
             mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
         )
         mask = mask.to(device=device, dtype=dtype)
         batch_size = batch_size * num_images_per_prompt
         masked_image = masked_image.to(device=device, dtype=dtype)
-        if masked_image.shape[1] == 4:
             masked_image_latents = masked_image
         else:
             masked_image_latents = retrieve_latents(self.vae.encode(masked_image), generator=generator)
         # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
         if mask.shape[0] < batch_size:
             if not batch_size % mask.shape[0] == 0:
@@ -688,15 +731,15 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
                 )
             masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
-        # mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
-        # masked_image_latents = (
-        #     torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
-        # )
         # aligning device to prevent device errors when concating it with the latent model input
         masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
         return mask, masked_image_latents
     @property
     def guidance_scale(self):
         return self._guidance_scale
@@ -727,11 +770,12 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
         prompt: Union[str, List[str]] = None,
         prompt_2: Optional[Union[str, List[str]]] = None,
         prompt_3: Optional[Union[str, List[str]]] = None,
-        height: int = None,
-        width: int = None,
         image: PipelineImageInput = None,
         mask_image: PipelineImageInput = None,
         masked_image_latents: PipelineImageInput = None,
         strength: float = 0.6,
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
@@ -740,13 +784,12 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
         negative_prompt_2: Optional[Union[str, List[str]]] = None,
         negative_prompt_3: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
-        add_predicted_noise: Optional[bool] = False,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         clip_skip: Optional[int] = None,
@@ -767,10 +810,39 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
             prompt_3 (`str` or `List[str]`, *optional*):
                 The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
                 will be used instead
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image. This is set to 1024 by default for the best results.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image. This is set to 1024 by default for the best results.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
@@ -796,9 +868,6 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
                 `text_encoder_3`. If not defined, `negative_prompt` is used instead
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
-            add_predicted_noise (`bool`, *optional*, defaults to True):
-                Use predicted noise instead of random noise when constructing noisy versions of the original image in
-                the reverse diffusion process
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                 to make generation deterministic.
@@ -845,6 +914,12 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt,
@@ -903,34 +978,70 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
-        # 3. Preprocess image
-        image = self.image_processor.preprocess(image, height, width)
-        # 4. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_inference_steps)
         # 5. Prepare latent variables
-        if latents is None:
-            latents, init_latents_orig, noise = self.prepare_latents(
-                image,
-                latent_timestep,
-                batch_size,
-                num_images_per_prompt,
-                prompt_embeds.dtype,
-                device,
-                generator,
-            )
-        # 5.1. Prepare masked latent variables
-        mask_condition = self.mask_processor.preprocess(mask_image, height, width)
         if masked_image_latents is None:
-            masked_image = image * (mask_condition < 0.5)
         else:
             masked_image = masked_image_latents
         mask, masked_image_latents = self.prepare_mask_latents(
             mask_condition,
             masked_image,
@@ -940,10 +1051,32 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
             width,
             prompt_embeds.dtype,
             device,
-            generator
         )
-        # 6. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
         self._num_timesteps = len(timesteps)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -955,7 +1088,10 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latent_model_input.shape[0])
                 noise_pred = self.transformer(
                     hidden_states=latent_model_input,
                     timestep=timestep,
@@ -972,6 +1108,20 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
                 # compute the previous noisy sample x_t -> x_t-1
                 latents_dtype = latents.dtype
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
                 if latents.dtype != latents_dtype:
                     if torch.backends.mps.is_available():
@@ -990,15 +1140,8 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
                     negative_pooled_prompt_embeds = callback_outputs.pop(
                         "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
                     )
-                if add_predicted_noise:
-                    init_latents_proper = self.scheduler.scale_noise(
-                        init_latents_orig, torch.tensor([t]), noise_pred_uncond
-                    )
-                else:
-                    init_latents_proper = self.scheduler.scale_noise(init_latents_orig, torch.tensor([t]), noise)
-                latents = (init_latents_proper * mask) + (latents * (1 - mask))
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -1007,16 +1150,19 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
                 if XLA_AVAILABLE:
                     xm.mark_step()
-        latents = (init_latents_orig * mask) + (latents * (1 - mask))
-        if output_type == "latent":
             image = latents
-        else:
-            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
-            image = self.vae.decode(latents, return_dict=False)[0]
-            image = self.image_processor.postprocess(image, output_type=output_type)
         # Offload all models
         self.maybe_free_model_hooks()
@@ -1024,4 +1170,4 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
         if not return_dict:
             return (image,)
-        return StableDiffusion3PipelineOutput(images=image)

+# Copyright 2024 Stability AI and The HuggingFace Team and IrohXu. All rights reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 import inspect
 from typing import Callable, Dict, List, Optional, Union
 import torch
 from transformers import (
     CLIPTextModelWithProjection,
     T5TokenizerFast,
 )
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.models.autoencoders import AutoencoderKL
 from diffusers.models.transformers import SD3Transformer2DModel
     Examples:
         ```py
         >>> import torch
+        >>> from diffusers import StableDiffusion3InpaintPipeline
         >>> from diffusers.utils import load_image
+        >>> pipe = StableDiffusion3InpaintPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        >>> source = load_image(img_url)
+        >>> mask = load_image(mask_url)
+        >>> image = pipe(prompt=prompt, image=source, mask_image=mask).images[0]
+        >>> image.save("sd3_inpainting.png")
         ```
 """
             vae_scale_factor=self.vae_scale_factor, vae_latent_channels=self.vae.config.latent_channels
         )
         self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            vae_latent_channels=self.vae.config.latent_channels,
+            do_normalize=False,
+            do_binarize=True,
+            do_convert_grayscale=True,
         )
         self.tokenizer_max_length = self.tokenizer.model_max_length
         self.default_sample_size = self.transformer.config.sample_size
         return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.check_inputs
     def check_inputs(
         self,
         prompt,
         if max_sequence_length is not None and max_sequence_length > 512:
             raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
         init_timestep = min(num_inference_steps * strength, num_inference_steps)
         return timesteps, num_inference_steps - t_start
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if (image is None or timestep is None) and not is_strength_max:
             raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
             )
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+            if image.shape[1] == 16:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.scale_noise(image_latents, timestep, noise)
         else:
+            noise = latents.to(device)
+            latents = noise
+        outputs = (latents,)
+        if return_noise:
+            outputs += (noise,)
+        if return_image_latents:
+            outputs += (image_latents,)
+        return outputs
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
         else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
     def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        num_images_per_prompt,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        do_classifier_free_guidance,
     ):
         # resize the mask to latents shape as we concatenate the mask to the latents
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
             mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
         )
         mask = mask.to(device=device, dtype=dtype)
         batch_size = batch_size * num_images_per_prompt
         masked_image = masked_image.to(device=device, dtype=dtype)
+        if masked_image.shape[1] == 16:
             masked_image_latents = masked_image
         else:
             masked_image_latents = retrieve_latents(self.vae.encode(masked_image), generator=generator)
+        masked_image_latents = (masked_image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
         # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
         if mask.shape[0] < batch_size:
             if not batch_size % mask.shape[0] == 0:
                 )
             masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
         # aligning device to prevent device errors when concating it with the latent model input
         masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
         return mask, masked_image_latents
     @property
     def guidance_scale(self):
         return self._guidance_scale
         prompt: Union[str, List[str]] = None,
         prompt_2: Optional[Union[str, List[str]]] = None,
         prompt_3: Optional[Union[str, List[str]]] = None,
         image: PipelineImageInput = None,
         mask_image: PipelineImageInput = None,
         masked_image_latents: PipelineImageInput = None,
+        height: int = None,
+        width: int = None,
+        padding_mask_crop: Optional[int] = None,
         strength: float = 0.6,
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
         negative_prompt_2: Optional[Union[str, List[str]]] = None,
         negative_prompt_3: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         clip_skip: Optional[int] = None,
             prompt_3 (`str` or `List[str]`, *optional*):
                 The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
                 will be used instead
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
+            mask_image_latent (`torch.Tensor`, `List[torch.Tensor]`):
+                `Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask
+                latents tensor will ge generated by `mask_image`.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image. This is set to 1024 by default for the best results.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
                 `text_encoder_3`. If not defined, `negative_prompt` is used instead
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                 to make generation deterministic.
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt,
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+        # 3. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
         latent_timestep = timesteps[:1].repeat(batch_size * num_inference_steps)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+        # 4. Preprocess mask and image
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
         # 5. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_transformer = self.transformer.config.in_channels
+        return_image_latents = num_channels_transformer == 16
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+        # 6. Prepare mask latent variables
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
         if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
         else:
             masked_image = masked_image_latents
         mask, masked_image_latents = self.prepare_mask_latents(
             mask_condition,
             masked_image,
             width,
             prompt_embeds.dtype,
             device,
+            generator,
+            self.do_classifier_free_guidance,
         )
+        # match the inpainting pipeline and will be updated with input + mask inpainting model later
+        if num_channels_transformer == 33:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if (
+                num_channels_latents + num_channels_mask + num_channels_masked_image
+                != self.transformer.config.in_channels
+            ):
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.transformer`: {self.transformer.config} expects"
+                    f" {self.transformer.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.transformer` or your `mask_image` or `image` input."
+                )
+        elif num_channels_transformer != 16:
+            raise ValueError(
+                f"The transformer {self.transformer.__class__} should have 16 input channels or 33 input channels, not {self.transformer.config.in_channels}."
+            )
+        # 7. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
         self._num_timesteps = len(timesteps)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latent_model_input.shape[0])
+                if num_channels_transformer == 33:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
                 noise_pred = self.transformer(
                     hidden_states=latent_model_input,
                     timestep=timestep,
                 # compute the previous noisy sample x_t -> x_t-1
                 latents_dtype = latents.dtype
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if num_channels_transformer == 16:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.scale_noise(
+                            init_latents_proper, torch.tensor([noise_timestep]), noise
+                        )
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
                 if latents.dtype != latents_dtype:
                     if torch.backends.mps.is_available():
                     negative_pooled_prompt_embeds = callback_outputs.pop(
                         "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
                     )
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                 if XLA_AVAILABLE:
                     xm.mark_step()
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+        else:
             image = latents
+        do_denormalize = [True] * image.shape[0]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
         # Offload all models
         self.maybe_free_model_hooks()
         if not return_dict:
             return (image,)
+        return StableDiffusion3PipelineOutput(images=image)