@@ -555,7 +555,7 @@ def __call__(
555555 self ,
556556 prompt : Union [str , List [str ]],
557557 negative_prompt : Optional [Union [str , List [str ]]] = None ,
558- init_image : Union [torch .FloatTensor , PIL .Image .Image ] = None ,
558+ image : Union [torch .FloatTensor , PIL .Image .Image ] = None ,
559559 mask_image : Union [torch .FloatTensor , PIL .Image .Image ] = None ,
560560 height : int = 512 ,
561561 width : int = 512 ,
@@ -583,11 +583,11 @@ def __call__(
583583 negative_prompt (`str` or `List[str]`, *optional*):
584584 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
585585 if `guidance_scale` is less than `1`).
586- init_image (`torch.FloatTensor` or `PIL.Image.Image`):
586+ image (`torch.FloatTensor` or `PIL.Image.Image`):
587587 `Image`, or tensor representing an image batch, that will be used as the starting point for the
588588 process.
589589 mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
590- `Image`, or tensor representing an image batch, to mask `init_image `. White pixels in the mask will be
590+ `Image`, or tensor representing an image batch, to mask `image `. White pixels in the mask will be
591591 replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
592592 PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
593593 contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -605,11 +605,11 @@ def __call__(
605605 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
606606 usually at the expense of lower image quality.
607607 strength (`float`, *optional*, defaults to 0.8):
608- Conceptually, indicates how much to transform the reference `init_image `. Must be between 0 and 1.
609- `init_image ` will be used as a starting point, adding more noise to it the larger the `strength`. The
608+ Conceptually, indicates how much to transform the reference `image `. Must be between 0 and 1.
609+ `image ` will be used as a starting point, adding more noise to it the larger the `strength`. The
610610 number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
611611 noise will be maximum and the denoising process will run for the full number of iterations specified in
612- `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image `.
612+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image `.
613613 num_images_per_prompt (`int`, *optional*, defaults to 1):
614614 The number of images to generate per prompt.
615615 eta (`float`, *optional*, defaults to 0.0):
@@ -648,6 +648,9 @@ def __call__(
648648 list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
649649 (nsfw) content, according to the `safety_checker`.
650650 """
651+ message = "Please use `image` instead of `init_image`."
652+ init_image = deprecate ("init_image" , "0.12.0" , message , take_from = kwargs )
653+ image = init_image or image
651654
652655 if isinstance (prompt , str ):
653656 batch_size = 1
@@ -714,7 +717,7 @@ def __call__(
714717 mask = None
715718 noise = None
716719
717- if init_image is None :
720+ if image is None :
718721 # get the initial random noise unless the user supplied it
719722
720723 # Unlike in other pipelines, latents need to be generated in the target device
@@ -753,11 +756,11 @@ def __call__(
753756 # scale the initial noise by the standard deviation required by the scheduler
754757 latents = latents * self .scheduler .init_noise_sigma
755758 else :
756- if isinstance (init_image , PIL .Image .Image ):
757- init_image = preprocess_image (init_image )
759+ if isinstance (image , PIL .Image .Image ):
760+ image = preprocess_image (image )
758761 # encode the init image into latents and scale the latents
759- init_image = init_image .to (device = self .device , dtype = latents_dtype )
760- init_latent_dist = self .vae .encode (init_image ).latent_dist
762+ image = image .to (device = self .device , dtype = latents_dtype )
763+ init_latent_dist = self .vae .encode (image ).latent_dist
761764 init_latents = init_latent_dist .sample (generator = generator )
762765 init_latents = 0.18215 * init_latents
763766 init_latents = torch .cat ([init_latents ] * batch_size * num_images_per_prompt , dim = 0 )
@@ -772,7 +775,7 @@ def __call__(
772775
773776 # check sizes
774777 if not mask .shape == init_latents .shape :
775- raise ValueError ("The mask and init_image should be the same size!" )
778+ raise ValueError ("The mask and image should be the same size!" )
776779
777780 # get the original timestep using init_timestep
778781 offset = self .scheduler .config .get ("steps_offset" , 0 )
@@ -961,7 +964,7 @@ def text2img(
961964
962965 def img2img (
963966 self ,
964- init_image : Union [torch .FloatTensor , PIL .Image .Image ],
967+ image : Union [torch .FloatTensor , PIL .Image .Image ],
965968 prompt : Union [str , List [str ]],
966969 negative_prompt : Optional [Union [str , List [str ]]] = None ,
967970 strength : float = 0.8 ,
@@ -980,7 +983,7 @@ def img2img(
980983 r"""
981984 Function for image-to-image generation.
982985 Args:
983- init_image (`torch.FloatTensor` or `PIL.Image.Image`):
986+ image (`torch.FloatTensor` or `PIL.Image.Image`):
984987 `Image`, or tensor representing an image batch, that will be used as the starting point for the
985988 process.
986989 prompt (`str` or `List[str]`):
@@ -989,11 +992,11 @@ def img2img(
989992 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
990993 if `guidance_scale` is less than `1`).
991994 strength (`float`, *optional*, defaults to 0.8):
992- Conceptually, indicates how much to transform the reference `init_image `. Must be between 0 and 1.
993- `init_image ` will be used as a starting point, adding more noise to it the larger the `strength`. The
995+ Conceptually, indicates how much to transform the reference `image `. Must be between 0 and 1.
996+ `image ` will be used as a starting point, adding more noise to it the larger the `strength`. The
994997 number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
995998 noise will be maximum and the denoising process will run for the full number of iterations specified in
996- `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image `.
999+ `num_inference_steps`. A value of 1, therefore, essentially ignores `image `.
9971000 num_inference_steps (`int`, *optional*, defaults to 50):
9981001 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
9991002 expense of slower inference. This parameter will be modulated by `strength`.
@@ -1035,7 +1038,7 @@ def img2img(
10351038 return self .__call__ (
10361039 prompt = prompt ,
10371040 negative_prompt = negative_prompt ,
1038- init_image = init_image ,
1041+ image = image ,
10391042 num_inference_steps = num_inference_steps ,
10401043 guidance_scale = guidance_scale ,
10411044 strength = strength ,
@@ -1052,7 +1055,7 @@ def img2img(
10521055
10531056 def inpaint (
10541057 self ,
1055- init_image : Union [torch .FloatTensor , PIL .Image .Image ],
1058+ image : Union [torch .FloatTensor , PIL .Image .Image ],
10561059 mask_image : Union [torch .FloatTensor , PIL .Image .Image ],
10571060 prompt : Union [str , List [str ]],
10581061 negative_prompt : Optional [Union [str , List [str ]]] = None ,
@@ -1072,11 +1075,11 @@ def inpaint(
10721075 r"""
10731076 Function for inpaint.
10741077 Args:
1075- init_image (`torch.FloatTensor` or `PIL.Image.Image`):
1078+ image (`torch.FloatTensor` or `PIL.Image.Image`):
10761079 `Image`, or tensor representing an image batch, that will be used as the starting point for the
10771080 process. This is the image whose masked region will be inpainted.
10781081 mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
1079- `Image`, or tensor representing an image batch, to mask `init_image `. White pixels in the mask will be
1082+ `Image`, or tensor representing an image batch, to mask `image `. White pixels in the mask will be
10801083 replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
10811084 PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
10821085 contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -1088,7 +1091,7 @@ def inpaint(
10881091 strength (`float`, *optional*, defaults to 0.8):
10891092 Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
10901093 is 1, the denoising process will be run on the masked area for the full number of iterations specified
1091- in `num_inference_steps`. `init_image ` will be used as a reference for the masked area, adding more
1094+ in `num_inference_steps`. `image ` will be used as a reference for the masked area, adding more
10921095 noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
10931096 num_inference_steps (`int`, *optional*, defaults to 50):
10941097 The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
@@ -1131,7 +1134,7 @@ def inpaint(
11311134 return self .__call__ (
11321135 prompt = prompt ,
11331136 negative_prompt = negative_prompt ,
1134- init_image = init_image ,
1137+ image = image ,
11351138 mask_image = mask_image ,
11361139 num_inference_steps = num_inference_steps ,
11371140 guidance_scale = guidance_scale ,
0 commit comments