
    bi                        d dl mZmZmZmZ d dlZd dlZd dlm	Z	m
Z
mZmZmZ ddlmZmZmZ ddlmZmZmZ ddlmZ dd	lmZ d
dlmZ d
dlmZ d
dlmZ d
dl m!Z! d
dl"m#Z# dZ$dZ%dZ& G d de      Z' G d de      Z( G d de      Z)y)    )CallableListOptionalUnionN)CLIPImageProcessorCLIPTextModelWithProjectionCLIPTokenizerCLIPVisionModelWithProjectionXLMRobertaTokenizer   )PriorTransformerUNet2DConditionModelVQModel)DDIMSchedulerDDPMSchedulerUnCLIPScheduler)replace_example_docstring   )DiffusionPipeline   )KandinskyPipeline)KandinskyImg2ImgPipeline)KandinskyInpaintPipeline)KandinskyPriorPipeline)MultilingualCLIPa  
    Examples:
        ```py
        from diffusers import AutoPipelineForText2Image
        import torch

        pipe = AutoPipelineForText2Image.from_pretrained(
            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
        )
        pipe.enable_model_cpu_offload()

        prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"

        image = pipe(prompt=prompt, num_inference_steps=25).images[0]
        ```
a~  
    Examples:
        ```py
        from diffusers import AutoPipelineForImage2Image
        import torch
        import requests
        from io import BytesIO
        from PIL import Image
        import os

        pipe = AutoPipelineForImage2Image.from_pretrained(
            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
        )
        pipe.enable_model_cpu_offload()

        prompt = "A fantasy landscape, Cinematic lighting"
        negative_prompt = "low quality, bad quality"

        url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"

        response = requests.get(url)
        image = Image.open(BytesIO(response.content)).convert("RGB")
        image.thumbnail((768, 768))

        image = pipe(prompt=prompt, image=original_image, num_inference_steps=25).images[0]
        ```
a  
    Examples:
        ```py
        from diffusers import AutoPipelineForInpainting
        from diffusers.utils import load_image
        import torch
        import numpy as np

        pipe = AutoPipelineForInpainting.from_pretrained(
            "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16
        )
        pipe.enable_model_cpu_offload()

        prompt = "A fantasy landscape, Cinematic lighting"
        negative_prompt = "low quality, bad quality"

        original_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
        )

        mask = np.zeros((768, 768), dtype=np.float32)
        # Let's mask out an area above the cat's head
        mask[:250, 250:-250] = 1

        image = pipe(prompt=prompt, image=original_image, mask_image=mask, num_inference_steps=25).images[0]
        ```
c            !           e Zd ZdZdZdZdgZdedede	de
eef   d	eded
ededededef fdZd(dee   fdZd)dee   de
ej4                  ef   fdZd)dZd Z ej>                          e e!      	 	 	 	 	 	 	 	 	 	 	 	 	 	 d*de
ee"e   f   dee
ee"e   f      dede#dededede#d ed!ee
ejH                  e"ejH                     f      d"eejJ                     d#ee   d$eeeeejJ                  gdf      d%ed&e&fd'              Z' xZ(S )+KandinskyCombinedPipelinea  
    Combined Pipeline for text-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior_scheduler ([`UnCLIPScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
    TzNtext_encoder->unet->movq->prior_prior->prior_image_encoder->prior_text_encoderprior_priortext_encoder	tokenizerunet	schedulermovqprior_image_encoderprior_text_encoderprior_tokenizerprior_schedulerprior_image_processorc                     t         |           | j                  |||||||||	|
|       t        ||||	|
|      | _        t        |||||      | _        y N)r   r    r!   r"   r#   r   r$   r%   r&   r'   r(   )priorimage_encoderr   r    r"   image_processor)r   r    r!   r"   r#   )super__init__register_modulesr   
prior_piper   decoder_pipeselfr   r    r!   r"   r#   r   r$   r%   r&   r'   r(   	__class__s               t/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.pyr/   z"KandinskyCombinedPipeline.__init__   s     	%# 31++"7 	 	
 1-+%%1
 .%
    Nattention_opc                 :    | j                   j                  |       y Nr2   *enable_xformers_memory_efficient_attentionr4   r8   s     r6   r<   zDKandinskyCombinedPipeline.enable_xformers_memory_efficient_attention       DD\Rr7   gpu_iddevicec                 x    | j                   j                  ||       | j                  j                  ||       y)u  
        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
        r?   r@   Nr1   enable_sequential_cpu_offloadr2   r4   r?   r@   s      r6   rD   z7KandinskyCombinedPipeline.enable_sequential_cpu_offload   s4     	55VF5S77vf7Ur7   c                     | j                   j                  ||       | j                  j                  ||       | j                  j                          y N)iterabletotalr1   progress_barr2   enable_model_cpu_offloadr4   rH   rI   s      r6   rK   z&KandinskyCombinedPipeline.progress_bar   D    $$he$D&&&F224r7   c                 t     | j                   j                  di |  | j                  j                  di | y N r1   set_progress_bar_configr2   r4   kwargss     r6   rS   z1KandinskyCombinedPipeline.set_progress_bar_config   2    ///9&9111;F;r7   promptnegative_promptnum_inference_stepsguidance_scalenum_images_per_promptheightwidthprior_guidance_scaleprior_num_inference_steps	generatorlatentsoutput_typecallbackcallback_stepsreturn_dictc                    | j                  ||||	|
||dd	      }|d   }|d   }t        |t        t        f      s|gn|}t	        |      |j
                  d   k  r<|j
                  d   t	        |      z  dk(  r|j
                  d   t	        |      z  |z  }| j                  |||||||
|||||      }| j                          |S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            prior_num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        ptF	rW   rX   r[   rY   r`   ra   rZ   rb   re   r   r   )rW   image_embedsnegative_image_embedsr]   r\   rY   r`   rZ   rb   rc   rd   re   )r1   
isinstancelisttuplelenshaper2   maybe_free_model_hooks)r4   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   prior_outputsri   rj   outputss                       r6   __call__z"KandinskyCombinedPipeline.__call__   s    Z +"7 9/ ( 

 %Q' -a 0!+FT5M!B&v;++A..<3E3Ea3H3v;3VZ[3["((+s6{:fDF##%"7 3)#)# $ 
 	##%r7   r:   NNNd         @r      rx   rw      NNpilNr   T))__name__
__module____qualname____doc___load_connected_pipesmodel_cpu_offload_seq_exclude_from_cpu_offloadr   r   r   r   r   r   r   r   r
   r   r	   r   r   r/   r   r   r<   inttorchr@   strrD   rK   rS   no_gradr   TEXT2IMAGE_EXAMPLE_DOC_STRINGr   float	GeneratorTensorboolrs   __classcell__r5   s   @r6   r   r   q   sC   < !l!.+
&+
 '+
 #	+

 56+
 +
 &+
 ;+
 8+
 '+
 )+
  2+
ZSxPXGY SVHSM VRWX]XdXdfiXiRj V5
< U]]_<= <@#& #%&&))+MQ*.%*GK !oc49n%o "%T#Y"78o !	o
 o  #o o o $o $'o E%//43H"HIJo %,,'o c]o 8S#u||$<d$BCDo o  !o > or7   r   c            %           e Zd ZdZdZdZdgZdedede	de
eef   d	eded
ededededef fdZd*dee   fdZd+dee   de
ej4                  ef   fdZd+dZd Z ej>                          e e!      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d,de
ee"e   f   de
ejF                  e$jJ                  jJ                  e"ejF                     e"e$jJ                  jJ                     f   dee
ee"e   f      dede&dede&ded ed!e&d"ed#ee
ejN                  e"ejN                     f      d$eejF                     d%ee   d&eeeeejF                  gdf      d'ed(e(f"d)              Z) xZ*S )- KandinskyImg2ImgCombinedPipelinea  
    Combined Pipeline for image-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior_scheduler ([`UnCLIPScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
    TNprior_text_encoder->prior_image_encoder->prior_prior->text_encoder->unet->movqr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   c                     t         |           | j                  |||||||||	|
|       t        ||||	|
|      | _        t        |||||      | _        y r*   )r.   r/   r0   r   r1   r   r2   r3   s               r6   r/   z)KandinskyImg2ImgCombinedPipeline.__init__n       	%# 31++"7 	 	
 1-+%%1
 5%
r7   Nr8   c                 :    | j                   j                  |       y r:   r;   r=   s     r6   r<   zKKandinskyImg2ImgCombinedPipeline.enable_xformers_memory_efficient_attention  r>   r7   r?   r@   c                 x    | j                   j                  ||       | j                  j                  ||       ya  
        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
        Note that offloading happens on a submodule basis. Memory savings are higher than with
        `enable_model_cpu_offload`, but performance is lower.
        rB   NrC   rE   s      r6   rD   z>KandinskyImg2ImgCombinedPipeline.enable_sequential_cpu_offload  4     	55VF5S77vf7Ur7   c                     | j                   j                  ||       | j                  j                  ||       | j                  j                          y rG   rJ   rM   s      r6   rK   z-KandinskyImg2ImgCombinedPipeline.progress_bar  rN   r7   c                 t     | j                   j                  di |  | j                  j                  di | y rP   rR   rT   s     r6   rS   z8KandinskyImg2ImgCombinedPipeline.set_progress_bar_config  rV   r7   rW   imagerX   rY   rZ   r[   strengthr\   r]   r^   r_   r`   ra   rb   rc   rd   re   c                    | j                  |||||||
dd	      }|d   }|d   }t        |t        t        f      s|gn|}t        |t        j
                  j
                        r|gn|}t        |      |j                  d   k  r<|j                  d   t        |      z  dk(  r|j                  d   t        |      z  |z  }t        |t        t        f      rWt        |      |j                  d   k  r<|j                  d   t        |      z  dk(  r|j                  d   t        |      z  |z  }| j                  ||||||	||||||||      }| j                          |S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
                again.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            strength (`float`, *optional*, defaults to 0.3):
                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
                be maximum and the denoising process will run for the full number of iterations specified in
                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            prior_num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        rg   Frh   r   r   )rW   r   ri   rj   r   r]   r\   rY   r`   rZ   rb   rc   rd   re   
r1   rk   rl   rm   PILImagern   ro   r2   rp   )r4   rW   r   rX   rY   rZ   r[   r   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rq   ri   rj   rr   s                         r6   rs   z)KandinskyImg2ImgCombinedPipeline.__call__  s~   r +"7 9/ ( 

 %Q' -a 0!+FT5M!B&%fciioo>Ev;++A..<3E3Ea3H3v;3VZ[3["((+s6{:fDF utUm,E
\//22""1%E
2a7!''*c%j8EAE##%"7 3)#)# $ 
" 	##%r7   r:   rt   )Nrv   rw   r   g333333?rx   rx   rw   ry   NNrz   Nr   T)+r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r
   r   r	   r   r   r/   r   r   r<   r   r   r@   r   rD   rK   rS   r   r   IMAGE2IMAGE_EXAMPLE_DOC_STRINGr   r   r   r   r   r   r   rs   r   r   s   @r6   r   r   K  s   < !l!.+
&+
 '+
 #	+

 56+
 +
 &+
 ;+
 8+
 '+
 )+
  2+
ZSxPXGY S	VHSM 	VRWX]XdXdfiXiRj 	V5
< U]]_=>
 <@#& #%&&))+MQ*.%*GK %Ec49n%E U\\399??D4FSYY__H]]^E "%T#Y"78	E
 !E E  #E E E E $E $'E E%//43H"HIJE %,,'E c]E  8S#u||$<d$BCD!E" #E$ %E ? Er7   r   c            %       :    e Zd ZdZdZdZdgZdedede	de
eef   d	eded
ededededef fdZd*dee   fdZd+dee   de
ej4                  ef   fdZd+dZd Z ej>                          e e!      	 	 	 	 	 	 	 	 	 	 	 	 	 	 d,de
ee"e   f   de
ejF                  e$jJ                  jJ                  e"ejF                     e"e$jJ                  jJ                     f   de
ejF                  e$jJ                  jJ                  e"ejF                     e"e$jJ                  jJ                     f   dee
ee"e   f      dede&deded ed!e&d"ed#ee
ejN                  e"ejN                     f      d$eejF                     d%ee   d&eeeeejF                  gdf      d'ed(e(f"d)              Z) xZ*S )- KandinskyInpaintCombinedPipelinea  
    Combined Pipeline for generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior_scheduler ([`UnCLIPScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
    Tr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   c                     t         |           | j                  |||||||||	|
|       t        ||||	|
|      | _        t        |||||      | _        y r*   )r.   r/   r0   r   r1   r   r2   r3   s               r6   r/   z)KandinskyInpaintCombinedPipeline.__init___  r   r7   Nr8   c                 :    | j                   j                  |       y r:   r;   r=   s     r6   r<   zKKandinskyInpaintCombinedPipeline.enable_xformers_memory_efficient_attention  r>   r7   r?   r@   c                 x    | j                   j                  ||       | j                  j                  ||       yr   rC   rE   s      r6   rD   z>KandinskyInpaintCombinedPipeline.enable_sequential_cpu_offload  r   r7   c                     | j                   j                  ||       | j                  j                  ||       | j                  j                          y rG   rJ   rM   s      r6   rK   z-KandinskyInpaintCombinedPipeline.progress_bar  rN   r7   c                 t     | j                   j                  di |  | j                  j                  di | y rP   rR   rT   s     r6   rS   z8KandinskyInpaintCombinedPipeline.set_progress_bar_config  rV   r7   rW   r   
mask_imagerX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   c                    | j                  |||||||
dd	      }|d   }|d   }t        |t        t        f      s|gn|}t        |t        j
                  j
                        r|gn|}t        |t        j
                  j
                        r|gn|}t        |      |j                  d   k  r<|j                  d   t        |      z  dk(  r|j                  d   t        |      z  |z  }t        |t        t        f      rWt        |      |j                  d   k  r<|j                  d   t        |      z  dk(  r|j                  d   t        |      z  |z  }t        |t        t        f      rWt        |      |j                  d   k  r<|j                  d   t        |      z  dk(  r|j                  d   t        |      z  |z  }| j                  ||||||	||||||||      }| j                          |S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
                again.
            mask_image (`np.array`):
                Tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while
                black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
                channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
                so the expected shape would be `(B, H, W, 1)`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            prior_num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        rg   Frh   r   r   )rW   r   r   ri   rj   r]   r\   rY   r`   rZ   rb   rc   rd   re   r   )r4   rW   r   r   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rq   ri   rj   rr   s                         r6   rs   z)KandinskyInpaintCombinedPipeline.__call__  s   p +"7 9/ ( 

 %Q' -a 0!+FT5M!B&%fciioo>E%/
CIIOO%Lj\R\
v;++A..<3E3Ea3H3v;3VZ[3["((+s6{:fDF utUm,E
\//22""1%E
2a7!''*c%j8EAE zD%=1J,"4"4Q"77""1%J71<&,,Q/3z?BjPJ##!%"7 3)#)# $ 
" 	##%r7   r:   rt   ru   )+r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r
   r   r	   r   r   r/   r   r   r<   r   r   r@   r   rD   rK   rS   r   r   INPAINT_EXAMPLE_DOC_STRINGr   r   r   r   r   r   r   rs   r   r   s   @r6   r   r   <  s   < !l!.+
&+
 '+
 #	+

 56+
 +
 &+
 ;+
 8+
 '+
 )+
  2+
ZSxPXGY S	VHSM 	VRWX]XdXdfiXiRj 	V5
< U]]_9: <@#& #%&&))+MQ*.%*GK %Lc49n%L U\\399??D4FSYY__H]]^L %,,		ell9KTRUR[R[RaRaMbbc	L
 "%T#Y"78L !L L  #L L L $L $'L E%//43H"HIJL %,,'L c]L  8S#u||$<d$BCD!L" #L$ %L ; Lr7   r   )*typingr   r   r   r   	PIL.Imager   r   transformersr   r   r	   r
   r   modelsr   r   r   
schedulersr   r   r   utilsr   pipeline_utilsr   pipeline_kandinskyr   pipeline_kandinsky_img2imgr   pipeline_kandinsky_inpaintr   pipeline_kandinsky_priorr   r   r   r   r   r   r   r   r   rQ   r7   r6   <module>r      s    3 2    F E G G / 1 @ @ < *! "" 8 :W 1 Wtn'8 nbu'8 ur7   