
    biE                         d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
 ddlmZmZ ddlmZmZmZ ddlmZ d	d
lmZmZ ddlmZ  e       rd dlmc mZ dZndZ ej<                  e      Z dZ!ddZ" G d de      Z#y)    )CallableListOptionalUnionN)XLMRobertaTokenizer   )UNet2DConditionModelVQModel)DDIMSchedulerDDPMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutput   )MultilingualCLIPTFav  
    Examples:
        ```py
        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
        >>> import torch

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-2-1-prior")
        >>> pipe_prior.to("cuda")

        >>> prompt = "red cat, 4k photo"
        >>> out = pipe_prior(prompt)
        >>> image_emb = out.image_embeds
        >>> negative_image_emb = out.negative_image_embeds

        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
        >>> pipe.to("cuda")

        >>> image = pipe(
        ...     prompt,
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=negative_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=100,
        ... ).images

        >>> image[0].save("cat.png")
        ```
c                 v    | |dz  z  }| |dz  z  dk7  r|dz  }||dz  z  }||dz  z  dk7  r|dz  }||z  ||z  fS )Nr   r   r    )hwscale_factornew_hnew_ws        k/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky.pyget_new_h_wr   K   sg    q E<?a
q E<?a
<!555    c            !           e Zd ZdZdZdedededee	e
f   def
 fdZd	 Z	 ddZ ej                           ee      	 	 	 	 	 	 	 	 	 	 	 	 ddeeee   f   deej*                  eej*                     f   deej*                  eej*                     f   deeeee   f      dedededededeeej2                  eej2                     f      deej*                     dee   deeeeej*                  gd
f      dedefd              Z xZS )KandinskyPipelinea1  
    Pipeline for text-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
    ztext_encoder->unet->movqtext_encoder	tokenizerunet	schedulermovqc                     t         |           | j                  |||||       dt        | j                  j
                  j                        dz
  z  | _        y )N)r"   r#   r$   r%   r&   r   r   )super__init__register_moduleslenr&   configblock_out_channelsmovq_scale_factor)selfr"   r#   r$   r%   r&   	__class__s         r   r)   zKandinskyPipeline.__init__k   s\     	% 	 	
 "#s499+;+;+N+N'ORS'S!Tr   c                     |t        ||||      }n;|j                  |k7  rt        d|j                   d|       |j                  |      }||j                  z  }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r/   r5   r4   r3   r2   latentsr%   s          r   prepare_latentsz!KandinskyPipeline.prepare_latents   sg    ?"5IfTYZG}}% #A'--P[\a[b!cddjj(GI666r   Nc                 h   t        |t              rt        |      nd}| j                  |dddddd      }|j                  }| j                  |dd      j                  }	|	j
                  d	   |j
                  d	   k\  rt        j                  ||	      sj| j                  j                  |	d d | j                  j                  dz
  d	f         }
t        j                  d
| j                  j                   d|
        |j                  |      }|j                  j                  |      }| j                  ||      \  }}|j                  |d      }|j                  |d      }|j                  |d      }|r|dg|z  }nt!        |      t!        |      ur$t#        dt!        |       dt!        |       d      t        |t$              r|g}n1|t        |      k7  r!t'        d| dt        |       d| d| d	      |}| j                  |dddddd      }|j                  j                  |      }|j                  j                  |      }| j                  ||      \  }}|j
                  d   }|j)                  d|      }|j+                  ||z  |      }|j
                  d   }|j)                  d|d      }|j+                  ||z  |d	      }|j                  |d      }t        j,                  ||g      }t        j,                  ||g      }t        j,                  ||g      }|||fS )Nr   
max_lengthTM   pt)padding
truncationr<   return_attention_maskadd_special_tokensreturn_tensorslongest)r?   rC   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )	input_idsattention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)r?   r<   r@   rA   rB   rC   )
isinstancelistr+   r#   rF   r5   torchequalbatch_decodemodel_max_lengthloggerwarningr7   rG   r"   repeat_interleavetype	TypeErrorstrr6   repeatviewcat)r/   promptr3   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_text	text_maskprompt_embedstext_encoder_hidden_statesuncond_tokensuncond_inputuncond_text_input_idsuncond_text_masknegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lens                        r   _encode_promptz KandinskyPipeline._encode_prompt   s    %/vt$<S[!
nn "&# % 
 %....SW.Xbb  $(<(<R(@@UcetIu>>66q$..JiJilmJmprJrGr7stLNNNN334Il^M
 (**62..11&9	484E4E$Y 5F 5
11 &778MST7U%?%Q%QRgmn%Q%o"//0E1/M	&&!#z 1fT/%::UVZ[jVkUl mV~Q(  OS1!0 1s?33 )/)::J3K_J` ax/
| <33  !0>>$&*#'# * L %1$:$:$=$=f$E!+::==fEHLHYHY/@P IZ IE"$E -2215G%;%B%B1F[%\"%;%@%@NcAcel%m"7==a@G0Q0X0XYZ\qst0u-0Q0V0V22GR1-  0AABW]^A_ "II'=}&MNM).4UWq3r)s&		#3Y"?@I8)CCr   r[   image_embedsnegative_image_embedsr^   heightwidthnum_inference_stepsguidance_scaler\   r2   r9   output_typecallbackcallback_stepsreturn_dictc                 P   t        |t              rd}n3t        |t              rt        |      }nt	        dt        |             | j                  }||	z  }|dkD  }| j                  |||	||      \  }}}t        |t              rt        j                  |d      }t        |t              rt        j                  |d      }|rZ|j                  |	d      }|j                  |	d      }t        j                  ||gd      j                  |j                  |      }| j                  j                  ||       | j                  j                  }| j                   j"                  j$                  }t'        ||| j(                        \  }}| j+                  ||||f|j                  ||
|| j                        }t-        | j/                  |            D ]  \  }}|rt        j                  |gdz        n|}||d	}| j!                  ||||d
      d   }|ro|j1                  |j2                  d   d      \  }}|j5                  d      \  }}|j5                  d      \  }} ||||z
  z  z   }t        j                  || gd      }t7        | j                  j"                  d      r"| j                  j"                  j8                  dv s#|j1                  |j2                  d   d      \  }}| j                  j;                  ||||
      j<                  }|,||z  dk(  r$|t?        | j                  dd      z  }! ||!||       t@        swtC        jD                           | jF                  jI                  |d      d   }"| jK                          |dvrt	        d|       |dv rX|"dz  dz   }"|"jM                  dd      }"|"jO                         jQ                  dddd      jS                         jU                         }"|dk(  r| jW                  |"      }"|s|"fS tY        |"      S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
                The clip image embeddings for text prompt, that will be used to condition the image generation.
            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
                The clip image embeddings for negative text prompt, will be used to condition the image generation.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        r   z2`prompt` has to be of type `str` or `list` but is g      ?r   rH   )r4   r3   )r3   r   )text_embedsro   F)sampletimestepencoder_hidden_statesadded_cond_kwargsrx   variance_type)learnedlearned_range)r2   orderT)force_not_quantizer{   )r>   nppilzIOnly the output types `pt`, `pil` and `np` are supported not output_type=)r   r   g      ?r   r   )images)-rL   rW   rM   r+   r6   rU   _execution_devicern   rN   rZ   rT   r7   r4   r%   set_timesteps	timestepsr$   r,   in_channelsr   r.   r:   	enumerateprogress_barsplitr5   chunkhasattrr   stepprev_samplegetattrXLA_AVAILABLExm	mark_stepr&   decodemaybe_free_model_hooksclampcpupermutefloatnumpynumpy_to_pilr   )#r/   r[   ro   rp   r^   rq   rr   rs   rt   r\   r2   r9   ru   rv   rw   rx   r_   r3   r]   re   rf   _timesteps_tensornum_channels_latentsitlatent_model_inputr~   
noise_predvariance_prednoise_pred_uncondnoise_pred_textvariance_pred_textstep_idximages#                                      r   __call__zKandinskyPipeline.__call__   s@   R fc"J%VJQRVW]R^Q_`aa''"77
&4s&:#7;7J7JF13NP_8
411 lD) 99\q9L+T2$)II.C$K!&'99:OUV9WL$9$K$KLagh$K$i! 99&;\%JPQRUU#))& V L 	$$%8$H>>33#yy//;;#FE43I3IJ &&-vu=&,,NN
 d//0@AB '	DAq=XG9q=!9^e0=| \)&@"3! #  J +,6,<,<W]]1=MST,<,U)
M5?5E5Ea5H2!?(5(;(;A(>%%.?UfCf1gg
"YY
4F'GQO
 --?NN))77;WW * 0 0q1Aq 0 I
A nn))#	 * 
 k  #N(:a(? CC1g.O'	T 		  T B8L##%11hithuvww-'CK#%EKK1%EIIK''1a399;AACE%%%e,E8O"%00r   )N)N   r   d   g      @r   NNr   Nr   T)__name__
__module____qualname____doc__model_cpu_offload_seqr   r   r	   r   r   r   r
   r)   r:   rn   rN   no_gradr   EXAMPLE_DOC_STRINGrW   r   Tensorr   intr   	Generatorr   boolr   __classcell__)r0   s   @r   r!   r!   U   s   & 7U&U 'U #	U
 56U U(	" dDL U]]_12 <@#& #%&MQ*.%*GK !q1c49n%q1 ELL$u||*<<=q1  %U\\43E%EF	q1
 "%T#Y"78q1 q1 q1 !q1 q1  #q1 E%//43H"HIJq1 %,,'q1 c]q1 8S#u||$<d$BCDq1 q1  !q1 3 q1r   r!   )   )$typingr   r   r   r   rN   transformersr   modelsr	   r
   
schedulersr   r   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r   r"   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rR   r   r   r!   r   r   r   <module>r      sx    3 2  4 6 
 . C * ))MM			H	% >6N1) N1r   