Ë
    ¢ÙbiùE  ã                   óü   — d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
 ddlmZmZ ddlmZmZmZ ddlmZ d	d
lmZmZ ddlmZ  e«       rd dlmc mZ dZndZ ej<                  e«      Z dZ!dd„Z" G d„ de«      Z#y)é    )ÚCallableÚListÚOptionalÚUnionN)ÚXLMRobertaTokenizeré   )ÚUNet2DConditionModelÚVQModel)ÚDDIMSchedulerÚDDPMScheduler)Úis_torch_xla_availableÚloggingÚreplace_example_docstring)Úrandn_tensoré   )ÚDiffusionPipelineÚImagePipelineOutputé   )ÚMultilingualCLIPTFav  
    Examples:
        ```py
        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
        >>> import torch

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-2-1-prior")
        >>> pipe_prior.to("cuda")

        >>> prompt = "red cat, 4k photo"
        >>> out = pipe_prior(prompt)
        >>> image_emb = out.image_embeds
        >>> negative_image_emb = out.negative_image_embeds

        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
        >>> pipe.to("cuda")

        >>> image = pipe(
        ...     prompt,
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=negative_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=100,
        ... ).images

        >>> image[0].save("cat.png")
        ```
c                 óv   — | |dz  z  }| |dz  z  dk7  r|dz  }||dz  z  }||dz  z  dk7  r|dz  }||z  ||z  fS )Nr   r   r   © )ÚhÚwÚscale_factorÚnew_hÚnew_ws        úk/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky.pyÚget_new_h_wr   K   sg   € Ø˜q‘Ñ €EØˆ<˜‰?Ñ˜aÒØ‰
ˆØ˜q‘Ñ €EØˆ<˜‰?Ñ˜aÒØ‰
ˆØ<Ñ ¨Ñ!5Ð5Ð5ó    c            !       ó  ‡ — e Zd ZdZdZdedededee	e
f   def
ˆ fd„Zd	„ Z	 dd„Z ej                   «        ee«      	 	 	 	 	 	 	 	 	 	 	 	 ddeeee   f   deej*                  eej*                     f   deej*                  eej*                     f   deeeee   f      dedededededeeej2                  eej2                     f      deej*                     dee   deeeeej*                  gd
f      dedefd„«       «       Zˆ xZS )ÚKandinskyPipelinea1  
    Pipeline for text-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
    ztext_encoder->unet->movqÚtext_encoderÚ	tokenizerÚunetÚ	schedulerÚmovqc                 ó¶   •— t         ‰|   «        | j                  |||||¬«       dt        | j                  j
                  j                  «      dz
  z  | _        y )N)r"   r#   r$   r%   r&   r   r   )ÚsuperÚ__init__Úregister_modulesÚlenr&   ÚconfigÚblock_out_channelsÚmovq_scale_factor)Úselfr"   r#   r$   r%   r&   Ú	__class__s         €r   r)   zKandinskyPipeline.__init__k   s\   ø€ ô 	‰ÑÔà×ÑØ%ØØØØð 	ô 	
ð "#¤s¨4¯9©9×+;Ñ+;×+NÑ+NÓ'OÐRSÑ'SÑ!TˆÕr   c                 ó¾   — |€t        ||||¬«      }n;|j                  |k7  rt        d|j                  › d|› «      ‚|j                  |«      }||j                  z  }|S )N)Ú	generatorÚdeviceÚdtypezUnexpected latents shape, got z, expected )r   ÚshapeÚ
ValueErrorÚtoÚinit_noise_sigma)r/   r5   r4   r3   r2   Úlatentsr%   s          r   Úprepare_latentsz!KandinskyPipeline.prepare_latents   sg   € Øˆ?Ü" 5°IÀfÐTYÔZ‰Gà}‰} Ò%Ü Ð#AÀ'Ç-Á-ÀÐP[Ð\aÐ[bÐ!cÓdÐdØ—j‘j Ó(ˆGà˜I×6Ñ6Ñ6ˆØˆr   Nc                 óh  — t        |t        «      rt        |«      nd}| j                  |dddddd¬«      }|j                  }| j                  |dd¬«      j                  }	|	j
                  d	   |j
                  d	   k\  r€t        j                  ||	«      sj| j                  j                  |	d d …| j                  j                  dz
  d	…f   «      }
t        j                  d
| j                  j                  › d|
› «       |j                  |«      }|j                  j                  |«      }| j                  ||¬«      \  }}|j                  |d¬«      }|j                  |d¬«      }|j                  |d¬«      }|r²|€dg|z  }nt!        |«      t!        |«      ur$t#        dt!        |«      › dt!        |«      › d«      ‚t        |t$        «      r|g}n1|t        |«      k7  r!t'        d|› dt        |«      › d|› d|› d	«      ‚|}| j                  |dddddd¬«      }|j                  j                  |«      }|j                  j                  |«      }| j                  ||¬«      \  }}|j
                  d   }|j)                  d|«      }|j+                  ||z  |«      }|j
                  d   }|j)                  d|d«      }|j+                  ||z  |d	«      }|j                  |d¬«      }t        j,                  ||g«      }t        j,                  ||g«      }t        j,                  ||g«      }|||fS )Nr   Ú
max_lengthTéM   Úpt)ÚpaddingÚ
truncationr<   Úreturn_attention_maskÚadd_special_tokensÚreturn_tensorsÚlongest)r?   rC   éÿÿÿÿz\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )Ú	input_idsÚattention_maskr   ©ÚdimÚ z?`negative_prompt` should be the same type to `prompt`, but got z != ú.z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)r?   r<   r@   rA   rB   rC   )Ú
isinstanceÚlistr+   r#   rF   r5   ÚtorchÚequalÚbatch_decodeÚmodel_max_lengthÚloggerÚwarningr7   rG   r"   Úrepeat_interleaveÚtypeÚ	TypeErrorÚstrr6   ÚrepeatÚviewÚcat)r/   Úpromptr3   Únum_images_per_promptÚdo_classifier_free_guidanceÚnegative_promptÚ
batch_sizeÚtext_inputsÚtext_input_idsÚuntruncated_idsÚremoved_textÚ	text_maskÚprompt_embedsÚtext_encoder_hidden_statesÚuncond_tokensÚuncond_inputÚuncond_text_input_idsÚuncond_text_maskÚnegative_prompt_embedsÚ!uncond_text_encoder_hidden_statesÚseq_lens                        r   Ú_encode_promptz KandinskyPipeline._encode_promptŠ   s¯  € ô %/¨v´tÔ$<”S˜”[À!ˆ
à—n‘nØØ ØØØ"&Ø#Øð %ó 
ˆð %×.Ñ.ˆØŸ.™.¨¸ÐSW˜.ÓX×bÑbˆà× Ñ  Ñ$¨×(<Ñ(<¸RÑ(@Ò@ÌÏÉÐUcÐetÔIuØŸ>™>×6Ñ6°ÂqÈ$Ï.É.×JiÑJiÐlmÑJmÐprÐJrÐGrÑ7sÓtˆLÜN‰NðØ—N‘N×3Ñ3Ð4°I¸l¸^ðMôð
 (×*Ñ*¨6Ó2ˆØ×.Ñ.×1Ñ1°&Ó9ˆ	à48×4EÑ4EØ$°Yð 5Fó 5
Ñ1ˆÐ1ð &×7Ñ7Ð8MÐSTÐ7ÓUˆØ%?×%QÑ%QÐRgÐmnÐ%QÓ%oÐ"Ø×/Ñ/Ð0EÈ1Ð/ÓMˆ	â&àÐ&Ø!#  zÑ 1‘Üf“¤T¨/Ó%:Ñ:ÜØUÔVZÐ[jÓVkÐUlð mÜ˜V›~ Qð(óð ô ˜O¬SÔ1Ø!0Ð 1‘Øœs ?Ó3Ò3Ü Ø)¨/Ð):Ð:JÌ3ÈÓK_ÐJ`ð aØxÐ/°
¨|ð <3ð3óð ð !0àŸ>™>ØØ$ØØØ&*Ø#'Ø#ð *ó ˆLð %1×$:Ñ$:×$=Ñ$=¸fÓ$EÐ!Ø+×:Ñ:×=Ñ=¸fÓEÐàHL×HYÑHYØ/Ð@Pð IZó IÑEÐ"Ð$Eð -×2Ñ2°1Ñ5ˆGØ%;×%BÑ%BÀ1ÐF[Ó%\Ð"Ø%;×%@Ñ%@ÀÐNcÑAcÐelÓ%mÐ"à7×=Ñ=¸aÑ@ˆGØ0Q×0XÑ0XÐYZÐ\qÐstÓ0uÐ-Ø0Q×0VÑ0VØÐ2Ñ2°G¸Ró1Ð-ð  0×AÑAÐBWÐ]^ÐAÓ_Ðô "ŸI™IÐ'=¸}Ð&MÓNˆMÜ).¯©Ð4UÐWqÐ3rÓ)sÐ&äŸ	™	Ð#3°YÐ"?Ó@ˆIàÐ8¸)ÐCÐCr   r[   Úimage_embedsÚnegative_image_embedsr^   ÚheightÚwidthÚnum_inference_stepsÚguidance_scaler\   r2   r9   Úoutput_typeÚcallbackÚcallback_stepsÚreturn_dictc                 óP  — t        |t        «      rd}n3t        |t        «      rt        |«      }nt	        dt        |«      › «      ‚| j                  }||	z  }|dkD  }| j                  |||	||«      \  }}}t        |t        «      rt        j                  |d¬«      }t        |t        «      rt        j                  |d¬«      }|rZ|j                  |	d¬«      }|j                  |	d¬«      }t        j                  ||gd¬«      j                  |j                  |¬«      }| j                  j                  ||¬«       | j                  j                  }| j                   j"                  j$                  }t'        ||| j(                  «      \  }}| j+                  ||||f|j                  ||
|| j                  «      }t-        | j/                  |«      «      D ]Š  \  }}|rt        j                  |gdz  «      n|}||d	œ}| j!                  ||||d
¬«      d   }|ro|j1                  |j2                  d   d¬«      \  }}|j5                  d«      \  }}|j5                  d«      \  }} ||||z
  z  z   }t        j                  || gd¬«      }t7        | j                  j"                  d«      r"| j                  j"                  j8                  dv s#|j1                  |j2                  d   d¬«      \  }}| j                  j;                  ||||
¬«      j<                  }|,||z  dk(  r$|t?        | j                  dd«      z  }! ||!||«       t@        sŒwtC        jD                  «        Œ | jF                  jI                  |d¬«      d   }"| jK                  «        |dvrt	        d|› «      ‚|dv rX|"dz  dz   }"|"jM                  dd«      }"|"jO                  «       jQ                  dddd«      jS                  «       jU                  «       }"|dk(  r| jW                  |"«      }"|s|"fS tY        |"¬«      S )aô  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
                The clip image embeddings for text prompt, that will be used to condition the image generation.
            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
                The clip image embeddings for negative text prompt, will be used to condition the image generation.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        r   z2`prompt` has to be of type `str` or `list` but is g      ð?r   rH   )r4   r3   )r3   r   )Útext_embedsro   F)ÚsampleÚtimestepÚencoder_hidden_statesÚadded_cond_kwargsrx   Úvariance_type)ÚlearnedÚlearned_range)r2   ÚorderT)Úforce_not_quantizer{   )r>   ÚnpÚpilzIOnly the output types `pt`, `pil` and `np` are supported not output_type=)r„   r…   g      à?r   r…   )Úimages)-rL   rW   rM   r+   r6   rU   Ú_execution_devicern   rN   rZ   rT   r7   r4   r%   Úset_timestepsÚ	timestepsr$   r,   Úin_channelsr   r.   r:   Ú	enumerateÚprogress_barÚsplitr5   ÚchunkÚhasattrr   ÚstepÚprev_sampleÚgetattrÚXLA_AVAILABLEÚxmÚ	mark_stepr&   ÚdecodeÚmaybe_free_model_hooksÚclampÚcpuÚpermuteÚfloatÚnumpyÚnumpy_to_pilr   )#r/   r[   ro   rp   r^   rq   rr   rs   rt   r\   r2   r9   ru   rv   rw   rx   r_   r3   r]   re   rf   Ú_Útimesteps_tensorÚnum_channels_latentsÚiÚtÚlatent_model_inputr~   Ú
noise_predÚvariance_predÚnoise_pred_uncondÚnoise_pred_textÚvariance_pred_textÚstep_idxÚimages#                                      r   Ú__call__zKandinskyPipeline.__call__ð   s@  € ôR fœcÔ"Ø‰JÜ˜¤Ô%Ü˜V›‰JäÐQÔRVÐW]ÓR^ÐQ_Ð`ÓaÐaà×'Ñ'ˆàÐ"7Ñ7ˆ
Ø&4°sÑ&:Ð#à7;×7JÑ7JØFÐ1Ð3NÐP_ó8
Ñ4ˆÐ1°1ô l¤DÔ)Ü Ÿ9™9 \°qÔ9ˆLÜÐ+¬TÔ2Ü$)§I¡IÐ.CÈÔ$KÐ!á&Ø'×9Ñ9Ð:OÐUVÐ9ÓWˆLØ$9×$KÑ$KÐLaÐghÐ$KÓ$iÐ!ä Ÿ9™9Ð&;¸\Ð%JÐPQÔR×UÑUØ#×)Ñ)°&ð Vó ˆLð 	‰×$Ñ$Ð%8ÀÐ$ÔHØŸ>™>×3Ñ3Ðà#Ÿy™y×/Ñ/×;Ñ;Ðä# F¨E°4×3IÑ3IÓJ‰ˆð ×&Ñ&ØÐ-¨v°uÐ=Ø&×,Ñ,ØØØØN‰Nó
ˆô ˜d×/Ñ/Ð0@ÓAÓBó '	‰DˆAˆqá=X¤§¡¨G¨9°q©=Ô!9Ð^eÐà0=È|Ñ \ÐØŸ™Ø)ØØ&@Ø"3Ø!ð #ó ð ñˆJñ +Ø,6×,<Ñ,<¸W¿]¹]È1Ñ=MÐSTÐ,<Ó,UÑ)
˜MØ5?×5EÑ5EÀaÓ5HÑ2Ð! ?Ø(5×(;Ñ(;¸AÓ(>Ñ%Ð%Ø.°À?ÐUfÑCfÑ1gÑg
Ü"ŸY™Y¨
Ð4FÐ'GÈQÔO
ô ˜Ÿ™×-Ñ-¨Ô?Ø—N‘N×)Ñ)×7Ñ7Ð;WÑWà *× 0Ñ 0°·±¸qÑ1AÀqÐ 0Ó I‘
˜Að —n‘n×)Ñ)ØØØØ#ð	 *ó ÷
 ‰kð ð Ð#¨¨NÑ(:¸aÒ(?Ø¤¨¯©¸ÀÓ CÑCÙ˜ 1 gÔ.çÜ—‘–ðO'	ðT —	‘	× Ñ  ¸TÐ ÓBÀ8ÑLˆà×#Ñ#Ô%àÐ1Ñ1ÜÐhÐitÐhuÐvÓwÐwà˜-Ñ'Ø˜C‘K #Ñ%ˆEØ—K‘K  1Ó%ˆEØ—I‘I“K×'Ñ'¨¨1¨a°Ó3×9Ñ9Ó;×AÑAÓCˆEà˜%ÒØ×%Ñ% eÓ,ˆEáØ8ˆOä"¨%Ô0Ð0r   )N)Né   r¬   éd   g      @r   NNr…   Nr   T)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úmodel_cpu_offload_seqr   r   r	   r   r   r   r
   r)   r:   rn   rN   Úno_gradr   ÚEXAMPLE_DOC_STRINGrW   r   ÚTensorr   Úintr›   Ú	Generatorr   Úboolr«   Ú__classcell__)r0   s   @r   r!   r!   U   sí  ø„ ñð& 7ÐðUà&ðUð 'ðUð #ð	Uð
 ˜¨Ð5Ñ6ðUð õUò(	ð" ódDðL €U‡]]ƒ_ÙÐ1Ó2ð <@ØØØ#&Ø #Ø%&ØMQØ*.Ø%*ØGKØØ ñ!q1àc˜4 ™9nÑ%ðq1ð ˜EŸL™L¨$¨u¯|©|Ñ*<Ð<Ñ=ðq1ð  % U§\¡\°4¸¿¹Ñ3EÐ%EÑFð	q1ð
 " %¨¨T°#©Y¨Ñ"7Ñ8ðq1ð ðq1ð ðq1ð !ðq1ð ðq1ð  #ðq1ð ˜E %§/¡/°4¸¿¹Ñ3HÐ"HÑIÑJðq1ð ˜%Ÿ,™,Ñ'ðq1ð ˜c‘]ðq1ð ˜8 S¨#¨u¯|©|Ð$<¸dÐ$BÑCÑDðq1ð ðq1ð  ò!q1ó 3ó ôq1r   r!   )é   )$Útypingr   r   r   r   rN   Útransformersr   Úmodelsr	   r
   Ú
schedulersr   r   Úutilsr   r   r   Úutils.torch_utilsr   Úpipeline_utilsr   r   r"   r   Útorch_xla.core.xla_modelÚcoreÚ	xla_modelr”   r“   Ú
get_loggerr®   rR   r´   r   r!   r   r   r   ú<module>rÆ      sx   ð÷ 3Ó 2ã õ÷ 4ß 6÷ñ õ
 .ß CÝ *ñ Ôß)Ð)àMà€Mà	ˆ×	Ñ	˜HÓ	%€ðÐ ó>6ôN1Ð)õ N1r   