
    bil                        d dl mZmZmZmZmZ d dlZd dlmZm	Z	 ddl
mZ ddlmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ d
dlmZmZ  e       rd dlmc mZ dZndZ ej@                  e!      Z"dZ#ddZ$ G d dee      Z%y)    )CallableDictListOptionalUnionN)T5EncoderModelT5Tokenizer   )StableDiffusionLoraLoaderMixin)Kandinsky3UNetVQModel)DDPMScheduler)	deprecateis_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutputTFa  
    Examples:
        ```py
        >>> from diffusers import AutoPipelineForText2Image
        >>> import torch

        >>> pipe = AutoPipelineForText2Image.from_pretrained(
        ...     "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
        ... )
        >>> pipe.enable_model_cpu_offload()

        >>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."

        >>> generator = torch.Generator(device="cpu").manual_seed(0)
        >>> image = pipe(prompt, num_inference_steps=25, generator=generator).images[0]
        ```

c                 v    | |dz  z  }| |dz  z  dk7  r|dz  }||dz  z  }||dz  z  dk7  r|dz  }||z  ||z  fS )Nr   r       )heightwidthscale_factor
new_height	new_widths        m/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.pydownscale_height_and_widthr    1   sg    <?*Ja1$a
q(I|Q!#Q	$i,&>>>    c            %           e Zd ZdZg dZdedededede	f
 fdZ
d	 Z ej                         	 	 	 	 	 	 	 	 	 d)deej                     deej                     deej                     deej                     fd       Zd Z	 	 	 	 	 	 d*dZed        Zed        Zed        Z ej                          ee      dddddddddddddd
dddgfdeeee   f   dededeeeee   f      d ee   d!ee   d"ee   d#eeej<                  eej<                     f      deej                     deej                     deej                     deej                     d$ee   d%ed&ee eee!gdf      d'ee   f d(              Z" xZ#S )+Kandinsky3Pipelineztext_encoder->unet->movq)latentsprompt_embedsnegative_prompt_embedsnegative_attention_maskattention_mask	tokenizertext_encoderunet	schedulermovqc                 N    t         |           | j                  |||||       y )N)r)   r*   r+   r,   r-   )super__init__register_modules)selfr)   r*   r+   r,   r-   	__class__s         r   r0   zKandinsky3Pipeline.__init__E   s0     	lQZae 	 	
r!   c                     |rYt        j                  ||dk(           ||dk(  <   |j                  d      j                         dz   }|d d d |f   }|d d d |f   }||fS )Nr   r   )torch
zeros_likesummax)r2   
embeddingsr(   cut_contextmax_seq_lengths        r   process_embedsz!Kandinsky3Pipeline.process_embedsS   sy    .3.>.>z.\]J]?^._J~*++//3779A=N#A$67J+A,>?N>))r!   Tr   Nr%   r&   r(   r'   c                    |<|:t        |      t        |      ur$t        dt        |       dt        |       d      || j                  }|t        |t              rd}n-|t        |t
              rt        |      }n|j                  d   }d}|| j                  |d|d	d
      }|j                  j                  |      }|j                  j                  |      }	| j                  ||	      }|d   }| j                  ||	|      \  }}	||	j                  d      z  }| j                  | j                  j                  }nd}|j                  ||      }|j                  \  }}}|j!                  d|d      }|j#                  ||z  |d      }|	j!                  |d      }	|r'|$|dg|z  }nEt        |t              r|g}n1|t        |      k7  r!t%        d| dt        |       d| d| d	      |}|| j                  |ddd	d	d
      }|j                  j                  |      }|j                  j                  |      }
| j                  ||
      }|d   }|ddd|j                  d   f   }|
ddd|j                  d   f   }
||
j                  d      z  }n*t'        j(                  |      }t'        j(                  |	      }
|rw|j                  d   }|j                  ||      }|j                  |j                  k7  r@|j!                  d|d      }|j#                  ||z  |d      }|
j!                  |d      }
nd}d}
|||	|
fS )aX  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`, *optional*):
                torch device to place the resulting embeddings on
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
            negative_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
        Nz?`negative_prompt` should be the same type to `prompt`, but got z != .r   r      
max_lengthTpt)paddingrA   
truncationreturn_tensors)r(   r   )dtypedevicer5    z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)rC   rA   rD   return_attention_maskrE   )type	TypeError_execution_device
isinstancestrlistlenshaper)   	input_idstor(   r*   r=   	unsqueezerF   repeatview
ValueErrorr6   r7   )r2   promptdo_classifier_free_guidancenum_images_per_promptrG   negative_promptr%   r&   _cut_contextr(   r'   
batch_sizerA   text_inputstext_input_idsrF   bs_embedseq_len_uncond_tokensuncond_inputs                        r   encode_promptz Kandinsky3Pipeline.encode_prompt[   s	   T /"=F|4#88UVZ[jVkUl mV~Q( 
 >++F*VS"9JJvt$<VJ&,,Q/J
 ..$%# ) K )2255f=N(77::6BN --- . M *!,M,0,?,?~_k,l)M>)N,D,DQ,GGM(%%++EE%((uV(D,22'1%,,Q0EqI%**86K+KWVXY'../DaH&+A+I &!#z 1OS1!0 1s?33 )/)::J3K_J` ax/
| <33  !0*#~~!("#*.#'  .   ".!7!7!:!:6!B*6*E*E*H*H*P')-):):"#: *; *& *@)B&)?C[]EXEXYZE[C[@[)\&*A!E]}GZGZ[\G]E]B]*^')?BYBcBcdeBf)f& */)9)9-)H&*/*:*:>*J'&,2215G%;%>%>USY%>%Z"%++}/B/BB)?)F)FqJ_ab)c&)?)D)DZRgEgiprt)u&*A*H*HI^`a*b' &*"&*#4nF]]]r!   c                     |t        ||||      }n;|j                  |k7  rt        d|j                   d|       |j                  |      }||j                  z  }|S )N)	generatorrG   rF   zUnexpected latents shape, got z, expected )r   rQ   rW   rS   init_noise_sigma)r2   rQ   rF   rG   rg   r$   r,   s          r   prepare_latentsz"Kandinsky3Pipeline.prepare_latents   sg    ?"5IfTYZG}}% #A'--P[\a[b!cddjj(GI666r!   c	           
          |0t        |t              r|dk  rt        d| dt        |       d      |Lt	         fd|D              s8t        d j
                   d|D 	cg c]  }	|	 j
                  vs|	 c}	       ||t        d| d	| d
      ||t        d      |7t        |t              s't        |t              st        dt        |             ||t        d| d| d
      |A|?|j                  |j                  k7  r&t        d|j                   d|j                   d      ||t        d      |G|E|j                  d d |j                  k7  r)t        d|j                  d d  d|j                   d      ||t        d      |I|F|j                  d d |j                  k7  r)t        d|j                  d d  d|j                   d      y y y c c}	w )Nr   z5`callback_steps` has to be a positive integer but is z	 of type r?   c              3   :   K   | ]  }|j                   v   y wN_callback_tensor_inputs.0kr2   s     r   	<genexpr>z2Kandinsky3Pipeline.check_inputs.<locals>.<genexpr>  #      F
23A---F
   2`callback_on_step_end_tensor_inputs` has to be in , but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zLPlease provide `negative_attention_mask` along with `negative_prompt_embeds`r   z`negative_prompt_embeds` and `negative_attention_mask` must have the same batch_size and token length when passed directly, but got: `negative_prompt_embeds` z != `negative_attention_mask` z:Please provide `attention_mask` along with `prompt_embeds`z`prompt_embeds` and `attention_mask` must have the same batch_size and token length when passed directly, but got: `prompt_embeds` z != `attention_mask` )	rM   intrW   rJ   allrn   rN   rO   rQ   )
r2   rX   callback_stepsr[   r%   r&   "callback_on_step_end_tensor_inputsr(   r'   rq   s
   `         r   check_inputszKandinsky3Pipeline.check_inputs   s     %z.#/NR`deReGGW X(),  .9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@TZ\`IaQRVW]R^Q_`aa&+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8 
 "-2I2Qkll!-2I2U%++BQ/3J3P3PP 66L6R6RSUTU6V5W X/556a9  $)?YZZ$)C""2A&.*>*>> --:-@-@!-D,E F&,,-Q0  ? *D$U pHs   !G,5G,c                     | j                   S rl   _guidance_scaler2   s    r   guidance_scalez!Kandinsky3Pipeline.guidance_scaleB  s    ###r!   c                      | j                   dkD  S )Nr   r}   r   s    r   rY   z.Kandinsky3Pipeline.do_classifier_free_guidanceF  s    ##a''r!   c                     | j                   S rl   )_num_timestepsr   s    r   num_timestepsz Kandinsky3Pipeline.num_timestepsJ  s    """r!      g      @i   pilr$   rX   num_inference_stepsr   r[   rZ   r   r   rg   output_typereturn_dictcallback_on_step_endrz   c                 
    |j                  dd      }|j                  dd      }|t        ddd       |t        ddd       |Lt         fd|D              s8t        d j                   d	|D cg c]  }| j                  vs| c}       d
} j
                  } j                  ||||	|
|||       | _        |t        |t              rd}n-|t        |t              rt        |      }n|	j                  d   } j                  | j                  ||||	|
|||
      \  }	}
}} j                  r<t        j                   |
|	g      }	t        j                   ||g      j#                         } j$                  j'                  ||        j$                  j(                  }t+        ||d      \  }} j-                  ||z  d||f|	j.                  ||| j$                        }t1         d      r& j2                   j2                  j5                          t        |      | j$                  j6                  z  z
  }t        |       _         j;                  |      5 }t=        |      D ]  \  }} j                  rt        j                   |gdz        n|} j?                  |||	|d      d   } j                  r"|jA                  d      \  } }!|dz   |!z  || z  z
  } j$                  jC                  ||||      jD                  }|~i }"|D ]  }tG               |   |"|<     | |||"      }#|#j                  d|      }|#j                  d|	      }	|#j                  d|
      }
|#j                  d|      }|#j                  d|      }|t        |      dz
  k(  s'|dz   |kD  r]|dz    j$                  j6                  z  dk(  r>|jI                          |,||z  dk(  r$|tK         j$                  dd      z  }$ ||$||       tL        stO        jP                           |dvrt        d|       |d k(  s jR                  jU                  |d
!      d"   }%|d#v rX|%d$z  d$z   }%|%jW                  dd      }%|%jY                         j[                  ddd%d      j]                         j_                         }%|d&k(  r ja                  |%      }%n|}% jc                          |s|%fcddd       S te        |%'      cddd       S c c}w # 1 sw Y   yxY w)(u6  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
                timesteps are used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 3.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            height (`int`, *optional*, defaults to self.unet.config.sample_size):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to self.unet.config.sample_size):
                The width in pixels of the generated image.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
                applies to [`schedulers.DDIMScheduler`], will be ignored for others.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
            negative_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
            clean_caption (`bool`, *optional*, defaults to `True`):
                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
                be installed. If the dependencies are not installed, the embeddings will be created from the raw
                prompt.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`

        callbackNry   z1.0.0zhPassing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`znPassing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`c              3   :   K   | ]  }|j                   v   y wrl   rm   ro   s     r   rr   z.Kandinsky3Pipeline.__call__.<locals>.<genexpr>  rs   rt   ru   rv   Tr   r   )rZ   rG   r[   r%   r&   r\   r(   r'   )rG         text_encoder_offload_hook)totalr   F)encoder_hidden_statesencoder_attention_maskr   g      ?)rg   r$   r%   r&   r(   r'   order)rB   npr   latentzSOnly the output types `pt`, `pil`, `np` and `latent` are supported not output_type=r   )force_not_quantizesample)r   r   g      ?r
   r   )images)3popr   rx   rW   rn   rL   r{   r~   rM   rN   rO   rP   rQ   re   rY   r6   catboolr,   set_timesteps	timestepsr    ri   rF   hasattrr   offloadr   r   progress_bar	enumerater+   chunkstepprev_samplelocalsupdategetattrXLA_AVAILABLExm	mark_stepr-   decodeclampcpupermutefloatnumpynumpy_to_pilmaybe_free_model_hooksr   )&r2   rX   r   r   r[   rZ   r   r   rg   r%   r&   r(   r'   r   r   r$   r   rz   kwargsr   ry   rq   r;   rG   r]   r   num_warmup_stepsr   itlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargscallback_outputsstep_idximages&   `                                     r   __call__zKandinsky3Pipeline.__call__N  s   @ ::j$/$4d;z
 %  A .9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  '' 	".#		
  .*VS"9JJvt$<VJ&,,Q/J Z^YkYk,,"7+'#9$)$; Zl Z
V-~?V ++!II'=}&MNM"YY(?'PQVVXN$$%8$HNN,,	 365!D&&//FEBNN
 445$:X:X:d**224 y>,?$..BVBV,VV!)n%89 H	5\!), -#1AEAaAaUYYy1}%=gn" "YY&*7+9 % '  
 339C9I9I!9L6%"03"6/!IN]nLn!nJ ..--'	 . 
 +  (3&(O? 9-3Xa[*9';D!Q'X$.229gFG$4$8$8-$XM-=-A-ABZ\r-s*%5%9%9:JN%[N.>.B.BC\^u.v+I**A9I/IqSTuX\XfXfXlXlNlpqNq '')+N0Ba0G#$(K#K 1g6 LLN[-#` "?? ijuivw  (*		((T(J8T-/!CK#-E!KK1-E!IIK//1a;AACIIKE%' --e4E'')xMH	5 H	5P 'e4QH	5 H	5E pHDH	5 H	5s&   1S>S> F&TCT)TT)	Tr   NNNNFNN)NNNNNN)$__name__
__module____qualname__model_cpu_offload_seqrn   r	   r   r   r   r   r0   r=   r6   no_gradr   Tensorre   ri   r{   propertyr   rY   r   r   EXAMPLE_DOC_STRINGr   rN   r   rw   r   	Generatorr   r   r   r   __classcell__)r3   s   @r   r#   r#   ;   s   6

 %
 	

 !
 
* U]]_ %)049=15:>S^  -S^ !) 6S^ !.S^ "*%,,!7S^ S^j	 #+/ $DL $ $ ( ( # # U]]_12 )-#% #;?/0 $#MQ049=15:>%* KO9B%|5c49n%|5 !|5 	|5
 "%T#Y"78|5  (}|5 |5 }|5 E%//43H"HIJ|5  -|5 !) 6|5 !.|5 "*%,,!7|5 c]|5 |5" 'xc40@$0F'GH#|5$ -1I%|5 3 |5r!   r#   )r   )&typingr   r   r   r   r   r6   transformersr   r	   loadersr   modelsr   r   
schedulersr   utilsr   r   r   r   utils.torch_utilsr   pipeline_utilsr   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   loggerr   r    r#   r   r!   r   <module>r      su    8 8  4 5 - '  . C ))MM			H	% (?Q5*,J Q5r!   