
    bi`Q                        d dl mZmZmZmZmZ d dlZd dlZd dl	m
Z
mZ ddlmZ ddlmZmZmZmZ ddlmZ dd	lmZmZmZ d
dlmZ d
dlmZ  e       rd dlmc mZ  dZ!ndZ! ejD                  e#      Z$dZ% G d dee      Z&y)    )CallableDictListOptionalUnionN)CLIPTextModelCLIPTokenizer   )DDPMWuerstchenScheduler)	deprecateis_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutput   )PaellaVQModel)WuerstchenDiffNeXtTFa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import WuerstchenPriorPipeline, WuerstchenDecoderPipeline

        >>> prior_pipe = WuerstchenPriorPipeline.from_pretrained(
        ...     "warp-ai/wuerstchen-prior", torch_dtype=torch.float16
        ... ).to("cuda")
        >>> gen_pipe = WuerstchenDecoderPipeline.from_pretrain("warp-ai/wuerstchen", torch_dtype=torch.float16).to(
        ...     "cuda"
        ... )

        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
        >>> prior_output = pipe(prompt)
        >>> images = gen_pipe(prior_output.image_embeddings, prompt=prompt)
        ```
c                   
    e Zd ZdZdZg dZ	 d%dededede	de
d	ed
df fdZd Z	 d&dZed        Zed        Zed        Z ej(                          ee      ddddddddddddgfdeej0                  eej0                     f   deeee   f   dedeee      dedeeeee   f      dedeeej:                  eej:                     f      deej0                     d ee   d!ed"eeeee gdf      d#ee   fd$              Z! xZ"S )'WuerstchenDecoderPipelineaR  
    Pipeline for generating images from the Wuerstchen model.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        tokenizer (`CLIPTokenizer`):
            The CLIP tokenizer.
        text_encoder (`CLIPTextModel`):
            The CLIP text encoder.
        decoder ([`WuerstchenDiffNeXt`]):
            The WuerstchenDiffNeXt unet decoder.
        vqgan ([`PaellaVQModel`]):
            The VQGAN model.
        scheduler ([`DDPMWuerstchenScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
        latent_dim_scale (float, `optional`, defaults to 10.67):
            Multiplier to determine the VQ latent space size from the image embeddings. If the image embeddings are
            height=24 and width=24, the VQ latent shape needs to be height=int(24*10.67)=256 and
            width=int(24*10.67)=256 in order to match the training conditions.
    ztext_encoder->decoder->vqgan)latentstext_encoder_hidden_statesnegative_prompt_embedsimage_embeddings	tokenizertext_encoderdecoder	schedulervqganlatent_dim_scalereturnNc                 r    t         |           | j                  |||||       | j                  |       y )N)r   r   r    r!   r"   )r#   )super__init__register_modulesregister_to_config)selfr   r   r    r!   r"   r#   	__class__s          m/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.pyr'   z"WuerstchenDecoderPipeline.__init__[   sF     	% 	 	
 	1AB    c                     |t        ||||      }n;|j                  |k7  rt        d|j                   d|       |j                  |      }||j                  z  }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r*   r2   r1   r0   r/   r   r!   s          r,   prepare_latentsz)WuerstchenDecoderPipeline.prepare_latentso   sg    ?"5IfTYZG}}% #A'--P[\a[b!cddjj(GI666r-   c                    t        |t              rt        |      nd}| j                  |d| j                  j                  dd      }|j
                  }|j                  }	| j                  |dd      j
                  }
|
j                  d   |j                  d   k\  rt        j                  ||
      s| j                  j                  |
d d | j                  j                  dz
  df         }t        j                  d	| j                  j                   d
|        |d d d | j                  j                  f   }|	d d d | j                  j                  f   }	| j                  |j                  |      |	j                  |            }|j                  }|j!                  |d      }d }|r;|dg|z  }nt#        |      t#        |      ur$t%        dt#        |       dt#        |       d      t        |t&              r|g}n1|t        |      k7  r!t)        d| dt        |       d| d| d	      |}| j                  |d| j                  j                  dd      }| j                  |j
                  j                  |      |j                  j                  |            }|j                  }|j                  d   }|j+                  d|d      }|j-                  ||z  |d      }||fS )Nr   
max_lengthTpt)paddingr8   
truncationreturn_tensorslongest)r:   r<   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )attention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancelistlenr   model_max_length	input_idsr?   r2   torchequalbatch_decodeloggerwarningr   r4   last_hidden_staterepeat_interleavetype	TypeErrorstrr3   repeatview)r*   promptr0   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsr?   untruncated_idsremoved_texttext_encoder_outputr   !uncond_text_encoder_hidden_statesuncond_tokensuncond_input*negative_prompt_embeds_text_encoder_outputseq_lens                      r,   encode_promptz'WuerstchenDecoderPipeline.encode_promptz   s-    %/vt$<S[!
nn ~~66 % 
 %..$33..SW.Xbb  $(<(<R(@@UcetIu>>66q$..JiJilmJmprJrGr7stLNNNN334Il^M ,A/P1P1P/P,PQN+A/P1P1P/P,PQN"//0A0A&0IZhZkZklrZs/t%8%J%J"%?%Q%QRgmn%Q%o",0)&&!#z 1fT/%::UVZ[jVkUl mV~Q(  OS1!0 1s?33 )/)::J3K_J` ax/
| <33  !0>>$>>::# * L :>9J9J&&))&1,B]B]B`B`agBh :K :6 1[0l0l- 8==a@G0Q0X0XYZ\qst0u-0Q0V0V22GR1- *+LLLr-   c                     | j                   S N_guidance_scaler*   s    r,   guidance_scalez(WuerstchenDecoderPipeline.guidance_scale   s    ###r-   c                      | j                   dkD  S )Nr   rg   ri   s    r,   rW   z5WuerstchenDecoderPipeline.do_classifier_free_guidance   s    ##a''r-   c                     | j                   S rf   )_num_timestepsri   s    r,   num_timestepsz'WuerstchenDecoderPipeline.num_timesteps   s    """r-      g        r   pilTr   r   rU   num_inference_steps	timestepsrj   rX   rV   r/   output_typereturn_dictcallback_on_step_end"callback_on_step_end_tensor_inputsc           
      <    |j                  dd      }|j                  dd      }|t        ddd       |t        ddd       |Lt         fd|D              s8t        d j                   d	|D cg c]  }| j                  vs| c}        j
                  } j                  j                  }| _        t        |t              s,t        |t              r|g}nt        d
t        |       d       j                  r>|<t        |t              s,t        |t              r|g}nt        dt        |       d      t        |t              rt        j                   |d      }t        |t"        j$                        r't        j&                  ||      j)                  |      }t        |t        j&                        st        dt        |       d      t        |t*              st        dt        |       d       j-                  |||j/                  d      |z   j                  |      \  }}|t        j                   ||g      n|} j                  r*t        j                   |t        j0                  |      g      n|}t+        |j/                  d       j2                  j4                  z        }t+        |j/                  d       j2                  j4                  z        }|j/                  d      |z  d||f}|? j6                  j9                  ||        j6                  j:                  }t=        |      }n3 j6                  j9                  ||        j6                  j:                  } j?                  |||||	 j6                        }	t=        |dd        _         tC         jE                  |dd             D ]  \  }}|jG                  |	j/                  d            j)                  |      } j                   j                  rt        j                   |	gdz        n|	 j                  rt        j                   |gdz        n|||      } j                  r5|jI                  d      \  }} t        jJ                  | | jL                        } j6                  jO                  |||	|      jP                  }	|Zi }!|D ]  }tS               |   |!|<     | |||!      }"|"j                  d|	      }	|"j                  d|      }|"j                  d|      }|,||z  dk(  r$|tU         j6                  dd      z  }# ||#||	       tV        stY        jZ                           |
d vrt        d!|
       |
d"k(  s j\                  j2                  j^                  |	z  }	 j\                  ja                  |	      jb                  je                  dd      }$|
d#k(  r?|$jg                  dddd      ji                         jk                         jm                         }$nW|
d$k(  rR|$jg                  dddd      ji                         jk                         jm                         }$ jo                  |$      }$n|	}$ jq                          |s|$S ts        |$      S c c}w )%aq  
        Function invoked when calling the pipeline for generation.

        Args:
            image_embedding (`torch.Tensor` or `List[torch.Tensor]`):
                Image Embeddings either extracted from an image or generated by a Prior Model.
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            num_inference_steps (`int`, *optional*, defaults to 12):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
                timesteps are used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 0.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `decoder_guidance_scale` is defined as `w` of
                equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by
                setting `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are
                closely linked to the text `prompt`, usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `decoder_guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
            embeddings.
        callbackNcallback_stepsz1.0.0zhPassing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`znPassing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`c              3   :   K   | ]  }|j                   v   y wrf   )_callback_tensor_inputs).0kr*   s     r,   	<genexpr>z5WuerstchenDecoderPipeline.__call__.<locals>.<genexpr>2  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found z2'prompt' must be of type 'list' or 'str', but got rC   z;'negative_prompt' must be of type 'list' or 'str', but got r   r@   )r0   )r1   zI'image_embeddings' must be of type 'torch.Tensor' or 'np.array', but got z5'num_inference_steps' must be of type 'int', but got zo                           In Case you want to provide explicit timesteps, please use the 'timesteps' argument.r   r
      )rr   r0   r>   )reffnetclip)model_outputtimestepsampler/   r   r   r   orderr   )r9   nprp   latentzSOnly the output types `pt`, `np`, `pil` and `latent` are supported not output_type=r   r   rp   ):popr   allr3   r{   _execution_devicer    r1   rh   rD   rE   rR   rQ   rP   rW   rI   catr   ndarrayTensorr4   intrd   size
zeros_likeconfigr#   r!   set_timestepsrr   rF   r6   rm   	enumerateprogress_barexpandchunklerprj   stepprev_samplelocalsgetattrXLA_AVAILABLExm	mark_stepr"   scale_factordecoder   clamppermutecpufloatnumpynumpy_to_pilmaybe_free_model_hooksr   )%r*   r   rU   rq   rr   rj   rX   rV   r/   r   rs   rt   ru   rv   kwargsrx   ry   r}   r0   r1   prompt_embedsr   r   r   latent_heightlatent_widthlatent_features_shapeitratiopredicted_latentspredicted_latents_textpredicted_latents_uncondcallback_kwargscallback_outputsstep_idximagess%   `                                    r,   __call__z"WuerstchenDecoderPipeline.__call__   s   T ::j$/$4d;z
 %  A .9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J 
 ''""- &$'&#& "TUYZ`UaTbbc dee++*:ot3Tos3'6&7O#UVZ[jVkUllmn  &-$yy)9qA&

3$||,<VLOOV[O\*ELL9[\`aq\r[sstu  -s3GM`HaGb cp q  150B0B!!!$'<<,,1
-- CYBdEII}&<=>jw 	#
 // II')9)9:J)KLM! 	 ,11!4t{{7S7SST+003dkk6R6RRS!1!6!6q!9<Q!QSTVceq r  NN((9V(L00I"%i.NN(()<V(L00I &&'<eVYX_aeaoaop ")CR.1d//	#2?@ (	DAqHHW\\!_-007E $,0,L,L		7)a-(RY,0,L,L%))UGaK(RW/	 !- ! //CTCZCZ[\C]@&(@$)JJ/GI_aeatat$u! nn)).#	 * 
 k  $/"$; 5A)/!OA&5#7aO#T *..y'B#3#7#78JL\#] -=-A-A02L.* #N(:a(? CC1g.Q(	T ;;efqers  h&jj''44w>GZZ&&w/66<<QBFd"1a3779??AGGI%1a3779??AGGI**62F 	##%M"6**W pHs   1ZZ)gףp=
W%@rf   )#__name__
__module____qualname____doc__model_cpu_offload_seqr{   r	   r   r   r   r   r   r'   r6   rd   propertyrj   rW   rn   rI   no_gradr   EXAMPLE_DOC_STRINGr   r   r   rR   r   r   	Generatorboolr   r   r   __classcell__)r+   s   @r,   r   r   ;   s   . ; #(C C $C $	C
 +C C  C 
C(	" PMd $ $ ( ( # # U]]_12 )-#%+/ #;?%&MQ*.%* KO9Bg+d5<<.@ @Ag+ c49n%g+ !	g+
 DK(g+ g+ "%T#Y"78g+  #g+ E%//43H"HIJg+ %,,'g+ c]g+ g+ 'xc40@$0F'GHg+ -1Ig+ 3 g+r-   r   )'typingr   r   r   r   r   r   r   rI   transformersr   r	   
schedulersr   utilsr   r   r   r   utils.torch_utilsr   pipeline_utilsr   r   r   modeling_paella_vq_modelr   modeling_wuerstchen_diffnextr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rL   r   r    r-   r,   <module>r      sr    9 8   5 1 Z Z - \ \ 3 < ))MM			H	% (F+ 79J F+r-   