
    bi                     8   d dl Z d dlmZmZmZmZmZmZ d dlZ	d dl
Z
d dlmZmZmZmZmZ d dlmZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(  e"       rd dl)m*c m+Z, dZ-ndZ- e#j\                  e/      Z0dZ1 e	jd                  g d      Z3ddgg dg ddgdggZ4e4D  cg c]*  }  e	jd                  | D cg c]
  \  }}||z   c}}      , c}}} Z5g dZ6d Z7d Z8d dZ9 G d de(      Z:yc c}}w c c}}} w )!    N)CallableDictListOptionalTupleUnion)	BertModelBertTokenizerCLIPImageProcessorMT5TokenizerT5EncoderModel)StableDiffusionPipelineOutput   )MultiPipelineCallbacksPipelineCallback)VaeImageProcessor)AutoencoderKLHunyuanDiT2DModel)get_2d_rotary_pos_embed)StableDiffusionSafetyChecker)DDPMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineTFu  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import HunyuanDiTPipeline

        >>> pipe = HunyuanDiTPipeline.from_pretrained(
        ...     "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
        ... )
        >>> pipe.to("cuda")

        >>> # You may also use English prompt as HunyuanDiT supports both English and Chinese
        >>> # prompt = "An astronaut riding a horse"
        >>> prompt = "一个宇航员在骑马"
        >>> image = pipe(prompt).images[0]
        ```
)g      ?gUUUUUU?g      ?gqq?g      ?   r      r!   )r        `  r!     )r#   r   r&   r%   r(   r!   r!   r#   r#   r!   )
r   r    r"   r$   r'   r)   r*   r+   r,   r-   c                     | |z  }t        j                  t        j                  t        |z
              }t        j                  t        j                  t        |   | |z  z
              }t
        |   |   \  }}||fS N)npargminabsSTANDARD_RATIOSTANDARD_AREASTANDARD_SHAPE)target_widthtarget_heighttarget_ratioclosest_ratio_idxclosest_area_idxwidthheights          m/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/hunyuandit/pipeline_hunyuandit.pymap_to_standard_shapesr>   b   sq    -/L		"&&,)F"GHyy6G(H<ZgKg(g!hi"#456FGME6&=    c                    |x}}| \  }}||z  }|dkD  r|}t        t        ||z  |z              }n|}t        t        ||z  |z              }t        t        ||z
  dz              }	t        t        ||z
  dz              }
|	|
f|	|z   |
|z   ffS )N   g       @)intround)srctgt_sizethtwhwrresize_heightresize_widthcrop_top	crop_lefts              r=   get_resize_crop_region_for_gridrO   j   s    BDAq	AA 	1u5a!,-E"q&1*-.5"},345HE2,345Ii 8m#;Y=U"VVVr?   c                     |j                  t        t        d|j                              d      }| j                  t        t        d| j                              d      }| ||z  z  }||z  d|z
  | z  z   } | S )a  
    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
    Flawed](https://huggingface.co/papers/2305.08891).

    Args:
        noise_cfg (`torch.Tensor`):
            The predicted noise tensor for the guided diffusion process.
        noise_pred_text (`torch.Tensor`):
            The predicted noise tensor for the text-guided diffusion process.
        guidance_rescale (`float`, *optional*, defaults to 0.0):
            A rescale factor applied to the noise predictions.

    Returns:
        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
    rA   T)dimkeepdim)stdlistrangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaleds         r=   rescale_noise_cfgr]      s    " ""tE!_5I5I,J'KUY"ZHmmU1inn%= >mMG#x''9: #66!>N:NR[9[[Ir?   c            9           e Zd ZdZdZg dZdgZg dZ	 	 	 dDded	e	d
e
dedededededee   dee   f fdZ	 	 	 	 	 	 	 	 	 	 	 dEdedej,                  dej.                  dededee   deej2                     deej2                     deej2                     deej2                     dee   defdZd  Zd! Z	 	 	 	 	 	 	 	 	 	 dFd"ZdGd#Zed$        Z ed%        Z!ed&        Z"ed'        Z#ed(        Z$ ejJ                          e&e'      dddd)d*ddd+ddddddddddd,ddd-gd+d.dd/dfde(ee)e   f   d0ee   d1ee   d2ee   d3ee*   dee(ee)e   f      dee   d4ee*   d5ee(ejV                  e)ejV                     f      d-eej2                     deej2                     d6eej2                     deej2                     d7eej2                     deej2                     d8eej2                     deej2                     d9eej2                     d:ee   d;ed<ee(e,eee-gdf   e.e/f      d=e)e   d>e*d?ee0eef      d@ee0eef      dAe0eef   dBef6dC              Z1 xZ2S )HHunyuanDiTPipelinea  
    Pipeline for English/Chinese-to-image generation using HunyuanDiT.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    HunyuanDiT uses two text encoders: [mT5](https://huggingface.co/google/mt5-base) and [bilingual CLIP](fine-tuned by
    ourselves)

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. We use
            `sdxl-vae-fp16-fix`.
        text_encoder (Optional[`~transformers.BertModel`, `~transformers.CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
            HunyuanDiT uses a fine-tuned [bilingual CLIP].
        tokenizer (Optional[`~transformers.BertTokenizer`, `~transformers.CLIPTokenizer`]):
            A `BertTokenizer` or `CLIPTokenizer` to tokenize text.
        transformer ([`HunyuanDiT2DModel`]):
            The HunyuanDiT model designed by Tencent Hunyuan.
        text_encoder_2 (`T5EncoderModel`):
            The mT5 embedder. Specifically, it is 't5-v1_1-xxl'.
        tokenizer_2 (`MT5Tokenizer`):
            The tokenizer for the mT5 embedder.
        scheduler ([`DDPMScheduler`]):
            A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents.
    z.text_encoder->text_encoder_2->transformer->vae)safety_checkerfeature_extractortext_encoder_2tokenizer_2text_encoder	tokenizerr`   )latentsprompt_embedsnegative_prompt_embedsprompt_embeds_2negative_prompt_embeds_2TNvaerd   re   transformer	schedulerra   requires_safety_checkerrb   rc   c                 ,   t         |           | j                  ||||
|||||		       |%|r#t        j	                  d| j
                   d       ||t        d      t        | dd       r/dt        | j                  j                  j                        dz
  z  nd| _        t        | j                  	      | _        | j                  |
       t!        | d      r2| j"                  &| j"                  j                  j$                  | _        y d| _        y )N)	rk   rd   re   rc   rl   rm   r`   ra   rb   z)You have disabled the safety checker for a   by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .zMake sure to define a feature extractor when loading {self.__class__} if you want to use the safety checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead.rk   r   rA      )vae_scale_factor)rn   rl      )super__init__register_modulesloggerwarning	__class__
ValueErrorgetattrlenrk   configblock_out_channelsrq   r   image_processorregister_to_confighasattrrl   sample_sizedefault_sample_size)selfrk   rd   re   rl   rm   r`   ra   rn   rb   rc   rx   s              r=   rt   zHunyuanDiTPipeline.__init__   s2    	%##)/) 	 
	
 !&=NN;DNN;K Lj j %*;*Cx 
 W^^bdikoVpc$((//*L*L&MPQ&Q Rvw0$BWBWX8OP t]+0@0@0L ##// 	   	 r?   rA   promptdevicedtypenum_images_per_promptdo_classifier_free_guidancenegative_promptrg   rh   prompt_attention_masknegative_prompt_attention_maskmax_sequence_lengthtext_encoder_indexc                    |H| j                   | j                   j                  }n%| j                  | j                  j                  }nd}|| j                  }| j                  | j
                  g}| j                  | j                   g}||   }||   }||dk(  rd}|dk(  rd}n|}|t        |t              rd}n-|t        |t              rt        |      }n|j                  d   }| ||dddd	      }|j                  } ||d
d      j                  }|j                  d   |j                  d   k\  rbt        j                  ||      sL|j                  |dd|j                   dz
  df         }t"        j%                  d|j                    d|        |j&                  j)                  |      }	 ||j)                  |      |	      }|d   }|	j+                  |d      }	|j)                  ||      }|j                  \  }}}|j+                  d|d      }|j-                  ||z  |d      }|r||dg|z  }n|:t/        |      t/        |      ur$t1        dt/        |       dt/        |       d      t        |t              r|g}n1|t        |      k7  r!t3        d| dt        |       d| d| d	      |}|j                  d   } ||d|dd      }|j&                  j)                  |      }
 ||j                  j)                  |      |
      }|d   }|
j+                  |d      }
|rK|j                  d   }|j)                  ||      }|j+                  d|d      }|j-                  ||z  |d      }|||	|
fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            dtype (`torch.dtype`):
                torch dtype
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
            max_sequence_length (`int`, *optional*): maximum sequence length to use for the prompt.
            text_encoder_index (`int`, *optional*):
                Index of the text encoder to use. `0` for clip and `1` for T5.
        Nr   M   rA      
max_lengthTpt)paddingr   
truncationreturn_attention_maskreturn_tensorslongest)r   r   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )attention_maskr   r    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)r   r   r   r   )rb   r   rl   _execution_devicere   rc   rd   
isinstancestrrT   r{   shape	input_idstorchequalbatch_decodemodel_max_lengthrv   rw   r   torepeatviewtype	TypeErrorry   )r   r   r   r   r   r   r   rg   rh   r   r   r   r   
tokenizerstext_encodersre   rd   r   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textbs_embedseq_len_uncond_tokensuncond_inputs                               r=   encode_promptz HunyuanDiTPipeline.encode_prompt   s1   ` ="".++11!!-((..>++Fnnd&6&67
**D,?,?@12	$%78&!Q&
!Q& 
,J*VS"9JJvt$<VJ&,,Q/J #$%&*#K )22N'	RVWaaO$$R(N,@,@,DDU[[N  )55oaIcIcfgIgjlIlFl6mn!2239\NL
 %0$>$>$A$A&$I!(!!&)4M *!,M$9$@$@AVXY$Z!%((uV(D,22'1%,,Q0EqI%**86K+KWVXY '+A+I&!#z 1#VD<Q(QUVZ[jVkUl mV~Q(  OS1!0 1s?33 )/)::J3K_J` ax/
| <33  !0&,,Q/J$$%#L .:-H-H-K-KF-S*%1&&))&1=&" &<A%>"-K-R-RShjk-l*&,2215G%;%>%>USY%>%Z"%;%B%B1F[]^%_"%;%@%@NcAcelnp%q"46KMkkkr?   c                 l   | j                   d }||fS t        j                  |      r| j                  j	                  |d      }n| j                  j                  |      }| j                  |d      j                  |      }| j                  ||j                  j                  |            \  }}||fS )Npil)output_typer   )r   )images
clip_input)	r`   r   	is_tensorr~   postprocessnumpy_to_pilra   r   pixel_values)r   imager   r   has_nsfw_conceptfeature_extractor_inputsafety_checker_inputs          r=   run_safety_checkerz%HunyuanDiTPipeline.run_safety_checker  s    &# &&& u%*.*>*>*J*J5^c*J*d'*.*>*>*K*KE*R'#'#9#9:Qbf#9#g#j#jkq#r &*&9&9)=)J)J)M)Me)T ': '#E# &&&r?   c                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )Neta	generator)setinspect	signaturerm   step
parameterskeys)r   r   r   accepts_etaextra_step_kwargsaccepts_generators         r=   prepare_extra_step_kwargsz,HunyuanDiTPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r?   c           
      ~    |dz  dk7  s|dz  dk7  rt        d| d| d      |Lt         fd|D              s8t        d j                   d|D cg c]  }| j                  vs| c}       ||t        d	| d
| d      ||t        d      ||	t        d      |7t        |t              s't        |t
              st        dt        |             ||t        d      |	|t        d      ||t        d| d| d      ||t        d      |
|t        d      |A|?|j                  |j                  k7  r&t        d|j                   d|j                   d      |	C|
@|	j                  |
j                  k7  r&t        d|	j                   d|
j                   d      y y y c c}w )Nrp   r   z7`height` and `width` have to be divisible by 8 but are z and r   c              3   :   K   | ]  }|j                   v   y wr/   )_callback_tensor_inputs).0kr   s     r=   	<genexpr>z2HunyuanDiTPipeline.check_inputs.<locals>.<genexpr>  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.ziProvide either `prompt` or `prompt_embeds_2`. Cannot leave both `prompt` and `prompt_embeds_2` undefined.z2`prompt` has to be of type `str` or `list` but is zEMust provide `prompt_attention_mask` when specifying `prompt_embeds`.zIMust provide `prompt_attention_mask_2` when specifying `prompt_embeds_2`.z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zWMust provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.z[Must provide `negative_prompt_attention_mask_2` when specifying `negative_prompt_embeds_2`.zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z{`prompt_embeds_2` and `negative_prompt_embeds_2` must have the same shape when passed directly, but got: `prompt_embeds_2` z != `negative_prompt_embeds_2` )ry   allr   r   r   rT   r   r   )r   r   r<   r;   r   rg   rh   r   r   ri   rj   prompt_attention_mask_2 negative_prompt_attention_mask_2"callback_on_step_end_tensor_inputsr   s   `              r=   check_inputszHunyuanDiTPipeline.check_inputs  s     A:?eai1nVW]V^^cdicjjklmm-9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  ^ 7{  FC)@TZ\`IaQRVW]R^Q_`aa$)>)Fdee&+B+Jhii&+A+M9/9J K*++]_ 
 "-2P2Xvww#/4T4\m  $)?)K""&<&B&BB --:-@-@,A B.445Q8 
 &+C+O$$(@(F(FF //>/D/D.E F0667q:  G ,P&[ pHs   F:%F:c	                 T   ||t        |      | j                  z  t        |      | j                  z  f}	t        |t              r)t	        |      |k7  rt        dt	        |       d| d      |t        |	|||      }n|j                  |      }|| j                  j                  z  }|S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r   r   r   )
rB   rq   r   rT   r{   ry   r   r   rm   init_noise_sigma)
r   r   num_channels_latentsr<   r;   r   r   r   rf   r   s
             r=   prepare_latentsz"HunyuanDiTPipeline.prepare_latents  s     K4000J$///	
 i&3y>Z+GA#i.AQ R&<'gi 
 ?"5IfTYZGjj(G DNN;;;r?   c                     | j                   S r/   _guidance_scaler   s    r=   guidance_scalez!HunyuanDiTPipeline.guidance_scale%  s    ###r?   c                     | j                   S r/   )_guidance_rescaler   s    r=   rY   z#HunyuanDiTPipeline.guidance_rescale)  s    %%%r?   c                      | j                   dkD  S )NrA   r   r   s    r=   r   z.HunyuanDiTPipeline.do_classifier_free_guidance0  s    ##a''r?   c                     | j                   S r/   )_num_timestepsr   s    r=   num_timestepsz HunyuanDiTPipeline.num_timesteps4  s    """r?   c                     | j                   S r/   )
_interruptr   s    r=   	interruptzHunyuanDiTPipeline.interrupt8  s    r?   2   g      @        r   rf   r   )r   r   r<   r;   num_inference_stepsr   r   r   ri   rj   r   r   r   return_dictcallback_on_step_endr   rY   original_sizetarget_sizecrops_coords_top_leftuse_resolution_binningc                    t        |t        t        f      r|j                  }|xs | j                  | j
                  z  }|xs | j                  | j
                  z  }t        |dz  dz        }t        |dz  dz        }|rQ||ft        vrGt        ||      \  }}t        |      }t        |      }t        j                  d| d| dt                | j                  |||||||||||||       || _        || _        d| _        |t        |t              rd}n-|t        |t               rt#        |      }n|j$                  d   }| j&                  }| j)                  ||| j*                  j,                  || j.                  |||||d	d
      \  }}}}| j)                  ||| j*                  j,                  || j.                  |||||dd
      \  }}}}| j0                  j3                  ||       | j0                  j4                  }| j*                  j6                  j8                  }| j;                  ||z  ||||j,                  ||	|
      }
| j=                  |	|      } |dz  | j*                  j6                  j>                  z  }!|dz  | j*                  j6                  j>                  z  }"d| j*                  j6                  j>                  z  }#tA        |!|"f|#      }$tC        | j*                  jD                  | j*                  jF                  z  |$|!|"f|d      }%tI        jJ                  dg|      }&|xs ||f}t!        ||z   |z         }'tI        jJ                  |'g|j,                        }'| j.                  rtI        jL                  ||g      }tI        jL                  ||g      }tI        jL                  ||g      }tI        jL                  ||g      }tI        jL                  |'gdz  d      }'tI        jL                  |&gdz  d      }&|jO                  |      }|jO                  |      }|jO                  |      }|jO                  |      }|'jO                  |j,                  |      jQ                  ||z  d      }'|&jO                  |      jQ                  ||z        }&t#        |      || j0                  jR                  z  z
  }(t#        |      | _*        | jW                  |      5 })tY        |      D ]   \  }*}+| jZ                  r| j.                  rtI        jL                  |
gdz        n|
},| j0                  j]                  |,|+      },tI        jJ                  |+g|,j$                  d   z  |      jO                  |,j,                        }-| j+                  |,|-|||||'|&|%d
      d   }.|.j_                  dd      \  }.}/| j.                  r|.j_                  d      \  }0}1|0||1|0z
  z  z   }.| j.                  r|dkD  rta        |.1|      }. | j0                  jb                  |.|+|
fi | ddid   }
|~i }2|D ]  }3te               |3   |2|3<     || |*|+|2      }4|4jg                  d|
      }
|4jg                  d|      }|4jg                  d|      }|4jg                  d|      }|4jg                  d|      }|*t#        |      dz
  k(  s'|*dz   |(kD  r/|*dz   | j0                  jR                  z  dk(  r|)ji                          tj        stm        jn                          # 	 ddd       |dk(  sb| jp                  js                  |
| jp                  j6                  jt                  z  d       d   }5| jw                  |5||j,                        \  }5}6n|
}5d}6|6d!g|5j$                  d   z  }7n|6D 8cg c]  }8|8  }7}8| jx                  j{                  |5||7"      }5| j}                          |s|5|6fS t        |5|6#      S # 1 sw Y   xY wc c}8w )$u  
        The call function to the pipeline for generation with HunyuanDiT.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            height (`int`):
                The height in pixels of the generated image.
            width (`int`):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference. This parameter is modulated by `strength`.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            prompt_embeds_2 (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            negative_prompt_embeds_2 (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
            prompt_attention_mask_2 (`torch.Tensor`, *optional*):
                Attention mask for the prompt. Required when `prompt_embeds_2` is passed directly.
            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
            negative_prompt_attention_mask_2 (`torch.Tensor`, *optional*):
                Attention mask for the negative prompt. Required when `negative_prompt_embeds_2` is passed directly.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
            callback_on_step_end (`Callable[[int, int, Dict], None]`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
                A callback function or a list of callback functions to be called at the end of each denoising step.
            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
                A list of tensor inputs that should be passed to the callback function. If not defined, all tensor
                inputs will be passed.
            guidance_rescale (`float`, *optional*, defaults to 0.0):
                Rescale the noise_cfg according to `guidance_rescale`. Based on findings of [Common Diffusion Noise
                Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891). See Section 3.4
            original_size (`Tuple[int, int]`, *optional*, defaults to `(1024, 1024)`):
                The original size of the image. Used to calculate the time ids.
            target_size (`Tuple[int, int]`, *optional*):
                The target size of the image. Used to calculate the time ids.
            crops_coords_top_left (`Tuple[int, int]`, *optional*, defaults to `(0, 0)`):
                The top left coordinates of the crop. Used to calculate the time ids.
            use_resolution_binning (`bool`, *optional*, defaults to `True`):
                Whether to use resolution binning or not. If `True`, the input resolution will be mapped to the closest
                standard resolution. Supported resolutions are 1024x1024, 1280x1280, 1024x768, 1152x864, 1280x960,
                768x1024, 864x1152, 960x1280, 1280x768, and 768x1280. It is recommended to set this to `True`.

        Examples:

        Returns:
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
                otherwise a `tuple` is returned where the first element is a list with the generated images and the
                second element is a list of `bool`s indicating whether the corresponding generated image contains
                "not-safe-for-work" (nsfw) content.
           zReshaped to (height, width)=(z, z), Supported shapes are FNrA   r   r   )r   r   r   r   r   r   rg   rh   r   r   r   r   r   )r   rp   @   r   )r   r   )r   r   )rQ   r   )total)encoder_hidden_statestext_embedding_maskencoder_hidden_states_t5text_embedding_mask_t5image_meta_sizestyleimage_rotary_embr   r   )rY   r   rf   rg   rh   ri   rj   latent)r   T)r   do_denormalize)r   nsfw_content_detected)@r   r   r   tensor_inputsr   rq   rB   SUPPORTED_SHAPEr>   rv   rw   r   r   r   r   r   rT   r{   r   r   r   rl   r   r   rm   set_timesteps	timestepsr|   in_channelsr   r   
patch_sizerO   r   	inner_dim	num_headsr   tensorcatr   r   orderr   progress_bar	enumerater   scale_model_inputchunkr]   r   localspopupdateXLA_AVAILABLExm	mark_steprk   decodescaling_factorr   r~   r   maybe_free_model_hooksr   )9r   r   r<   r;   r   r   r   r   r   r   rf   rg   ri   rh   rj   r   r   r   r   r   r   r   r   rY   r   r   r   r   r   r   r
  r   r   grid_height
grid_width	base_sizegrid_crops_coordsr  r  add_time_idsnum_warmup_stepsr  itlatent_model_inputt_expand
noise_predr   noise_pred_uncondrX   callback_kwargsr   callback_outputsr   r   r  has_nsfws9                                                            r=   __call__zHunyuanDiTPipeline.__call__<  s   f *-=?U,VW1E1S1S. K433d6K6KKI11D4I4IIflb()Ub[B&'!vuo_&L25&AME6[FJENN:6("UGKcdsctuv 	"!*$#,.	
  .!1 *VS"9JJvt$<VJ&,,Q/J'' ""(("7(,(H(H+'#9"7+I "   
	
"!*( ""(("7(,(H(H+)#;"9+K #   
	
$#," 	$$%8$HNN,,	  $//66BB&&.. 	
 !::9cJ kT%5%5%<%<%G%GGaZ4#3#3#:#:#E#EE
 0 0 7 7 B BB	;[*<UW`a2&&$*:*:*D*DD*%
 aS0!4fe_MK7:OOP||\N-:M:MN++!II'=}&MNM$)II/MOd.e$f!#ii)A?(STO&+ii1QSj0k&l# 99l^a%7Q?LIIugkq1E%(((7 5 8 8 8 G),,F,;"9"<"<F"<"K#]-@-@PWW..
 '..z<Q/QR y>,?$..BVBV,VV!)n%89 <	#\!), ;#1>> BFAaAaUYYy1}%=gn"%)^^%E%EFXZ[%\" !<<.@.F.Fq.I(IRXY\\,22 ] 
 "--&*7(=-<+B$0%5 % .  
 !+ 0 0 0 :
A 339C9I9I!9L6%!2^YjGj5k!kJ338H38N!2:aq!rJ .$..--j!WmHYmglmnop'3&(O? 9-3Xa[*9';D!Q'X$.229gFG$4$8$8-$XM-=-A-ABZ\r-s*&6&:&:;Lo&^O/?/C/C24L0, I**A9I/IqSTuX\XfXfXlXlNlpqNq '') LLNw;#<	#| h&HHOOGdhhoo.L.L$LZ_O`abcE&*&=&=eV]M`M`&a#E#E##"Vekk!n4N;KLx(lLNL$$00K`n0o 	##%+,,,EQabbe<	# <	#P Ms   *H_5_5(
`5_>)TNN)NNrA   TNNNNNNr   )
NNNNNNNNNNr/   )3__name__
__module____qualname____doc__model_cpu_offload_seq_optional_components_exclude_from_cpu_offloadr   r   r	   r
   r   r   r   r   boolr   r   r   rt   r   r   r   r   rB   Tensorr   r   r   r   r   propertyr   rY   r   r   r   no_gradr   EXAMPLE_DOC_STRINGr   r   float	Generatorr   r   r   r   r   r.  __classcell__)rx   s   @r=   r_   r_      s   8 M "2 2" )-37.22
2
  2
 !	2

 '2
 !2
 52
 .2
 "&2
 !02
 l+2
n  $!%&,0)-049=8<AE-1"#clcl cl {{	cl
  #cl &*cl "#cl  -cl !) 6cl  (5cl )1(>cl &c]cl  clL'!, #"'+!% $)-+/JZ, $ $ & & ( ( # #   U]]_12 )- $#-/*-;?/0"MQ*.04269=;?8<:>AECG%*  9B"%3?1517'+=Tcc49n%Tc Tc }	Tc
 &c]Tc !Tc "%T#Y"78Tc  (}Tc e_Tc E%//43H"HIJTc %,,'Tc  -Tc "%,,/Tc !) 6Tc #+5<<"8Tc   (5!Tc" "*%,,!7#Tc$ )1(>%Tc& +35<<*@'Tc( c])Tc* +Tc, '(Cd+T124DF\\]
-Tc2 -1I3Tc4  5Tc6  c3h07Tc8 eCHo.9Tc:  %S#X;Tc< !%=Tc 3 Tcr?   r_   )r   );r   typingr   r   r   r   r   r   numpyr0   r   transformersr	   r
   r   r   r   $diffusers.pipelines.stable_diffusionr   	callbacksr   r   r~   r   modelsr   r   models.embeddingsr   )pipelines.stable_diffusion.safety_checkerr   
schedulersr   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   torch_xla.core.xla_modelcore	xla_modelr  r  
get_loggerr/  rv   r:  arrayr3   r5   r4   r  r>   rO   r]   r_   )shapesrI   rH   s   000r=   <module>rP     s    ? ?   c c N A 0 6 8 U ' 
 . . ))MM 
		H	% $  < ++MM ESSS&V4TQ1q545SW*4yc* ycM 5Ss   DD&
DD