
    bi%                         d dl Z d dlmZmZmZ d dlZd dlZd dlm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZmZ ddlmZ  e       rd dlZ ej8                  e      Zd Zd Z d Z! G d de      Z"y)    N)ListOptionalUnion)AutoTokenizerUMT5EncoderModel   )
FrozenDict)ClassifierFreeGuidance)is_ftfy_availablelogging   )ModularPipelineBlocksPipelineState)ComponentSpec
ConfigSpec
InputParamOutputParam   )WanModularPipelinec                     t        j                  |       } t        j                  t        j                  |             } | j	                         S N)ftfyfix_texthtmlunescapestriptexts    c/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/modular_pipelines/wan/encoders.pybasic_cleanr    %   s3    ==D==t,-D::<    c                 T    t        j                  dd|       } | j                         } | S )Nz\s+ )resubr   r   s    r   whitespace_cleanr&   +   s$    66&#t$D::<DKr!   c                 .    t        t        |             } | S r   )r&   r    r   s    r   prompt_cleanr(   1   s    K-.DKr!   c                      e Zd ZdZedefd       Zedee   fd       Z	edee
   fd       Zedee   fd       Zedee   fd       Zed        Zed	eeee   f   d
edej*                  fd       Ze	 	 	 	 	 	 	 dd	edeej*                     dededee   deej2                     deej2                     d
efd       Z ej6                         dededefd       Zy)WanTextEncoderStepwanreturnc                      y)NzMText Encoder step that generate text_embeddings to guide the video generation selfs    r   descriptionzWanTextEncoderStep.description9   s    ^r!   c           
      |    t        dt              t        dt              t        dt        t	        ddi      d      gS )Ntext_encoder	tokenizerguiderguidance_scaleg      @from_config)configdefault_creation_method)r   r   r   r
   r	   r/   s    r   expected_componentsz&WanTextEncoderStep.expected_components=   sB     .*:;+}5&!#3S"9:(5		
 		
r!   c                     g S r   r.   r/   s    r   expected_configsz#WanTextEncoderStep.expected_configsJ   s    	r!   c                 B    t        d      t        d      t        d      gS )Npromptnegative_promptattention_kwargs)r   r/   s    r   inputszWanTextEncoderStep.inputsN   s(     x ())*
 	
r!   c                 v    t        dt        j                  dd      t        dt        j                  dd      gS )Nprompt_embedsguider_input_fieldsz2text embeddings used to guide the image generation)	type_hintkwargs_typer1   negative_prompt_embedsz;negative text embeddings used to guide the image generation)r   torchTensorr/   s    r   intermediate_outputsz'WanTextEncoderStep.intermediate_outputsV   s@     ,,1P	 (,,1Y	
 	
r!   c                     | j                   Wt        | j                   t              s<t        | j                   t              s!t	        dt        | j                                y y y )Nz2`prompt` has to be of type `str` or `list` but is )r>   
isinstancestrlist
ValueErrortype)block_states    r   check_inputszWanTextEncoderStep.check_inputsg   s]    );--s3J{GYGY[_<`QRVWbWiWiRjQklmm =a3 *r!   r>   max_sequence_lengthdevicec                    | j                   j                  }t        |t              r|gn|}|D cg c]  }t	        |       }}| j                  |d|dddd      }|j                  |j                  }}|j                  d      j                  d      j                         }	| j                  |j                  |      |j                  |            j                  }
|
j                  ||      }
t        |
|	      D cg c]
  \  }}|d |  }
}}t        j                  |
D cg c]J  }t        j                   ||j#                  ||j%                  d      z
  |j%                  d            g      L c}d      }
|
S c c}w c c}}w c c}w )	N
max_lengthTpt)paddingrV   
truncationadd_special_tokensreturn_attention_maskreturn_tensorsr   r   )dim)dtyperT   )r3   r^   rL   rM   r(   r4   	input_idsattention_maskgtsumlongtolast_hidden_stateziprH   stackcat	new_zerossize)
componentsr>   rS   rT   r^   utext_inputstext_input_idsmaskseq_lensrC   vs               r   _get_t5_prompt_embedsz(WanTextEncoderStep._get_t5_prompt_embedsn   sj    ''--'4&&+12a,q/22 ** *#"& + 
  +44k6P6P771:>>a>(--/"//0A0A&0I477SY?[mm%((uV(D+.}h+GH41a2AHH^klYZUYY1;;':QVVAY'Fq	RSTlrs
 ) 3 Ils   E>?F$AF	Nnum_videos_per_promptprepare_unconditional_embedsr?   rC   rG   c	                    |xs | j                   }t        |t              r|gn|}|t        |      n|j                  d   }	|t
        j                  | |||      }|r||xs d}t        |t              r|	|gz  n|}|:t        |      t        |      ur$t        dt        |       dt        |       d      |	t        |      k7  r!t        d| dt        |       d| d|	 d		      t
        j                  | |||      }|j                  \  }
}}|j                  d
|d
      }|j                  |
|z  |d      }|r)|j                  d
|d
      }|j                  |	|z  |d      }||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_videos_per_prompt (`int`):
                number of videos that should be generated per prompt
            prepare_unconditional_embeds (`bool`):
                whether to use prepare unconditional embeddings or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            max_sequence_length (`int`, defaults to `512`):
                The maximum number of text tokens to be used for the generation process.
        r    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.r   )_execution_devicerL   rM   lenshaper*   rr   rP   	TypeErrorrO   repeatview)rk   r>   rT   rs   rt   r?   rC   rG   rS   
batch_sizebs_embedseq_len_s                r   encode_promptz WanTextEncoderStep.encode_prompt   s   L 7:77'4&&$*$6S[M<O<OPQ<R
 .DDZQWYlntuM',B,J-3O@J?\_@`jO+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  &8%M%MO-@&&"  -22'1%,,Q0EqI%**86K+KWVXY'%;%B%B1F[]^%_"%;%@%@NcAcelnp%q"444r!   rk   statec           
      l   | j                  |      }| j                  |       |j                  j                  dkD  |_        |j
                  |_        | j                  ||j                  |j                  d|j                  |j                  d d       \  |_
        |_        | j                  ||       ||fS )Nr   )rC   rG   )get_block_staterR   r5   num_conditionsrt   ry   rT   r   r>   r?   rC   rG   set_block_state)r0   rk   r   rQ   s       r   __call__zWanTextEncoderStep.__call__   s     **51+&3=3D3D3S3SVW3W0'99 44''#'  	
	
%. 	UK05  r!   )Nr   TNNNi   )__name__
__module____qualname__
model_namepropertyrM   r1   r   r   r:   r   r<   r   rA   r   rJ   staticmethodrR   r   intrH   rT   rr   r   boolrI   r   no_gradr   r   r   r.   r!   r   r*   r*   6   s   J_S _ _ 

T-%8 

 

 $z"2   
Z( 
 
 
d;&7 
 
  n n c49n% ! 	 <  *.%&-1)-049=#&H5H5 &H5  #	H5
 '+H5 "#H5  -H5 !) 6H5 !H5 H5T U]]_!#5 !m !P] ! !r!   r*   )#r   typingr   r   r   regexr$   rH   transformersr   r   configuration_utilsr	   guidersr
   utilsr   r   modular_pipeliner   r   modular_pipeline_utilsr   r   r   r   r   r   
get_loggerr   loggerr    r&   r(   r*   r.   r!   r   <module>r      sk     ( (   8 - - / C W W 0  
		H	%
|!. |!r!   