
    biAC                        d dl Z d dlmZmZmZ d dlZd dlZd dlm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZmZ d
dlmZmZ d
dlm Z m!Z!m"Z"m#Z# ddlm$Z$  e       rd dl%Z% ejL                  e'      Z(d Z)d Z*d Z+	 ddejX                  deejZ                     de.fdZ/ G d de      Z0 G d de      Z1y)    N)ListOptionalUnion)CLIPTextModelCLIPTokenizerT5EncoderModelT5TokenizerFast   )
FrozenDict)VaeImageProcessor)FluxLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKL)USE_PEFT_BACKENDis_ftfy_availableloggingscale_lora_layersunscale_lora_layers   )ModularPipelineBlocksPipelineState)ComponentSpec
ConfigSpec
InputParamOutputParam   )FluxModularPipelinec                     t        j                  |       } t        j                  t        j                  |             } | j	                         S N)ftfyfix_texthtmlunescapestriptexts    d/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/modular_pipelines/flux/encoders.pybasic_cleanr(   '   s3    ==D==t,-D::<    c                 T    t        j                  dd|       } | j                         } | S )Nz\s+ )resubr$   r%   s    r'   whitespace_cleanr.   -   s$    66&#t$D::<DKr)   c                 .    t        t        |             } | S r   )r.   r(   r%   s    r'   prompt_cleanr0   3   s    K-.DKr)   encoder_output	generatorsample_modec                     t        | d      r |dk(  r| j                  j                  |      S t        | d      r|dk(  r| j                  j                         S t        | d      r| j                  S t        d      )Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrr5   r6   moder8   AttributeError)r1   r2   r3   s      r'   retrieve_latentsr<   9   st     ~}-+2I))00;;		/K84K))..00		+%%%RSSr)   c                       e Zd ZdZedefd       Zedee   fd       Z	edee
   fd       Zedee   fd       Zedej                   dej"                  fd	       Z ej&                         d
ededefd       Zy)FluxVaeEncoderStepfluxreturnc                      y)NzIVae Encoder step that encode the input image into a latent representation selfs    r'   descriptionzFluxVaeEncoderStep.descriptionI   s    Zr)   c           
      `    t        dt              t        dt        t        ddd      d      gS )Nvaeimage_processor   )vae_scale_factorvae_latent_channelsfrom_config)configdefault_creation_method)r   r   r   r   rC   s    r'   expected_componentsz&FluxVaeEncoderStep.expected_componentsM   s7     %/!!!rRT"UV(5	
 	
r)   c           
          t        dd      t        d      t        d      t        d      t        dt        j                  d	      t        d
t        t           d	      gS )NimageT)requiredheightwidthr2   dtypez Data type of model tensor inputs	type_hintrE   preprocess_kwargszA kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under `self.image_processor` in [diffusers.image_processor.VaeImageProcessor])r   torchrU   r   dictrC   s    r'   inputszFluxVaeEncoderStep.inputsY   sY     w.x w{#w%++Cef#"4. E
 	
r)   c                 <    t        dt        j                  d      gS )Nimage_latentszUThe latents representing the reference image for image-to-image/inpainting generationrV   r   rY   TensorrC   s    r'   intermediate_outputsz'FluxVaeEncoderStep.intermediate_outputsh   s#     ,,s
 	
r)   rQ   r2   c                    t        |t              r`t        |j                  d         D cg c]'  }t	        | j                  |||dz          ||         ) }}t        j                  |d      }nt	        | j                  |      |      }|| j                  j                  z
  | j                  j                  z  }|S c c}w )Nr   r   )r2   )dim)
isinstancelistrangeshaper<   encoderY   catrM   shift_factorscaling_factor)rG   rQ   r2   ir]   s        r'   _encode_vae_imagez$FluxVaeEncoderStep._encode_vae_imager   s     i&`efkfqfqrsft`u[\ E!a!e,<!=STVM  "IIm;M,SZZ->)TM&)@)@@CJJD]D]]s   ,B?
componentsstatec                    | j                  |      }|j                  xs i |_        |j                  |_        |j                  |j                  n|j
                  j                  |_         |j                  j                  |j                  f|j                  |j                  d|j                  |_        |j                  j                  |j                  |j                        |_        |j                  j                  d   |_        t        |j                  t               rQt#        |j                        |j                  k7  r/t%        dt#        |j                         d|j                   d      | j'                  |j
                  |j                  |j                        |_        | j+                  ||       ||fS )N)rS   rT   devicerU   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)rQ   r2   )get_block_staterX   _execution_devicerq   rU   rG   rH   
preprocessrQ   rS   rT   torf   
batch_sizerc   r2   rd   len
ValueErrorrl   r]   set_block_staterD   rm   rn   block_states       r'   __call__zFluxVaeEncoderStep.__call__   s   **51(3(E(E(K%'991<1B1B1NK--T^TbTbThThAJ66AA
&1&8&8@Q@Q
U`UrUr
 (--008J8JR]RcRc0d!,!2!2!8!8!; k++T2s;;P;P7QU`UkUk7kA#kF[F[B\A] ^'2233su 
 %)$:$:NN+"3"3{?T?T %; %
! 	UK05  r)   N)__name__
__module____qualname__
model_namepropertystrrE   r   r   rO   r   r[   r   r`   staticmethodrY   r_   	Generatorrl   no_gradr   r   r|   rB   r)   r'   r>   r>   F   s    J[S [ [ 	
T-%8 	
 	
 
Z( 
 
 
d;&7 
 
 ell u   U]]_!#6 !} !Q^ ! !r)   r>   c                      e Zd ZdZedefd       Zedee   fd       Z	edee
   fd       Zedee   fd       Zedee   fd       Zed        Zed	eeee   f   d
ededej*                  fd       Zed	eeee   f   d
edej*                  fd       Ze	 	 	 	 	 	 dd	eeee   f   deeee   f   deej*                     d
edeej2                     deej2                     dedee   fd       Z ej8                         dededefd       Zy)FluxTextEncoderStepr?   r@   c                      y)NzMText Encoder step that generate text_embeddings to guide the video generationrB   rC   s    r'   rE   zFluxTextEncoderStep.description   s    ^r)   c                 ~    t        dt              t        dt              t        dt              t        dt              gS )Ntext_encoder	tokenizertext_encoder_2tokenizer_2)r   r   r   r   r	   rC   s    r'   rO   z'FluxTextEncoderStep.expected_components   s7     .-8+}5*N;-9	
 	
r)   c                     g S r   rB   rC   s    r'   expected_configsz$FluxTextEncoderStep.expected_configs   s    	r)   c                 B    t        d      t        d      t        d      gS )Npromptprompt_2joint_attention_kwargs)r   rC   s    r'   r[   zFluxTextEncoderStep.inputs   s'     x z"/0
 	
r)   c                     t        dt        j                  d      t        dt        j                  d      t        dt        j                  d      gS )Nprompt_embedsz2text embeddings used to guide the image generationrV   pooled_prompt_embedsz9pooled text embeddings used to guide the image generationtext_idsz#ids from the text sequence for RoPEr^   rC   s    r'   r`   z(FluxTextEncoderStep.intermediate_outputs   sQ     ,,P
 &,,W
 ,,A
 	
r)   c                     | j                   | j                  fD ]=  }|t        |t              rt        |t              r(t        dt        |              y )Nz@`prompt` or `prompt_2` has to be of type `str` or `list` but is )r   r   rc   r   rd   rx   type)r{   r   s     r'   check_inputsz FluxTextEncoderStep.check_inputs   sY    "));+?+?@ 	tF!:fc+B:V\^bKc #cdhiodpcq!rss	tr)   r   num_images_per_promptmax_sequence_lengthrq   c           	         | j                   j                  }t        |t              r|gn|}t	        |      }t        | t
              r| j                  || j                        }| j                  |d|dddd      }|j                  }| j                  |dd      j                  }	|	j                  d   |j                  d   k\  rXt        j                  ||	      sB| j                  j                  |	d d |d	z
  df         }
t        j                  d
| d|
        | j                  |j                  |      d      d   }|j                  ||      }|j                  \  }}}|j!                  d	|d	      }|j#                  ||z  |d      }|S )N
max_lengthTFpt)paddingr   
truncationreturn_lengthreturn_overflowing_tokensreturn_tensorslongestr   r   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  	 tokens: output_hidden_statesr   rU   rq   )r   rU   rc   r   rw   r   maybe_convert_promptr   	input_idsrf   rY   equalbatch_decodeloggerwarningru   repeatview)rm   r   r   r   rq   rU   rv   text_inputstext_input_idsuntruncated_idsremoved_textr   _seq_lens                 r'   _get_t5_prompt_embedsz)FluxTextEncoderStep._get_t5_prompt_embeds   s    ))//'4&&[
j"=>44VZ=S=STF ,, *&+ - 
 %..$00[_0`jj  $(<(<R(@@UcetIu%11>>qRehiRilnRnOn?opLNN'(	,A
 #11.2C2CF2Kbg1hijk%((uV(D%++7A &,,Q0EqI%**:8M+MwXZ[r)   c           	      \   t        |t              r|gn|}t        |      }t        | t              r| j	                  || j
                        }| j                  |d| j
                  j                  dddd      }|j                  }| j
                  j                  }| j                  |dd      j                  }|j                  d   |j                  d   k\  rXt        j                  ||      sB| j
                  j                  |d d |d	z
  df         }	t        j                  d
| d|	        | j                  |j                  |      d      }
|
j                   }
|
j                  | j                  j"                  |      }
|
j%                  d	|      }
|
j'                  ||z  d      }
|
S )Nr   TFr   )r   r   r   r   r   r   r   r   r   r   z\The following part of your input was truncated because CLIP can only handle sequences up to r   r   r   )rc   r   rw   r   r   r   model_max_lengthr   rf   rY   r   r   r   r   r   ru   pooler_outputrU   r   r   )rm   r   r   rq   rv   r   r   tokenizer_max_lengthr   r   r   s              r'   _get_clip_prompt_embedsz+FluxTextEncoderStep._get_clip_prompt_embeds  s    (4&&[
j"=>44VZ=Q=QRF ** !++<<&+ + 
 %..)33DD$..vyY].^hh  $(<(<R(@@UcetIu%//<<_QPdghPhkmPmMm=noLNN()<.B #//0A0A&0I`e/f &33%((z/F/F/L/LU[(\ &,,Q0EF%**:8M+MrRr)   Nr   r   r   
lora_scalec	                 t   |xs | j                   }|gt        | t              rW|| _        | j                  t
        rt        | j                  |       | j                  t
        rt        | j                  |       t        |t              r|gn|}|N|xs |}t        |t              r|gn|}t        j                  | |||      }t        j                  | ||||      }| j                  ,t        | t              rt
        rt        | j                  |       | j                  ,t        | t              rt
        rt        | j                  |       | j                  | j                  j                  nt        j                  }	t        j                   |j"                  d   d      j%                  ||	      }
|||
fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in all text-encoders
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
        )r   rq   r   )r   r   r   rq   r   r
   rp   )rs   rc   r   _lora_scaler   r   r   r   r   r   r   r   r   rU   rY   bfloat16zerosrf   ru   )rm   r   r   rq   r   r   r   r   r   rU   r   s              r'   encode_promptz!FluxTextEncoderStep.encode_prompt.  s   D 7:77 !j=P&Q%/J" &&27G!*"9"9:F((49I!*";";ZH'4&& )6H%/#%>zHH $7#N#N&;	 $O $  0EE&;$7 F M "".*&9:?O#J$;$;ZH$$0*&9:?O#J$=$=zJ1;1H1H1T
''--Z_ZhZh;;}2215q9<<FRW<X2H<<r)   rm   rn   c           
         | j                  |      }| j                  |       |j                  |_        |j                  |j                  j                  dd       nd |_        | j                  ||j                  d d d |j                  d|j                        \  |_	        |_
        |_        | j                  ||       ||fS )Nscaler   )r   r   r   r   rq   r   r   )rr   r   rs   rq   r   gettext_encoder_lora_scaler   r   r   r   r   ry   rz   s       r'   r|   zFluxTextEncoderStep.__call__  s     **51+&'99
 11= ..227DA 	+
 _c^p^p%%!%%%"#":: _q 	_
[	"K$DkFZ 	UK05  r)   )Nr   NNi   N) r}   r~   r   r   r   r   rE   r   r   rO   r   r   r   r[   r   r`   r   r   r   intrY   rq   r   r   r   FloatTensorfloatr   r   r   r   r|   rB   r)   r'   r   r      s8   J_S _ _ 
T-%8 
 
 $z"2   
Z( 
 
 
d;&7 
 
& t t
 *c49n%*  #* !	*
 * *X )c49n%)  #) 	) )V 
 *.%&59<@#&&*P=c49n%P= T#Y'P= &	P=
  #P=   1 12P= 'u'8'89P= !P= UOP= P=d U]]_!#6 !} !Q^ ! !r)   r   )Nr6   )2r"   typingr   r   r   regexr,   rY   transformersr   r   r   r	   configuration_utilsr   rH   r   loadersr   r   modelsr   utilsr   r   r   r   r   modular_pipeliner   r   modular_pipeline_utilsr   r   r   r   r   r    
get_loggerr}   r   r(   r.   r0   r_   r   r   r<   r>   r   rB   r)   r'   <module>r      s     ( (   V V - 0 G # i i C W W 1  
		H	% ck
TLL
T-5eoo-F
T\_
TV!. V!r}!/ }!r)   