
    biL                     :   d dl Z d dlmZmZmZmZmZmZmZ d dl	Z
d dlZd dlmZmZmZmZmZmZ ddlmZmZ ddlmZmZmZmZ ddlmZ ddlmZmZ dd	l m!Z! dd
l"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0  e&       rd dl1m2c m3Z4 dZ5ndZ5 e'jl                  e7      Z8dZ9	 	 	 	 d$de:de:de;de;fdZ<	 d%dejz                  deej|                     de?fdZ@	 	 	 	 d&dee:   deee?ej                  f      deee:      d eee;      fd!ZB G d" d#e.eee      ZCy)'    N)AnyCallableDictListOptionalTupleUnion)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjectionT5EncoderModelT5TokenizerFast   )PipelineImageInputVaeImageProcessor)FluxIPAdapterMixinFluxLoraLoaderMixinFromSingleFileMixinTextualInversionLoaderMixin)AutoencoderKL)FluxControlNetModelFluxMultiControlNetModel)FluxTransformer2DModel)FlowMatchEulerDiscreteScheduler)USE_PEFT_BACKENDis_torch_xla_availableloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)randn_tensor   )DiffusionPipeline   )FluxPipelineOutputTFa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers.utils import load_image
        >>> from diffusers import FluxControlNetPipeline
        >>> from diffusers import FluxControlNetModel

        >>> base_model = "black-forest-labs/FLUX.1-dev"
        >>> controlnet_model = "InstantX/FLUX.1-dev-controlnet-canny"
        >>> controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
        >>> pipe = FluxControlNetPipeline.from_pretrained(
        ...     base_model, controlnet=controlnet, torch_dtype=torch.bfloat16
        ... )
        >>> pipe.to("cuda")
        >>> control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg")
        >>> prompt = "A girl in city, 25 years old, cool, futuristic"
        >>> image = pipe(
        ...     prompt,
        ...     control_image=control_image,
        ...     control_guidance_start=0.2,
        ...     control_guidance_end=0.8,
        ...     controlnet_conditioning_scale=1.0,
        ...     num_inference_steps=28,
        ...     guidance_scale=3.5,
        ... ).images[0]
        >>> image.save("flux.png")
        ```
base_seq_lenmax_seq_len
base_shift	max_shiftc                 <    ||z
  ||z
  z  }|||z  z
  }| |z  |z   }|S N )image_seq_lenr'   r(   r)   r*   mbmus           l/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/flux/pipeline_flux_controlnet.pycalculate_shiftr3   Z   s;     
Z	K,$>?AQ%%A		Q	BI    encoder_output	generatorsample_modec                     t        | d      r |dk(  r| j                  j                  |      S t        | d      r|dk(  r| j                  j                         S t        | d      r| j                  S t        d      )Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrr9   r:   moder<   AttributeError)r5   r6   r7   s      r2   retrieve_latentsr@   h   st     ~}-+2I))00;;		/K84K))..00		+%%%RSSr4   num_inference_stepsdevice	timestepssigmasc                    ||t        d      |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS  | j                  |fd	|i| | j                  }||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesrC   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)rC   rB   rD   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)rD   rB   rB   r-   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__rC   len)	schedulerrA   rB   rC   rD   kwargsaccepts_timestepsaccept_sigmass           r2   retrieve_timestepsrS   v   s   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	 	M)FMfM''	!)n ))) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	 	GvfGG''	!)n ))) 	 	 3MFMfM''	)))r4   c            C           e Zd ZdZdZddgZg dZ	 	 dKdeded	e	d
e
dedededeeee   ee   ef   dedef fdZ	 	 	 	 	 dLdeeee   f   dededeej4                     deej6                     f
dZ	 	 dMdeeee   f   dedeej4                     fdZ	 	 	 	 	 	 	 dNdeeee   f   deeeee   f      deej4                     dedeej<                     deej<                     dedee   fdZ d Z!d Z"	 	 	 	 	 	 	 	 dOd Z#e$d!        Z%e$d"        Z&e$d#        Z'	 dPd$Z(	 	 dQd%Z)e*d&        Z+e*d'        Z,e*d(        Z-e*d)        Z. ej^                          e0e1      ddddd*ddd+dd,d-d*ddd*dddddddddddd.d/ddd0gdf deeee   f   deeeee   f      d1eeee   f   d2eeeee   f      d3ed4ee   d5ee   d6ed7eee      d8ed9eeee   f   d:eeee   f   d;e2d<eeeee   f      d=eeee   f   dee   d>eeejf                  eejf                     f      d0eej<                     deej<                     deej<                     d?ee2   d@eeejh                        dAee2   dBeeejh                        dCeej<                     dDeej<                     dEee   dFe5dGee6ee7f      dHee8eee6gdf      dIee   def@dJ              Z9 xZ:S )RFluxControlNetPipelinea  
    The Flux pipeline for text-to-image generation.

    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/

    Args:
        transformer ([`FluxTransformer2DModel`]):
            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
        text_encoder_2 ([`T5EncoderModel`]):
            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
        tokenizer (`CLIPTokenizer`):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
        tokenizer_2 (`T5TokenizerFast`):
            Second Tokenizer of class
            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
    z=text_encoder->text_encoder_2->image_encoder->transformer->vaeimage_encoderfeature_extractor)r<   prompt_embedscontrol_imageNrO   vaetext_encoder	tokenizertext_encoder_2tokenizer_2transformer
controlnetc                    t         |           t        |t        t        f      rt        |      }| j                  |||||||||	|

       t        | dd       r/dt        | j                  j                  j                        dz
  z  nd| _        t        | j                  dz        | _        t        | d      r"| j                   | j                   j"                  nd| _        d	| _        y )
N)
rZ   r[   r]   r\   r^   r_   rO   r`   rV   rW   rZ   r#   r%      )vae_scale_factorr\   M      )super__init__
isinstancelisttupler   register_modulesgetattrrN   rZ   configblock_out_channelsrc   r   image_processorr=   r\   model_max_lengthtokenizer_max_lengthdefault_sample_size)selfrO   rZ   r[   r\   r]   r^   r_   r`   rV   rW   rM   s              r2   rg   zFluxControlNetPipeline.__init__   s     	j4-01*=J%)##!'/ 	 	
 W^^bdikoVpc$((//*L*L&MPQ&Q Rvw  1$BWBWZ[B[\/6t[/IdnnNhDNN++np 	! $' r4   r%      promptnum_images_per_promptmax_sequence_lengthrB   dtypec           	         |xs | j                   }|xs | j                  j                  }t        |t              r|gn|}t        |      }t        | t              r| j                  || j                        }| j                  |d|dddd      }|j                  }| j                  |dd      j                  }	|	j                  d   |j                  d   k\  rbt        j                  ||	      sL| j                  j                  |	d d | j                  d	z
  df         }
t         j#                  d
| d|
        | j%                  |j'                  |      d      d   }| j$                  j                  }|j'                  ||      }|j                  \  }}}|j)                  d	|d	      }|j+                  ||z  |d      }|S )N
max_lengthTFpt)paddingrz   
truncationreturn_lengthreturn_overflowing_tokensreturn_tensorslongestr|   r   r%   zXThe following part of your input was truncated because `max_sequence_length` is set to  	 tokens: output_hidden_statesr   rx   rB   )_execution_devicer[   rx   rh   strrN   r   maybe_convert_promptr\   r^   	input_idsshapetorchequalbatch_decoderq   loggerwarningr]   torepeatview)rs   ru   rv   rw   rB   rx   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrX   _seq_lens                 r2   _get_t5_prompt_embedsz,FluxControlNetPipeline._get_t5_prompt_embeds   s    14110**00'4&&[
d78..vt~~FF&& *&+ ' 
 %..**69UY*Zdd  $(<(<R(@@UcetIu++88DLeLehiLilnLnIn9opLNN'(	,A
 ++N,=,=f,E\a+bcde##))%((uV(D%++7A &,,Q0EqI%**:8M+MwXZ[r4   c           	      d   |xs | j                   }t        |t              r|gn|}t        |      }t        | t              r| j                  || j                        }| j                  |d| j                  dddd      }|j                  }| j                  |dd      j                  }|j                  d   |j                  d   k\  rlt        j                  ||      sV| j                  j                  |d d | j                  d	z
  df         }t        j                  d
| j                   d|        | j                  |j!                  |      d      }	|	j"                  }	|	j!                  | j                  j$                  |      }	|	j'                  d	|      }	|	j)                  ||z  d      }	|	S )Nrz   TFr{   )r|   rz   r}   r   r~   r   r   r   r   r%   z\The following part of your input was truncated because CLIP can only handle sequences up to r   r   r   )r   rh   r   rN   r   r   r\   rq   r   r   r   r   r   r   r   r[   r   pooler_outputrx   r   r   )
rs   ru   rv   rB   r   r   r   r   r   rX   s
             r2   _get_clip_prompt_embedsz.FluxControlNetPipeline._get_clip_prompt_embeds)  s    1411'4&&[
d78..vt~~FFnn 00&+ % 
 %....SW.Xbb  $(<(<R(@@UcetIu>>66q$JcJcfgJgjlJlGl7mnLNN--.i~G )).*;*;F*CZ_)` &33%((t/@/@/F/Fv(V &,,Q0EF%**:8M+MrRr4   prompt_2rX   pooled_prompt_embeds
lora_scalec	                 l   |xs | j                   }|gt        | t              rW|| _        | j                  t
        rt        | j                  |       | j                  t
        rt        | j                  |       t        |t              r|gn|}|D|xs |}t        |t              r|gn|}| j                  |||      }| j                  ||||      }| j                  ,t        | t              rt
        rt        | j                  |       | j                  ,t        | t              rt
        rt        | j                  |       | j                  | j                  j                  n| j                  j                  }	t        j                  |j                   d   d      j#                  ||	      }
|||
fS )a  

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in all text-encoders
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
        )ru   rB   rv   )ru   rv   rw   rB   r%   r   rB   rx   )r   rh   r   _lora_scaler[   r   r    r]   r   r   r   r!   rx   r_   r   zerosr   r   )rs   ru   r   rB   rv   rX   r   rw   r   rx   text_idss              r2   encode_promptz$FluxControlNetPipeline.encode_promptU  s   F 1411 !j7J&K)D   ,1A!$"3"3Z@"".3C!$"5"5zB'4&& )6H%/#%>zHH $(#?#?&; $@ $ 
 !66&;$7	 7 M ($ 349I#D$5$5zB*$ 349I#D$7$7D+/+<+<+H!!''dN^N^NdNd;;}2215q9<<FRW<X2H<<r4   c                 P   t        | j                  j                               j                  }t	        |t
        j                        s| j                  |d      j                  }|j                  ||      }| j                  |      j                  }|j                  |d      }|S )Nr{   )r   r   r   dim)nextrV   rK   rx   rh   r   TensorrW   pixel_valuesr   image_embedsrepeat_interleave)rs   imagerB   rv   rx   r   s         r2   encode_imagez#FluxControlNetPipeline.encode_image  s    T''2245;;%.**5*FSSEe4))%0==#556KQR5Sr4   c                    g }|t        |t              s|g}t        |      | j                  j                  j
                  k7  r9t        dt        |       d| j                  j                  j
                   d      |D ]-  }| j                  ||d      }|j                  |d d d f          / nt        |t              s|g}t        |      | j                  j                  j
                  k7  r9t        dt        |       d| j                  j                  j
                   d      |D ]  }|j                  |        g }|D ]@  }t        j                  |g|z  d      }|j                  |	      }|j                  |       B |S )
NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r%   zR`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got z image embeds and r   r   rB   )rh   ri   rN   r_   encoder_hid_projnum_ip_adaptersrF   r   appendr   catr   )rs   ip_adapter_imageip_adapter_image_embedsrB   rv   r   single_ip_adapter_imagesingle_image_embedss           r2   prepare_ip_adapter_image_embedsz6FluxControlNetPipeline.prepare_ip_adapter_image_embeds  s    "*.5$4#5 #$(8(8(I(I(Y(YY abefvbwax  yE  FJ  FV  FV  Fg  Fg  Fw  Fw  Ex  xE  F  ,< B'&*&7&78OQWYZ&[###$7a$@AB 5t<+B*C'*+t/?/?/P/P/`/`` hil  nE  jF  iG  GY  Z^  Zj  Zj  Z{  Z{  ZK  ZK  YL  LY  Z  (? 9###$789 #%#/ 	@"'))-@,ADY,Y_`"a"5"8"8"8"G#**+>?	@
 '&r4   c           
          | j                   dz  z  dk7  s| j                   dz  z  dk7  r,t        j                  d j                   dz   d| d| d       |Lt         fd|D              s8t	        d j
                   d	|D cg c]  }| j
                  vs| c}       ||t	        d
| d| d      ||t	        d| d| d      ||t	        d      |7t        |t              s't        |t              st	        dt        |             |7t        |t              s't        |t              st	        dt        |             ||t	        d| d| d      ||t	        d| d| d      |A|?|j                  |j                  k7  r&t	        d|j                   d|j                   d      ||	t	        d      ||
t	        d      ||dkD  rt	        d|       y y c c}w )Nr#   r   z-`height` and `width` have to be divisible by z	 but are z and z(. Dimensions will be resized accordinglyc              3   :   K   | ]  }|j                   v   y wr,   )_callback_tensor_inputs).0krs   s     r2   	<genexpr>z6FluxControlNetPipeline.check_inputs.<locals>.<genexpr>  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.z Cannot forward both `prompt_2`: zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z4`prompt_2` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: z)Cannot forward both `negative_prompt_2`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` .zIf `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`.zIf `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`.rt   z8`max_sequence_length` cannot be greater than 512 but is )rc   r   r   allrF   r   rh   r   ri   typer   )rs   ru   r   heightwidthnegative_promptnegative_prompt_2rX   negative_prompt_embedsr   negative_pooled_prompt_embeds"callback_on_step_end_tensor_inputsrw   r   s   `             r2   check_inputsz#FluxControlNetPipeline.check_inputs  s1    T**Q./14AVAVYZAZ8[_`8`NN?@U@UXY@Y?ZZcdjckkpqvpw  x`  a .9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  !m&?28*<RS`Ra b0 0  ^ 5w  FC)@TZ\`IaQRVW]R^Q_`aa!:h+DZX`bfMgSTXYaTbScdee&+A+M9/9J K*++]_  */E/Q;<M;N O*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  $)=)E U  "-2O2W y  */BS/HWXkWlmnn 0I*e pHs   G7G7c                 4   t        j                  ||d      }|d   t        j                  |      d d d f   z   |d<   |d   t        j                  |      d d d f   z   |d<   |j                  \  }}}|j	                  ||z  |      }|j                  ||      S )Nr   ).r%   ).r#   r   )r   r   aranger   reshaper   )	r   r   r   rB   rx   latent_image_idslatent_image_id_heightlatent_image_id_widthlatent_image_id_channelss	            r2   _prepare_latent_image_idsz0FluxControlNetPipeline._prepare_latent_image_ids%  s     !;;vua8#3F#;ell6>RSTVZSZ>[#[ #3F#;ell5>QRVXYRY>Z#Z RbRhRhO 57O+33"%::<T
  ""&">>r4   c                     | j                  |||dz  d|dz  d      } | j                  dddddd      } | j                  ||dz  |dz  z  |dz        } | S )Nr#   r      r%   r      )r   permuter   )r<   r   num_channels_latentsr   r   s        r2   _pack_latentsz$FluxControlNetPipeline._pack_latents4  sl     ,,z+?1aQVZ[Q[]^_//!Q1a3//*v{uz.JL`cdLder4   c                    | j                   \  }}}dt        |      |dz  z  z  }dt        |      |dz  z  z  }| j                  ||dz  |dz  |dz  dd      } | j                  dddddd      } | j	                  ||dz  ||      } | S )Nr#   r   r   r   r%   r   )r   intr   r   r   )r<   r   r   rc   r   num_patcheschannelss          r2   _unpack_latentsz&FluxControlNetPipeline._unpack_latents=  s     -4MM)
K c&k&6&:;<SZ$4q$89:,,z6Q;
HPQMSTVWX//!Q1a3//*h5.A65Qr4   c	                    dt        |      | j                  dz  z  z  }dt        |      | j                  dz  z  z  }||||f}	|0| j                  ||dz  |dz  ||      }
|j                  ||      |
fS t	        |t
              r)t        |      |k7  rt        dt        |       d| d      t        |	|||      }| j                  |||||      }| j                  ||dz  |dz  ||      }
||
fS )Nr#   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r6   rB   rx   )
r   rc   r   r   rh   ri   rN   rF   r"   r   )rs   r   r   r   r   rx   rB   r6   r<   r   r   s              r2   prepare_latentsz&FluxControlNetPipeline.prepare_latentsO  s8    c&kd&;&;a&?@ASZD$9$9A$=>?165A#==j&TU+W\`aWacikpq::V5:9;KKKi&3y>Z+GA#i.AQ R&<'gi 
 u	&PUV$$Wj:NPVX]^99*fPQkSX\]S]_eglm(((r4   c
                 0   t        |t        j                        rn| j                  j	                  |||      }|j
                  d   }
|
dk(  r|}n|}|j                  |d      }|j                  ||      }|r|	st        j                  |gdz        }|S )N)r   r   r   r%   r   r   r#   )	rh   r   r   ro   
preprocessr   r   r   r   )rs   r   r   r   r   rv   rB   rx   do_classifier_free_guidance
guess_modeimage_batch_size	repeat_bys               r2   prepare_imagez$FluxControlNetPipeline.prepare_images  s     eU\\*((33E&PU3VE ;;q>q "I .I''	q'9e4&zIIugk*Er4   c                     | j                   S r,   )_guidance_scalers   s    r2   guidance_scalez%FluxControlNetPipeline.guidance_scale  s    ###r4   c                     | j                   S r,   )_joint_attention_kwargsr   s    r2   joint_attention_kwargsz-FluxControlNetPipeline.joint_attention_kwargs  s    +++r4   c                     | j                   S r,   )_num_timestepsr   s    r2   num_timestepsz$FluxControlNetPipeline.num_timesteps  s    """r4   c                     | j                   S r,   )
_interruptr   s    r2   	interruptz FluxControlNetPipeline.interrupt  s    r4         ?   g      @g        pilTr<   r   r   true_cfg_scaler   r   rA   rD   r   control_guidance_startcontrol_guidance_endrY   control_modecontrolnet_conditioning_scaler6   r   r   negative_ip_adapter_image negative_ip_adapter_image_embedsr   r   output_typereturn_dictr   callback_on_step_endr   c!                    |xs | j                   | j                  z  }|xs | j                   | j                  z  }t        |t              s t        |t              rt	        |      |gz  }nt        |t              s t        |t              rt	        |      |gz  }ngt        |t              sWt        |t              sGt        | j
                  t              rt	        | j
                  j                        nd}!|!|gz  |!|gz  }}| j                  ||||||||||||        |
| _	        || _
        d| _        |t        |t              rd}"n-|t        |t              rt	        |      }"n|j                  d   }"| j                  }#| j                  j                   }$| j"                  | j"                  j%                  dd      nd}%|dkD  xr |du}&| j'                  |||||#|| |%      \  }}}'|&r| j'                  |||||#|| |%      \  }}}(| j                  j(                  j*                  dz  })t        | j
                  t,              r| j/                  ||||"|z  ||#| j0                  j                   	      }|j                  d
d \  }}| j
                  j2                  dnd}*| j
                  j2                  t5        | j0                  j7                  |      |      }|| j0                  j(                  j8                  z
  | j0                  j(                  j:                  z  }|j                  dd \  }+},| j=                  ||"|z  |)|+|,      }|t        |t>              stA        d      tC        jD                  |      jG                  |#tB        jH                        }|jK                  dd      jM                  |j                  d   d      }n(t        | j
                  t              rg }-| j
                  j                  d   j2                  dnd}*tO        |      D ]  \  }.}/| j/                  |/|||"|z  ||#| j0                  j                   	      }/|/j                  d
d \  }}| j
                  j                  d   j2                  t5        | j0                  j7                  |/      |      }/|/| j0                  j(                  j8                  z
  | j0                  j(                  j:                  z  }/|/j                  dd \  }+},| j=                  |/|"|z  |)|+|,      }/|-jQ                  |/        |-}t        |t              r"t	        |      t	        |      k7  rtA        d      t        |t              s|gt	        |      z  }g }0|D ]j  }1|1d}1tC        jD                  |1      jM                  |-d   j                  d         jG                  |#tB        jH                        }|0jQ                  |       l |0}| j                  j(                  j*                  dz  })| jS                  |"|z  |)|||j                   |#||      \  }}2|	tU        jV                  dd|z  |      n|	}	|j                  d   }3tY        |3| jZ                  j(                  j%                  dd      | jZ                  j(                  j%                  dd      | jZ                  j(                  j%                  dd      | jZ                  j(                  j%                  dd            }4t]        | jZ                  ||#|	|4      \  }5}t_        t	        |5      || jZ                  j`                  z  z
  d      }6t	        |5      | _1        g }7te        t	        |5            D ]  }.tg        ||      D 89cg c]8  \  }8}9dti        |.t	        |5      z  |8k  xs |.dz   t	        |5      z  |9kD        z
  : }:}8}9|7jQ                  t        | j
                  t,              r|:d   n|:        ||-|+|)tU        jj                  ||dftT        jl                        }n0|.|,||(tU        jj                  ||dftT        jl                        }| j"                  i | _
        d};d}<||| jo                  |||#|"|z        };||| jo                  |||#|"|z        }<| jq                  |      5 }=tO        |5      D ]  \  }.}>| jr                  r|;|;| j                  d<   |>jM                  |j                  d         jG                  |j                         }?t        | j
                  t              r.| j
                  j                  d   j(                  jt                  }@n | j
                  j(                  jt                  }@@rtC        jD                  |
g|#      nd}A|AAjM                  |j                  d         nd}At        |7|.   t              r%tg        ||7|.         D B8cg c]
  \  }B}8|B|8z   }C}B}8n|}Dt        |Dt              rDd   }DD|7|.   z  }C| j                  |||C|?d z  A|||'|2| j"                  d!      \  }E}F| j                  j(                  jt                  rtC        jD                  |
g|#      nd}A|AAjM                  |j                  d         nd}A| j                  ||?d z  |A||EF|'|2| j"                  d*"      d   }G|&rI|<|<| j                  d<   | j                  ||?d z  A||EF|'|2| j"                  d|*"      d   }H|H|G|Hz
  z  z   }G|j                   }I| jZ                  jw                  G|>|d#      d   }|j                   |Ik7  r9tB        jx                  jz                  j}                         r|jG                  I      }|Zi }J|D ]  }Kt               |K   J|K<     || |.|>J      }L|Lj                  d$|      }|Lj                  d%|      }|Lj                  d&|      }|.t	        |5      dz
  k(  s'|.dz   |6kD  r/|.dz   | jZ                  j`                  z  dk(  r|=j                          t        s|t        j                           	 ddd       |d'k(  r|}Mn| j                  |||| j                        }|| j0                  j(                  j:                  z  | j0                  j(                  j8                  z   }| j0                  j                  |d#      d   }M| j                  j                  |M|(      }M| j                          |sMfS t        M)      S c c}9}8w c c}8}Bw # 1 sw Y   xY w)*a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                will be used instead
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image. This is set to 1024 by default for the best results.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image. This is set to 1024 by default for the best results.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            sigmas (`List[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            guidance_scale (`float`, *optional*, defaults to 7.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
                The percentage of total steps at which the ControlNet starts applying.
            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
                The percentage of total steps at which the ControlNet stops applying.
            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
                    `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
                images must be passed as a list such that each element of the list can be correctly batched for input
                to a single ControlNet.
            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
                the corresponding scale as a list.
            control_mode (`int` or `List[int]`,, *optional*, defaults to None):
                The control mode when applying ControlNet-Union.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            negative_ip_adapter_image:
                (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.

        Examples:

        Returns:
            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
            images.
        r%   )r   r   rX   r   r   r   r   rw   FNr   scale)ru   r   rX   r   rB   rv   rw   r   r   )r   r   r   r   rv   rB   rx   T)r6   r#   zB For `FluxControlNet`, `control_mode` should be an `int` or `None`)rx   r   zFor Multi-ControlNet, `control_mode` must be a list of the same  length as the number of controlnets (control images) specifiedr   base_image_seq_len   max_image_seq_len   r)         ?r*   ffffff?)rD   r1   r   )totalr   r   i  )hidden_statescontrolnet_condcontrolnet_modeconditioning_scaletimestepguidancepooled_projectionsencoder_hidden_statestxt_idsimg_idsr   r  )r  r  r  r  r  controlnet_block_samplescontrolnet_single_block_samplesr  r  r   r  controlnet_blocks_repeat)r  r<   rX   rY   latent)r  )images)Krr   rc   rh   ri   rN   r`   r   netsr   r   r   r   r   r   r   r_   rx   r   getr   rm   in_channelsr   r   rZ   input_hint_blockr@   encodeshift_factorscaling_factorr   r   rF   r   tensorr   longr   expand	enumerater   r   nplinspacer3   rO   rS   maxorderr   rangezipfloatr   uint8r   progress_barr   guidance_embedsstepbackendsmpsis_availablelocalspopupdateXLA_AVAILABLExm	mark_stepr   decodero   postprocessmaybe_free_model_hooksr&   )Nrs   ru   r   r   r   r   r   r   rA   rD   r   r   r   rY   r   r   rv   r6   r<   rX   r   r   r   r   r  r   r   r  r  r   r  r   rw   multr   rB   rx   r   do_true_cfgr   r   r   r  height_control_imagewidth_control_imagecontrol_imagesicontrol_image_control_modescmoder   r.   r1   rC   num_warmup_stepscontrolnet_keepsekeepsr   negative_image_embedsr1  tr  use_guidancer  c
cond_scalecontrolnet_cond_scaler  r  
noise_predneg_noise_predlatents_dtypecallback_kwargsr   callback_outputsr   sN                                                                                 r2   __call__zFluxControlNetPipeline.__call__  s   P K433d6K6KKI11D4I4II0$7JG[]a<b%()=%>BXAY%Y"0$7JG]_c<d#&'=#>BVAW#W 2D9*MacgBh0:4??Ld0e3t++,klD.//,-- %9" 	+/'#9!5*G/Q 3 	 	
  .'=$ *VS"9JJvt$<VJ&,,Q/J''  && ?C>Y>Y>eD''++GT:ko 	 %q(H_D-H
 '!5"7 3!  	
		
  
 ""&*4%B&;$7% # 		&-  $//66BBaGdoo':; ..#%(==&;hhnn / M *//4MFE 150P0P0Xu^b$//7 01O[d e!.1M1M!MQUQYQYQ`Q`QoQo o =J<O<OPQPR<S9$&9 $ 2 2!!66(('! '!,4$%ijj$||L9<<V5::<V+00Q7>>}?R?RST?UWXY)ABN040D0DQ0G0X0X0`ufj$%.}%= 6!>!%!3!3(!),AA*?!((.. "4 " !/ 4 4RS 9??''*;;C%5dhhoon6Uaj%kN&4txx7S7S&SW[W_W_WfWfWuWu%uN AO@T@TUVUW@X=(*=%)%7%7&"%::,,+&N %%n5568 +M ,-#l2Cs=GY2Y X  lD1 ,~M0BBM% 3=E$||E299.:K:Q:QRS:TUXXY_glgqgqXr$$\2	3
 )L  $//66BBaG$($8$8.. 	%
!! TZSaS!&9"9;NOgma(NN!!%%&:C@NN!!%%&94@NN!!%%lC8NN!!%%k48
 *<NN*
&	& s9~0CdnnFZFZ0ZZ\]^!)n s9~& 	lA   68LMAq eAI.2Rq1uI6NQR6RSSE  ""z$//K^/_58ejk	l (,C,O%-2R2Z(*%1C288(T%&+B+J%15U5a!xx(:"((K&&.+-D( $'+B+N?? '22	L %04T4`$($H$H)022	%! %89 i	#\!), h#1>>+NZD001JK88GMM!$4588Gdoo/GH#'??#7#7#:#A#A#Q#QL#'??#9#9#I#ILLX5<<(8H^b@H@T8??7==+;<Z^oa0$7478UWfghWi4j!kDAq!a%!kJ!k,I)!"7>0Ea0H-!69K!KJ MQOO")$1$0'1%_%';*7$,+/+F+F % M\ MI(*I  FJEUEUE\E\ElElELL.!1&Arv  AI@T8??7==+;<Z^!--")%_%';*7-E4S$,+/+F+F %-E .  
 ,8Rg445NO%)%5%5&-!)D!)+H.D1I8W ( 0/3/J/J$)1I &6 & &N "0.JQ_D_2`!`J !(..--j!WRW-XYZ[==M1~~))668")**]";'3&(O? 9-3Xa[*9';D!Q'X$.229gFG$4$8$8-$XM$4$8$8-$XM I**A9I/IqSTuX\XfXfXlXlNlpqNq '') LLNQh#i	#V ("E **7FE4CXCXYG!?!??488??C_C__GHHOOGO?BE((44U4TE 	##%8O!//n "l'i	# i	#s,   =t4D3u 7t:
Iu u :u  u	)NN)Nr%   rt   NN)r%   N)NNr%   NNrt   N)NNNNNNNNr,   )FF);__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r	   r   r   r   r   r   r
   rg   r   r   r   r   rB   rx   r   r   FloatTensorr/  r   r   r   r   staticmethodr   r   r   r   r   propertyr   r   r   r   no_gradr   EXAMPLE_DOC_STRINGr   	Generatorr   boolr   r   r   rY  __classcell__)rM   s   @r2   rU   rU      s   4 \+-@AK 8<04&'2&' &' $	&'
 !&' '&' %&' ,&' &9!:EBU<VXpp
&' 5&' .&'T )-%&#&)-'+/c49n%/  #/ !	/
 &/ $/h &')-	*c49n%*  #* &	*^ 59)-%&59<@#&&*P=c49n%P= 5d3i01P= &	P=
  #P=   1 12P= 'u'8'89P= !P= UOP=f	"'T #!&*+/ KoZ ? ?    2 !)Z %* D $ $ , , # #   U]]_12 )-4815=A # $##%(, #<?:=,08<CF/0MQ/359<@9=@DBFIM>BEI%* ;?KO9B#&Cs0c49n%s0 5d3i01s0 sDI~.	s0
 $E#tCy.$9:s0 s0 s0 }s0 !s0 e%s0 s0 !&eT%[&8 9s0 $E4;$67s0 *s0 uS$s)^45s0  (-UDK-?'@!s0"  (}#s0$ E%//43H"HIJ%s0& %++,'s0(   1 12)s0* 'u'8'89+s0, ##56-s0. "*$u||*<!=/s00 $,,>#?1s02 +343E*F3s04 !)):): ;5s06 (00A0A'B7s08 c]9s0: ;s0< !)c3h 8=s0> 'xc40@$0F'GH?s0@ -1IAs0B !Cs0 3 s0r4   rU   )r	  r  r  r  )Nr:   )NNNN)DrH   typingr   r   r   r   r   r   r	   numpyr)  r   transformersr
   r   r   r   r   r   ro   r   r   loadersr   r   r   r   models.autoencodersr   "models.controlnets.controlnet_fluxr   r   models.transformersr   
schedulersr   utilsr   r   r   r   r    r!   utils.torch_utilsr"   pipeline_utilsr$   pipeline_outputr&   torch_xla.core.xla_modelcore	xla_modelr;  r:  
get_loggerrZ  r   rd  r   r/  r3   r   re  r   r@   rB   rS   rU   r-   r4   r2   <module>rx     sn    D D D    E p p 0 _ 9 9  . . / ))MM 
		H	% D 

 
 	

 
 ck
TLL
T-5eoo-F
T\_
T  *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*vi0.0CEXZl i0r4   