
    biZ                     V   d dl mZ d dlmZmZmZmZmZmZ d dl	Z	d dl
mZ ddlmZmZ ddlmZmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(  ejR                  e*      Z+e G d de'             Z, G d de#eee      Z- G d de#      Z.y)    )	dataclass)AnyDictListOptionalTupleUnionN   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)USE_PEFT_BACKENDloggingscale_lora_layersunscale_lora_layers   )JointTransformerBlock)	AttentionAttentionProcessorFusedJointAttnProcessor2_0)"CombinedTimestepTextProjEmbeddings
PatchEmbed)Transformer2DModelOutput)
ModelMixin)SD3SingleTransformerBlock   )
BaseOutputzero_modulec                   2    e Zd ZU eej
                     ed<   y)SD3ControlNetOutputcontrolnet_block_samplesN)__name__
__module____qualname__r   torchTensor__annotations__     f/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/controlnets/controlnet_sd3.pyr!   r!   %   s    #ELL11r*   r!   c            $           e Zd ZdZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d+dededededed	ed
ededededededeedf   dee	   dee	   de
de
f" fd       Zd,dee   deddfdZedee	ef   fd       Zdeeee	ef   f   fdZd Zd Zd  Ze	 d-d!       Z	 	 	 	 	 	 d.d"ej2                  d#ej2                  d$ed%ej2                  d&ej2                  d'ej6                  d(eee	ef      d)e
deej2                  ef   fd*Z xZS )/SD3ControlNetModela]	  
    ControlNet model for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).

    Parameters:
        sample_size (`int`, defaults to `128`):
            The width/height of the latents. This is fixed during training since it is used to learn a number of
            position embeddings.
        patch_size (`int`, defaults to `2`):
            Patch size to turn the input data into small patches.
        in_channels (`int`, defaults to `16`):
            The number of latent channels in the input.
        num_layers (`int`, defaults to `18`):
            The number of layers of transformer blocks to use.
        attention_head_dim (`int`, defaults to `64`):
            The number of channels in each head.
        num_attention_heads (`int`, defaults to `18`):
            The number of heads to use for multi-head attention.
        joint_attention_dim (`int`, defaults to `4096`):
            The embedding dimension to use for joint text-image attention.
        caption_projection_dim (`int`, defaults to `1152`):
            The embedding dimension of caption embeddings.
        pooled_projection_dim (`int`, defaults to `2048`):
            The embedding dimension of pooled text projections.
        out_channels (`int`, defaults to `16`):
            The number of latent channels in the output.
        pos_embed_max_size (`int`, defaults to `96`):
            The maximum latent height/width of positional embeddings.
        extra_conditioning_channels (`int`, defaults to `0`):
            The number of extra channels to use for conditioning for patch embedding.
        dual_attention_layers (`Tuple[int, ...]`, defaults to `()`):
            The number of dual-stream transformer blocks to use.
        qk_norm (`str`, *optional*, defaults to `None`):
            The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
        pos_embed_type (`str`, defaults to `"sincos"`):
            The type of positional embedding to use. Choose between `"sincos"` and `None`.
        use_pos_embed (`bool`, defaults to `True`):
            Whether to use positional embeddings.
        force_zeros_for_pooled_projection (`bool`, defaults to `True`):
            Whether to force zeros for pooled projection embeddings. This is handled in the pipelines by reading the
            config value of the ControlNet model.
    TNsample_size
patch_sizein_channels
num_layersattention_head_dimnum_attention_headsjoint_attention_dimcaption_projection_dimpooled_projection_dimout_channelspos_embed_max_sizeextra_conditioning_channelsdual_attention_layers.qk_normpos_embed_typeuse_pos_embed!force_zeros_for_pooled_projectionc                    t         |           |}|
|
n|| _        ||z  | _        |r"t	        ||||| j                  ||      | _        nd | _        t        | j                  |	      | _        |lt        j                  ||      | _
        t        j                  t        |      D cg c]#  }t        | j                  ||d|||v rdnd      % c}      | _        nNd | _
        t        j                  t        |      D cg c]  }t        | j                  ||       c}      | _        t        j                  g       | _        t        t#        | j                              D ]R  }t        j                  | j                  | j                        }t%        |      }| j                   j'                  |       T t	        |||||z   | j                  d       }t%        |      | _        d| _        y c c}w c c}w )N)heightwidthr/   r0   	embed_dimr8   r<   )embedding_dimr6   FT)dimr3   r2   context_pre_onlyr;   use_dual_attention)rD   r3   r2   )r@   rA   r/   r0   rB   r<   )super__init__r7   	inner_dimr   	pos_embedr   time_text_embednnLinearcontext_embedder
ModuleListranger   transformer_blocksr   controlnet_blockslenr   appendpos_embed_inputgradient_checkpointing)selfr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   default_out_channelsi_controlnet_blockrU   	__class__s                          r+   rH   zSD3ControlNetModel.__init__W   s   * 	*,8,DLJ^,/AA'"!%'..#5-DN "DNA..@U 
 *$&II.ACY$ZD! ')mm #:.
  * NN,?+=). '348M3M4SX
'D# %)D!&(mm #:.  . NN,?+=	'D# "$r!2s42234 	<A!yyH*+;<""))*:;	< %!#&AAnn
  +?;&+#S
s   '(G-?G2
chunk_sizerD   returnc                     |dvrt        d|       |xs d}dt        j                  j                  dt        dt        ffd| j                         D ]  } |||        y)	aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r   z-Make sure to set `dim` to either 0 or 1, not r   moduler]   rD   c                     t        | d      r| j                  ||       | j                         D ]  } |||        y )Nset_chunk_feed_forward)r]   rD   )hasattrrb   children)r`   r]   rD   childfn_recursive_feed_forwards       r+   rf   zMSD3ControlNetModel.enable_forward_chunking.<locals>.fn_recursive_feed_forward   sE    v78---M* B)%SABr*   N)
ValueErrorr&   rL   Moduleintrd   )rW   r]   rD   r`   rf   s       @r+   enable_forward_chunkingz*SD3ControlNetModel.enable_forward_chunking   su     fLSERSS  _1
	Behhoo 	B3 	BUX 	B mmo 	?F%fj#>	?r*   c                     i }dt         dt        j                  j                  dt        t         t
        f   ffd| j                         D ]  \  }} |||        |S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namer`   
processorsc                     t        |d      r|j                         ||  d<   |j                         D ]  \  }} |  d| ||        |S )Nget_processor
.processor.)rc   ro   named_children)rl   r`   rm   sub_namere   fn_recursive_add_processorss        r+   rt   zGSD3ControlNetModel.attn_processors.<locals>.fn_recursive_add_processors   sd    v/282F2F2H
dV:./#)#8#8#: U%+tfAhZ,@%TU r*   )strr&   rL   rh   r   r   rr   )rW   rm   rl   r`   rt   s       @r+   attn_processorsz"SD3ControlNetModel.attn_processors   sm     
	c 	588?? 	X\]`bt]tXu 	 !//1 	BLD&'fjA	B r*   	processorc           	      T   t        | j                  j                               }t        |t              r,t        |      |k7  rt        dt        |       d| d| d      dt        dt        j                  j                  ffd| j                         D ]  \  }} |||        y)	a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.rl   r`   c                     t        |d      rEt        |t              s|j                  |       n#|j                  |j	                  |  d             |j                         D ]  \  }} |  d| ||        y )Nset_processorrp   rq   )rc   
isinstancedictrz   poprr   )rl   r`   rw   rs   re   fn_recursive_attn_processors        r+   r~   zJSD3ControlNetModel.set_attn_processor.<locals>.fn_recursive_attn_processor   sx    v/!)T2((3(($z7J)KL#)#8#8#: T%+tfAhZ,@%STr*   N)rS   rv   keysr{   r|   rg   ru   r&   rL   rh   rr   )rW   rw   countrl   r`   r~   s        @r+   set_attn_processorz%SD3ControlNetModel.set_attn_processor   s     D((--/0i&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1 	ALD&'fi@	Ar*   c                 r   d| _         | j                  j                         D ]1  \  }}dt        |j                  j
                        v s(t        d       | j                  | _         | j                         D ]%  }t        |t              s|j                  d       ' | j                  t                      y)u1  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsrv   itemsru   r\   r#   rg   modulesr{   r   fuse_projectionsr   r   )rW   rZ   attn_processorr`   s       r+   fuse_qkv_projectionsz'SD3ControlNetModel.fuse_qkv_projections  s     )-%!%!5!5!;!;!= 	vA~#n66??@@ !tuu	v )-(<(<%lln 	3F&),''T'2	3 	 : <=r*   c                 T    | j                   | j                  | j                          yy)u   Disables the fused QKV projection if enabled.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>

        N)r   r   )rW   s    r+   unfuse_qkv_projectionsz)SD3ControlNetModel.unfuse_qkv_projections&  s)     ((4##D$A$AB 5r*   c                 Z   t        |j                  j                  |j                  j                  |j                  j                  |j                  j                  |j
                  |j                  j                        }|j                  |j                  j                         d       |S )N)r@   rA   r/   r0   rB   r8   Tstrict)
r   configr.   r/   r0   rI   r8   load_state_dictrJ   
state_dict)rW   transformerrJ   s      r+   _get_pos_embed_from_transformerz2SD3ControlNetModel._get_pos_embed_from_transformer5  s    %%11$$00"))44#**66!++*11DD
	 	!!+"7"7"B"B"DT!Rr*   c                 D   |j                   }|xs |j                  |d<   ||d<   | j                  |      }|r|j                  j	                  |j                  j                                |j                  j	                  |j                  j                                |j                  j	                  |j                  j                                |j                  j	                  |j                  j                         d       t        |j                        |_
        |S )Nr1   r9   Fr   )r   r1   from_configrJ   r   r   rK   rN   rQ   r   rU   )clsr   r1   num_extra_conditioning_channelsload_weights_from_transformerr   
controlnets          r+   from_transformerz#SD3ControlNetModel.from_transformerA  s     ##)>V->->|0O,-__V,
(  001F1F1Q1Q1ST&&66{7R7R7]7]7_`''778T8T8_8_8ab))99+:X:X:c:c:ens9t)4Z5O5O)PJ&r*   hidden_statescontrolnet_condconditioning_scaleencoder_hidden_statespooled_projectionstimestepjoint_attention_kwargsreturn_dictc	                 ~   |#|j                         }|j                  dd      }	nd}	t        rt        | |	       n)|'|j	                  dd      t
        j                  d       | j                  |j                  dk7  rt        d      | j                  |j                  dk7  rt        d      | j                  |t        d	      | j                  |t        d
      | j                  | j                  |      }| j                  ||      }
| j                  | j                  |      }|| j                  |      z   }d}| j                  D ]  }t        j                         rD| j                   r8| j                  | j#                  ||||
      \  }}n8| j#                  |||
      }n$| j                   ||||
      \  }}n	 |||
      }||fz   } d}t%        || j&                        D ]  \  }} ||      }||fz   } |D cg c]  }||z  	 }}t        rt)        | |	       |s|fS t+        |      S c c}w )a  
        The [`SD3Transformer2DModel`] forward method.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
                Input `hidden_states`.
            controlnet_cond (`torch.Tensor`):
                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
            conditioning_scale (`float`, defaults to `1.0`):
                The scale factor for ControlNet outputs.
            encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
                from the embeddings of input conditions.
            timestep ( `torch.LongTensor`):
                Used to indicate denoising step.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.

        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        Nscale      ?z\Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective.   z/hidden_states must be 4D when pos_embed is usedr
   z3hidden_states must be 3D when pos_embed is not usedzDencoder_hidden_states must be provided when context_embedder is usedzNencoder_hidden_states should not be provided when context_embedder is not usedr)   )r   r   temb)r"   )copyr}   r   r   getloggerwarningrJ   ndimrg   rN   rK   rU   rQ   r&   is_grad_enabledrV   _gradient_checkpointing_funcziprR   r   r!   )rW   r   r   r   r   r   r   r   r   
lora_scaler   block_res_samplesblockcontrolnet_block_res_samplesblock_res_sampler[   samples                    r+   forwardzSD3ControlNetModel.forwardT  s   N "-%;%@%@%B"/33GSAJJdJ/%16L6P6PQXZ^6_6kr >>%-*<*<*ANOO ^^#(:(:a(?RSS  ,1F1Ncdd""*/D/Pmnn>>% NN=9M##H.@A  ,$($9$9:O$P! &(<(<_(MM,, 	EE$$&4+F+F((4;?;\;\%-	<8)= %)$E$Ee]\`$aM ((4;@&3K`gk<8)=
 %*-$>M 1]4D D-	E0 (*$256GI_I_2` 	^../0@A+GK[J]+](	^
 So'o1C(C'o$'oj1022"<XYY (ps   	H:)   r         @   r   i   i  i   r   `   r   r)   NsincosTT)Nr   )   r   T)r   NNNNT) r#   r$   r%   __doc__ _supports_gradient_checkpointingr   ri   r   r   ru   boolrH   rj   propertyr   r   rv   r	   r   r   r   r   classmethodr   r&   r'   float
LongTensorr   r   r   __classcell__r\   s   @r+   r-   r-   *   sl   (T (,$ "$#%#'&*%)"$+,13!%(0"26%W,W, W, 	W,
 W,  W, !W, !W, !$W,  #W, W,  W, &)W,  %S#XW, #W,  !!W," #W,$ ,0%W, W,t?(3- ?S ?Y] ?: c+=&=!>  0 AE2Dd3PbKbFc2c,d  AF>4C
 jn , %(.2+/%);? xZ||xZ xZ "	xZ
  %||xZ "LLxZ ""xZ !)c3h 8xZ xZ 
u||55	6xZr*   r-   c                        e Zd ZdZ fdZ	 	 	 	 ddej                  deej                     dee	   dej                  dej                  dej                  d	eeeef      d
edeeef   fdZ xZS )SD3MultiControlNetModela  
    `SD3ControlNetModel` wrapper class for Multi-SD3ControlNet

    This module is a wrapper for multiple instances of the `SD3ControlNetModel`. The `forward()` API is designed to be
    compatible with `SD3ControlNetModel`.

    Args:
        controlnets (`List[SD3ControlNetModel]`):
            Provides additional conditioning to the unet during the denoising process. You must set multiple
            `SD3ControlNetModel` as a list.
    c                 V    t         |           t        j                  |      | _        y )N)rG   rH   rL   rO   nets)rW   controlnetsr\   s     r+   rH   z SD3MultiControlNetModel.__init__  s    MM+.	r*   r   r   r   r   r   r   r   r   r^   c	                     t        t        ||| j                              D ]T  \  }	\  }
}} ||||||
|||      }|	dk(  r|}"t        d   |d         D cg c]
  \  }}||z    }}}t        |      f}V S c c}}w )N)r   r   r   r   r   r   r   r   r   )	enumerater   r   tuple)rW   r   r   r   r   r   r   r   r   rY   imager   r   block_samplescontrol_block_samplescontrol_block_sampleblock_samples                    r+   r   zSD3MultiControlNetModel.forward  s     .7s?L^`d`i`i7j-k 	H)A)uj&+!&;#5 %#('='	M Av(5% ?BBWXYBZ\ijk\l>m):,l )<7)% ) *//D)E(G%)	H, %$)s   A:)NNNT)r#   r$   r%   r   rH   r&   r'   r   tensorr   r   r   r   ru   r   r   r	   r!   r   r   r   r   s   @r+   r   r     s    
/ /3%);? !%||!% ell+!% !K	!%
 "LL!%  %||!% ""!% !)c3h 8!% !% 
"E)	*!%r*   r   )/dataclassesr   typingr   r   r   r   r   r	   r&   torch.nnrL   configuration_utilsr   r   loadersr   r   utilsr   r   r   r   	attentionr   attention_processorr   r   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   transformers.transformer_sd3r   r   r   r   
get_loggerr#   r   r!   r-   r   r)   r*   r+   <module>r      s     " : :   B ? V V - [ [ G 7 ' D / 
		H	% 2* 2 2bZ[2BDZ bZJ2%j 2%r*   