
    biM                    t   d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZmZ ddlmZmZ ddlmZmZ dd	lmZ d
dlmZmZmZmZmZmZmZ d
dlm Z m!Z! d
dl"m#Z# d
dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d
dl,m-Z- ddl.m/Z/  ej`                  e1      Z2e G d de             Z3 G d dejh                        Z5 G d dejh                        Z6 G d dejh                        Z7	 	 	 	 	 	 	 	 d;de8de8de8de8de8dee8   d ee
e8e	e8   f      d!ee8   d"ee8   d#e9d$ee9   d%ee9   fd&Z:	 	 	 	 	 	 	 d<d'e8d(e8dee8   dee8   d e8d!ee8   d"ee8   d$e9d%e9fd)Z;d*e8d+e8d,ee8   fd-Z< G d. d/e#e      Z= G d0 d1e#e      Z> G d2 d3ejh                        Z? G d4 d5ejh                        Z@ G d6 d7ejh                        ZAd=d8ZBd9 ZCd: ZDy)>    )	dataclass)gcd)AnyDictListOptionalTupleUnionN)Tensornn   )ConfigMixinregister_to_config)
BaseOutputlogging)apply_freeu   )ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORS	AttentionAttentionProcessorAttnAddedKVProcessorAttnProcessorFusedAttnProcessor2_0)TimestepEmbedding	Timesteps)
ModelMixin)CrossAttnDownBlock2DCrossAttnUpBlock2DDownsample2DResnetBlock2DTransformer2DModelUNetMidBlock2DCrossAttn
Upsample2D)UNet2DConditionModel   )ControlNetConditioningEmbeddingc                        e Zd ZU dZdZeed<   y)ControlNetXSOutputa=  
    The output of [`UNetControlNetXSModel`].

    Args:
        sample (`Tensor` of shape `(batch_size, num_channels, height, width)`):
            The output of the `UNetControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to the base
            model output, but is already the final output.
    Nsample)__name__
__module____qualname____doc__r*   r   __annotations__     e/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/controlnets/controlnet_xs.pyr)   r)   4   s     FFr1   r)   c                        e Zd ZdZ	 	 ddej
                  dej
                  dej
                  deej
                     deej                     f
 fdZ xZ	S )	DownBlockControlNetXSAdapterz}Components that together with corresponding components from the base model will form a
    `ControlNetXSCrossAttnDownBlock2D`resnetsbase_to_ctrlctrl_to_base
attentionsdownsamplerc                 h    t         |           || _        || _        || _        || _        || _        y N)super__init__r5   r6   r7   r8   downsamplers)selfr5   r6   r7   r8   r9   	__class__s         r2   r=   z%DownBlockControlNetXSAdapter.__init__F   s6     	(($'r1   )NN)
r+   r,   r-   r.   r   
ModuleListr   Conv2dr=   __classcell__r@   s   @r2   r4   r4   B   sh    * /3+/(( mm( mm	(
 R]]+( bii(( (r1   r4   c                   X     e Zd ZdZdedej                  dej                  f fdZ xZS )MidBlockControlNetXSAdapterz|Components that together with corresponding components from the base model will form a
    `ControlNetXSCrossAttnMidBlock2D`midblockr6   r7   c                 L    t         |           || _        || _        || _        y r;   )r<   r=   rG   r6   r7   )r?   rG   r6   r7   r@   s       r2   r=   z$MidBlockControlNetXSAdapter.__init__Z   s%     ((r1   )	r+   r,   r-   r.   r#   r   rA   r=   rC   rD   s   @r2   rF   rF   V   s3    ))!8 ) )egerer ) )r1   rF   c                   <     e Zd ZdZdej
                  f fdZ xZS )UpBlockControlNetXSAdapterzwComponents that together with corresponding components from the base model will form a `ControlNetXSCrossAttnUpBlock2D`r7   c                 0    t         |           || _        y r;   )r<   r=   r7   )r?   r7   r@   s     r2   r=   z#UpBlockControlNetXSAdapter.__init__d   s    (r1   )r+   r,   r-   r.   r   rA   r=   rC   rD   s   @r2   rJ   rJ   a   s     B)R]] ) )r1   rJ   base_in_channelsbase_out_channelsctrl_in_channelsctrl_out_channelstemb_channelsmax_norm_num_groupstransformer_layers_per_blocknum_attention_headscross_attention_dimadd_downsampleupcast_attentionuse_linear_projectionc                 J   d}g }g }g }g }t        |t              r|g|z  }t        |      D ]  }|dk(  r| n|} |dk(  r|n|}|j                  t	        | |              |j                  t        || z   ||t        || z   |      t        ||      d             |r3|j                  t        |||z  |||   |	||t        ||                   |j                  t	        ||              |
rI|j                  t	        ||             t        ||z   d|d	      }|j                  t	        ||             nd }t        t        j                  |      t        j                  |      t        j                  |      
      }|rt        j                  |      |_        |||_        |S )Nr   r   
max_factorh㈵>in_channelsout_channelsrP   groups
groups_outepsr]   
num_layersrT   rW   rV   norm_num_groupsTopuse_convr^   name)r5   r6   r7   )
isinstanceintrangeappendmake_zero_convr!   find_largest_factorr"   r    r4   r   rA   r8   r>   )rL   rM   rN   rO   rP   rQ   has_crossattnrR   rS   rT   rU   rV   rW   rc   r5   r8   r7   r6   ir>   down_block_componentss                        r2   get_down_block_adapterrr   i   s    JGJLL.4(D'E
'R$: #R/0Av+;L/0Av+;L 	N+;=MNO,/??.+*+;>N+N[no./@M`a		
 "'%)<< 1;A>(;*?%5$78IVi$j	 	N+<>OPQG#RJ  	N+<>OPQ# 11DO`gk
 	N+<>OPQ8g&]]<0]]<0 +-==+D(-9*  r1   base_channelsctrl_channelsc	                     t        | |       }	t        ||| z   ||t        t        ||| z         |      ||||	      }
t        ||       }t	        |	|
|      S )N	rR   r]   r^   rP   resnet_groupsrT   rS   rW   rV   )r6   rG   r7   )rm   r#   rn   r   rF   )rs   rt   rP   rQ   rR   rS   rT   rV   rW   r6   rG   r7   s               r2   get_mid_block_adapterrx      sl     "-?L&%A!M1"#)#m]]=Z*[]pq//3)H "-?L&L8bnoor1   r^   prev_output_channelctrl_skip_channelsc                     g }d}t        |      D ])  }|dk(  r|n| }|j                  t        ||   |             + t        t	        j
                  |            S )Nr   r   )r7   )rk   rl   rm   rJ   r   rA   )r^   ry   rz   r7   rc   rp   resnet_in_channelss          r2   get_up_block_adapterr}      se    
 LJ: W45F0N+=a+@BTUVW &2==3NOOr1   c                    :    e Zd ZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededee   dede	de
eee   f   dee   d	ee   d
edee   dee   de
eee   f   de	dede	f fd       Ze	 	 	 	 	 	 	 	 ddedee   deee      deee      de	dedededee   fd       Zd Z xZS )ControlNetXSAdaptera  
    A `ControlNetXSAdapter` model. To use it, pass it into a `UNetControlNetXSModel` (together with a
    `UNet2DConditionModel` base model).

    This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
    methods implemented for all models (such as downloading or saving).

    Like `UNetControlNetXSModel`, `ControlNetXSAdapter` is compatible with StableDiffusion and StableDiffusion-XL. It's
    default parameters are compatible with StableDiffusion.

    Parameters:
        conditioning_channels (`int`, defaults to 3):
            Number of channels of conditioning input (e.g. an image)
        conditioning_channel_order (`str`, defaults to `"rgb"`):
            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
        conditioning_embedding_out_channels (`tuple[int]`, defaults to `(16, 32, 96, 256)`):
            The tuple of output channels for each block in the `controlnet_cond_embedding` layer.
        time_embedding_mix (`float`, defaults to 1.0):
            If 0, then only the control adapters's time embedding is used. If 1, then only the base unet's time
            embedding is used. Otherwise, both are combined.
        learn_time_embedding (`bool`, defaults to `False`):
            Whether a time embedding should be learned. If yes, `UNetControlNetXSModel` will combine the time
            embeddings of the base model and the control adapter. If no, `UNetControlNetXSModel` will use the base
            model's time embedding.
        num_attention_heads (`list[int]`, defaults to `[4]`):
            The number of attention heads.
        block_out_channels (`list[int]`, defaults to `[4, 8, 16, 16]`):
            The tuple of output channels for each block.
        base_block_out_channels (`list[int]`, defaults to `[320, 640, 1280, 1280]`):
            The tuple of output channels for each block in the base unet.
        cross_attention_dim (`int`, defaults to 1024):
            The dimension of the cross attention features.
        down_block_types (`list[str]`, defaults to `["CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"]`):
            The tuple of downsample blocks to use.
        sample_size (`int`, defaults to 96):
            Height and width of input/output sample.
        transformer_layers_per_block (`Union[int, Tuple[int]]`, defaults to 1):
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
        upcast_attention (`bool`, defaults to `True`):
            Whether the attention computation should always be upcasted.
        max_norm_num_groups (`int`, defaults to 32):
            Maximum number of groups in group normal. The actual number will be the largest divisor of the respective
            channels, that is <= max_norm_num_groups.
    conditioning_channelsconditioning_channel_order#conditioning_embedding_out_channelstime_embedding_mixlearn_time_embeddingrS   block_out_channelsbase_block_out_channelsrT   down_block_typessample_sizerR   rV   rQ   rW   c                    t         !|           |d   }|d   dz  }|dvrt        d|       t        |      t        |
      k7  rt        d| d|
 d      t	        |t
        t        f      s|gt        |
      z  }t	        |	t
        t        f      s|	gt        |
      z  }	t	        |t
        t        f      s|gt        |
      z  }t        |      t        |
      k7  rt        d| d|
 d      t        |d   ||	      | _        |rt        ||      | _
        nd | _
        t        j                  g       | _        t        j                  g       | _        t        j                  d|d   d
d      | _        t#        |d   |d         | _        |d   }|d   }t'        |
      D ]c  \  }}|}||   }|}||   }d|v }|t        |
      dz
  k(  }| j                  j)                  t+        |||||||||   ||   |	|   | ||             e t-        |d   |d   ||d   |d   |	d   ||      | _        |d   g}t'        |      D ]/  \  }}|t        |      dz
  k  rd
nd}|j1                  |g|z         1 t        t3        |            }|d   }t5        t        |
            D ]V  }|}||   }t5        d
      D cg c]  }|j7                          } }| j                  j)                  t9        |||              X y c c}w )Nr      )rgbbgrz&unknown `conditioning_channel_order`: zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: .zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: conditioning_embedding_channelsr   r   r   r&   kernel_sizepadding	CrossAttn)rL   rM   rN   rO   rP   rQ   ro   rR   rS   rT   rU   rV   rW   )rs   rt   rP   rR   rS   rT   rV   rW   r   )r^   ry   rz   )r<   r=   
ValueErrorlenri   listtupler'   controlnet_cond_embeddingr   time_embeddingr   rA   down_blocksup_connectionsrB   conv_inrm   control_to_base_for_conv_in	enumeraterl   rr   rx   	mid_blockextendreversedrk   popr}   )"r?   r   r   r   r   r   rS   r   r   rT   r   r   rR   rV   rQ   rW   time_embedding_input_dimtime_embedding_dimrM   rO   rp   down_block_typerL   rN   ro   is_final_blockrz   r^   number_of_subblocks reversed_base_block_out_channelsprev_base_output_channel_ctrl_skip_channels_r@   s"                                    r2   r=   zControlNetXSAdapter.__init__$  s   0 	#:1#= 4Q7!; &^;EF`Eabcc!"c*:&;;t  vH  uI  I_  `p  _q  qr  s  6uF,H+ICP`La+a(-e}=#6"7#>N:O"O-e}=#6"7#>N:O"O"#s+;'<<v  xK  wL  Lb  cs  bt  tu  v 
 *I,>q,AB"7*
&  "34LN`"aD"&D==, mmB/ yy$6q$9qRST+9:LQ:OQhijQk+l( 4A6.q1"+,<"= 	A0 7 :0 21 5'?:M#&6"7!";;N##&%5&7%5&7"4(;"/1Ma1P(;A(>(;A(>'5#5%5*?	6 /1"5,R0,)Eb)I 3B 7 3B 7-"7	
 134();< 	LOA|/0144!   %%|n7J&JK		L ,09P0Q+R(<Q?s+,- 	A'8$ @ CEJ1X"N#5#9#9#;"N"N&&$!2(@':	 #Os   6K9unet
size_ratioc
                 |   |du}
|du}|
|z  st        d      |xs0 |j                  j                  D cg c]  }t        ||z         c}}||j                  j                  } | |||	|||||j                  j                  |j                  j
                  |j                  j                  |j                  j                  |j                  j                  |j                  j                  |j                  j                  |j                  j                        }|j                  |j                         |S c c}w )a8  
        Instantiate a [`ControlNetXSAdapter`] from a [`UNet2DConditionModel`].

        Parameters:
            unet (`UNet2DConditionModel`):
                The UNet model we want to control. The dimensions of the ControlNetXSAdapter will be adapted to it.
            size_ratio (float, *optional*, defaults to `None`):
                When given, block_out_channels is set to a fraction of the base model's block_out_channels. Either this
                or `block_out_channels` must be given.
            block_out_channels (`List[int]`, *optional*, defaults to `None`):
                Down blocks output channels in control model. Either this or `size_ratio` must be given.
            num_attention_heads (`List[int]`, *optional*, defaults to `None`):
                The dimension of the attention heads. The naming seems a bit confusing and it is, see
                https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
            learn_time_embedding (`bool`, defaults to `False`):
                Whether the `ControlNetXSAdapter` should learn a time embedding.
            time_embedding_mix (`float`, defaults to 1.0):
                If 0, then only the control adapter's time embedding is used. If 1, then only the base unet's time
                embedding is used. Otherwise, both are combined.
            conditioning_channels (`int`, defaults to 3):
                Number of channels of conditioning input (e.g. an image)
            conditioning_channel_order (`str`, defaults to `"rgb"`):
                The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
            conditioning_embedding_out_channels (`Tuple[int]`, defaults to `(16, 32, 96, 256)`):
                The tuple of output channel for each block in the `controlnet_cond_embedding` layer.
        NzePass exactly one of `block_out_channels` (for absolute sizing) or `size_ratio` (for relative sizing).)r   r   r   r   r   rS   r   r   rT   r   r   rR   rV   rQ   rW   )r   configr   rj   attention_head_dimrT   r   r   rR   rV   rd   rW   todtype)clsr   r   r   rS   r   r   r   r   r   
fixed_sizerelative_sizebmodels                 r2   	from_unetzControlNetXSAdapter.from_unet  s   R (t3
"$.]*w 
 0pQUQ\Q\QoQo3pACJ4G3p&"&++"@"@"7'A0S1!5 31$(KK$B$B $ ? ?![[99//)-)Q)Q![[99 $ ; ;"&++"C"C
& 	5 4qs   D9c                     t        d      )NzA ControlNetXSAdapter cannot be run by itself. Use it together with a UNet2DConditionModel to instantiate a UNetControlNetXSModel.)r   )r?   argskwargss      r2   forwardzControlNetXSAdapter.forward  s     Q
 	
r1   )r   r          `            ?Fr   r      r   r   i@  i     r      r   r   r   DownBlock2Dr   r&   Tr   T)NNNFr   r   r   r   )r+   r,   r-   r.   r   rj   strr	   floatboolr
   r   r=   classmethodr%   r   r   r   rC   rD   s   @r2   r   r      s   ,\  &'*/:K$'%*67)7.D#'(
 &(?@!%#%&*+F"F %(F .33Z	F
 "F #F #3c
?3F "#JF "'sF !F  *F" c]#F$ ',CsO&<%F& 'F( !)F*  $+F FP  '+2637%*"%%&*/:KJ"J UOJ %T#Y/	J
 &d3i0J #J  J  #J %(J .33ZJ JX
r1   r   c            /       <    e Zd ZdZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d:dee   dee	   dee	   dee   dee   de
eee   f   d	e
eee   f   d
e
eee   f   dee	   dee   dededee   dee   dededee   de	dedee   de
eee   f   def, fd       Ze	 	 	 	 	 d;dedee   dee   deee      dee   dee   fd       Zd<d Zedee	ef   fd!       Zd"e
eee	ef   f   fd#Zd$ Zd%ed&ed'ed(efd)Zd* Zd+ Zd, Z	 	 	 	 	 	 	 	 	 d=d-ed.e
ej<                  eef   d/ej<                  d0eej<                     d1ee   d2eej<                     d3eej<                     d4eej<                     d5eee	e f      d6eee	ej<                  f      d7ed8ede
e!ef   fd9Z" xZ#S )>UNetControlNetXSModela9  
    A UNet fused with a ControlNet-XS adapter model

    This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
    methods implemented for all models (such as downloading or saving).

    `UNetControlNetXSModel` is compatible with StableDiffusion and StableDiffusion-XL. It's default parameters are
    compatible with StableDiffusion.

    It's parameters are either passed to the underlying `UNet2DConditionModel` or used exactly like in
    `ControlNetXSAdapter` . See their documentation for details.
    Tr   r   up_block_typesr   rd   rT   rR   rS   addition_embed_typeaddition_time_embed_dimrV   rW   time_cond_proj_dim%projection_class_embeddings_input_dimr   ctrl_conditioning_channels(ctrl_conditioning_embedding_out_channelsctrl_conditioning_channel_orderctrl_learn_time_embeddingctrl_block_out_channelsctrl_num_attention_headsctrl_max_norm_num_groupsc                    t         0|           |dk  s|dkD  rt        d      |dk  r|st        d      |	|	dk7  rt        d      t        |t        t
        f      s|gt        |      z  }t        |t        t
        f      s|gt        |      z  }t        |t        t
        f      s|gt        |      z  }t        |t        t
        f      s|gt        |      z  }|}d| _        t        j                  d|d   dd	      | _
        t        |d   ||
      | _        t        j                  d|d   dd	      | _        t        |d   |d         | _        |d   }|d   dz  }t!        |d   dd      | _        t%        |||      | _        |rt%        ||      | _        nd | _        |	d | _        d | _        n$t!        |
dd      | _        t%        ||      | _        g }|d   }|d   }t/        |      D ]^  \  }}|}||   }|} ||   }d|v }!|t        |      dz
  k(  }"|j1                  t3        ||| |||||!||   ||   ||   ||   |" ||             ` t5        |d   |d   ||||d   |d   |d   |d   ||      | _        g }#t	        t9        |            }$t	        t9        |            }%t	        t9        |            }&|d   g}'t/        |      D ]/  \  }}(|t        |      dz
  k  rdnd})|'j;                  |(g|)z         1 t	        t9        |            }*|*d   }(t/        |      D ]  \  }}+|(},|*|   }(|*t=        |dz   t        |      dz
           }-t?        d      D .cg c]  }.|'jA                          }/}.d|+v }!|t        |      dz
  k(  }"|#j1                  tC        |-|(|,|/|||!|$|   |%|   |&|   |" |||              t        jD                  |      | _#        t        jD                  |#      | _$        t        jJ                  |d   |      | _&        t        jN                         | _(        t        j                  |d   ddd	      | _)        y c c}.w )Nr   r&   z1`time_embedding_mix` needs to be between 0 and 1.zKTo use `time_embedding_mix` < 1, `ctrl_learn_time_embedding` must be `True`	text_timezAs `UNetControlNetXSModel` currently only supports StableDiffusion and StableDiffusion-XL, `addition_embed_type` must be `None` or `'text_time'`.r   r   r   r   T)flip_sin_to_cosdownscale_freq_shift)cond_proj_dim)r]   time_embed_dimr   rL   rM   rN   rO   rP   rd   r   ro   rR   base_num_attention_headsr   rT   rU   rV   rW   r   rs   rt   rP   rd   r   rR   r   r   rT   rV   rW   r   )r]   r^   ry   rz   rP   resolution_idxro   rR   rS   rT   add_upsamplerV   rd   rW   )num_channels
num_groups)*r<   r=   r   ri   r   r   r   r]   r   rB   base_conv_inr'   r   ctrl_conv_inrm   r   r   base_time_projr   base_time_embeddingctrl_time_embeddingbase_add_time_projbase_add_embeddingr   rl    ControlNetXSCrossAttnDownBlock2DControlNetXSCrossAttnMidBlock2Dr   r   r   minrk   r   ControlNetXSCrossAttnUpBlock2DrA   r   	up_blocks	GroupNormbase_conv_norm_outSiLUbase_conv_actbase_conv_out)1r?   r   r   r   r   rd   rT   rR   rS   r   r   rV   rW   r   r   r   r   r   r   r   r   r   r   r   time_embed_input_dimr   r   rM   rO   rp   r   rL   rN   ro   r   r    rev_transformer_layers_per_blockrev_num_attention_headsrev_cross_attention_dimrz   r^   r   reversed_block_out_channelsup_block_typery   r]   r   r   r@   s1                                                   r2   r=   zUNetControlNetXSModel.__init__  sB   B 	!%7!%;PQQ!*Cjkk*/Bk/Q d  6uF,H+ICP`La+a(-e}=#6"7#>N:O"O-e}=#6"7#>N:O"O2T5MB(@'ACHXDY'Y$#6  IIa);A)>AWXY)H,CA,FG"<*
&
 IIa)@)CQR\]^+9:QRS:TVhijVk+l(  2!4+A.2'(:1(=tjkl#4 ,$
 
 %'80(D$ (,D$&&*D#&*D#&/0GY]tu&vD#&78]_m&nD# .q13A6"+,<"= 	A0 21 50 7 :'?:M#&6"7!";;N0%5&7%5&7"0$3-E"/1Ma1P-Ea-H-Ea-H(;A(>'5#5%5*?	: 9,R01"5(+%=)Eb)I%=b%A%=b%A 3B 7-"7
 	+/9U0V+W("&x0H'I"J"&x0C'D"E 6a89()@A 	LOA|4599q   %%|n7J&JK		L '+84F+G&H#215 ). 9 	A}".6q9L5c!a%EWAX[\A\6]^KEJ1X"N#5#9#9#;"N"N'=8M#&8"9A"==N. +!-(;':"0#$"/1QRS1T(?(B(?(B%3!3%5$3*?	8 ==5y1"$,,<Nq<Q^m"nWWYYY'9!'<aQXYZ; #Os   Qr   
controlnetr   ctrl_optional_kwargsc                 z   |t        j                  |||fi |}n!t        d ||||fD              rt        d      g d}|j                  j                         D 	ci c]  \  }}	||v s||	 }}}	|j                  j                  |d<   g d}
|j                  j                         D 	ci c]  \  }}	||
v sd|z   |	 }
}}	|j                  j                  |
d<   | j                  i ||
      }g d}|D ]8  }t        |d	|z         j                  t        ||      j                                : d
dg}|D ]R  }t        ||      st        ||      t        |d	|z         j                  t        ||      j                                T |j                  j                  |j                  j                                |j                  j                  |j                  j                                |j                   3|j"                  j                  |j                   j                                |j$                  j                  |j$                  j                                t'        j(                  d t+        |j,                  |j,                        D              |_        t.        j1                  |j2                  |j2                        |_        t'        j(                  d t+        |j4                  |j6                        D              |_        |j9                  |j:                         |S c c}	}w c c}	}w )a  
        Instantiate a [`UNetControlNetXSModel`] from a [`UNet2DConditionModel`] and an optional [`ControlNetXSAdapter`]
        .

        Parameters:
            unet (`UNet2DConditionModel`):
                The UNet model we want to control.
            controlnet (`ControlNetXSAdapter`):
                The ControlNet-XS adapter with which the UNet will be fused. If none is given, a new ControlNet-XS
                adapter will be created.
            size_ratio (float, *optional*, defaults to `None`):
                Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
            ctrl_block_out_channels (`List[int]`, *optional*, defaults to `None`):
                Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details,
                where this parameter is called `block_out_channels`.
            time_embedding_mix (`float`, *optional*, defaults to None):
                Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
            ctrl_optional_kwargs (`Dict`, *optional*, defaults to `None`):
                Passed to the `init` of the new controlnet if no controlnet was given.
        c              3   $   K   | ]  }|d u 
 y wr;   r0   ).0os     r2   	<genexpr>z2UNetControlNetXSModel.from_unet.<locals>.<genexpr>  s      "#s   zWhen a controlnet is passed, none of these parameters should be passed: size_ratio, ctrl_block_out_channels, time_embedding_mix, ctrl_optional_kwargs.)r   r   r   r   rd   rT   rR   r   r   rV   rW   r   r   rS   )r   r   r   r   r   rS   rQ   ctrl_r   )r   r   conv_norm_outconv_outbase_add_time_projadd_embeddingc              3   N   K   | ]  \  }}t         j                  ||        y wr;   )r   from_modulesr  r   cs      r2   r  z2UNetControlNetXSModel.from_unet.<locals>.<genexpr>7  s)      *
1 -99!Q?*
   #%c              3   N   K   | ]  \  }}t         j                  ||        y wr;   )r   r  r  s      r2   r  z2UNetControlNetXSModel.from_unet.<locals>.<genexpr><  s)      (
1 +771=(
r  )r   r   anyr   r   itemsr   r   from_configgetattrload_state_dict
state_dicthasattrr   r   r   r   r   r   r   rA   zipr   r   r  r   r   r   r   r   )r   r   r  r   r   r   r  params_for_unetkvparams_for_controlnetr   modules_from_unetmoptional_modules_from_unets                  r2   r   zUNetControlNetXSModel.from_unet  s   < ,66j"9=QJ  (24KM_au'v  ! m 

 -1KK,=,=,?XDAq1CW1a4XX151O1O-.!
 =G<M<M<S<S<U tDAqYZ^sYs1a t t6@6G6G6Z6Z23  L? L6K LM
 # 	WAE7Q;'77a8H8S8S8UV	W &
" , 	[AtQGD!$4$@w{+;;GD!<L<W<W<YZ	[
 	''77
8\8\8g8g8ij**:+=+=+H+H+JK$$0%%55j6O6O6Z6Z6\]))99*:`:`:k:k:mn MM *
D,,j.D.DE*
 
 :FFt~~WaWkWkl-- (
DNNJ,E,EF(
 
 	w Y !us   L1+L1-L7:L7returnc                    | j                         D ]	  }d|_         g d}|D cg c]  }t        | |      t        | |       }}|D ]  }|j                         D ]	  }d|_           | j                  D ]  }|j	                           | j
                  j	                          | j                  D ]  }|j	                           yc c}w )Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
        tuning.T)r   r   r   r   r   r   r   r   NF)
parametersrequires_gradr  r   freeze_base_paramsr   r   )r?   param
base_partspartdus         r2   freeze_unet_paramsz(UNetControlNetXSModel.freeze_unet_paramsF  s     __& 	'E"&E	'	

 7AddGDRVDWDcgdD)d
d 	,D* ,&+#,	, !! 	#A  "	#))+ 	#A  "	# es
   CCc                     i }dt         dt        j                  j                  dt        t         t
        f   ffd| j                         D ]  \  }} |||        |S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        rh   module
processorsc                     t        |d      r|j                         ||  d<   |j                         D ]  \  }} |  d| ||        |S )Nget_processor
.processorr   )r  r3  named_children)rh   r0  r1  sub_namechildfn_recursive_add_processorss        r2   r8  zJUNetControlNetXSModel.attn_processors.<locals>.fn_recursive_add_processorsn  sd    v/282F2F2H
dV:./#)#8#8#: U%+tfAhZ,@%TU r1   )r   torchr   Moduler   r   r5  )r?   r1  rh   r0  r8  s       @r2   attn_processorsz%UNetControlNetXSModel.attn_processorsc  sm     
	c 	588?? 	X\]`bt]tXu 	 !//1 	BLD&'fjA	B r1   	processorc           	      T   t        | j                  j                               }t        |t              r,t        |      |k7  rt        dt        |       d| d| d      dt        dt        j                  j                  ffd| j                         D ]  \  }} |||        y)	a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.rh   r0  c                     t        |d      rEt        |t              s|j                  |       n#|j                  |j	                  |  d             |j                         D ]  \  }} |  d| ||        y )Nset_processorr4  r   )r  ri   dictr?  r   r5  )rh   r0  r<  r6  r7  fn_recursive_attn_processors        r2   rA  zMUNetControlNetXSModel.set_attn_processor.<locals>.fn_recursive_attn_processor  sx    v/!)T2((3(($z7J)KL#)#8#8#: T%+tfAhZ,@%STr1   N)r   r;  keysri   r@  r   r   r9  r   r:  r5  )r?   r<  countrh   r0  rA  s        @r2   set_attn_processorz(UNetControlNetXSModel.set_attn_processor}  s     D((--/0i&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1 	ALD&'fi@	Ar1   c           	      j   t        d | j                  j                         D              rt               }nmt        d | j                  j                         D              rt	               }n8t        dt        t        | j                  j                                            | j                  |       y)ze
        Disables custom attention processors and sets the default attention implementation.
        c              3   @   K   | ]  }|j                   t        v   y wr;   )r@   r   r  procs     r2   r  zCUNetControlNetXSModel.set_default_attn_processor.<locals>.<genexpr>  s     i4t~~!>>i   c              3   @   K   | ]  }|j                   t        v   y wr;   )r@   r   rG  s     r2   r  zCUNetControlNetXSModel.set_default_attn_processor.<locals>.<genexpr>  s     h$#==hrI  zOCannot call `set_default_attn_processor` when attention processors are of type N)	allr;  valuesr   r   r   nextiterrD  )r?   r<  s     r2   set_default_attn_processorz0UNetControlNetXSModel.set_default_attn_processor  s     i4K_K_KfKfKhii,.Ih$J^J^JeJeJghh%Iabfgklp  mA  mA  mH  mH  mJ  hK  cL  bM  N  		*r1   s1s2b1b2c                     t        | j                        D ]9  \  }}t        |d|       t        |d|       t        |d|       t        |d|       ; y)aF  Enables the FreeU mechanism from https://huggingface.co/papers/2309.11497.

        The suffixes after the scaling factors represent the stage blocks where they are being applied.

        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.

        Args:
            s1 (`float`):
                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            s2 (`float`):
                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
        rP  rQ  rR  rS  N)r   r   setattr)r?   rP  rQ  rR  rS  rp   upsample_blocks          r2   enable_freeuz"UNetControlNetXSModel.enable_freeu  sQ    $ "+4>>!: 	.A~ND"-ND"-ND"-ND"-		.r1   c                     h d}t        | j                        D ]3  \  }}|D ])  }t        ||      st        ||d      t	        ||d       + 5 y)zDisables the FreeU mechanism.>   rR  rS  rP  rQ  N)r   r   r  r  rU  )r?   
freeu_keysrp   rV  r  s        r2   disable_freeuz#UNetControlNetXSModel.disable_freeu  sW    -
!*4>>!: 	5A~ 5>1-D1Q1]NAt45	5r1   c                 r   d| _         | j                  j                         D ]1  \  }}dt        |j                  j
                        v s(t        d       | j                  | _         | j                         D ]%  }t        |t              s|j                  d       ' | j                  t                      y)u1  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsr;  r  r   r@   r+   r   modulesri   r   fuse_projectionsrD  r   )r?   r   attn_processorr0  s       r2   fuse_qkv_projectionsz*UNetControlNetXSModel.fuse_qkv_projections  s     )-%!%!5!5!;!;!= 	vA~#n66??@@ !tuu	v )-(<(<%lln 	3F&),''T'2	3 	 5 78r1   c                 T    | j                   | j                  | j                          yy)u   Disables the fused QKV projection if enabled.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>

        N)r^  rD  )r?   s    r2   unfuse_qkv_projectionsz,UNetControlNetXSModel.unfuse_qkv_projections  s)     ((4##D$A$AB 5r1   r*   timestepencoder_hidden_statescontrolnet_condconditioning_scaleclass_labelstimestep_condattention_maskcross_attention_kwargsadded_cond_kwargsreturn_dictapply_controlc                 	   | j                   j                  dk(  rt        j                  |dg      }|2d|j	                  |j
                        z
  dz  }|j                  d      }|}t        j                  |      s|j                  j                  dk(  }|j                  j                  dk(  }t        |t              r%|s|rt        j                  nt        j                  }n$|s|rt        j                  nt        j                  }t        j                   |g||j                        }n6t#        |j$                        d	k(  r|d   j	                  |j                        }|j'                  |j$                  d	         }| j)                  |      }|j	                  |j
                  
      }| j                   j*                  rN|rL| j-                  ||      }| j/                  ||      }| j                   j0                  dz  }||z  |d|z
  z  z   }n| j/                  |      }d}| j                   j2                  n| j                   j2                  dk(  rd|
vrt5        | j6                   d      |
j9                  d      }d|
vrt5        | j6                   d      |
j9                  d      }| j;                  |j=                               }|j?                  |j$                  d	   df      }t        j@                  ||gd      }|j	                  |j
                        }| jC                  |      }n#t5        d| j                   j2                   d      |||z   n|}|}|x}}g g }}| jE                  |      } | jG                  |      }| jI                  |      }| || z  }|r|| jK                  |      |z  z   }|jM                  |       |jM                  |       | jN                  D ]9  }! |!||||||	||      \  }}}"}#|jQ                  |"       |jQ                  |#       ; | jS                  ||||||	||      \  }}| jT                  D ]@  }$t#        |$jV                        }%||% d }&||% d }'|d|%  }|d|%  } |$||&|'||||	||	      }B | jY                  |      }| j[                  |      }| j]                  |      }|s|fS t_        |      S )a	  
        The [`ControlNetXSModel`] forward method.

        Args:
            sample (`Tensor`):
                The noisy input tensor.
            timestep (`Union[torch.Tensor, float, int]`):
                The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states.
            controlnet_cond (`Tensor`):
                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
            conditioning_scale (`float`, defaults to `1.0`):
                How much the control model affects the base model outputs.
            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
                embeddings.
            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
                negative values to the attention scores corresponding to "discard" tokens.
            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
            added_cond_kwargs (`dict`):
                Additional conditions for the Stable Diffusion XL UNet.
            return_dict (`bool`, defaults to `True`):
                Whether or not to return a [`~models.controlnets.controlnet.ControlNetOutput`] instead of a plain
                tuple.
            apply_control (`bool`, defaults to `True`):
                If `False`, the input is run only through the base model.

        Returns:
            [`~models.controlnetxs.ControlNetXSOutput`] **or** `tuple`:
                If `return_dict` is `True`, a [`~models.controlnetxs.ControlNetXSOutput`] is returned, otherwise a
                tuple is returned where the first element is the sample tensor.
        r   r&   )dimsNg     mpsnpu)r   devicer   )r   g333333?r   text_embedsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`time_idsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`r   dimzgControlNet-XS currently only supports StableDiffusion and StableDiffusion-XL, so addition_embed_type = z is currently not supported.)hidden_states_basehidden_states_ctrltembrf  rh  rl  rk  ro  )	hidden_statesres_hidden_states_tuple_baseres_hidden_states_tuple_ctrlr{  rf  rh  rl  rk  ro  )r*   )0r   r   r9  flipr   r   	unsqueeze	is_tensorrt  typeri   r   float32float64int32int64tensorr   shapeexpandr   r   r   r   r   r   r   r@   getr   flattenreshapeconcatr   r   r   r   r   rl   r   r   r   r   r5   r   r   r   r)   )(r?   r*   re  rf  rg  rh  ri  rj  rk  rl  rm  rn  ro  	timestepsis_mpsis_npur   t_emb	ctrl_temb	base_tembinterpolation_paramr{  aug_embru  rv  time_embeds
add_embedscembh_ctrlh_basehs_basehs_ctrlguided_hintdownresidual_hbresidual_hcup	n_resnetsskips_hbskips_hcs(                                           r2   r   zUNetControlNetXSModel.forward  s   p ;;66%?#jjsCO %."3"3FLL"AAXMN+55a8N 	y) ]]''50F]]''50F(E**0F(.&u{{i[fmmTI!Q&!$**6==9I $$V\\!_5	##I.
 v||,;;00]00FI00FI"&++"@"@#"E22Y!FYBY5ZZD++E2D ;;**2[[,,;$55 ~~&  '{  |  ,//>K!22 ~~&  'x  y  ),,Z8H11(2B2B2DEK%--{/@/@/CR.HIK{K&@bIJ#tzz2J--j9Gyz~  {F  {F  {Z  {Z  z[  [w  x  ")!4tg~$ % ! r 44_E ""6*""6*"k!Fd>>vFI[[[Fvv$$ 	(D7;#)#)&*#5'=-+	84FFK NN;'NN;'	( %%"&1#9)' ( 	
 .. 	BBJJI	z{+H	z{+Hk	z*Gk	z*G$-5-5&*#5'=-+
F	& ((0##F+##F+9!00r1   )r   r   )	UpBlock2Dr   r   r   r   r   r   r&   r   NNTTNNr   r   r   r   Fr   r   r   )NNNNNr#  N)	Nr   NNNNNTT)$r+   r,   r-   r.    _supports_gradient_checkpointingr   r   rj   r	   r   r
   r   r   r=   r   r%   r   r   r   r   r.  propertyr   r;  rD  rO  rW  rZ  rb  rd  r   r9  r   r)   r   rC   rD   s   @r2   r   r      s    (,$ &((
 &u)?)+6:?@67-115!%&*,0?C$'*+?P/4*/.<;<(*=|[ c]|[  *	|[ c
|[ "#J|[ "#|[ #3c
?3|[ ',CsO&<|[ #3c
?3|[  &c]!|[" "*##|[$ %|[&  $'|[( %SM)|[* 08}+|[. "/|[0 %(1|[2 38*3|[4 *-5|[6 $(7|[8 "'s9|[: #(U3Z"8;|[< #&=|[ |[|  59&*9=.2/3t"t 01t UO	t
 "*$u+!6t %UOt 'tnt tl#: c+=&=!>  0 AE2Dd3PbKbFc2c,d  AF+ .u .% .U . .2594C$ 37.1/30415;??C "J1J1 eS01J1  %||	J1
 "%,,/J1 %UOJ1 u||,J1  -J1 !.J1 !)c3h 8J1 $Dell):$;<J1 J1 J1 
!5(	)J1r1   r   c                   T    e Zd Z	 	 	 	 	 	 	 	 	 	 d dedededededededeeeee   f      d	ee   d
ee   dee   dedee   dee   f fdZe	de
defd       Zd!dZ	 	 	 	 	 	 	 d"dededee   dee   dee   dee   deeeef      dee   dedeeeeedf   eedf   f   fdZ xZS )#r   rL   rM   rN   rO   rP   rd   r   rR   r   r   rT   rU   rV   rW   c                    t         |           g }g }g }g }g }g }d}t        |	t              r|	g|z  }	t	        |      D ]  }|dk(  r|n|}|dk(  r|n|}|j                  t        ||             |j                  t        ||||             |j                  t        ||z   ||t        ||z   |      t        ||      d             |r[|j                  t        |
||
z  ||	|   ||||             |j                  t        |||z  ||	|   |||t        ||                   |j                  t        ||              |rb|j                  t        ||             t        |d|d	
      | _        t        ||z   d|d	
      | _        |j                  t        ||             nd | _        d | _        t        j                  |      | _        t        j                  |      | _        |rt        j                  |      nd g|z  | _        |rt        j                  |      nd g|z  | _        t        j                  |      | _        t        j                  |      | _        d| _        y )Nr   r   r]   r^   rP   r_   rY   r[   r\   rb   Tre   rf   F)r<   r=   ri   rj   rk   rl   rm   r!   rn   r"   r    base_downsamplersctrl_downsamplersr   rA   base_resnetsctrl_resnetsbase_attentionsctrl_attentionsr6   r7   gradient_checkpointing)r?   rL   rM   rN   rO   rP   rd   r   ro   rR   r   r   rT   rU   rV   rW   r  r  r  r  r7   r6   rc   rp   r@   s                           r2   r=   z)ControlNetXSCrossAttnDownBlock2D.__init__  s   $ 	
2C8,H+IJ+V(z" 9	VA346/?P346/?P /?AQ RS 0!2"/*	  03C C!2"/.(+;;H`  33DQij	 &&&0)-EE$5#?#B,?.C)9(7	  &&&0)-EE$5#?#B,?.C)9(;<MZr(s	 /@BS TUs9	Vv  /@BS TU%1!D?PW[&D" &2!$55Sdko&D" /@BS TU%)D"%)D"MM,7MM,7ANr}}_=UYTZ]gTgANr}}_=UYTZ]gTgMM,7MM,7&+#r1   base_downblockctrl_downblockc                 t   d }|j                   d   j                  }|j                   d   j                  }|j                   d   j                  |z
  }|j                   d   j                  }|j                   d   j                  j                  }|j                   d   j
                  j                  }	|j                   d   j
                  j                  }
t        |d      rd}t        |j                  d   j                        } ||      j                  } ||      j                  } ||      j                  } ||      j                  }|j                  d   j                  }nd}d }d }d }d }d }d }|j                  d u} | ||||||	|
||||||||      }|j                   j#                  |j                   j%                                |j&                  j#                  |j                   j%                                |rf|j(                  j#                  |j                  j%                                |j*                  j#                  |j                  j%                                |ri|j,                  j#                  |j                  d   j%                                |j.                  j#                  |j                  j%                                |j0                  j#                  |j0                  j%                                |j2                  j#                  |j2                  j%                                |S )Nc                 N    | j                   d   j                  d   j                  S Nr   r8   transformer_blocksattn2blocks    r2   get_first_cross_attentionzPControlNetXSCrossAttnDownBlock2D.from_modules.<locals>.get_first_cross_attention@  $    ##A&99!<BBBr1   r   r8   TFr   )r5   r]   r^   time_emb_projin_featuresnorm1r   r  r   r8   r  headsrT   rV   rW   r>   r  r  r  r  r  r  r  r  r6   r7   )r   r  r  r  rL   rM   rN   rO   rP   r   ctrl_num_groupsro   rR   r   r   rT   rV   rW   rU   r   s                       r2   r  z-ControlNetXSCrossAttnDownBlock2D.from_modules=  s   	C *11!4@@*2215BB""1%114DD 	 +2215BB&..q1??KK#++A.44??
(00399DD><0 M+.~/H/H/K/^/^+_('@'P'V'V$'@'P'V'V$";N"K"_"_8HYY$2$=$=a$@$V$V!!M+/('+$'+$"&#$(!'44D@ -/-/'&%4')E%=%= 3)-"7
& 	**>+A+A+L+L+NO**>+A+A+L+L+NO!!11.2K2K2V2V2XY!!11.2K2K2V2V2XY##33N4O4OPQ4R4]4]4_`##33N4O4O4Z4Z4\]**>+F+F+Q+Q+ST**>+F+F+Q+Q+STr1   r#  c                 h   | j                         D ]	  }d|_         | j                  g}t        | j                  t
        j                        r|j                  | j                         | j                  |j                  | j                         |D ]  }|j                         D ]	  }d|_           yr%  TNF)	r&  r'  r  ri   r  r   rA   rl   r  r?   r)  r*  r+  s       r2   r(  z3ControlNetXSCrossAttnDownBlock2D.freeze_base_params  s     __& 	'E"&E	' ''(
d**BMM:d223!!-d445 	,D* ,&+#,	,r1   ry  r{  rf  rz  rh  rk  rl  encoder_attention_maskro  .c
           	      L   |'|j                  dd       t        j                  d       |}
|}d}d}t        t	        | j
                  | j                              }t        t	        | j                  | j                              }t	        ||| j                  | j                        D ]  \  \  }}\  }}}}|	rt        j                  | ||
      gd      }t        j                         r | j                  r| j                  ||
|      }
n	 ||
|      }
| ||
||||d      d   }
|	rPt        j                         r | j                  r| j                  |||      }n	 |||      }| ||||||d      d   }|	r|
 ||      |z  z   }
||
fz   }||fz   } | j                   | j                  d	   }| j                  d	   }|	rt        j                  | ||
      gd      }| j!                  |
      }
|	r| j#                  |      }|	r|
 ||      |z  z   }
||
fz   }||fz   }|
|||fS )
NscaleSPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r0   r&   rw  Frf  rl  rk  r  rn  r   r   )r  loggerwarningr   r  r  r  r  r  r6   r7   r9  catis_grad_enabledr  _gradient_checkpointing_funcr  r  )r?   ry  r{  rf  rz  rh  rk  rl  r  ro  r  r  base_output_statesctrl_output_statesbase_blocksctrl_blocksb_resb_attnc_resc_attnb2cc2bs                         r2   r   z(ControlNetXSCrossAttnDownBlock2D.forward  s    "-%))'48Dtu##3t00$2F2FGH3t00$2F2FGH:=d&7&79J9J;
 ,	@6OUF_eVc3 FCK#8a@ $$&4+F+F::5&$Ovt,!*?+A#1+A %  ((*t/J/J!>>ufdSF"640F%#.C/E'5/E$) F #f+0B"BB!3vi!?!3vi!?Y,	@\ !!-##B'C##B'C FCK#8a@++F3F//7#f+0B"BB!3vi!?!3vi!?v13EEEr1   )
r   r   Tr&   r&   r&   r   TFTr  )NNr   NNNT)r+   r,   r-   rj   r   r
   r	   r   r=   r   r   r4   r  r(  r   r   r   r   r   r   rC   rD   s   @r2   r   r     s     "(*IJ2323-1#+004!t,t, t, 	t,
 t, t, t, #&t, '/uS%*_/E&Ft, #+3-t, #+3-t, &c]t, t, #4.t,   (~!t,l ?*> ?Pl ? ?B,* 37/3.1+/;?37"ZF"ZF ZF  (/	ZF
 %V,ZF %UOZF !(ZF !)c3h 8ZF !) 0ZF ZF 
vvuVS[153EE	FZFr1   r   c                       e Zd Z	 	 	 	 	 	 	 	 	 ddededee   dedededee   dee   d	ee   d
edee   f fdZedede	fd       Z
ddZ	 	 	 	 	 	 ddedededee   dee   deeeef      dee   dee   dedeeef   fdZ xZS )r   rs   rt   rP   rd   r   rR   r   r   rT   rV   rW   c                 
   t         |           t        ||      | _        t	        |||||	|||
      | _        t	        |||z   ||t        t        |||z         |      |	|||
	      | _        t        ||      | _	        d| _
        y )N)rR   r]   rP   rw   rT   rS   rW   rV   rv   F)r<   r=   rm   r6   r#   base_midblockrn   r   ctrl_midblockr7   r  )r?   rs   rt   rP   rd   r   rR   r   r   rT   rV   rW   r@   s               r2   r=   z(ControlNetXSCrossAttnMidBlock2D.__init__  s     	 +=-H4)E%') 3 8"7-	
 5)E%5&'-M==#@AC[ !4 8"7-
" +=-H&+#r1   r  r  c                    |j                   }|j                  }|j                  }d }|j                  }|j                  }t        |j                  d   j                        }|j                  d   j                  j                  }	|j                  d   j                  j                  }
|j                  d   j                  j                  } ||      j                  } ||      j                  } ||      j                  } ||      j                  }|j                  d   j                   } | |||	|
|||||||      }|j                   j#                  |j%                                |j&                  j#                  |j%                                |j(                  j#                  |j%                                |j                  j#                  |j%                                |S )Nc                 N    | j                   d   j                  d   j                  S r  r  )rG   s    r2   r  zOControlNetXSCrossAttnMidBlock2D.from_modules.<locals>.get_first_cross_attention-  s$    &&q)<<Q?EEEr1   r   r   )r6   r7   rG   r^   r]   r   r8   r  r5   r  r  r  r   r  rT   rV   rW   r  r  r  r  )r   r  r  r6   r7   r  rs   rt   rR   rP   r   r  r   r   rT   rV   rW   r   s                     r2   r  z,ControlNetXSCrossAttnMidBlock2D.from_modules"  s    %11$11%..	F %11$00'*=+C+CA+F+Y+Y'Z$%--a0>>JJ"**1-33>>
'//288CC#<]#K#Q#Q #<]#K#Q#Q 7FZZ4]CTT - 8 8 ; Q Q '''&%4)E%=%= 3-"7
 	**<+B+B+DE++M,D,D,FG++M,D,D,FG**<+B+B+DEr1   r#  c                     | j                         D ]	  }d|_         | j                  j                         D ]	  }d|_         y)r%  TFN)r&  r'  r  )r?   r)  s     r2   r(  z2ControlNetXSCrossAttnMidBlock2D.freeze_base_paramsS  sJ     __& 	'E"&E	' ''224 	(E"'E	(r1   ry  r{  rf  rz  rh  rl  rk  r  ro  c
                 F   |'|j                  dd       t        j                  d       |}
|}|||||d}|	r(t        j                  || j                  |
      gd      } | j                  |
fi |}
|	r* | j                  |fi |}|
| j                  |      |z  z   }
|
|fS )Nr  r  )r{  rf  rk  rl  r  r&   rw  )	r  r  r  r9  r  r6   r  r  r7   )r?   ry  r{  rf  rz  rh  rl  rk  r  ro  r  r  
joint_argss                r2   r   z'ControlNetXSCrossAttnMidBlock2D.forward^  s     "-%))'48Dtu## %:,&<&<

 YY(9(9&(ABJF###F9j9'T''=*=Fd//7:LLLFv~r1   )	Nr   r   r&   r&   r&   r   FTr  )Nr   NNNT)r+   r,   r-   rj   r   r   r=   r   r#   rF   r  r(  r   r   r   r   r   r	   r   rC   rD   s   @r2   r   r     sv   
 (,!(*,-2323-1!&042,2, 2,  }	2,
 2, #&2, '*2, #+3-2, #+3-2, &c]2, 2,  (~2,h ... 3. .`	(  04.1;?+/37"""" "  &	"
 %V," %UO" !)c3h 8" !(" !) 0" " 
vv~	"r1   r   c                   &    e Zd Z	 	 	 	 	 	 	 	 	 d!dedededee   dededee   ded	ed
edededee   f fdZede	de
fd       Zd"dZ	 	 	 	 	 	 	 d#dedeedf   deedf   dedee   dee   deeeef      dee   dee   dee   dedefd Z xZS )$r   r]   r^   ry   rz   rP   rd   r   rR   rS   rT   r   rV   rW   c                    t         |           g }g }g }d}|| _        |
| _        t	        |	t
              r|	g|z  }	t        |      D ]  }||dz
  k(  r|n|}|dk(  r|n|}|j                  t        ||   |             |j                  t        ||z   |||             |sZ|j                  t        |
||
z  ||	|   ||||              t        j                  |      | _        |rt        j                  |      nd g|z  | _        t        j                  |      | _        |rt!        |d|      | _        nd | _        d| _        || _        y )	Nr   r&   r   r  rb   T)rg   r^   F)r<   r=   has_cross_attentionrS   ri   rj   rk   rl   rm   r!   r"   r   rA   r5   r8   r7   r$   
upsamplersr  r   )r?   r]   r^   ry   rz   rP   rd   r   ro   rR   rS   rT   r   rV   rW   r5   r8   r7   rc   rp   res_skip_channelsr|   r@   s                         r2   r=   z'ControlNetXSCrossAttnUpBlock2D.__init__  sm   " 	

#0 #6 2C8,H+IJ+V(z" 	A01Z!^0C,89Q!4L/A!/DFX YZNN 25F F!-"/*	 !!&+$(;;$0#?#B,?.C)9(7	!	: }}W-7D"--
34&S]J]MM,7(S_`DO"DO&+#,r1   base_upblockctrl_upblockc                    |j                   }d }|j                  d   j                  }|j                  d   j                  |z
  }|j                  d   j                  |z
  }|D cg c]  }|j                   }	}|j                  d   j                  j
                  }
|j                  d   j                  j                  }|j                  }t        |d      rtd}t        |j                  d   j                        } ||      j                  } ||      j                  } ||      j                  }|j                  d   j                   }nd}d }d }d }d }d }|j"                  d u} | ||||	|
|||||||||      }|j                  j%                  |j                  j'                                |r3|j                  j%                  |j                  j'                                |r6|j"                  j%                  |j"                  d   j'                                |j                   j%                  |j'                                |S c c}w )Nc                 N    | j                   d   j                  d   j                  S r  r  r  s    r2   r  zNControlNetXSCrossAttnUpBlock2D.from_modules.<locals>.get_first_cross_attention  r  r1   r   r   r8   TF)r]   r^   ry   rz   rP   rd   r   ro   rR   rS   rT   r   rV   rW   )r7   r5   r^   r]   r  r  r  r   r   r  r   r8   r  r  rT   rV   rW   r  r  r  )r   r  r  ctrl_to_base_skip_connectionsr  r^   r]   prev_output_channelsr  ctrl_skip_channelssrP   r   r   ro   rR   rS   rT   rV   rW   r   r   s                        r2   r  z+ControlNetXSCrossAttnUpBlock2D.from_modules  s:   (4(A(A%	C $++A.;;"**2.::\I+33A6BB\Q6STq}}TT$,,Q/==II!))!,22==
%44<. M+.|/F/Fq/I/\/\+](";L"I"O"O";L"I"]"]8FWW$0$;$;A$>$T$T!!M+/("&"&#$(!#..d: #% 42'&)')E 3 3%-"7
$ 	%%l&:&:&E&E&GH,,\-D-D-O-O-QR,,\-D-DQ-G-R-R-TU**+H+S+S+UV] Us   %Ir#  c                 h   | j                         D ]	  }d|_         | j                  g}t        | j                  t
        j                        r|j                  | j                         | j                  |j                  | j                         |D ]  }|j                         D ]	  }d|_           yr  )	r&  r'  r5   ri   r8   r   rA   rl   r  r  s       r2   r(  z1ControlNetXSCrossAttnUpBlock2D.freeze_base_params  s     __& 	'E"&E	' ll^
door}}5doo.??&doo. 	,D* ,&+#,	,r1   r|  r}  .r~  r{  rf  rh  rl  rk  upsample_sizer  ro  c           	          |'|j                  dd       t        j                  d       t         dd       xr+ t         dd       xr t         dd       xr t         dd        fd}t	         j
                   j                   j                  t        |      t        |            D ]  \  }}}}}|r| ||      |z  z  } |||      \  }}t        j                  ||gd	      }t        j                         r  j                  r j                  |||      }n	 |||      }|~ ||||||
d
      d   }  j                   j                  ||	      }|S )Nr  r  rP  rQ  rR  rS  c           	          rDt        j                  | |j                  j                  j                  j
                        S | |fS )N)rP  rQ  rR  rS  )r   r   rP  rQ  rR  rS  )r|  
res_h_baseis_freeu_enabledr?   s     r2   maybe_apply_freeu_to_subblockzMControlNetXSCrossAttnUpBlock2D.forward.<locals>.maybe_apply_freeu_to_subblock0  sK    "''!wwwwwwww  %j00r1   r&   rw  Fr  r   )r  r  r  r  r  r5   r8   r7   r   r9  r  r  r  r  r  )r?   r|  r}  r~  r{  rf  rh  rl  rk  r  r  ro  r  resnetattnr  r  
res_h_ctrlr  s   `                 @r2   r   z&ControlNetXSCrossAttnUpBlock2D.forward  s    "-%))'48Dtu D$% *dD)*dD)* dD)	 		1 :=LLOO1212:
 	5FD#z: Z3E!EE(EmU_(`%M:!II}j&AqIM$$&4+F+F $ A A&-Y] ^ &}d ; $!*?+A#1+A %! !'	8 ??& OOM=IMr1   )	r   NTr&   r&   r   TFTr  )Nr   NNNNT)r+   r,   r-   rj   r   r   r   r=   r   r   rJ   r  r(  r   r	   r   r   r   r   r   rC   rD   s   @r2   r   r     s     "(,,-#$#'!!&04E-E- E- !	E-
 !IE- E- E- !E- '*E- !E- !E- E- E-  (~E-N 8(: 8Jd 8 8t,. 37.1;?+/'+37"GG ',FCK&8G ',FCK&8	G
 G  (/G %UOG !)c3h 8G !(G  }G !) 0G G 
Gr1   r   c                 F    t        t        j                  | |dd            S )Nr&   r   )r   )zero_moduler   rB   )r]   r^   s     r2   rm   rm   a  s    ryylAqIJJr1   c                 n    | j                         D ]!  }t        j                  j                  |       # | S r;   )r&  r   initzeros_)r0  ps     r2   r  r  e  s/      
qMr1   c                 P    |}|| k\  r| S |dk7  r| |z  }|dk(  r|S |dz  }|dk7  ry y )Nr   r&   r0   )numberrZ   factorresiduals       r2   rn   rn   k  sB    F
A+F?q=M!	 A+r1   )r   Tr&   r&   r   TFT)Nr   r&   r&   r   FTr;   )Edataclassesr   mathr   typingr   r   r   r   r	   r
   r9  torch.utils.checkpointr   r   configuration_utilsr   r   utilsr   r   utils.torch_utilsr   attention_processorr   r   r   r   r   r   r   
embeddingsr   r   modeling_utilsr   unets.unet_2d_blocksr   r   r    r!   r"   r#   r$   unets.unet_2d_conditionr%   r  r'   
get_loggerr+   r  r)   r:  r4   rF   rJ   rj   r   rr   rx   r}   r   r   r   r   r   rm   r  rn   r0   r1   r2   <module>r     s   "  : :    B ( ,   6 '   ; 7 
		H	% 
 
 
(299 (()")) )) ) *,EF)*)-',,0X!X!X! X! 	X!
 X! "#X! #+5eCj+A"BX! "#X! "#X! X! tnX! $D>X!| $()+())*)-""& p p p C= p "#	 p
 #& p "# p "# p  p   pFPPP S	PH
*k H
VC1J C1LdFryy dFN	Sbii Sl[RYY [|Kr1   