
    bi}                     >   d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	mc m
Z ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZmZmZmZm Z   G d dejB                        Z" G d dejB                        Z#dejH                  dejH                  fdZ% G d dejB                        Z& G d dejB                        Z' G d dejB                        Z( G d dejB                        Z) G d dejB                        Z* G d dejB                        Z+y)    )partial)OptionalTupleUnionN   )	deprecate   )get_activation)SpatialNorm)Downsample1DDownsample2DFirDownsample2DKDownsample2Ddownsample_2d)AdaGroupNorm)FirUpsample2DKUpsample2D
Upsample1D
Upsample2Dupfirdn2d_nativeupsample_2dc            "            e Zd ZdZddddddddd	d
dddddddedee   dededededee   dededededee   dedededee   f  fdZ	de
j                  de
j                  d e
j                  fd!Z xZS )"ResnetBlockCondNorm2Da)  
    A Resnet block that use normalization layer that incorporate conditioning information.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
        groups_out (`int`, *optional*, default to None):
            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
        time_embedding_norm (`str`, *optional*, default to `"ada_group"` ):
            The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial".
        kernel (`torch.Tensor`, optional, default to None): FIR filter, see
            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
        use_in_shortcut (`bool`, *optional*, default to `True`):
            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
            `conv_shortcut` output.
        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
            If None, same as `out_channels`.
    NF               ư>swish	ada_group      ?T)out_channelsconv_shortcutdropouttemb_channelsgroups
groups_outepsnon_linearitytime_embedding_normoutput_scale_factoruse_in_shortcutupdownconv_shortcut_biasconv_2d_out_channelsin_channelsr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   c                .   t         |           || _        ||n|}|| _        || _        || _        || _        || _        |
| _        ||}| j                  dk(  rt        ||||      | _
        n9| j                  dk(  rt        ||      | _
        nt        d| j                         t        j                  ||ddd      | _        | j                  dk(  rt        ||||      | _        n9| j                  dk(  rt        ||      | _        nt        d| j                         t"        j                  j%                  |      | _        |xs |}t        j                  ||ddd      | _        t+        |	      | _        d x| _        | _        | j
                  rt3        |d	      | _        n | j                  rt5        |ddd
      | _        || j                  |k7  n|| _        d | _        | j6                  r!t        j                  ||ddd|      | _        y y )Nr   )r'   spatialz" unsupported time_embedding_norm:    r	   kernel_sizestridepaddingFuse_convopr9   r7   namer   r5   r6   r7   bias)super__init__r0   r!   use_conv_shortcutr,   r-   r*   r)   r   norm1r   
ValueErrornnConv2dconv1norm2torchDropoutr#   conv2r
   nonlinearityupsample
downsampler   r   r+   r"   )selfr0   r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   	__class__s                    R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/resnet.pyr@   zResnetBlockCondNorm2D.__init__J   s    ( 	&&2&:{(!.	#6 #6 J##{2%m[&cRDJ%%2$[-@DJA$BZBZA[\]]YY{LaPQ[\]
##{2%m\:SVWDJ%%2$\=ADJA$BZBZA[\]]xx''03C|YY|-AqYZdef
*=9*..77&{UCDMYY*;PQX\]DOKZKbt//3GGhw!!#$'"D      input_tensortembreturnc                    t        |      dkD  s|j                  dd       d}t        dd|       |}| j                  ||      }| j	                  |      }| j
                  U|j                  d   dk\  r |j                         }|j                         }| j                  |      }| j                  |      }n.| j                  "| j                  |      }| j                  |      }| j                  |      }| j                  ||      }| j	                  |      }| j                  |      }| j                  |      }| j                  | j                  |      }||z   | j                  z  }|S )Nr   scaleThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.1.0.0@   )lengetr   rB   rK   rL   shape
contiguousrM   rF   rG   r#   rJ   r"   r*   )rN   rR   rS   argskwargsdeprecation_messagehidden_statesoutput_tensors           rP   forwardzResnetBlockCondNorm2D.forward   sT   t9q=FJJw5A #Ugw(;<$

=$7))-8==$""1%++668 - 8 8 :==6L MM-8M__(??<8L OOM:M

=1

=$7))-8]3

=1)--l;L%59Q9QQrQ   )__name__
__module____qualname____doc__intr   boolfloatstrr@   rH   Tensorrc   __classcell__rO   s   @rP   r   r   ,   s(   B '+# $($#.%(*.#'.2%I I sm	I
 I I I I SMI I I !I #I "$I I  !I" !#I$ 'sm%IV%ELL % %Z_ZfZf %rQ   r   c            (           e Zd ZdZddddddddd	dd
dddddddddedee   dededededee   dedededededee	j                     dedee   dedededee   f& fd Zd!e	j                  d"e	j                  d#e	j                  fd$Z xZS )%ResnetBlock2Da9  
    A Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
        groups_out (`int`, *optional*, default to None):
            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a
            stronger conditioning with scale and shift.
        kernel (`torch.Tensor`, optional, default to None): FIR filter, see
            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
        use_in_shortcut (`bool`, *optional*, default to `True`):
            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
            `conv_shortcut` output.
        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
            If None, same as `out_channels`.
    NFr   r   r   Tr   r   defaultr    )r!   r"   r#   r$   r%   r&   pre_normr'   r(   skip_time_actr)   kernelr*   r+   r,   r-   r.   r/   r0   r!   r"   r#   r$   r%   r&   rr   r'   r(   rs   r)   rt   r*   r+   r,   r-   r.   r/   c                   t         |           |dk(  rt        d      |dk(  rt        d      d| _        || _        ||n|}|| _        || _        || _        || _        || _	        || _
        || _        ||}t        j                  j                  |||	d      | _        t        j                   ||ddd	      | _        |r| j                  d
k(  rt        j$                  ||      | _        nN| j                  dk(  rt        j$                  |d|z        | _        n t        d| j                   d      d | _        t        j                  j                  |||	d      | _        t        j                  j+                  |      | _        |xs |}t        j                   ||ddd	      | _        t1        |
      | _        d x| _        | _        | j                  rL|dk(  rdfd| _        n|dk(  r"t9        t:        j<                  dd      | _        nlt?        |d      | _        nY| j                  rM|dk(  rdfd| _        n;|dk(  r"t9        t:        j@                  dd      | _        ntC        |ddd      | _        || j                  |k7  n|| _"        d | _#        | jD                  r!t        j                   ||ddd|      | _#        y y )Nr   zkThis class cannot be used with `time_embedding_norm==ada_group`, please use `ResnetBlockCondNorm2D` insteadr2   ziThis class cannot be used with `time_embedding_norm==spatial`, please use `ResnetBlockCondNorm2D` insteadT
num_groupsnum_channelsr'   affiner3   r	   r4   rq   scale_shiftr   zunknown time_embedding_norm :  fir)r	   r3   r3   r	   c                     t        |       S N)rt   )r   x
fir_kernels    rP   <lambda>z(ResnetBlock2D.__init__.<locals>.<lambda>%  s    +a
*K rQ   sde_vpg       @nearest)scale_factormodeFr8   c                     t        |       S r~   )r   r   s    rP   r   z(ResnetBlock2D.__init__.<locals>.<lambda>-  s    M!J,O rQ   )r5   r6   r:   r;   r   r=   )$r?   r@   rC   rr   r0   r!   rA   r,   r-   r*   r)   rs   rH   rD   	GroupNormrB   rE   rF   Lineartime_emb_projrG   rI   r#   rJ   r
   rK   rL   rM   r   Finterpolater   
avg_pool2dr   r+   r"   )rN   r0   r!   r"   r#   r$   r%   r&   rr   r'   r(   rs   r)   rt   r*   r+   r,   r-   r.   r/   r   rO   s                       @rP   r@   zResnetBlock2D.__init__   s   . 	+-}  )+{  &&2&:{(!.	#6 #6 *JXX''6Y\ei'j
YY{LaPQ[\]
$''94%'YY}l%K"))]:%'YY}a,>N%O" #A$BZBZA[[\!]^^!%DXX'':L^ajn'o
xx''03C|YY|-AqYZdef
*=9*..77)
 K8# 'Ci X *; GYY)
"O8#")!,,Aa"P".{UTU\`"aKZKbt//3GGhw!!#$'"D  rQ   rR   rS   rT   c                    t        |      dkD  s|j                  dd       d}t        dd|       |}| j                  |      }| j	                  |      }| j
                  U|j                  d   dk\  r |j                         }|j                         }| j                  |      }| j                  |      }n.| j                  "| j                  |      }| j                  |      }| j                  |      }| j                  9| j                  s| j	                  |      }| j                  |      d d d d d d f   }| j                  dk(  r|||z   }| j                  |      }nr| j                  dk(  rR|t        d| j                         t        j                   |d	d
      \  }}| j                  |      }|d
|z   z  |z   }n| j                  |      }| j	                  |      }| j#                  |      }| j%                  |      }| j&                  | j'                  |j                               }||z   | j(                  z  }	|	S )Nr   rV   rW   rX   rY   rq   rz   z9 `temb` should not be None when `time_embedding_norm` is r   r	   )dim)rZ   r[   r   rB   rK   rL   r\   r]   rM   rF   r   rs   r)   rG   rC   rH   chunkr#   rJ   r"   r*   )
rN   rR   rS   r^   r_   r`   ra   
time_scale
time_shiftrb   s
             rP   rc   zResnetBlock2D.forward@  s4   t9q=FJJw5A #Ugw(;<$

=1))-8==$""1%++668 - 8 8 :==6L MM-8M__(??<8L OOM:M

=1)%%((.%%d+Aq$,<=D##y0 - 4 JJ}5M%%6| OPTPhPhOij  &+[[qa%@"J
 JJ}5M)Q^<zIM JJ}5M))-8]3

=1)--l.E.E.GHL%59Q9QQrQ   )rd   re   rf   rg   rh   r   ri   rj   rk   rH   rl   r@   rc   rm   rn   s   @rP   rp   rp      s[   D '+# $($##,)-%(*.#'.2+b b sm	b
 b b b b SMb b b b b !b &b  #!b" "$#b$ %b& 'b( !)b* 'sm+bH5ELL 5 5Z_ZfZf 5rQ   rp   tensorrT   c                    t        | j                        dk(  r| d d d d d f   S t        | j                        dk(  r| d d d d d d d f   S t        | j                        dk(  r| d d d d dd d f   S t        dt        |        d      )Nr   r3      r   z`len(tensor)`: z has to be 2, 3 or 4.)rZ   r\   rC   )r   s    rP   rearrange_dimsr   y  s    
6<<AaDj!!
6<<AaD!m$$	V\\	a	aAqj!!?3v;-7LMNNrQ   c                        e Zd ZdZ	 	 ddededeeeeef   f   dedef
 fdZde	j                  d	e	j                  fd
Z xZS )Conv1dBlocka  
    Conv1d --> GroupNorm --> Mish

    Parameters:
        inp_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        kernel_size (`int` or `tuple`): Size of the convolving kernel.
        n_groups (`int`, default `8`): Number of groups to separate the channels into.
        activation (`str`, defaults to `mish`): Name of the activation function.
    inp_channelsr!   r5   n_groups
activationc                     t         |           t        j                  ||||dz        | _        t        j
                  ||      | _        t        |      | _        y )Nr   r7   )	r?   r@   rD   Conv1dconv1dr   
group_normr
   mish)rN   r   r!   r5   r   r   rO   s         rP   r@   zConv1dBlock.__init__  sK     	iilKQ\`aQab,,x>":.	rQ   inputsrT   c                     | j                  |      }t        |      }| j                  |      }t        |      }| j                  |      }|S N)r   r   r   r   )rN   r   intermediate_reproutputs       rP   rc   zConv1dBlock.forward  sM     KK/*+<= OO,=>*+<=,-rQ   )   r   rd   re   rf   rg   rh   r   r   rk   r@   rH   rl   rc   rm   rn   s   @rP   r   r     sr    	   // / 3c3h/0	/
 / /ell u|| rQ   r   c                        e Zd ZdZ	 	 ddedededeeeeef   f   def
 fdZde	j                  d	e	j                  d
e	j                  fdZ xZS )ResidualTemporalBlock1Da  
    Residual 1D block with temporal convolutions.

    Parameters:
        inp_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        embed_dim (`int`): Embedding dimension.
        kernel_size (`int` or `tuple`): Size of the convolving kernel.
        activation (`str`, defaults `mish`): It is possible to choose the right activation function.
    r   r!   	embed_dimr5   r   c                 6   t         |           t        |||      | _        t        |||      | _        t        |      | _        t        j                  ||      | _	        ||k7  rt        j                  ||d      | _        y t        j                         | _        y )Nr	   )r?   r@   r   conv_inconv_outr
   time_emb_actrD   r   time_embr   Identityresidual_conv)rN   r   r!   r   r5   r   rO   s         rP   r@   z ResidualTemporalBlock1D.__init__  s     	"<{K#L,L*:6		)\: 9E8TBIIlL!4 	Z\ZeZeZg 	rQ   r   trT   c                     | j                  |      }| j                  |      }| j                  |      t        |      z   }| j	                  |      }|| j                  |      z   S )z
        Args:
            inputs : [ batch_size x inp_channels x horizon ]
            t : [ batch_size x embed_dim ]

        returns:
            out : [ batch_size x out_channels x horizon ]
        )r   r   r   r   r   r   )rN   r   r   outs       rP   rc   zResidualTemporalBlock1D.forward  s^     a MM!ll6"^A%66mmC T''///rQ   )   r   r   rn   s   @rP   r   r     s}    	  45 

 
 	

 3c3h/0
 
&0ell 0u|| 0 0rQ   r   c            	            e Zd ZdZ	 	 	 ddedee   dedef fdZddej                  ded	ej                  fd
Z
 xZS )TemporalConvLayera  
    Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016

    Parameters:
        in_dim (`int`): Number of input channels.
        out_dim (`int`): Number of output channels.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
    in_dimout_dimr#   norm_num_groupsc                 b   t         |           |xs |}|| _        || _        t	        j
                  t	        j                  ||      t	        j                         t	        j                  ||dd            | _	        t	        j
                  t	        j                  ||      t	        j                         t	        j                  |      t	        j                  ||dd            | _        t	        j
                  t	        j                  ||      t	        j                         t	        j                  |      t	        j                  ||dd            | _        t	        j
                  t	        j                  ||      t	        j                         t	        j                  |      t	        j                  ||dd            | _        t        j                  j                  | j                  d   j                          t        j                  j                  | j                  d   j"                         y )Nr3   r	   r	   )r	   r   r   r   )r?   r@   r   r   rD   
Sequentialr   SiLUConv3drF   rI   rJ   conv3conv4initzeros_weightr>   )rN   r   r   r#   r   rO   s        rP   r@   zTemporalConvLayer.__init__  sv    	#V ]]LL&1GGIIIfgy)D


 ]]LL'2GGIJJwIIgvy)D	

 ]]LL'2GGIJJwIIgvy)D	

 ]]LL'2GGIJJwIIgvy)D	

 	tzz"~,,-
tzz"~**+rQ   ra   
num_framesrT   c                    |d d d f   j                  d|f|j                  dd  z         j                  ddddd      }|}| j                  |      }| j	                  |      }| j                  |      }| j                  |      }||z   }|j                  ddddd      j                  |j                  d   |j                  d   z  df|j                  dd  z         }|S )Nr   r	   r   r   r3   r   )reshaper\   permuterF   rJ   r   r   )rN   ra   r   identitys       rP   rc   zTemporalConvLayer.forward  s    $'"**B
+;m>Q>QRSRT>U+UV^^_`bcefhiklm 	 !

=1

=1

=1

=1 =0%--aAq!<DD  #m&9&9!&<<bAMDWDWXYXZD[[
 rQ   )Nr   r   )r	   rd   re   rf   rg   rh   r   rj   r@   rH   rl   rc   rm   rn   s   @rP   r   r     se     "&!',', #', 	',
 ',RU\\ s 5<< rQ   r   c            	            e Zd ZdZ	 	 	 ddedee   dedef fdZdej                  dej                  d	ej                  fd
Z
 xZS )TemporalResnetBlocka  
    A Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
    r0   r!   r$   r'   c                    t         |           || _        ||n|}|| _        d}|D cg c]  }|dz  	 }}t        j
                  j                  d||d      | _        t        j                  |||d|      | _	        |t        j                  ||      | _        nd | _        t        j
                  j                  d||d      | _        t        j
                  j                  d      | _        t        j                  |||d|      | _        t!        d	      | _        | j                  |k7  | _        d | _        | j$                  r t        j                  ||ddd
      | _        y y c c}w )Nr   r   r   Trv   r	   r4   r   silur   )r?   r@   r0   r!   rH   rD   r   rB   r   rF   r   r   rG   rI   r#   rJ   r
   rK   r+   r"   )	rN   r0   r!   r$   r'   r5   kr7   rO   s	           rP   r@   zTemporalResnetBlock.__init__*  s_    	&&2&:{(#./a16//XX''2KUXae'f
YY#

 $!#=,!GD!%DXX''2LVYbf'g
xx'',YY#

 +62#//<?!!#"D  A 0s   E'rR   rS   rT   c                    |}| j                  |      }| j                  |      }| j                  |      }| j                  J| j                  |      }| j                  |      d d d d d d d d f   }|j	                  ddddd      }||z   }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  | j                  |      }||z   }|S )Nr   r   r	   r3   r   )	rB   rK   rF   r   r   rG   r#   rJ   r"   )rN   rR   rS   ra   rb   s        rP   rc   zTemporalResnetBlock.forward`  s    $

=1))-8

=1)$$T*D%%d+Aq!T4,?@D<<1aA.D)D0M

=1))-8]3

=1)--l;L$}4rQ   )Nr   r   r   rn   s   @rP   r   r     si    	 '+ 44 sm4 	4
 4lELL   rQ   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddedee   dededee   dedef fd	Z	 	 dd
e	j                  dee	j                     dee	j                     fdZ xZS )SpatioTemporalResBlocka  
    A SpatioTemporal Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
        temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
        merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
            The merge strategy to use for the temporal mixing.
        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
            If `True`, switch the spatial and temporal mixing.
    r0   r!   r$   r'   temporal_epsmerge_factorswitch_spatial_to_temporal_mixc	                     t         	|           t        ||||      | _        t	        ||n|||n||||n|      | _        t        |||      | _        y )N)r0   r!   r$   r'   )alphamerge_strategyr   )r?   r@   rp   spatial_res_blockr   temporal_res_blockAlphaBlender
time_mixer)
rN   r0   r!   r$   r'   r   r   r   r   rO   s
            rP   r@   zSpatioTemporalResBlock.__init__  sp     	!.#%'	"
 #6(4(@k)5)A{' , 8c	#
 ')+I
rQ   ra   rS   image_only_indicatorc                    |j                   d   }| j                  ||      }|j                   \  }}}}||z  }	|d d d f   j                  |	||||      j                  ddddd      }
|d d d f   j                  |	||||      j                  ddddd      }||j                  |	|d      }| j	                  ||      }| j                  |
||      }|j                  ddddd      j                  ||||      }|S )Nr   r   r   r	   r3   r   )	x_spatial
x_temporalr   )r\   r   r   r   r   r   )rN   ra   rS   r   r   batch_frameschannelsheightwidth
batch_sizehidden_states_mixs              rP   rc   zSpatioTemporalResBlock.forward  s>    *//3
..}dC0=0C0C-h!Z/
 $'"**:z8VUZ[ccdeghjkmnpqr 	 $'"**:z8VUZ[ccdeghjkmnpqr 	 <<
J;D//tD'$!5 ( 
 &--aAq!<DD\S[]cejkrQ   )Nr   r   Ng      ?learned_with_imagesF)NN)rd   re   rf   rg   rh   r   rj   ri   r@   rH   rl   rc   rm   rn   s   @rP   r   r   {  s    ( '+ (,!,/4

 sm
 	

 
 uo
 
 )-
H (,7;	|| u||$ 'u||4	rQ   r   c            	            e Zd ZdZg dZ	 	 ddededef fdZde	j                  ded	e	j                  fd
Z	 dde	j                  de	j                  dee	j                     d	e	j                  fdZ xZS )r   a  
    A module to blend spatial and temporal features.

    Parameters:
        alpha (`float`): The initial value of the blending factor.
        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
            The merge strategy to use for the temporal mixing.
        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
            If `True`, switch the spatial and temporal mixing.
    )learnedfixedr   r   r   r   c                    t         |           || _        || _        || j                  vrt        d| j                         | j                  dk(  r'| j                  dt        j                  |g             y | j                  dk(  s| j                  dk(  rD| j                  dt        j                  j                  t        j                  |g                   y t        d| j                         )Nzmerge_strategy needs to be in r   
mix_factorr   r   zUnknown merge strategy )r?   r@   r   r   
strategiesrC   register_bufferrH   rl   register_parameterrD   	Parameter)rN   r   r   r   rO   s       rP   r@   zAlphaBlender.__init__  s     	,.L+0=doo=NOPP')  u||UG/DE  I-1D1DH]1]##L%((2D2DU\\SXRYEZ2[\6t7J7J6KLMMrQ   r   ndimsrT   c                 2   | j                   dk(  r| j                  }|S | j                   dk(  r!t        j                  | j                        }|S | j                   dk(  r|t	        d      t        j
                  |j                         t        j                  dd|j                        t        j                  | j                        d         }|dk(  r|d d d d d d d f   }|S |d	k(  r|j                  d
      d d d d f   }|S t	        d| d      t        )Nr   r   r   zMPlease provide image_only_indicator to use learned_with_images merge strategyr	   )device).Nr   r3   r   zUnexpected ndims z. Dimensions should be 3 or 5)r   r   rH   sigmoidrC   whereri   onesr   r   NotImplementedError)rN   r   r   r   s       rP   	get_alphazAlphaBlender.get_alpha  s    ')OOE6 3   I-MM$//2E0 -   $99#+ !pqqKK$))+

1a(<(C(CDdoo.y9E zaq$45  !b)!T4-8  !#4UG;X!YZZ &%rQ   r   r   c                     | j                  ||j                        }|j                  |j                        }| j                  rd|z
  }||z  d|z
  |z  z   }|S )Nr    )r   ndimtodtyper   )rN   r   r   r   r   r   s         rP   rc   zAlphaBlender.forward  sZ     3Y^^D)..%KEIu
 ::rQ   )r   Fr   )rd   re   rf   rg   r   rj   rk   ri   r@   rH   rl   rh   r   r   rc   rm   rn   s   @rP   r   r     s    	 =J
 4/4	NN N )-	N(ell 3 5<< F 8<	<< LL 'u||4	
 
rQ   r   ),	functoolsr   typingr   r   r   rH   torch.nnrD   torch.nn.functional
functionalr   utilsr   activationsr
   attention_processorr   downsamplingr   r   r   r   r   normalizationr   
upsamplingr   r   r   r   r   r   Moduler   rp   rl   r   r   r   r   r   r   r    rQ   rP   <module>r     s      ) )      ' ,  ( NBII NbxBII xxO5<< OELL O "))  H,0bii ,0^D		 DNY")) YzQRYY QhN299 NrQ   