
    bi*                       d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
Z
 d dlm
c mZ d dlZddlmZmZmZ ddlmZmZmZ ddlmZmZmZ ddlmZ d	d
lmZ d	dlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d	dl*m+Z+m,Z, d	dl-m.Z. d	dl/m0Z0m1Z1m2Z2 d	dl3m4Z4 d	dl5m6Z6 ddl7m8Z8 ddl9m:Z:  ejv                  e<      Z=e G d de             Z> G d de
j~                        Z@ G d de
j~                        ZA G d de
j~                        ZB G d de
j~                        ZC G d de
j~                        ZD G d  d!e
j~                        ZE G d" d#e
j~                        ZF G d$ d%e.ee      ZG G d& d'e.eee      ZHy)(    )	dataclass)AnyDictOptionalTupleUnionN   )ConfigMixin
FrozenDictregister_to_config)FromOriginalModelMixinPeftAdapterMixinUNet2DConditionLoadersMixin)
BaseOutput	deprecatelogging)apply_freeu   )BasicTransformerBlock)
ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORS	AttentionAttentionProcessorAttnAddedKVProcessorAttnProcessorAttnProcessor2_0FusedAttnProcessor2_0IPAdapterAttnProcessorIPAdapterAttnProcessor2_0)TimestepEmbedding	Timesteps)
ModelMixin)Downsample2DResnetBlock2D
Upsample2D)DualTransformer2DModel)Transformer2DModel   )UNetMidBlock2DCrossAttn)UNet2DConditionModelc                   0    e Zd ZU dZej
                  ed<   y)UNetMotionOutputa  
    The output of [`UNetMotionOutput`].

    Args:
        sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
    sampleN)__name__
__module____qualname____doc__torchTensor__annotations__     c/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/unets/unet_motion_model.pyr,   r,   4   s     LLr6   r,   c                    Z    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededee   dee   dededed	ee   d
edee   dedededee   dee   f fdZ		 	 	 	 	 dde
j                  dee
j                     dee
j                     dee
j                     dedeeeef      de
j                  fdZ xZS )AnimateDiffTransformer3Das  
    A Transformer model for video-like data.

    Parameters:
        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            The number of channels in the input and output (specify if the input is **continuous**).
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
        attention_bias (`bool`, *optional*):
            Configure if the `TransformerBlock` attention should contain a bias parameter.
        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
            This is fixed during training since it is used to learn a number of position embeddings.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
            activation functions.
        norm_elementwise_affine (`bool`, *optional*):
            Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
        double_self_attention (`bool`, *optional*):
            Configure if each `TransformerBlock` should contain two self-attention layers.
        positional_embeddings: (`str`, *optional*):
            The type of positional embeddings to apply to the sequence input before passing use.
        num_positional_embeddings: (`int`, *optional*):
            The maximum length of the sequence over which to apply positional embeddings.
    num_attention_headsattention_head_dimin_channelsout_channels
num_layersdropoutnorm_num_groupscross_attention_dimattention_biassample_sizeactivation_fnnorm_elementwise_affinedouble_self_attentionpositional_embeddingsnum_positional_embeddingsc                    t         |           || _        || _        ||z  }|| _        t        j                  ||dd      | _        t        j                  ||      | _	        t        j                  t        |      D cg c]  }t        |||||||	||||       c}      | _        t        j                  ||      | _        y c c}w )Nư>T)
num_groupsnum_channelsepsaffine)r?   rA   rD   rB   rF   rE   rG   rH   )super__init__r:   r;   r<   nn	GroupNormnormLinearproj_in
ModuleListranger   transformer_blocksproj_out)selfr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   	inner_dim_	__class__s                     r7   rP   z!AnimateDiffTransformer3D.__init__^   s    $ 	#6 "4'*<<	&LLO+[_hlm	yyi8 #%-- z*  &'&#(;"/#1*?,C*?.G#
& 		)[9%s   ?Chidden_statesencoder_hidden_statestimestepclass_labels
num_framescross_attention_kwargsreturnc                 D   |j                   \  }}}	}
||z  }|}|dddf   j                  ||||	|
      }|j                  ddddd      }| j                  |      }|j                  ddddd      j                  ||	z  |
z  ||      }| j	                  |      }| j
                  D ]  } ||||||      } | j                  |      }|ddddf   j                  ||	|
||      j                  ddddd      j                         }|j                  |||	|
      }||z   }|S )	a0  
        The [`AnimateDiffTransformer3D`] forward method.

        Args:
            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
                Input hidden_states.
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            timestep ( `torch.LongTensor`, *optional*):
                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
                `AdaLayerZeroNorm`.
            num_frames (`int`, *optional*, defaults to 1):
                The number of frames to be processed per batch. This is used to reshape the hidden states.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).

        Returns:
            torch.Tensor:
                The output tensor.
        Nr   r   r(   r	      )input)r^   r_   r`   rc   ra   )shapereshapepermuterS   rU   rX   rY   
contiguous)rZ   r^   r_   r`   ra   rb   rc   batch_frameschannelheightwidth
batch_sizeresidualblockoutputs                  r7   forwardz AnimateDiffTransformer3D.forward   s]   F 0=/B/B,gvu!Z/
 %dAg.66z:wX^`ef%--aAq!<		-0%--aAq!<DDZRXEX[`E`blnuv=9 ,, 	E!+&;!'=)M	 M:$a-(WZ
GDWQ1a#Z\	 	 &--lGVUS)r6   )   X   NNr(               NFNgegluTTNN)NNNr(   N)r.   r/   r0   r1   intr   floatboolstrrP   r2   r3   
LongTensorr   r   rt   __classcell__r]   s   @r7   r9   r9   A   s   < $&"$%)&*!-1$%)$(,&*/337!0: 0:  0: c]	0:
 sm0: 0: 0: 0: &c]0: 0: c]0: 0: "&0:  $0:  (}0:  $,C=!0:j =A/337;?E||E  ((8(89E 5++,	E
 u//0E E !)c3h 8E 
Er6   r9   c            &       6    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedededededededed	ed
ededededeeee   f   de	e   dedeeee   f   def$ fdZ
	 	 ddej                  de	ej                     dedeej                  eej                  df   f   fdZ xZS )DownBlockMotionr<   r=   temb_channelsr?   r>   
resnet_epsresnet_time_scale_shiftresnet_act_fnresnet_groupsresnet_pre_normoutput_scale_factoradd_downsampledownsample_paddingtemporal_num_attention_headstemporal_cross_attention_dimtemporal_max_seq_length%temporal_transformer_layers_per_blocktemporal_double_self_attentionc                    t         |           g }g }t        |t              r|f|z  }nt	        |      |k7  rt        d|       t        |t              r|f|z  }nt	        |      |k7  rt        d|       t        |      D ]`  }|dk(  r|n|}|j                  t        |||||	|||||

             |j                  t        ||   |||   |	|ddd||||   z  |             b t        j                  |      | _        t        j                  |      | _        |r1t        j                  t        |d	||d
      g      | _        d| _        y d | _        d| _        y )Nz\`temporal_transformer_layers_per_block` must be an integer or a tuple of integers of length zS`temporal_num_attention_heads` must be an integer or a tuple of integers of length r   
r<   r=   r   rM   groupsr?   time_embedding_normnon_linearityr   pre_normFry   
sinusoidalr:   r<   r>   r@   rA   rB   rD   rG   rH   r;   rF   Topuse_convr=   paddingname)rO   rP   
isinstancerz   len
ValueErrorrW   appendr$   r9   rQ   rV   resnetsmotion_modulesr#   downsamplersgradient_checkpointing)rZ   r<   r=   r   r?   r>   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ir]   s                         r7   rP   zDownBlockMotion.__init__   s   * 	 ;SA5Z4\_i4i167:Enoynz{ 
 2C8,H+JZ+W(-.*<efpeqr  z" 	A)*a+\KNN +!-"/"(#(?"/(;, !!((DQ(G ,DQG$1(D#(")*6.E'37STU7V'V*H!	@ }}W- mmN; " $!%%1 2!
!D ',# !%D&+#r6   r^   tembrb   rd   .c                    t        |      dkD  s|j                  dd       d}t        dd|       d}t        | j                  | j
                        }|D ]S  \  }	}
t        j                         r | j                  r| j                  |	||      }n
 |	||      } |
||      }||fz   }U | j                   | j                  D ]  } ||      } ||fz   }||fS )	Nr   scaleThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.1.0.0r5   input_tensorr   rb   r^   )r   getr   zipr   r   r2   is_grad_enabledr   _gradient_checkpointing_funcr   )rZ   r^   r   rb   argskwargsdeprecation_messageoutput_statesblocksresnetmotion_moduledownsamplers               r7   rt   zDownBlockMotion.forward6  s     t9q=FJJw5A #Ugw(;<T\\4#6#67%+ 	=!FM$$&4+F+F $ A A&-Y] ^ &M M)-JOM)],<<M	= (#00 I +- HI *],<<Mm++r6   )rw   r(   rJ   defaultswishrx   T      ?Tr(   r(   Nrx   r(   T)Nr(   )r.   r/   r0   rz   r{   r}   r|   r   r   r   rP   r2   r3   rt   r   r   s   @r7   r   r      sw     '0$ $%(#"#?@6:')HI/3'[,[, [, 	[,
 [, [, [, "%[, [, [, [, #[, [,  [, ',CsO&<[,  '/sm![," "%#[,$ 05S%*_/E%[,& )-'[,@ (,	,||, u||$, 	, 
u||U5<<#455	6,r6   r   c            6           e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%dedededededeeee   f   deded	ed
edededededededededededede	e   dededeeee   f   def4 fdZ
	 	 	 	 	 	 	 d&dej                  de	ej                     de	ej                     de	ej                     d ed!e	ej                     d"e	eeef      d#e	ej                     fd$Z xZS )'CrossAttnDownBlockMotionr<   r=   r   r?   r>   transformer_layers_per_blockr   r   r   r   r   r:   rA   r   r   r   dual_cross_attentionuse_linear_projectiononly_cross_attentionupcast_attentionattention_typer   r   r   r   r   c                    t         |           g }g }g }d| _        || _        t	        |t
              r|f|z  }nt        |      |k7  rt        d|       t	        |t
              r|f|z  }nt        |      |k7  rt        d|       t        |      D ]  }|dk(  r|n|}|j                  t        |||||
|||	||
             |s+|j                  t        |||z  |||   ||
||||
             n#|j                  t        |||z  |d||
             |j                  t        ||||   |
|d	d
d|||z  |              t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        |r1t        j                  t'        |d||d      g      | _        d	| _        y d | _        d	| _        y )NTPtransformer_layers_per_block must be an integer or a list of integers of length Ytemporal_transformer_layers_per_block must be an integer or a list of integers of length r   r   r<   r>   rA   r@   r   r   r   r   r(   r<   r>   rA   r@   Fry   r   r   r   r   )rO   rP   has_cross_attentionr:   r   rz   r   r   rW   r   r$   r'   r&   r9   rQ   rV   
attentionsr   r   r#   r   r   ) rZ   r<   r=   r   r?   r>   r   r   r   r   r   r   r:   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   s                                   r7   rP   z!CrossAttnDownBlockMotion.__init__Y  sB   : 	
#' #6  2C8,H+JZ+W(-.*<bcmbno 
 ;SA5Z4\_i4i167:Eklvkwx  z" :	A)*a+\KNN +!-"/"(#(?"/(;, (!!&+$(;;$0#?#B,?(5.C-A)9'5 !!*+$(;;$0#$,?(5	 !!((D ,DQG$1(D#(")*6.E'37S'S*HY:	x --
3}}W- mmN; " $!%%1 2!
!D ',# !%D&+#r6   r^   r   r_   attention_maskrb   encoder_attention_maskrc   additional_residualsc	           	      <   |'|j                  dd       t        j                  d       d}	t        t	        | j
                  | j                  | j                              }
t        |
      D ]  \  }\  }}}t        j                         r | j                  r| j                  |||      }n
 |||      } ||||||d      d   } |||      }|t        |
      d	z
  k(  r|||z   }|	|fz   }	 | j                   | j                  D ]  } ||
      } |	|fz   }	||	fS )Nr   SPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r5   r   Fr^   r_   rc   r   r   return_dictr   r   r(   r   )r   loggerwarninglistr   r   r   r   	enumerater2   r   r   r   r   r   )rZ   r^   r   r_   r   rb   r   rc   r   r   r   r   r   attnr   r   s                   r7   rt   z CrossAttnDownBlockMotion.forward  sR    "-%))'48Dtuc$,,9L9LMN09&0A 	=,A,m$$&4+F+F $ A A&-Y] ^ &M M +&;'=-'=! M *-JOM CK!O#(<(H -0D D)],<<M+	=. (#00 I +- HI *],<<Mm++r6   )rw   r(   r(   rJ   r   r   rx   Tr(      r   r(   TFFFFr   N   rx   r(   T)NNNr(   NNNr.   r/   r0   rz   r{   r   r   r}   r|   r   rP   r2   r3   r   r   rt   r   r   s   @r7   r   r   X  s3    ?@ '0$ $#$#'%("##%*&+%*!&'6:,-')HI/37D,D, D, 	D,
 D, D, ',CsO&<D, D, "%D, D, D, D, !D, !D, #D,   !D," #D,$ #%D,&  $'D,( #)D,* +D,, -D,. '/sm/D,0 '*1D,2 "%3D,4 05S%*_/E5D,6 )-7D,R (,8<159=;?7;/,||/, u||$/,  (5	/,
 !./, /, !) 6/, !)c3h 8/, 'u||4/,r6   r   c            8           e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d(dededededee   dededeeee   f   d	ed
ededede	dededede	de	de	de	de	dedee   dededeeee   f   f4 fdZ
	 	 	 	 	 	 	 d)dej                  deej                  df   deej                     d eej                     d!eeeef      d"ee   d#eej                     d$eej                     d%ed&ej                  fd'Z xZS )*CrossAttnUpBlockMotionr<   r=   prev_output_channelr   resolution_idxr?   r>   r   r   r   r   r   r   r:   rA   r   add_upsampler   r   r   r   r   r   r   r   r   c                    t         !|           g }g }g }d| _        || _        t	        |t
              r|f|z  }n(t        |      |k7  rt        d| dt        |             t	        |t
              r|f|z  }n(t        |      |k7  rt        d| dt        |             t        |      D ]  }||dz
  k(  r|n|}|dk(  r|n|} |j                  t        | |z   |||	|||
|||
             |s+|j                  t        |||z  |||   ||||||
             n#|j                  t        |||z  |d||	             |j                  t        ||||   ||d
dd|||z  
              t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        |r(t        j                  t'        |d|      g      | _        nd | _        d
| _        || _        y )NTr   z, got r   r(   r   r   r   r   Fry   r   
r:   r<   r>   r@   rA   rB   rD   rG   rH   r;   r   r=   )rO   rP   r   r:   r   rz   r   r   rW   r   r$   r'   r&   r9   rQ   rV   r   r   r   r%   
upsamplersr   r   )"rZ   r<   r=   r   r   r   r?   r>   r   r   r   r   r   r   r:   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   res_skip_channelsresnet_in_channelsr]   s"                                    r7   rP   zCrossAttnUpBlockMotion.__init__  s   : 	
#' #6  2C8,H+JZ+W(-.*<bcmbnntux  zV  vW  uX  Y 
 ;SA5Z4\_i4i167:Eklvkww}  B  Ch  i  ~j  k  z" :	A01Z!^0C,89Q!4LNN 25F F!-"/"(#(?"/(;, (!!&+$(;;$0#?#B,?(5.C-A)9'5 !!*+$(;;$0#$,?(5	 !!((D ,DQG$1(D#(")*6.E'37S'S[:	x --
3}}W- mmN; mmZtbn-o,pqDO"DO&+#,r6   r^   res_hidden_states_tuple.r   r_   rc   upsample_sizer   r   rb   rd   c
           
         |'|j                  dd       t        j                  d       t        | dd       xr+ t        | dd       xr t        | dd       xr t        | dd       }
t	        | j
                  | j                  | j                        }|D ]  \  }}}|d   }|d d }|
rGt        | j                  ||| j                  | j                  | j                  | j                        \  }}t        j                  ||gd	
      }t        j                          r | j"                  r| j%                  |||      }n
 |||      } ||||||d      d   } |||	      } | j&                  | j&                  D ]  } |||      } |S )Nr   r   s1s2b1b2r   r   r   r   r(   dimr   Fr   r   r   r^   output_size)r   r   r   getattrr   r   r   r   r   r   r   r   r   r   r2   catr   r   r   r   )rZ   r^   r   r   r_   rc   r   r   r   rb   is_freeu_enabledr   r   r   r   res_hidden_states	upsamplers                    r7   rt   zCrossAttnUpBlockMotion.forward  s    "-%))'48Dtu D$% *dD)*dD)* dD)	 	 T\\4??D4G4GH+1 !	P'FD- 7 ;&=cr&B#  3>''!%wwwwwwww400 "II}6G&HaPM$$&4+F+F $ A A&-Y] ^ &M M +&;'=-'=! M *-JOMC!	PF ??&!__ b	 )S` ab r6   )Nrw   r(   r(   rJ   r   r   rx   Tr(   r   r   TFFFFr   Nr   rx   r(   )NNNNNNr(   )r.   r/   r0   rz   r   r{   r   r   r}   r|   rP   r2   r3   r   r   rt   r   r   s   @r7   r   r     sU    )-?@ '0$ $#$#'%(!%*&+%*!&'6:,-')HI7{-{- {- !	{-
 {- !{- {- {- ',CsO&<{- {- "%{- {- {- {- !{-  !!{-" ##{-$ %{-& #'{-(  $){-* #+{-, -{-. /{-0 '/sm1{-2 '*3{-4 "%5{-6 05S%*_/E7{-B (,8<;?'+159=?||? "'u||S'8!9? u||$	?
  (5? !)c3h 8?  }? !.? !) 6? ? 
?r6   r   c            (       &    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededee   dededed	ed
edededededee   dededeee	e   f   f$ fdZ
	 	 	 ddej                  de	ej                  df   deej                     dedej                  f
dZ xZS )UpBlockMotionr<   r   r=   r   r   r?   r>   r   r   r   r   r   r   r   r   r   r   r   c                 b   t         |           g }g }t        |t              r|f|z  }nt	        |      |k7  rt        d|       t        |      D ]h  }||dz
  k(  r|n|}|dk(  r|n|}|j                  t        ||z   ||||||	|
||
             |j                  t        ||||   ||ddd|||z  
             j t        j                  |      | _        t        j                  |      | _        |r(t        j                  t        |d	|
      g      | _        nd | _        d| _        || _        y )Nr   r(   r   r   Fry   r   r   Tr   )rO   rP   r   rz   r   r   rW   r   r$   r9   rQ   rV   r   r   r%   r   r   r   )rZ   r<   r   r=   r   r   r?   r>   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   s                           r7   rP   zUpBlockMotion.__init__  si   * 	 ;SA5Z4\_i4i167:Eklvkwx  z"  	A01Z!^0C,89Q!4LNN 25F F!-"/"(#(?"/(;, !!((D ,DQG$1(D#(")*6.E'37S'S' 	D }}W- mmN; mmZtbn-o,pqDO"DO&+#,r6   r^   r   .r   rb   rd   c           
         t        |      dkD  s|j                  dd       d}t        dd|       t        | dd       xr+ t        | dd       xr t        | dd       xr t        | dd       }	t	        | j
                  | j                        }
|
D ]  \  }}|d	   }|d d	 }|	rGt        | j                  ||| j                  | j                  | j                  | j                  
      \  }}t        j                  ||gd      }t        j                         r | j                   r| j#                  |||      }n
 |||      } |||      } | j$                  | j$                  D ]  } |||      } |S )Nr   r   r   r   r   r   r   r   r   r   r(   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r2   r   r   r   r   r   )rZ   r^   r   r   r   rb   r   r   r   r   r   r   r   r   r   s                  r7   rt   zUpBlockMotion.forward   s    t9q=FJJw5A #Ugw(;< D$% *dD)*dD)* dD)	 	 T\\4#6#67%+ 	P!FM 7 ;&=cr&B#  3>''!%wwwwwwww400 "II}6G&HaPM$$&4+F+F $ A A&-Y] ^ &M M)-JOM1	P4 ??&!__ b	 )S` ab r6   )Nrw   r(   rJ   r   r   rx   Tr   TNr   rx   r(   )NNr(   )r.   r/   r0   rz   r   r{   r}   r|   r   r   rP   r2   r3   rt   r   r   s   @r7   r   r     sn    )- '0$ $%(!6:,-')HI'L-L- !L- 	L-
 L- !L- L- L- L- "%L- L- L- L- #L- L-  '/sm!L-" '*#L-$ "%%L-& 05S%*_/E'L-d (,5||5 "'u||S'8!95 u||$	5 5 
5r6   r   c            .           e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d dededededeeee   f   dededed	ed
ededededededededede	e   dedeeee   f   f* fdZ
	 	 	 	 	 	 d!dej                  de	ej                     de	ej                     de	ej                     de	eeef      de	ej                     dedej                  fdZ xZS )"UNetMidBlockCrossAttnMotionr<   r   r?   r>   r   r   r   r   r   r   r:   r   rA   r   r   r   r   r   r   r   r   c                 l   t         |           d| _        || _        |	|	nt	        |dz  d      }	t        |t              r|f|z  }nt        |      |k7  rt        d| d      t        |t              r|f|z  }nt        |      |k7  rt        d| d      t        |||||	|||||

      g}g }g }t        |      D ]  }|s*|j                  t        |||z  |||   ||	|||	             n#|j                  t        |||z  |d	||	
             |j                  t        |||||	|||||

             |j                  t        |||z  |||   |	|dd|d
              t        j                   |      | _        t        j                   |      | _        t        j                   |      | _        d| _        y )NTrf   rx   zT`transformer_layers_per_block` should be an integer or a list of integers of length .z]`temporal_transformer_layers_per_block` should be an integer or a list of integers of length r   )r<   r>   rA   r@   r   r   r   r(   r   Fr   ry   )
r:   r;   r<   r>   r@   rA   rB   rG   rH   rD   )rO   rP   r   r:   minr   rz   r   r   r$   rW   r   r'   r&   r9   rQ   rV   r   r   r   r   )rZ   r<   r   r?   r>   r   r   r   r   r   r   r:   r   rA   r   r   r   r   r   r   r   r   r   r   r   r   r]   s                             r7   rP   z$UNetMidBlockCrossAttnMotion.__init__Y  s5   0 	#' #6 )6)BK[\L\^`Ha 2C8,H+JZ+W(-.*<fgqfrrst 
 ;SA5Z4\_i4i167:Eopzo{{|}  '(+$$;+$7(
 
z" 5	A'!!&+#'::$/#?#B,?(5.C)9'5
 !!*+#'::$/#$,?(5	 NN +!,"/"(#(?"/(;, !!((D'26R'R +DQG$1(D#(*6.E")Q5	n --
3}}W- mmN;&+#r6   r^   r   r_   r   rc   r   rb   rd   c           
         |'|j                  dd       t        j                  d        | j                  d   ||      }t	        | j
                  | j                  dd  | j                        }|D ]y  \  }	}
} |	|||||d      d   }t        j                         r7| j                  r+| j                  ||d d d |d       }| j                  |
||      }c ||d d d |d       } |
||      }{ |S )Nr   r   r   r   r(   Fr   )r   r   r   r   r   r   r   r2   r   r   r   )rZ   r^   r   r_   r   rc   r   rb   r   r   r   r   s               r7   rt   z#UNetMidBlockCrossAttnMotion.forward  s    "-%))'48Dtu'Q]NT__dll12&68K8KL+1 	N'D&- +&;'=-'=! M $$&4+F+F $ A A!=$dJPT! !% A A&-Y] ^ -mT4z[_ ` &M M#	N& r6   )rw   r(   r(   rJ   r   r   rx   Tr(   r   r   FFFr   r(   Nrx   r(   )NNNNNr(   r   r   s   @r7   r   r   X  s   
 ?@ '0$ $#$%(#'%*&+!&',-6:')HI-{,{, {, 	{,
 {, ',CsO&<{, {, "%{, {, {, {, !{, #{, !{, #{,   $!{," #{,$ %{,& '*'{,( '/sm){,* "%+{,, 05S%*_/E-{,@ (,8<15;?9=$||$ u||$$  (5	$
 !.$ !)c3h 8$ !) 6$ $ 
$r6   r   c                   |     e Zd Z	 	 	 	 	 	 	 	 ddededeeee   f   deeee   f   dedee   deded	ef fd
Z	 xZ
S )MotionModulesr<   layers_per_blockr   r:   rB   rA   rD   r@   max_seq_lengthc
                 \   t         |           t        j                  g       | _        t        |t              r|f|z  }n(t        |      |k7  rt        d| dt        |             t        |      D ]6  }
| j                  j                  t        |||
   |||||||z  d|	
             8 y )NzZThe number of transformer layers per block must match the number of layers per block, got  and r   )
r<   r>   r@   rA   rD   rB   r:   r;   rG   rH   )rO   rP   rQ   rV   r   r   rz   r   r   rW   r   r9   )rZ   r<   r  r   r:   rB   rA   rD   r@   r  r   r]   s              r7   rP   zMotionModules.__init__  s     	 mmB/2C8,H+JM]+](-.2BB'(c2N.O-PR 
 '( 	A&&( +;A>$3(;"/#1(;'26I'I*6.<	r6   )r   r   r   FNry   rx   rx   )r.   r/   r0   rz   r   r   r|   r   r}   rP   r   r   s   @r7   r  r    s     !"?@67$-1$! %% % ',CsO&<	%
 #3c
?3% % &c]% % % % %r6   r  c                        e Zd Ze	 	 	 	 	 	 	 	 	 	 ddeedf   deeee   f   deeee   eee      f   dedeeee   f   deeee   f   ded	ed
edee   f fd       Z	d Z
 xZS )MotionAdapterblock_out_channels.motion_layers_per_block#motion_transformer_layers_per_block!motion_mid_block_layers_per_block'motion_transformer_layers_per_mid_blockmotion_num_attention_headsmotion_norm_num_groupsmotion_max_seq_lengthuse_motion_mid_blockconv_in_channelsc                     t         |           g }g }t        |t              r|ft	        |      z  }n:t	        |      t	        |      k7  r#t        dt	        |       dt	        |             t        |t              r|ft	        |      z  }t        |t              r|f|z  }n)t	        |      |k7  rt        d| dt	        |       d      t        |t              r|ft	        |      z  }n:t	        |      t	        |      k7  r#t        dt	        |       dt	        |             |
r"t        j                  |
|d   dd	
      | _        nd| _        t        |      D ]6  \  }}||   }|j                  t        ||ddd||   |||   ||   	             8 |	r t        |d   |ddd|d   |||	      | _        nd| _        t        t        |            }|d   }t        t        |            }t        t        |            }t        t        |            }t        |      D ]9  \  }}||   }|j                  t        ||ddd||   |||   d	z   ||   	             ; t        j                  |      | _        t        j                  |      | _        y)a3  Container to store AnimateDiff Motion Modules

        Args:
            block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each UNet block.
            motion_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 2):
                The number of motion layers per UNet block.
            motion_transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple[int]]`, *optional*, defaults to 1):
                The number of transformer layers to use in each motion layer in each block.
            motion_mid_block_layers_per_block (`int`, *optional*, defaults to 1):
                The number of motion layers in the middle UNet block.
            motion_transformer_layers_per_mid_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
                The number of transformer layers to use in each motion layer in the middle block.
            motion_num_attention_heads (`int` or `Tuple[int]`, *optional*, defaults to 8):
                The number of heads to use in each attention layer of the motion module.
            motion_norm_num_groups (`int`, *optional*, defaults to 32):
                The number of groups to use in each group normalization layer of the motion module.
            motion_max_seq_length (`int`, *optional*, defaults to 32):
                The maximum sequence length to use in the motion module.
            use_motion_mid_block (`bool`, *optional*, defaults to True):
                Whether to use a motion module in the middle of the UNet.
        zKThe number of motion layers per block must match the number of blocks, got r	  z$The number of layers per mid block (zD) must match the length of motion_transformer_layers_per_mid_block ()zgThe length of the attention head number tuple in the motion module must match the number of block, got r   r	   r(   kernel_sizer   Nry   F)	r<   r@   rA   rD   rB   r:   r  r  r   r   )rO   rP   r   rz   r   r   rQ   Conv2dconv_inr   r   r  	mid_blockr   reversedrV   down_blocks	up_blocks)rZ   r  r  r  r  r  r  r  r  r  r  r  r  r   rm   output_channelreversed_block_out_channels reversed_motion_layers_per_block,reversed_motion_transformer_layers_per_block#reversed_motion_num_attention_headsr]   s                       r7   rP   zMotionAdapter.__init__'  s    J 		-s3'>&@3GYCZ&Z#()S1C-DD-./uS9P5Q4RT 
 93?3V2X[^_q[r2r/=sC771723 89=^^67X6Y ZUUX  ZA  VB  UC  CDE 
 0#6*D)FM_I`)`&+,4F0GG((+,F(G'HcRdNeMfh 
 99%57I!7LZ[efgDLDL#$67 	JAw/2N .$:(,")#((B1(E#8%<Q%?1TUV1W
	   *.r2 6$(%$$>r$B4!B-T
DN "DN&*84F+G&H#4Q7+/9P0Q+R(7;HEh<i7j4.28<V3W.X+#$?@ 	JAw8;N .$:(,")#((KA(N#8%Ea%H1%L1]^_1`
	  ==5y1r6   c                      y Nr5   )rZ   r-   s     r7   rt   zMotionAdapter.forward  s    r6   )
i@  i  r   r   r   r(   r(   r(   r   rx   rx   TN)r.   r/   r0   r   r   rz   r   r|   r   rP   rt   r   r   s   @r7   r  r  &  s     /E:;YZ12JK=>&(%'%)*.A2!#s(OA2 "'sE#J!7A2 .33c
E%PS*DU3U-V	A2
 ,/A2 27sE#J1GA2 %*#uSz/$:A2 !$A2  #A2 #A2 #3-A2 A2Fr6   r  c            C           e Zd ZdZdZdgZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dOdee   dedede	e
d	f   d
e	e
d	f   de	ed	f   deee	e   f   dedede
dedededeee	e   e	e	   f   deeee	e   e	e	   f      deee	e   e	e	   f   deeee	e   e	e	   f      deeee	e   f      deeee	e   f      dedeee	ed	f   f   dedeee	ed	f   f   deeee	ed	f   e	e	ed	f   d	f   f      deded ee   d!ee
   d"ee
   d#ee   d$ee   d%ee   f@ fd&       Ze	 	 dPd'ed(ee   d)efd*       ZdQd,Zd(ee   d+dfd-Z	 	 	 	 dRd.e
d/ed0ed1ee
   d2ed+dfd3Zed+ee
ef   fd4       Zd5eeee
ef   f   fd6ZdSd7ee   d8ed+dfd9ZdQd:ZdQd;Zd<ed=ed>ed?ed+df
d@ZdQdAZdB Z dC Z!	 	 	 	 	 	 	 dTdDe"jF                  dEee"jF                  eef   dFe"jF                  dGee"jF                     dHee"jF                     dIeee
e$f      dJeee
e"jF                  f      dKee	e"jF                        dLee"jF                     dMed+ee%e	e"jF                     f   fdNZ& xZ'S )UUNetMotionModela=  
    A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
    sample shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).
    TrS   NrC   r<   r=   down_block_types.up_block_typesr  r  r   mid_block_scale_factoract_fnr@   norm_epsrA   r   $reverse_transformer_layers_per_blockr   -reverse_temporal_transformer_layers_per_block transformer_layers_per_mid_block)temporal_transformer_layers_per_mid_blockr   r:   r  r  "reverse_motion_num_attention_headsr  mid_block_layersencoder_hid_dimencoder_hid_dim_typeaddition_embed_typeaddition_time_embed_dim%projection_class_embeddings_input_dimtime_cond_proj_dimc!                 4   t         7|           || _        t        |      t        |      k7  rt	        d| d| d      t        |      t        |      k7  rt	        d| d| d      t        |t              s)t        |      t        |      k7  rt	        d| d| d      t        |t              r)t        |      t        |      k7  rt	        d| d| d      t        |t              s)t        |      t        |      k7  rt	        d| d| d      t        |t              r$|"|D ]  }!t        |!t              st	        d	       t        |t              r$|"|D ]  }!t        |!t              st	        d
       d}"d}#|"dz
  dz  }$t        j                  ||d   |"|$      | _
        |d   dz  }%t        |d   dd      | _        |d   }&t        |&|%|
|       | _        |d | _        |dk(  r#t        |dd      | _        t        ||%      | _        t        j$                  g       | _        t        j$                  g       | _        t        |t              r|ft        |      z  }t        |t              r|ft        |      z  }t        |t              r|gt        |      z  }t        |t              r|gt        |      z  }t        |t              r|gt        |      z  }t        |t              r|gt        |      z  }t        |t              r|gt        |      z  }t        |t              r|ft        |      z  }|d   }'t+        |      D ]  \  }(})|'}*||(   }'|(t        |      dz
  k(  }+|)dk(  rMt-        d4i d|*d|'d|%d||(   d||(   d|d|
d|d||(   d||(   d|d |+ d!|d"||(   d#|d$||(   },n2|)d%k(  r"t/        |*|'|%||(   ||
||+ |||(   |||(   &      },nt	        d'      | j&                  j1                  |,        |t        |d(   t              r|d(   nd}|r,t3        |d(   |%||
|	|d(   |d(   |d)|||d(   |||*      | _        n%t7        |d(   |%||
|	|d(   |d(   |d)|||+      | _        d| _        t        t;        |            }-t        t;        |            }.t        t;        |            }/t        t;        |            }0t        t;        |            }1|t        t;        |            }|t        t;        |            }|-d   }'t+        |      D ]  \  }(}2|(t        |      dz
  k(  }+|'}3|-|(   }'|-t=        |(dz   t        |      dz
           }*|+sd}4| xj8                  dz  c_        nd)}4|2d,k(  rRt?        d4i d|*d|'d-|3d|%d.|(d|/|(   dz   d||(   d|d|
d|d|.|(   d|0|(   d/|4d!|d"|1|(   d#|d$||(   }5n5|2d0k(  r%tA        |*|3|'|%|(|/|(   dz   ||
||4|1|(   |||(   1      }5nt	        d2      | j(                  j1                  |5       |'}3 |:t        jB                  |d   ||3      | _"        t        jF                         | _$        nd | _"        d | _$        |#dz
  dz  }6t        j                  |d   ||#|6      | _%        y )5Nz\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: r  zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: zdMust provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: z^Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: zOMust provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.ziMust provide 'reverse_temporal_transformer_layers_per_block` if using asymmetrical motion module in UNet.r	   r(   r   r   r  rf   T)r-  cond_proj_dim	text_timer   r<   r=   r   r>   r   r   r   r   r:   rA   r   r   r   r   r   r   r   )r<   r=   r   r>   r   r   r   r   r   r   r   r   zeInvalid `down_block_type` encountered. Must be one of `CrossAttnDownBlockMotion` or `DownBlockMotion`r   F)r<   r   r   r   r   rA   r:   r   r   r   r>   r   r   r   r   )r<   r   r   r   r   rA   r:   r   r   r   r>   r   r   r   r   r   r   )r<   r   r=   r   r   r>   r   r   r   r   r   r   r   z_Invalid `up_block_type` encountered. Must be one of `CrossAttnUpBlockMotion` or `UpBlockMotion`)rL   rK   rM   r5   )&rO   rP   rC   r   r   r   rz   r   rQ   r  r  r!   	time_projr    time_embeddingencoder_hid_projadd_time_projadd_embeddingrV   r  r  r   r   r   r   r   r  r)   num_upsamplersr  r  r   r   rR   conv_norm_outSiLUconv_actconv_out)8rZ   rC   r<   r=   r*  r+  r  r  r   r,  r-  r@   r.  rA   r   r/  r   r0  r1  r2  r   r:   r  r  r3  r  r4  r5  r6  r7  r8  r9  r:  layer_number_per_blockconv_in_kernelconv_out_kernelconv_in_paddingtime_embed_dimtimestep_input_dimr   r   down_block_typeinput_channelis_final_block
down_blockr!  reversed_num_attention_headsreversed_layers_per_blockreversed_cross_attention_dimr$  up_block_typer   r   up_blockconv_out_paddingr]   s8                                                          r7   rP   zUNetMotionModel.__init__  s	   \ 	&  C$77no  oA  AU  Vd  Ue  ef  g  !"c*:&;;t  vH  uI  I_  `p  _q  qr  s  -s3<O8PTWXhTi8iv  xK  wL  Lb  cs  bt  tu  v  )40S9L5MQTUeQf5fv  xK  wL  Lb  cs  bt  tu  v  *C0S9I5JcRbNc5cp  rB  qC  CY  Zj  Yk  kl  m  2D9>b>j*F x&4d;$%vwwx
 <dC=E*O &4d;$ D  )A-!3yy+A.NTc

 ,A.2"#5a#8$B/2/vM_
  '$(D!+-!*+BD!!LD!23XZh!iD ==,r*)3/#6"83?O;P"P)3/#6"83?O;P"P&, 01C8H4II2C8,H+ICP`La+a(:C@4X3Y\_`p\q3q0;SA5Z4[^abr^s4s1CSI=j<knq o =9 0#6*D)FM]I^)^& ,A."+,<"= ,	0A*M/2N#&8"9A"==N"<<5  -!/ #1  02	
 2Na1P  ( #) #2 )<A(> )<A(> (: (6#5 +@ 2LA1N -B  ;``a:b!
$ !$55, -!/"0/2'"("1'5#5'91KA1N,A:_`a:b
 !{  ##J/Y,	0^ ,34>?[\^?_ad4e,R0kl -  8.r2,#$$:$7$;$7$;-%*&;+-G-K(=-M6_DN& 5.r2,#$$:$7$;$7$;-%*&;+-MDN    '+84F+G&H#'+H5H,I'J$$(2B)C$D!'+H5H,I'J$.28<V3W.X+/737A]8^3_08@<@JoAp<q94Q7 ). 9 8	1A}#&8"9A"==N"08;N7AE3GYCZ]^C^8_`M "###q(#$ 881  -!/ )< #1	
 $%  9;a? 2VVW1X  ( #) #2 )EQ(G )EQ(G ". +@ 2UUV1W  -B!" ;hhi:j#& /1( -(;!/"0#$8;a?'"("1!-1TUV1W,A:ghi:j  !u  NN!!(+"0q8	1v &!#/2T\"D GGIDM!%D DM+a/A5		q!<_Vf
r6   unetmotion_adapterload_weightsc           	         |d u}|r&|j                  |j                         t        |j                  d         t        |j                  d         k7  rt	        d      t        |j                  d   t              r*|j                  d   gt        |j                  d         z  }nt        |j                  d         }t        |j                  d   t              r*|j                  d   gt        |j                  d         z  }nt        |j                  d         }||k7  rt	        d      t        |j                        }| j                  |d<   g }|d   D ])  }	d	|	v r|j                  d
       |j                  d       + ||d<   g }
|d   D ])  }	d	|	v r|
j                  d       |
j                  d       + |
|d<   |r|j                  d   |d<   |j                  d   |d<   |j                  d   |d<   |j                  d   |d<   |j                  d   |d<   |j                  d   |d<   |j                  d   |d<   |j                  d   r|j                  d   |d<   |j                  d      s|d   |d<   | j                  |       \  }}t        |D ci c]  }||v s||v s||j                  |       c}      }| j                  |d<   | j                  |      }|s|S |r|j                  d   r|j                  |_        t!        j"                  |j                  j$                  |j                  j$                  d d dd d d d d f   gd      }|j                  j'                  ||j                  j(                  d       n3|j                  j'                  |j                  j+                                |j,                  j'                  |j,                  j+                                |j.                  j'                  |j.                  j+                                t1        d |j2                  j5                         D              ri }|j2                  j7                         D ]  \  }}|j9                  d      r't;        t<        d       rt>        nt@        } |       ||<   >t;        t<        d       rtB        ntD        } ||jF                  |jH                  |jJ                  |jL                  !      ||<    |j2                  j7                         D ]  \  }}||vs|jO                         ||<    |jQ                  |       d"|j                  _)        |jT                  |_*        tW        |jX                        D ]  \  }}|jX                  |   jZ                  j'                  |jZ                  j+                                t;        |jX                  |   d#      r@|jX                  |   j\                  j'                  |j\                  j+                                |jX                  |   j^                  s|jX                  |   j^                  j'                  |j^                  j+                                 tW        |j`                        D ]  \  }}|j`                  |   jZ                  j'                  |jZ                  j+                                t;        |j`                  |   d#      r@|j`                  |   j\                  j'                  |j\                  j+                                |j`                  |   jb                  s|j`                  |   jb                  j'                  |jb                  j+                                 |jd                  jZ                  j'                  |jd                  jZ                  j+                                |jd                  j\                  j'                  |jd                  j\                  j+                                |jf                  3|jf                  j'                  |jf                  j+                                |jh                  3|jh                  j'                  |jh                  j+                                |jj                  j'                  |jj                  j+                                |r|jm                  |       |j                  |jn                         |S c c}w )$N)devicer*  r  z;Incompatible Motion Adapter, got different number of blocksr  r  zEIncompatible Motion Adapter, got different number of layers per block_class_name	CrossAttnr   r   r+  r   r   r  r  r  r  r2  r  r   r  r<   r:   r;   rf   r(   r   )weightbiasc              3   H   K   | ]  }t        |t        t        f        y wr&  )r   r   r   .0procs     r7   	<genexpr>z.UNetMotionModel.from_unet2d.<locals>.<genexpr>\  s&      
 t46OPQ
s    "zattn1.processorscaled_dot_product_attention)hidden_sizerA   r   
num_tokensip_image_projr   )8tor\  r   configr   r   rz   r   dictr.   r   r   _get_signature_keysr   from_configr  r2   r   r_  load_state_dictr`  
state_dictr>  r?  anyattn_processorsvaluesitemsendswithhasattrFr   r   r   r   rg  rA   r   rh  r]   set_attn_processorr6  r@  r   r  r   r   r   r  r   r  rD  rF  rG  load_motion_modulesdtype)clsrX  rY  rZ  has_motion_adapterexpanded_layers_per_block!expanded_adapter_layers_per_blockrk  r  down_blocks_typer  expected_kwargsoptional_kwargskmodelupdated_conv_in_weight
attn_procsr   	processorattn_processor_classr   rQ  rV  s                          r7   from_unet2dzUNetMotionModel.from_unet2d  s    ,47T[[1 4;;123s>;P;PQe;f7gg !^__ $++&893?-1[[9K-L,MPSTXT_T_`rTsPt,t),0=O1P,Q).//0IJCP5C5J5JKd5e4fil"))*>?j 51 599N9NOh9i4j1(,MM !hii dkk" #} &'9 : 	6..""#=>""#45		6
 &1!"	 &'7 8 	2..  !9:  1		2
 $- 3A3H3HIe3fF/0.<.C.CD[.\F*+-;-B-BCY-ZF)*)7)>)>?X)YF%&BPBWBW9CF>? ?M>S>S5?F:; 4B3H3HIe3fF/0 $$%78(6(=(=>P(Q}% zz/0,23G,HF()+.+B+B3+G(vn!oAUYZ^mYmQ

1-no #}'L ."7"78J"K*22EM%*YY$$n&<&<&C&CAqr1aK&PQWX&" MM))5KUYUaUaUfUf*ghMM))$,,*A*A*CD''(A(A(CD,,T-@-@-K-K-MN 
,,335
 
 J#'#7#7#=#=#? i==!23,3A7U,V(\i ) (<'=Jt$ #1&DE 23 )
 (<$-$9$9,5,I,I'oo#,#7#7	(Jt$$ $)#8#8#>#>#@ =iz)'0':':'<Jt$= $$Z00?ELL-%)%:%:E"&t'7'78 	hMAza ((889K9K9V9V9XYu((+\:!!!$//??
@U@U@`@`@bc  #00!!!$11AA*BYBYBdBdBfg	h %T^^4 	`KAxOOA&&66x7G7G7R7R7TUuq)<8"--==h>Q>Q>\>\>^_q!,,"--==h>Q>Q>\>\>^_	` 	//0F0F0Q0Q0ST""224>>3L3L3W3W3YZ)//0B0B0M0M0OP==$NN**4==+C+C+EF&&t}}'?'?'AB%%n5 	a os   c:'c:rd   c                    | j                         D ]	  }d|_         | j                  D ]*  }|j                  }|j                         D ]	  }d|_         , | j                  D ]*  }|j                  }|j                         D ]	  }d|_         , t        | j                  d      r3| j                  j                  }|j                         D ]	  }d|_         yy)z|Freeze the weights of just the UNet2DConditionModel, and leave the motion modules
        unfrozen for fine tuning.
        FTr   N)
parametersrequires_gradr  r   r  rv  r  )rZ   paramrQ  r   rV  s        r7   freeze_unet2d_paramsz$UNetMotionModel.freeze_unet2d_params  s    
 __& 	(E"'E	( ** 	+J'66N'224 +&*#+	+
  	+H%44N'224 +&*#+	+
 4>>#34!^^::N'224 +&*#+ 5r6   c                 4   t        |j                        D ]E  \  }}| j                  |   j                  j                  |j                  j	                                G t        |j
                        D ]E  \  }}| j
                  |   j                  j                  |j                  j	                                G t        | j                  d      rH| j                  j                  j                  |j                  j                  j	                                y y )Nr   )r   r  r   ro  rp  r  rv  r  )rZ   rY  r   rQ  rV  s        r7   ry  z#UNetMotionModel.load_motion_modules  s    &~'A'AB 	gMAzQ..>>z?X?X?c?c?ef	g$^%=%=> 	cKAxNN1,,<<X=T=T=_=_=ab	c 4>>#34NN))99.:R:R:a:a:l:l:no 5r6   save_directoryis_main_processsafe_serializationvariantpush_to_hubc           	      |   | j                         }i }|j                         D ]  \  }	}
d|	v s|
||	<    t        | j                  d   | j                  d   | j                  d   | j                  d   | j                  d   | j                  d         }|j	                  |        |j
                  d
|||||d	| y )Nr   r  r  r@   r  r  r  )r  r  r  r  r  r  )r  r  r  r  r  r5   )rp  rt  r  rk  ro  save_pretrained)rZ   r  r  r  r  r  r   rp  motion_state_dictr  vadapters               r7   save_motion_modulesz#UNetMotionModel.save_motion_modules  s     __&
 $$& 	)DAq1$'(!!$	)  #{{+?@$(KK0B$C#';;/@#A'+{{3O'P"&++.E"F!%-C!D
 	 12 	
)+1#	
 	
r6   c                     i }dt         dt        j                  j                  dt        t         t
        f   ffd| j                         D ]  \  }} |||        |S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        r   module
processorsc                     t        |d      r|j                         ||  d<   |j                         D ]  \  }} |  d| ||        |S )Nget_processor
.processorr  )rv  r  named_children)r   r  r  sub_namechildfn_recursive_add_processorss        r7   r  zDUNetMotionModel.attn_processors.<locals>.fn_recursive_add_processors  sd    v/282F2F2H
dV:./#)#8#8#: U%+tfAhZ,@%TU r6   )r}   r2   rQ   Moduler   r   r  )rZ   r  r   r  r  s       @r7   rr  zUNetMotionModel.attn_processors  sm     
	c 	588?? 	X\]`bt]tXu 	 !//1 	BLD&'fjA	B r6   r  c           	      T   t        | j                  j                               }t        |t              r,t        |      |k7  rt        dt        |       d| d| d      dt        dt        j                  j                  ffd| j                         D ]  \  }} |||        y)	a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r  c                     t        |d      rEt        |t              s|j                  |       n#|j                  |j	                  |  d             |j                         D ]  \  }} |  d| ||        y )Nset_processorr  r  )rv  r   rl  r  popr  )r   r  r  r  r  fn_recursive_attn_processors        r7   r  zGUNetMotionModel.set_attn_processor.<locals>.fn_recursive_attn_processor  sx    v/!)T2((3(($z7J)KL#)#8#8#: T%+tfAhZ,@%STr6   N)r   rr  keysr   rl  r   r}   r2   rQ   r  r  )rZ   r  countr   r  r  s        @r7   rx  z"UNetMotionModel.set_attn_processor  s     D((--/0i&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1 	ALD&'fi@	Ar6   
chunk_sizer   c                     |dvrt        d|       |xs d}dt        j                  j                  dt        dt        ffd| j                         D ]  } |||        y)	aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r(   z-Make sure to set `dim` to either 0 or 1, not r(   r  r  r   c                     t        | d      r| j                  ||       | j                         D ]  } |||        y Nset_chunk_feed_forward)r  r   rv  r  childrenr  r  r   r  fn_recursive_feed_forwards       r7   r  zJUNetMotionModel.enable_forward_chunking.<locals>.fn_recursive_feed_forward-  E    v78---M* B)%SABr6   N)r   r2   rQ   r  rz   r  )rZ   r  r   r  r  s       @r7   enable_forward_chunkingz'UNetMotionModel.enable_forward_chunking  su     fLSERSS  _1
	Behhoo 	B3 	BUX 	B mmo 	?F%fj#>	?r6   c                     dt         j                  j                  dt        dt        ffd| j	                         D ]  } |d d        y )Nr  r  r   c                     t        | d      r| j                  ||       | j                         D ]  } |||        y r  r  r  s       r7   r  zKUNetMotionModel.disable_forward_chunking.<locals>.fn_recursive_feed_forward8  r  r6   r   )r2   rQ   r  rz   r  )rZ   r  r  s     @r7   disable_forward_chunkingz(UNetMotionModel.disable_forward_chunking7  sM    	Behhoo 	B3 	BUX 	B mmo 	7F%fdA6	7r6   c           	      j   t        d | j                  j                         D              rt               }nmt        d | j                  j                         D              rt	               }n8t        dt        t        | j                  j                                            | j                  |       y)ze
        Disables custom attention processors and sets the default attention implementation.
        c              3   @   K   | ]  }|j                   t        v   y wr&  )r]   r   rb  s     r7   re  z=UNetMotionModel.set_default_attn_processor.<locals>.<genexpr>G  s     i4t~~!>>i   c              3   @   K   | ]  }|j                   t        v   y wr&  )r]   r   rb  s     r7   re  z=UNetMotionModel.set_default_attn_processor.<locals>.<genexpr>I  s     h$#==hr  zOCannot call `set_default_attn_processor` when attention processors are of type N)	allrr  rs  r   r   r   nextiterrx  )rZ   r  s     r7   set_default_attn_processorz*UNetMotionModel.set_default_attn_processorC  s     i4K_K_KfKfKhii,.Ih$J^J^JeJeJghh%Iabfgklp  mA  mA  mH  mH  mJ  hK  cL  bM  N  		*r6   r   r   r   r   c                     t        | j                        D ]9  \  }}t        |d|       t        |d|       t        |d|       t        |d|       ; y)aF  Enables the FreeU mechanism from https://huggingface.co/papers/2309.11497.

        The suffixes after the scaling factors represent the stage blocks where they are being applied.

        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.

        Args:
            s1 (`float`):
                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            s2 (`float`):
                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
        r   r   r   r   N)r   r  setattr)rZ   r   r   r   r   r   upsample_blocks          r7   enable_freeuzUNetMotionModel.enable_freeuS  sQ    $ "+4>>!: 	.A~ND"-ND"-ND"-ND"-		.r6   c                     h d}t        | j                        D ]3  \  }}|D ])  }t        ||      st        ||d      t	        ||d       + 5 y)zDisables the FreeU mechanism.>   r   r   r   r   N)r   r  rv  r   r  )rZ   
freeu_keysr   r  r  s        r7   disable_freeuzUNetMotionModel.disable_freeul  sW    -
!*4>>!: 	5A~ 5>1-D1Q1]NAt45	5r6   c                 r   d| _         | j                  j                         D ]1  \  }}dt        |j                  j
                        v s(t        d       | j                  | _         | j                         D ]%  }t        |t              s|j                  d       ' | j                  t                      y)u1  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsrr  rt  r}   r]   r.   r   modulesr   r   fuse_projectionsrx  r   )rZ   r\   attn_processorr  s       r7   fuse_qkv_projectionsz$UNetMotionModel.fuse_qkv_projectionsu  s     )-%!%!5!5!;!;!= 	vA~#n66??@@ !tuu	v )-(<(<%lln 	3F&),''T'2	3 	 5 78r6   c                 T    | j                   | j                  | j                          yy)u   Disables the fused QKV projection if enabled.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>

        N)r  rx  )rZ   s    r7   unfuse_qkv_projectionsz&UNetMotionModel.unfuse_qkv_projections  s)     ((4##D$A$AB 5r6   r-   r`   r_   timestep_condr   rc   added_cond_kwargsdown_block_additional_residualsmid_block_additional_residualr   c                 &  $ d| j                   z  $d}d}t        $fd|j                  dd D              rt        j	                  d       d}|2d|j                  |j                        z
  d	z  }|j                  d      }|}t        j                  |      s|j                  j                  d
k(  }|j                  j                  dk(  }t        |t              r%|s|rt        j                  nt        j                  }n$|s|rt        j                   nt        j"                  }t        j$                  |g||j                        }n6t'        |j                        dk(  r|d   j                  |j                        }|j                  d   }|j)                  |j                  d         }| j+                  |      }|j                  | j                        }| j-                  ||      }d}| j.                  j0                  dk(  rd|vrt3        | j4                   d      |j7                  d      }d|vrt3        | j4                   d      |j7                  d      }| j9                  |j;                               }|j=                  |j                  d   df      }t        j>                  ||gd      }|j                  |j                        }| jA                  |      }||n||z   }|jC                  |d|j                  d   |z        }| jD                  | j.                  jF                  dk(  rsd|vrt3        | j4                   d      |j7                  d      }| jE                  |      }|D cg c]&  }|jC                  |d|j                  d   |z        ( }}||f}|jI                  ddddd      j=                  |j                  d   |z  df|j                  dd z         }| jK                  |      }|f}| jL                  D ]?  }tO        |d      r|jP                  r |||||||      \  }}n ||||      \  }}||z  }A |#d}tS        ||      D ]  \  }} || z   }||fz  } |}| jT                  DtO        | jT                  d       r| jU                  ||||||!      }n| jU                  |||||"      }|	||	z   }tW        | jX                        D ]  \  }!}"|!t'        | jX                        dz
  k(  }#|t'        |"jZ                         d }|dt'        |"jZ                          }|#s|r|d   j                  dd }tO        |"d      r|"jP                  r |"||||||||#      } |"|||||$      } | j\                  r"| j]                  |      }| j_                  |      }| ja                  |      }|dddf   j=                  d|f|j                  dd z         jI                  ddddd      }|
s|fS tc        |%      S c c}w )&aG	  
        The [`UNetMotionModel`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
                through the `self.time_embedding` layer to obtain the timestep embeddings.
            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
                negative values to the attention scores corresponding to "discard" tokens.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
                A tuple of tensors that if specified are added to the residuals of down unet blocks.
            mid_block_additional_residual: (`torch.Tensor`, *optional*):
                A tensor that if specified is added to the residual of the middle unet block.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_motion_model.UNetMotionOutput`] instead of a plain
                tuple.

        Returns:
            [`~models.unets.unet_motion_model.UNetMotionOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unets.unet_motion_model.UNetMotionOutput`] is returned,
                otherwise a `tuple` is returned where the first element is the sample tensor.
        r   FNc              3   .   K   | ]  }|z  d k7    yw)r   Nr5   )rc  sdefault_overall_up_factors     r7   re  z*UNetMotionModel.forward.<locals>.<genexpr>  s     Maq,,1Ms   z9Forward upsample size to force interpolation output size.Tr(   g     mpsnpu)rz  r\  r   )rz  r=  text_embedsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`time_idsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`r   r   )r   r   ri  image_embedsz has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`r	   rf   r   )r^   r   r_   r   rb   rc   )r^   r   rb   r5   r   )r_   r   rb   rc   )r_   r   rc   )r^   r   r   r_   r   r   rb   rc   )r^   r   r   r   rb   )r-   )2rC  rq  rh   r   inforj  rz  	unsqueezer2   	is_tensorr\  typer   r{   float32float64int32int64tensorr   expandr>  r?  rk  r7  r   r]   r   rA  flattenri   concatrB  repeat_interleaver@  r6  rj   r  r  rv  r   r   r  r   r  r   rD  rF  rG  r,   )%rZ   r-   r`   r_   r  r   rc   r  r  r  r   forward_upsample_sizer   	timestepsis_mpsis_npurz  rb   t_embembaug_embr  r  time_embeds
add_embedsr  image_embeddown_block_res_samplesdownsample_blockres_samplesnew_down_block_res_samplesdown_block_res_sampledown_block_additional_residualr   r  rP  r  s%                                       @r7   rt   zUNetMotionModel.forward  s   d %&t':':$:! !&M6<<;LMMKKST$(! %."3"3FLL"AAXMN+55a8N 	y) ]]''50F]]''50F(E**0F(.&u{{i[fmmTI!Q&!$**6==9I \\!_
$$V\\!_5	y)
 tzz*!!%7;;**k9$55 ~~&  '{  |  ,//>K!22 ~~&  'x  y  ),,Z8H,,X-=-=-?@K%--{/@/@/CR.HIK{K&@bIJ#syy1J((4G_c#-##JA399Q<R\C\#]  ,1Q1QUd1d%66 ~~&  'A  B  -00@L00>L $0 --ja[M^M^_`MadnMn-oL  &;L$I! 1aA.66Q*8TVX7Y\b\h\hijik\l7lmf% #) $ 0 0 	2')>?DTDhDh&6"(*?#1)+A'# '7VRUbl&m#"k1"	2 +6)+&IL&(GJ GE%'E )>@^(^%*/D.FF*	G &@" >>%t~~'78*?#1)+A (  *?#1+A (  )4;;F "+4>>!: 	A~#dnn"5"99N0#n6L6L2M1M1OPK%;<Zs>CYCY?Z>Z%[" "&; 6r : @ @ D~'<=.BdBd'"(,7*?"/#1)+A	 ("(,7"/)/	@ ''/F]]6*Fv& a(("j)9FLL<L)LMUUVWYZ\]_`bcd9v..Ws   6+X) Nrf   rf   )r   r   r   r   )r   r   r   r   r'  r   r(   r(   silurx   gh㈵>r   r(   Nr(   NNr(   Fr   rx   r   NTr(   NNNNNN)NT)rd   N)TTNF)Nr   )NNNNNNT)(r.   r/   r0   r1    _supports_gradient_checkpointing _skip_layerwise_casting_patternsr   r   rz   r   r}   r   r{   r|   rP   classmethodr*   r  r  r  ry  r  propertyr   r   rr  rx  r  r  r  r  r  r  r  r2   r3   r   r,   rt   r   r   s   @r7   r)  r)    s    (,$(.x$ &*-
+
 /E34"#()!#'MN_cVWhlMQVW&+;<%'BCqu%) !)-.2-115?C,0WA
c]A
 A
 	A

  S/A
 c3hA
" "#s(O#A
$  U3Z0%A
&  'A
( !&)A
* +A
, -A
. /A
0 !1A
2 ',CsU5\,I&J3A
4 /7uS%*eTYl=Z7[.\5A
6 05S%*eEl5R/S7A
8 8@c5QT:W\]bWcFc@d7e9A
: +35eCj3I*J;A
< 4<E#uSz/<R3S=A
>  $?A
@ #3c3h#78AA
B  #CA
D %*#uS#X*>$?EA
F -5U3c3hQVW\]`be]eWfhkWkQl;l5m,nGA
H #IA
J KA
L "#MA
N 'smOA
P &c]QA
R "*#SA
T 08}UA
V %SMWA
 A
F
  37!	W"W !/W 	W Wr+0p(=2I pd p !%#'!%!!
!
 !
 !	!

 #!
 !
 
!
F c+=&=!>  0 AE2Dd3PbKbFc2c,d  AD?(3- ?S ?Y] ?:	7+ .u .% .U . .$ .2594C$ 1515;??CIM@D f/f/ eS01f/  %||	f/
  -f/ !.f/ !)c3h 8f/ $Dell):$;<f/ *2%2E)Ff/ (0'=f/ f/ 
u||!44	5f/r6   r)  )Idataclassesr   typingr   r   r   r   r   r2   torch.nnrQ   torch.nn.functional
functionalrw  torch.utils.checkpointconfiguration_utilsr
   r   r   loadersr   r   r   utilsr   r   r   utils.torch_utilsr   	attentionr   attention_processorr   r   r   r   r   r   r   r   r   r   
embeddingsr    r!   modeling_utilsr"   r   r#   r$   r%    transformers.dual_transformer_2dr&   transformers.transformer_2dr'   unet_2d_blocksr)   unet_2d_conditionr*   
get_loggerr.   r   r,   r  r9   r   r   r   r   r   r  r  r)  r5   r6   r7   <module>r     s<   " 4 4      N N \ \ 3 3 , -   6 ' < < E < 3 3 
		H	% 	z 	 	Tryy Tn},bii },@v,ryy v,r}RYY }@DBII DNb")) bJ&BII &RFJ-C FRS/j+/JL\ S/r6   