
    bi~                     P   d dl mZmZmZmZmZ d dlZd dlmZ d dl	Zddl
mZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)  ejT                  e+      Z, G d dejZ                        Z. G d de!ee      Z/y)    )AnyDictOptionalTupleUnionN   )ConfigMixinregister_to_config)UNet2DConditionLoadersMixin)logging   )get_activation)	AttentionFeedForward)ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORSAttentionProcessorAttnAddedKVProcessorAttnProcessorFusedAttnProcessor2_0)TimestepEmbedding	Timesteps)
ModelMixin)TransformerTemporalModel   )UNetMidBlock3DCrossAttnget_down_blockget_up_block)UNet3DConditionOutputc                        e Zd Z	 	 	 	 ddedededededee   def fdZd	ej                  d
ej                  fdZ
 xZS )"I2VGenXLTransformerTemporalEncoderdimnum_attention_headsattention_head_dimactivation_fnupcast_attentionff_inner_dimdropoutc           	          t         |           t        j                  |dd      | _        t        ||||d|d      | _        t        |||d|d      | _        y )NTh㈵>)elementwise_affineepsF)	query_dimheadsdim_headr(   biasr&   out_bias)r(   r%   final_dropout	inner_dimr0   )	super__init__nn	LayerNormnorm1r   attn1r   ff)	selfr"   r#   r$   r%   r&   r'   r(   	__class__s	           `/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/unets/unet_i2vgen_xl.pyr5   z+I2VGenXLTransformerTemporalEncoder.__init__1   se     	\\#$DI
%'-

 '"
    hidden_statesreturnc                    | j                  |      }| j                  |d       }||z   }|j                  dk(  r|j                  d      }| j	                  |      }||z   }|j                  dk(  r|j                  d      }|S )N)encoder_hidden_states   r   )r8   r9   ndimsqueezer:   )r;   r?   norm_hidden_statesattn_output	ff_outputs        r=   forwardz*I2VGenXLTransformerTemporalEncoder.forwardO   s     "ZZ6jj!34jP#m3")11!4MGGM*	!M1")11!4Mr>   )gegluFNg        )__name__
__module____qualname__intstrboolr   r5   torchTensorrI   __classcell__r<   s   @r=   r!   r!   0   s}     %!&&*

 !
  	

 
 
 sm
 
<|| 
r>   r!   c                   `    e Zd ZdZdZe	 	 	 	 	 	 	 	 	 	 	 d(dee   dededee	df   d	ee	df   d
eedf   dedee   dede
eee   f   dee
eee   f      f fd       Zedee	ef   fd       Zde
eee	ef   f   fdZd)dee   deddfdZd Zd Zd Zd Zd Zd Z	 	 	 	 	 d*dej2                  de
ej2                  eef   d ej2                  d!ej2                  d"eej2                     d#eej2                     d$eej2                     d%eee	ef      d&ede
eeej2                     f   fd'Z xZS )+I2VGenXLUNeta	  
    I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and
    returns a sample-shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample.
        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
            The tuple of downsample blocks to use.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
            The tuple of upsample blocks to use.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
            If `None`, normalization and activation layers is skipped in post-processing.
        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
        attention_head_dim (`int`, *optional*, defaults to 64): Attention head dim.
        num_attention_heads (`int`, *optional*): The number of attention heads.
    FNsample_sizein_channelsout_channelsdown_block_types.up_block_typesblock_out_channelslayers_per_blocknorm_num_groupscross_attention_dimr$   r#   c                 &
   t         |           |
}t        |      t        |      k7  rt        d| d| d      t        |      t        |      k7  rt        d| d| d      t	        |t
              s)t        |      t        |      k7  rt        d| d| d      t        j                  ||z   |d   dd	
      | _        t        d||d   d	|      | _
        t        j                  t        j                  d|dz  dd	      t        j                         t        j                  |dz  |dz  dd	d	      t        j                         t        j                  |dz  |dd	d	            | _        t        |d|dz  |d      | _        t        j                  t        j                  d|dz  dd	      t        j                         t        j                   d      t        j                  |dz  |dz  ddd	      t        j                         t        j                  |dz  |	ddd	            | _        |d   dz  }t%        |d   dd      | _        |d   }t)        ||d      | _        t        j                  t        j,                  |	|      t        j                         t        j,                  ||	|z              | _        t        j                  t        j,                  ||      t        j                         t        j,                  ||            | _        t        j2                  g       | _        t        j2                  g       | _        t	        |t
              r|ft        |      z  }|d   }t9        |      D ]T  \  }}|}||   }|t        |      d	z
  k(  }t;        |||||| dd||	||   d	d      }| j4                  j=                  |       V t?        |d   |ddd	|	|d   |d	      | _         d| _!        tE        tG        |            }tE        tG        |            }|d   }t9        |      D ]  \  }}|t        |      d	z
  k(  }|}||   }|tI        |d	z   t        |      d	z
           }|sd}| xjB                  d	z  c_!        nd}tK        ||d	z   |||||dd||	||   d|      }| j6                  j=                  |       |} t        jL                  |d   |d      | _'        tQ        d      | _)        t        j                  |d   |dd	
      | _*        y )Nz\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: .zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: r   r   r   )kernel_sizepadding   )r#   r$   rX   
num_layersr^   rC   )rc   )striderc   r   gelu)r"   r#   r'   r$   r%   )    rh      Tsilu)act_fnr*   F)re   rX   rY   temb_channelsadd_downsample
resnet_epsresnet_act_fnresnet_groupsr_   r#   downsample_paddingdual_cross_attention)	rX   rl   rn   ro   output_scale_factorr_   r#   rp   rr   )re   rX   rY   prev_output_channelrl   add_upsamplern   ro   rp   r_   r#   rr   resolution_idx)num_channels
num_groupsr,   )+r4   r5   len
ValueError
isinstancerN   r6   Conv2dconv_inr   transformer_in
SequentialSiLUimage_latents_proj_inr!   image_latents_temporal_encoderAdaptiveAvgPool2dimage_latents_context_embeddingr   	time_projr   time_embeddingLinearcontext_embeddingfps_embedding
ModuleListdown_blocks	up_blocks	enumerater   appendr   	mid_blocknum_upsamplerslistreversedminr   	GroupNormconv_norm_outr   conv_actconv_out)r;   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r$   r#   time_embed_dimtimestep_input_dimoutput_channelidown_block_typeinput_channelis_final_block
down_blockreversed_block_out_channelsreversed_num_attention_headsup_block_typeru   rv   up_blockr<   s                             r=   r5   zI2VGenXLUNet.__init__~   s   2 	 1  C$77no  oA  AU  Vd  Ue  ef  g  !"c*:&;;t  vH  uI  I_  `p  _q  qr  s  -s3<O8PTWXhTi8iv  xK  wL  Lb  cs  bt  tu  v 
 yy{!:<Nq<Q_`jkl6 !2*1-+
 &(]]IIaq!Q7GGIIIkAo{Q!QOGGIIIkAo{AaK&
" /Q !$q* /
+ 02}}IIaq!Q7GGI  *IIkAo{R'71aPGGIIIkB&(;QqRST0
, ,A.2"#5a#8$B/2/0BN[ab!#II)>:GGIIIn&9K&GH"

  ]]II(.92779biiP^`nFo

 ==,r*)3/#6"83?O;P"P ,A."+,<"= 	0A*M/2N#&8"9A"==N'+)+,#11 $-$7$7$:#$%*J ##J/)	0. 1*2.(  ! 3 3B 7)!&

   '+84F+G&H#'+H5H,I'J$4Q7 ). 9 	1A}#&8"9A"==N"08;N7AE3GYCZ]^C^8_`M "###q(#$#+a/)+$7,) $-$7$@$C%* H  NN!!(+"0?	1D  \\7I!7LYhnst&v.		"4Q"7ST^_`r>   r@   c                     i }dt         dt        j                  j                  dt        t         t
        f   ffd| j                         D ]  \  }} |||        |S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namemodule
processorsc                     t        |d      r|j                         ||  d<   |j                         D ]  \  }} |  d| ||        |S )Nget_processor
.processorra   )hasattrr   named_children)r   r   r   sub_namechildfn_recursive_add_processorss        r=   r   zAI2VGenXLUNet.attn_processors.<locals>.fn_recursive_add_processorsH  sd    v/282F2F2H
dV:./#)#8#8#: U%+tfAhZ,@%TU r>   )rO   rQ   r6   Moduler   r   r   )r;   r   r   r   r   s       @r=   attn_processorszI2VGenXLUNet.attn_processors=  sm     
	c 	588?? 	X\]`bt]tXu 	 !//1 	BLD&'fjA	B r>   	processorc           	      T   t        | j                  j                               }t        |t              r,t        |      |k7  rt        dt        |       d| d| d      dt        dt        j                  j                  ffd| j                         D ]  \  }} |||        y)	a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r   c                     t        |d      rEt        |t              s|j                  |       n#|j                  |j	                  |  d             |j                         D ]  \  }} |  d| ||        y )Nset_processorr   ra   )r   r|   dictr   popr   )r   r   r   r   r   fn_recursive_attn_processors        r=   r   zDI2VGenXLUNet.set_attn_processor.<locals>.fn_recursive_attn_processorl  sx    v/!)T2((3(($z7J)KL#)#8#8#: T%+tfAhZ,@%STr>   N)rz   r   keysr|   r   r{   rO   rQ   r6   r   r   )r;   r   countr   r   r   s        @r=   set_attn_processorzI2VGenXLUNet.set_attn_processorW  s     D((--/0i&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1 	ALD&'fi@	Ar>   
chunk_sizer"   c                     |dvrt        d|       |xs d}dt        j                  j                  dt        dt        ffd| j                         D ]  } |||        y)	aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r   z-Make sure to set `dim` to either 0 or 1, not r   r   r   r"   c                     t        | d      r| j                  ||       | j                         D ]  } |||        y Nset_chunk_feed_forward)r   r"   r   r   childrenr   r   r"   r   fn_recursive_feed_forwards       r=   r   zGI2VGenXLUNet.enable_forward_chunking.<locals>.fn_recursive_feed_forward  E    v78---M* B)%SABr>   N)r{   rQ   r6   r   rN   r   )r;   r   r"   r   r   s       @r=   enable_forward_chunkingz$I2VGenXLUNet.enable_forward_chunkingz  su     fLSERSS  _1
	Behhoo 	B3 	BUX 	B mmo 	?F%fj#>	?r>   c                     dt         j                  j                  dt        dt        ffd| j	                         D ]  } |d d        y )Nr   r   r"   c                     t        | d      r| j                  ||       | j                         D ]  } |||        y r   r   r   s       r=   r   zHI2VGenXLUNet.disable_forward_chunking.<locals>.fn_recursive_feed_forward  r   r>   r   )rQ   r6   r   rN   r   )r;   r   r   s     @r=   disable_forward_chunkingz%I2VGenXLUNet.disable_forward_chunking  sM    	Behhoo 	B3 	BUX 	B mmo 	7F%fdA6	7r>   c           	      j   t        d | j                  j                         D              rt               }nmt        d | j                  j                         D              rt	               }n8t        dt        t        | j                  j                                            | j                  |       y)ze
        Disables custom attention processors and sets the default attention implementation.
        c              3   @   K   | ]  }|j                   t        v   y wN)r<   r   .0procs     r=   	<genexpr>z:I2VGenXLUNet.set_default_attn_processor.<locals>.<genexpr>  s     i4t~~!>>i   c              3   @   K   | ]  }|j                   t        v   y wr   )r<   r   r   s     r=   r   z:I2VGenXLUNet.set_default_attn_processor.<locals>.<genexpr>  s     h$#==hr   zOCannot call `set_default_attn_processor` when attention processors are of type N)	allr   valuesr   r   r{   nextiterr   )r;   r   s     r=   set_default_attn_processorz'I2VGenXLUNet.set_default_attn_processor  s     i4K_K_KfKfKhii,.Ih$J^J^JeJeJghh%Iabfgklp  mA  mA  mH  mH  mJ  hK  cL  bM  N  		*r>   c                     t        | j                        D ]9  \  }}t        |d|       t        |d|       t        |d|       t        |d|       ; y)aF  Enables the FreeU mechanism from https://huggingface.co/papers/2309.11497.

        The suffixes after the scaling factors represent the stage blocks where they are being applied.

        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.

        Args:
            s1 (`float`):
                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            s2 (`float`):
                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
        s1s2b1b2N)r   r   setattr)r;   r   r   r   r   r   upsample_blocks          r=   enable_freeuzI2VGenXLUNet.enable_freeu  sQ    $ "+4>>!: 	.A~ND"-ND"-ND"-ND"-		.r>   c                     h d}t        | j                        D ]3  \  }}|D ])  }t        ||      st        ||d      t	        ||d       + 5 y)zDisables the FreeU mechanism.>   r   r   r   r   N)r   r   r   getattrr   )r;   
freeu_keysr   r   ks        r=   disable_freeuzI2VGenXLUNet.disable_freeu  sW    -
!*4>>!: 	5A~ 5>1-D1Q1]NAt45	5r>   c                 r   d| _         | j                  j                         D ]1  \  }}dt        |j                  j
                        v s(t        d       | j                  | _         | j                         D ]%  }t        |t              s|j                  d       ' | j                  t                      y)u1  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsr   itemsrO   r<   rK   r{   modulesr|   r   fuse_projectionsr   r   )r;   _attn_processorr   s       r=   fuse_qkv_projectionsz!I2VGenXLUNet.fuse_qkv_projections  s     )-%!%!5!5!;!;!= 	vA~#n66??@@ !tuu	v )-(<(<%lln 	3F&),''T'2	3 	 5 78r>   c                 T    | j                   | j                  | j                          yy)u   Disables the fused QKV projection if enabled.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>

        N)r   r   )r;   s    r=   unfuse_qkv_projectionsz#I2VGenXLUNet.unfuse_qkv_projections  s)     ((4##D$A$AB 5r>   sampletimestepfpsimage_latentsimage_embeddingsrB   timestep_condcross_attention_kwargsreturn_dictc
           
        & |j                   \  }
}}}}d| j                  z  &d}d}t        &fd|j                   dd D              rt        j	                  d       d}|}t        j                  |      s|j                  j                  dk(  }|j                  j                  d	k(  }t        |t              r%|s|rt
        j                  nt
        j                  }n$|s|rt
        j                  nt
        j                  }t        j                  |g||j                  
      }n6t!        |j                         dk(  r|d   j#                  |j                        }|j%                  |j                   d         }| j'                  |      }|j#                  | j(                        }| j+                  ||      }|j%                  |j                   d         }| j-                  | j'                  |      j#                  | j(                              }||z   }|j/                  |d|j                   d   |z        }|j1                  |
d| j2                  j4                        }t        j6                  ||gd      }|ddddddddf   }|j9                  ddddd      j;                  |j                   d   |j                   d   z  |j                   d   |j                   d   |j                   d         }| j=                  |      }|j                   \  }}}}|j9                  dddd      j;                  |||z  |      }t        j6                  ||gd      }| j?                  |      }|jA                  d| j2                  jB                  | j2                  j4                        }t        j6                  ||gd      }|j/                  |d|j                   d   |z        }|j9                  ddddd      j;                  |j                   d   |j                   d   z  |j                   d   |j                   d   |j                   d         }| jE                  |      }|dddf   j;                  |
||||      j9                  ddddd      j;                  |
|z  |z  ||      }| jG                  |      }|j;                  |
||||      j9                  ddddd      }t        j6                  ||gd      }|j9                  ddddd      j;                  |j                   d   |z  df|j                   dd z         }| jI                  |      }| jK                  |||d      d   }|f} | jL                  D ]>  }!tO        |!d      r|!jP                  r |!|||||      \  }}"n |!|||      \  }}"| |"z  } @ | jR                  | jS                  |||||      }tU        | jV                        D ]  \  }#}$|#t!        | jV                        dz
  k(  }%| t!        |$jX                         d }"| dt!        |$jX                          } |%s|r| d   j                   dd }tO        |$d      r|$jP                  r |$|||"||||      } |$|||"||      } | j[                  |      }| j]                  |      }| j_                  |      }|dddf   j;                  d|f|j                   dd z         j9                  ddddd      }|	s|fS ta        |      S )a  
        The [`I2VGenXLUNet`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition".
            image_latents (`torch.Tensor`): Image encodings from the VAE.
            image_embeddings (`torch.Tensor`):
                Projection embeddings of the conditioning image computed with a vision encoder.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
                tuple.

        Returns:
            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
                otherwise a `tuple` is returned where the first element is the sample tensor.
        r   FNc              3   .   K   | ]  }|z  d k7    yw)r   N )r   sdefault_overall_up_factors     r=   r   z'I2VGenXLUNet.forward.<locals>.<genexpr>/  s     Maq,,1Ms   z9Forward upsample size to force interpolation output size.Tmpsnpu)dtypedevicer   )r  )r"   output_sizer   )r"   r   rC   rs   )
num_framesr   r   has_cross_attention)r?   tembrB   r  r   )r?   r  r  )rB   r  r   )r?   r  res_hidden_states_tuplerB   upsample_sizer  r   )r?   r  r  r	  r  )r   )1shaper   anyloggerinforQ   	is_tensorr  typer|   floatfloat32float64int32int64tensorrz   toexpandr   r  r   r   repeat_interleave	new_zerosconfigr_   catpermutereshaper   r   viewrX   r   r   r~   r   r   r   r  r   r   r   resnetsr   r   r   r   )'r;   r   r   r   r   r   rB   r   r   r   
batch_sizechannelsr  heightwidthforward_upsample_sizer	  	timestepsis_mpsis_npur  t_embfps_embembcontext_embimage_latents_for_context_embdsimage_latents_context_embs_batch_size	_channels_height_width	image_embdown_block_res_samplesdownsample_blockres_samplesr   r   r   r   s'                                         @r=   rI   zI2VGenXLUNet.forward  s*   L ;A,,7
Hj&% %&t':':$:! !&M6<<;LMMKKST$(! 	y) ]]''50F]]''50F)U+*0F(.&u{{i[fmmTI!Q&!$**6==9I $$V\\!_5	y)
 tzz*##E=9 jj1&$$T^^C%8%;%;$**%;%MN go##JA399Q<R\C\#] &&z1dkk6U6UVii.C D!L*71bqb!*D'%D%L%LQPQSTVWYZ%[%c%c+11!47V7\7\]^7__+11!4+11!4+11!4	&
" &*%I%IJd%e"2L2R2R/Y%?%G%G1aQR%S%[%[6)9&
" ii.H IqQ**+;<	NN2t{{'>'>@_@_`	iii 8a@!33JAS^SdSdefSgjtSt3u%--aAq!<DD"]%8%8%;;"""	
 22=A$'"WZXvuEWQ1a#WZ&(50*hG	 	 ;;MJ%--j&%U]^ffghjkmnpqstu FM2:1aA.66Q*8TVX7Y\b\h\hijik\l7lmf%$$!#9	 % 

  #) $ 0 0 	2')>?DTDhDh&6"(*5)+A'# '7VRUbl&m#"k1"	2 >>%^^&1%'= $ F "+4>>!: 	A~#dnn"5"99N0#n6L6L2M1M1OPK%;<Zs>CYCY?Z>Z%[" "&; 6r : @ @ D~'<=.BdBd'"(,7*5"/)+A ("(,7"/)-	> ##F+v&v& a(("j)9FLL<L)LMUUVWYZ\]_`bcd9$F33r>   )NrC   rC   )CrossAttnDownBlock3Dr6  r6  DownBlock3D)	UpBlock3DCrossAttnUpBlock3Dr9  r9  )i@  i     r:  r   rh   i   @   N)Nr   )NNNNT) rK   rL   rM   __doc__ _supports_gradient_checkpointingr
   r   rN   r   rO   r   r5   propertyr   r   r   r   r   r   r   r   r   r   r   rQ   rR   r  r   rP   r   rI   rS   rT   s   @r=   rV   rV   a   sv   4 (-$ &*-
+
 /E !)+#'57@D-|ac]|a |a 	|a
  S/|a c3h|a" "#s(O#|a$ %|a& "#'|a( !)|a* "#uSz/2+|a, &eCsO&<=-|a |a| c+=&=!>  0 AE2Dd3PbKbFc2c,d  AF?(3- ?S ?Y] ?<	7+ .2594C& 488<04;? S4S4 eS01S4 \\	S4
 ||S4 #5<<0S4  (5S4  -S4 !)c3h 8S4 S4 
$eELL&99	:S4r>   rV   )0typingr   r   r   r   r   rQ   torch.nnr6   torch.utils.checkpointconfiguration_utilsr	   r
   loadersr   utilsr   activationsr   	attentionr   r   attention_processorr   r   r   r   r   r   
embeddingsr   r   modeling_utilsr   !transformers.transformer_temporalr   unet_3d_blocksr   r   r   unet_3d_conditionr   
get_loggerrK   r  r   r!   rV   r   r>   r=   <module>rN     s    5 4    B 2  ( .  6 ' H 
 5 
		H	%. .bo	4:{,G o	4r>   