
    bi                        d dl mZmZmZmZ d dlZd dlZd dlm	Z	 d dl
m	c mZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z"  ejF                  e$      Z% G d de	jL                        Z' G d de	jP                        Z) G d de	jP                        Z* G d de	jP                        Z+ G d de	jP                        Z, G d de	jP                        Z- G d de	jP                        Z. G d de	jP                        Z/ G d  d!e	jP                        Z0 G d" d#eee      Z1y)$    )DictOptionalTupleUnionN   )ConfigMixinregister_to_config)FromOriginalModelMixin)logging)apply_forward_hook   )get_activation)CogVideoXDownsample3D)AutoencoderKLOutput)
ModelMixin)CogVideoXUpsample3D   )DecoderOutputDiagonalGaussianDistributionc                   T     e Zd ZdZdej
                  dej
                  f fdZ xZS )CogVideoXSafeConv3dzq
    A 3D convolution layer that splits the input tensor into smaller parts to avoid OOM in CogVideoX Model.
    inputreturnc                    |j                   d   |j                   d   z  |j                   d   z  |j                   d   z  |j                   d   z  dz  dz  }|dkD  r| j                  d   }t        |dz        dz   }t        j                  ||d      }|dkD  rZ|d   gt        dt        |            D cg c]4  }t        j                  ||dz
     d d d d | dz   d f   ||   fd      6 c}z   }g }|D ]   }|j                  t        
| )  |             " t        j                  |d      }	|	S t        
| )  |      S c c}w )Nr   r   r   r      i   @dim)shapekernel_sizeinttorchchunkrangelencatappendsuperforward)selfr   memory_countr   part_numinput_chunksioutput_chunksinput_chunkoutput	__class__s             q/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/autoencoders/autoencoder_kl_cogvideox.pyr(   zCogVideoXSafeConv3d.forward+   s_   [[^ekk!n,u{{1~=ANQVQ\Q\]^Q__cddgnn 	
 !**1-K<!+,q0H ;;uhA>LQ ,Q0"1c,&784 II|AE21a+9I9K3KLl[\o^def4  
 M+ C$$UW_[%ABCYY}!4FM7?5))4s   59E)__name__
__module____qualname____doc__r!   Tensorr(   __classcell__r1   s   @r2   r   r   &   s(    *U\\ *ell * *    r   c                       e Zd ZdZ	 	 	 ddededeeeeeef   f   dededef fdZ	 dd	e	j                  d
ee	j                     de	j                  fdZdd	e	j                  d
ee	j                     de	j                  fdZ xZS )CogVideoXCausalConv3da=  A 3D causal convolution layer that pads the input tensor to ensure causality in CogVideoX Model.

    Args:
        in_channels (`int`): Number of channels in the input tensor.
        out_channels (`int`): Number of output channels produced by the convolution.
        kernel_size (`int` or `Tuple[int, int, int]`): Kernel size of the convolutional kernel.
        stride (`int`, defaults to `1`): Stride of the convolution.
        dilation (`int`, defaults to `1`): Dilation rate of the convolution.
        pad_mode (`str`, defaults to `"constant"`): Padding mode.
    in_channelsout_channelsr   stridedilationpad_modec           	         t         |           t        |t              r|fdz  }|\  }}}	|dz
  }
|dz
  dz  }|	dz
  dz  }|| _        || _        || _        |
| _        |||||
df| _        d| j                  | j
                  f| _	        d| _
        || _        t        |t              r|n|ddf}|ddf}t        |||||| j                  dk(  rdn| j                  d      | _        y )Nr   r   r   r   	replicatezeros)r=   r>   r   r?   r@   paddingpadding_mode)r'   __init__
isinstancer    rA   
height_pad	width_padtime_padtime_causal_paddingconst_padding_conv3dtemporal_dimtime_kernel_sizetupler   conv)r)   r=   r>   r   r?   r@   rA   rO   height_kernel_sizewidth_kernel_sizerK   rI   rJ   r1   s                r2   rG   zCogVideoXCausalConv3d.__init__Q   s    	k3'&.1,KBM?,.? $a'(1,2
&*q0	 $" $-y*jRZ\]#^ %&$H! 0%fe461a.a#'#%#+5A4;T;T 
	r:   inputs
conv_cacher   c                     | j                   dk(  r$t        j                  || j                  d      }|S | j                  }|dkD  r5||gn|d d d d d df   g|dz
  z  }t        j                  ||gz   d      }|S )NrC   )moder   r   r   )rA   FpadrL   rO   r!   r%   )r)   rT   rU   r   cached_inputss        r2   fake_context_parallel_forwardz3CogVideoXCausalConv3d.fake_context_parallel_forward}   s     ==K'UU64#;#;+NF 	 //KQ0:0FVTUWXZ\[\Z\T\M]L^bmpqbqLr=F8#;Cr:   c                     | j                  ||      }| j                  dk(  rd }n*|d d d d | j                   dz   d f   j                         }| j	                  |      }||fS )NrC   r   )r[   rA   rO   clonerQ   )r)   rT   rU   r0   s       r2   r(   zCogVideoXCausalConv3d.forward   si    33FJG==K'J1t'<'<&<q&@&B BCIIKJ6"z!!r:   )r   r   constantN)r3   r4   r5   r6   r    r   r   strrG   r!   r7   r   r[   r(   r8   r9   s   @r2   r<   r<   E   s    	  "*
*
 *
 3c3m 445	*

 *
 *
 *
Z JN
ll
080F
	
	"ell 	"8N 	"Z_ZfZf 	"r:   r<   c            
            e Zd ZdZ	 ddededef fdZ	 ddej                  dej                  dee	e
ej                  f      d	ej                  fd
Z xZS )CogVideoXSpatialNorm3Dao  
    Spatially conditioned normalization as defined in https://huggingface.co/papers/2209.09002. This implementation is
    specific to 3D-video like data.

    CogVideoXSafeConv3d is used instead of nn.Conv3d to avoid OOM in CogVideoX Model.

    Args:
        f_channels (`int`):
            The number of channels for input to group normalization layer, and output of the spatial norm layer.
        zq_channels (`int`):
            The number of channels for the quantized vector as described in the paper.
        groups (`int`):
            Number of groups to separate the channels into for group normalization.
    
f_channelszq_channelsgroupsc                     t         |           t        j                  ||dd      | _        t        ||dd      | _        t        ||dd      | _        y )Nư>T)num_channels
num_groupsepsaffiner   )r   r?   )r'   rG   nn	GroupNorm
norm_layerr<   conv_yconv_b)r)   rc   rd   re   r1   s       r2   rG   zCogVideoXSpatialNorm3D.__init__   sP     	,,J6W[dhi+KQR[\]+KQR[\]r:   fzqrU   r   c                    i }|xs i }|j                   d   dkD  r|j                   d   dz  dk(  r|d d d d d df   |d d d d dd f   }}|j                   dd  |j                   dd  }}|d d d d d df   |d d d d dd f   }
}	t        j                  |	|      }	t        j                  |
|      }
t        j                  |	|
gd      }n$t        j                  ||j                   dd        }| j                  ||j                  d            \  }|d<   | j                  ||j                  d            \  }|d<   | j                  |      }||z  |z   }||fS )	Nr   r   )sizer   ro   rU   rp   )	r   rX   interpolater!   r%   ro   getrp   rn   )r)   rq   rr   rU   new_conv_cachef_firstf_restf_first_sizef_rest_sizez_firstz_restro   rp   norm_fnew_fs                  r2   r(   zCogVideoXSpatialNorm3D.forward   s\    %2
771:>aggaj1n11bqbk1Q12X;VG(/bc(:FLL<M+L ArrlBq!QRxLVGmmG,?G]]6<FGV,!4Br5B+/;;rjnnU]F^;+_(x(+/;;rjnnU]F^;+_(x(#&(n$$r:   )    r_   )r3   r4   r5   r6   r    rG   r!   r7   r   r   r`   r(   r8   r9   s   @r2   rb   rb      s    & 		^	^ 	^ 		^ bf%%#(<<%=Ed3PUP\P\K\F]=^%	%r:   rb   c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 ddedee   dededededed	ed
ee   def fdZ		 	 	 dde
j                  dee
j                     dee
j                     deeee
j                  f      de
j                  f
dZ xZS )CogVideoXResnetBlock3Da   
    A 3D ResNet block used in the CogVideoX model.

    Args:
        in_channels (`int`):
            Number of input channels.
        out_channels (`int`, *optional*):
            Number of output channels. If None, defaults to `in_channels`.
        dropout (`float`, defaults to `0.0`):
            Dropout rate.
        temb_channels (`int`, defaults to `512`):
            Number of time embedding channels.
        groups (`int`, defaults to `32`):
            Number of groups to separate the channels into for group normalization.
        eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
        non_linearity (`str`, defaults to `"swish"`):
            Activation function to use.
        conv_shortcut (bool, defaults to `False`):
            Whether or not to use a convolution shortcut.
        spatial_norm_dim (`int`, *optional*):
            The dimension to use for spatial norm if it is to be used instead of group norm.
        pad_mode (str, defaults to `"first"`):
            Padding mode.
    r=   r>   dropouttemb_channelsre   rj   non_linearityconv_shortcutspatial_norm_dimrA   c                    t         |           |xs |}|| _        || _        t	        |      | _        || _        |	| _        |	;t        j                  |||      | _
        t        j                  |||      | _        n&t        ||	|      | _
        t        ||	|      | _        t        ||d|
      | _        |dkD  rt        j                  ||      | _        t        j"                  |      | _        t        ||d|
      | _        | j                  | j                  k7  r7| j                  rt        ||d|
      | _        y t+        ||ddd      | _        y y )	N)rh   ri   rj   )rc   rd   re   r   )r=   r>   r   rA   r   )in_featuresout_featuresr   )r=   r>   r   r?   rE   )r'   rG   r=   r>   r   nonlinearityuse_conv_shortcutr   rl   rm   norm1norm2rb   r<   conv1Linear	temb_projDropoutr   conv2r   r   )r)   r=   r>   r   r   re   rj   r   r   r   rA   r1   s              r2   rG   zCogVideoXResnetBlock3D.__init__   sV    	#2{&(*=9!. 0#;6WZ[DJ<FX[\DJ/&,DJ
 0',DJ +#,AX`

 1YY=|\DNzz'**$<QYa

 t000%%%: +,TU`h&" &9 +,TU^_ij&" 1r:   rT   tembrr   rU   r   c                 B   i }|xs i }|}|*| j                  |||j                  d            \  }|d<   n| j                  |      }| j                  |      }| j                  ||j                  d            \  }|d<   |/|| j	                  | j                  |            d d d d d d d f   z   }|*| j                  |||j                  d            \  }|d<   n| j                  |      }| j                  |      }| j                  |      }| j                  ||j                  d            \  }|d<   | j                  | j                  k7  rF| j                  r)| j                  ||j                  d            \  }|d<   n| j                  |      }||z   }||fS )Nr   rv   r   r   r   r   )r   rx   r   r   r   r   r   r   r=   r>   r   r   )r)   rT   r   rr   rU   ry   hidden_statess          r2   r(   zCogVideoXResnetBlock3D.forward  s    %2
>59ZZr^h^l^lmt^uZ5v2M>'2 JJ}5M))-815MV`VdVdelVm1n.~g.)DNN4;L;LT;R,STUWXZ^`dfjTj,kkM>59ZZr^h^l^lmt^uZ5v2M>'2 JJ}5M))-8]315MV`VdVdelVm1n.~g.t000%%:>:L:Lz~~o'F ;M ;77 ++F3%.n,,r:   )	N           r   rg   swishFNfirstNNN)r3   r4   r5   r6   r    r   floatr`   boolrG   r!   r7   r   r(   r8   r9   s   @r2   r   r      s   : '+ $#*.:: sm: 	:
 : : : : : #3-: :~ (,%)8<)-)- u||$)- U\\"	)-
 T#u||"345)- 
)-r:   r   c                       e Zd ZdZdZ	 	 	 	 	 	 	 	 	 ddedededededed	ed
ededededef fdZ		 	 	 dde
j                  dee
j                     dee
j                     deeee
j                  f      de
j                  f
dZ xZS )CogVideoXDownBlock3Da  
    A downsampling block used in the CogVideoX model.

    Args:
        in_channels (`int`):
            Number of input channels.
        out_channels (`int`, *optional*):
            Number of output channels. If None, defaults to `in_channels`.
        temb_channels (`int`, defaults to `512`):
            Number of time embedding channels.
        num_layers (`int`, defaults to `1`):
            Number of resnet layers.
        dropout (`float`, defaults to `0.0`):
            Dropout rate.
        resnet_eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
        resnet_act_fn (`str`, defaults to `"swish"`):
            Activation function to use.
        resnet_groups (`int`, defaults to `32`):
            Number of groups to separate the channels into for group normalization.
        add_downsample (`bool`, defaults to `True`):
            Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension.
        compress_time (`bool`, defaults to `False`):
            Whether or not to downsample across temporal dimension.
        pad_mode (str, defaults to `"first"`):
            Padding mode.
    Tr=   r>   r   r   
num_layers
resnet_epsresnet_act_fnresnet_groupsadd_downsampledownsample_paddingcompress_timerA   c                 @   t         |           g }t        |      D ]-  }|dk(  r|n|}|j                  t	        ||||||||             / t        j                  |      | _        d | _        |	r(t        j                  t        |||
|      g      | _        d| _
        y )Nr   )r=   r>   r   r   re   rj   r   rA   rE   r   F)r'   rG   r#   r&   r   rl   
ModuleListresnetsdownsamplersr   gradient_checkpointing)r)   r=   r>   r   r   r   r   r   r   r   r   r   rA   r   r-   
in_channelr1   s                   r2   rG   zCogVideoXDownBlock3D.__init__j  s     	z" 	A()QLJNN& *!-#"/(""/%		 }}W-  ")$l<N^k!D ',#r:   r   r   rr   rU   r   c           
         i }|xs i }t        | j                        D ]v  \  }}d| }t        j                         r7| j                  r+| j                  |||||j                  |            \  }||<   V |||||j                  |            \  }||<   x | j                  | j                  D ]
  }	 |	|      } ||fS )z3Forward method of the `CogVideoXDownBlock3D` class.resnet_rv   )	enumerater   r!   is_grad_enabledr   _gradient_checkpointing_funcrx   r   )
r)   r   r   rr   rU   ry   r-   resnetconv_cache_keydownsamplers
             r2   r(   zCogVideoXDownBlock3D.forward  s     %2
"4<<0 	IAv&qc]N$$&4+F+F@D@a@a!NN>2A=~n= AG!4
~8VA=~n=	  (#00 ; +M :; n,,r:   )	r   r   rg   r   r   Tr   Fr   r   )r3   r4   r5   r6    _supports_gradient_checkpointingr    r   r`   r   rG   r!   r7   r   r   r(   r8   r9   s   @r2   r   r   K  s   8 (,$  $#"##-,-, -, 	-,
 -, -, -, -, -, -,  -, -, -,d (,%)8< -|| - u||$ - U\\"	 -
 T#u||"345 - 
 -r:   r   c                       e Zd ZdZdZ	 	 	 	 	 	 	 ddedededededed	ed
ee   def fdZ		 	 	 dde
j                  dee
j                     dee
j                     deeee
j                  f      de
j                  f
dZ xZS )CogVideoXMidBlock3Da  
    A middle block used in the CogVideoX model.

    Args:
        in_channels (`int`):
            Number of input channels.
        temb_channels (`int`, defaults to `512`):
            Number of time embedding channels.
        dropout (`float`, defaults to `0.0`):
            Dropout rate.
        num_layers (`int`, defaults to `1`):
            Number of resnet layers.
        resnet_eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
        resnet_act_fn (`str`, defaults to `"swish"`):
            Activation function to use.
        resnet_groups (`int`, defaults to `32`):
            Number of groups to separate the channels into for group normalization.
        spatial_norm_dim (`int`, *optional*):
            The dimension to use for spatial norm if it is to be used instead of group norm.
        pad_mode (str, defaults to `"first"`):
            Padding mode.
    Tr=   r   r   r   r   r   r   r   rA   c
                     t         |           g }
t        |      D ]%  }|
j                  t	        |||||||||		             ' t        j                  |
      | _        d| _        y )N)	r=   r>   r   r   re   rj   r   r   rA   F)	r'   rG   r#   r&   r   rl   r   r   r   )r)   r=   r   r   r   r   r   r   r   rA   r   _r1   s               r2   rG   zCogVideoXMidBlock3D.__init__  sr     	z" 	ANN& +!,#"/("%5"/%
	 }}W-&+#r:   r   r   rr   rU   r   c           
      6   i }|xs i }t        | j                        D ]v  \  }}d| }t        j                         r7| j                  r+| j                  |||||j                  |            \  }||<   V |||||j                  |            \  }||<   x ||fS )z2Forward method of the `CogVideoXMidBlock3D` class.r   rv   )r   r   r!   r   r   r   rx   )	r)   r   r   rr   rU   ry   r-   r   r   s	            r2   r(   zCogVideoXMidBlock3D.forward  s     %2
"4<<0 
	IAv&qc]N$$&4+F+F@D@a@aM4Z^^N5SA=~n= AG!4
~8VA=~n=
	 n,,r:   )r   r   rg   r   r   Nr   r   )r3   r4   r5   r6   r   r    r   r`   r   rG   r!   r7   r   r(   r8   r9   s   @r2   r   r     s    0 (,$  $*.,, , 	,
 , , , , #3-, ,H (,%)8<-||- u||$- U\\"	-
 T#u||"345- 
-r:   r   c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededededef fdZ	 	 	 dde	j                  dee	j                     dee	j                     deeee	j                  f      de	j                  f
dZ xZS )CogVideoXUpBlock3Da=  
    An upsampling block used in the CogVideoX model.

    Args:
        in_channels (`int`):
            Number of input channels.
        out_channels (`int`, *optional*):
            Number of output channels. If None, defaults to `in_channels`.
        temb_channels (`int`, defaults to `512`):
            Number of time embedding channels.
        dropout (`float`, defaults to `0.0`):
            Dropout rate.
        num_layers (`int`, defaults to `1`):
            Number of resnet layers.
        resnet_eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
        resnet_act_fn (`str`, defaults to `"swish"`):
            Activation function to use.
        resnet_groups (`int`, defaults to `32`):
            Number of groups to separate the channels into for group normalization.
        spatial_norm_dim (`int`, defaults to `16`):
            The dimension to use for spatial norm if it is to be used instead of group norm.
        add_upsample (`bool`, defaults to `True`):
            Whether or not to use a upsampling layer. If not used, output dimension would be same as input dimension.
        compress_time (`bool`, defaults to `False`):
            Whether or not to downsample across temporal dimension.
        pad_mode (str, defaults to `"first"`):
            Padding mode.
    r=   r>   r   r   r   r   r   r   r   add_upsampleupsample_paddingr   rA   c                 B   t         |           g }t        |      D ].  }|dk(  r|n|}|j                  t	        ||||||||	|	             0 t        j                  |      | _        d | _        |
r(t        j                  t        ||||      g      | _        d| _
        y )Nr   )	r=   r>   r   r   re   rj   r   r   rA   r   F)r'   rG   r#   r&   r   rl   r   r   
upsamplersr   r   )r)   r=   r>   r   r   r   r   r   r   r   r   r   r   rA   r   r-   r   r1   s                    r2   rG   zCogVideoXUpBlock3D.__init__2  s      	z" 	A()QLJNN& *!-#"/(""/%5%
	  }}W- mm'$l<L\iDO ',#r:   r   r   rr   rU   r   c           
         i }|xs i }t        | j                        D ]v  \  }}d| }t        j                         r7| j                  r+| j                  |||||j                  |            \  }||<   V |||||j                  |            \  }||<   x | j                  | j                  D ]
  }	 |	|      } ||fS )z1Forward method of the `CogVideoXUpBlock3D` class.r   rv   )r   r   r!   r   r   r   rx   r   )
r)   r   r   rr   rU   ry   r-   r   r   	upsamplers
             r2   r(   zCogVideoXUpBlock3D.forwardc  s     %2
"4<<0 	IAv&qc]N$$&4+F+F@D@a@a!NN>2A=~n= AG!4
~8VA=~n=	  ??&!__ 9	 )- 89 n,,r:   )
r   r   rg   r   r      Tr   Fr   r   )r3   r4   r5   r6   r    r   r`   r   rG   r!   r7   r   r   r(   r8   r9   s   @r2   r   r     s   F  $ "! !#/,/, /, 	/,
 /, /, /, /, /, /, /, /, /, /,h (,%)8< -|| - u||$ - U\\"	 -
 T#u||"345 - 
 -r:   r   c                       e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 ddededeedf   deedf   ded	ed
ededededef fdZ		 	 dde
j                  dee
j                     deeee
j                  f      de
j                  fdZ xZS )CogVideoXEncoder3DaG  
    The `CogVideoXEncoder3D` layer of a variational autoencoder that encodes its input into a latent representation.

    Args:
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
            options.
        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
            The number of output channels for each block.
        act_fn (`str`, *optional*, defaults to `"silu"`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
        layers_per_block (`int`, *optional*, defaults to 2):
            The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32):
            The number of groups for normalization.
    Tr=   r>   down_block_types.block_out_channelslayers_per_blockact_fnnorm_epsnorm_num_groupsr   rA   temporal_compression_ratioc                    t         |           t        t        j                  |            }t        ||d   d|
      | _        t        j                  g       | _	        |d   }t        |      D ]d  \  }}|}||   }|t        |      dz
  k(  }||k  }|dk(  rt        ||d|	||||| |
      }nt        d      | j                  j                  |       f t        |d   d|	d	||||

      | _        t        j"                  ||d   d      | _        t        j&                         | _        t        |d   d	|z  d|
      | _        d| _        y )Nr   r   r   rA   r   r   )
r=   r>   r   r   r   r   r   r   r   r   zEInvalid `down_block_type` encountered. Must be `CogVideoXDownBlock3D`r   )r=   r   r   r   r   r   r   rA   rg   )rj   F)r'   rG   r    nplog2r<   conv_inrl   r   down_blocksr   r$   r   
ValueErrorr&   r   	mid_blockrm   norm_outSiLUconv_actconv_outr   )r)   r=   r>   r   r   r   r   r   r   r   rA   r   temporal_compress_leveloutput_channelr-   down_block_typeinput_channelis_final_blockr   
down_blockr1   s                       r2   rG   zCogVideoXEncoder3D.__init__  s{   & 	 #&bgg.H&I"J,[:LQ:O]^iqr==, ,A."+,<"= 	0A*M/2N#&8"9A"==N 77M"881 -!/"##/'"("1'5#5"/
 !!hii##J/-	02 -*2. )	
 _6H6LRVW	-r"A$4!h
 ',#r:   sampler   rU   r   c           
         i }|xs i }| j                  ||j                  d            \  }|d<   t        j                         r| j                  rt        | j                        D ]4  \  }}d| }| j                  |||d|j                  |            \  }||<   6 | j                  | j                  ||d|j                  d            \  }|d<   nlt        | j                        D ]*  \  }}d| } |||d|j                  |            \  }||<   , | j                  ||d|j                  d            \  }|d<   | j                  |      }| j                  |      }| j                  ||j                  d            \  }|d<   ||fS )z5The forward method of the `CogVideoXEncoder3D` class.r   rv   down_block_Nr   r   )r   rx   r!   r   r   r   r   r   r   r   r   r   )	r)   r   r   rU   ry   r   r-   r   r   s	            r2   r(   zCogVideoXEncoder3D.forward  s    %2
37<<S]SaSabkSl<3m0~i0  "t'B'B!*4+;+;!< :#.qc!2@D@a@a!NN>2A=~n= :>9Z9Z{+:6M>+6 "+4+;+;!< :#.qc!2@J!4z~~n/MA=~n= :>tTjnn[6Q :H :6M>+6
 m4m448MM-\f\j\jku\vM4w1~j1n,,r:   )r   r   r   r   r   r         r   r   r   silurg   r   r   r   r   NNr3   r4   r5   r6   r   r    r   r`   r   rG   r!   r7   r   r   r(   r8   r9   s   @r2   r   r     s   * (,$ -
 /C !!,-#G,G, G,  S/	G, "#s(OG, G, G, G, G, G,  !G," %*#G,X (,8<	4-4- u||$4- T#u||"345	4-
 
4-r:   r   c                       e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 ddededeedf   deedf   ded	ed
ededededef fdZ		 	 dde
j                  dee
j                     deeee
j                  f      de
j                  fdZ xZS )CogVideoXDecoder3Da@  
    The `CogVideoXDecoder3D` layer of a variational autoencoder that decodes its latent representation into an output
    sample.

    Args:
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
            The number of output channels for each block.
        act_fn (`str`, *optional*, defaults to `"silu"`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
        layers_per_block (`int`, *optional*, defaults to 2):
            The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32):
            The number of groups for normalization.
    Tr=   r>   up_block_types.r   r   r   r   r   r   rA   r   c                    t         |           t        t        |            }t	        ||d   d|
      | _        t        |d   dd|||||
      | _        t        j                  g       | _
        |d   }t        t        j                  |            }t        |      D ]k  \  }}|}||   }|t        |      dz
  k(  }||k  }|dk(  rt!        ||d|	|dz   ||||| ||
      }|}nt#        d	      | j                  j%                  |       m t'        |d
   ||      | _        t        j*                         | _        t	        |d
   |d|
      | _        d| _        y )Nr   r   r   r   )r=   r   r   r   r   r   r   rA   r   r   )r=   r>   r   r   r   r   r   r   r   r   r   rA   zAInvalid `up_block_type` encountered. Must be `CogVideoXUpBlock3D`r   )re   F)r'   rG   listreversedr<   r   r   r   rl   r   	up_blocksr    r   r   r   r$   r   r   r&   rb   r   r   r   r   r   )r)   r=   r>   r   r   r   r   r   r   r   rA   r   reversed_block_out_channelsr   r   r-   up_block_typeprev_output_channelr   r   up_blockr1   s                        r2   rG   zCogVideoXDecoder3D.__init__6  s   & 	&*84F+G&H#,4Q7QQY

 -3A6 )(	
 r*4Q7"%bgg.H&I"J ). 9 	,A}"08;N#&8"9A"==N 77M 44- 3!/"##/!3'"("1%0%3!3"/% '5# !deeNN!!(+3	,6 //J2/NP[dst	-'+\qS[
 ',#r:   r   r   rU   r   c           
      J   i }|xs i }| j                  ||j                  d            \  }|d<   t        j                         r| j                  r| j                  | j                  ||||j                  d            \  }|d<   t        | j                        D ]4  \  }}d| }| j                  |||||j                  |            \  }||<   6 nm| j                  ||||j                  d            \  }|d<   t        | j                        D ]+  \  }}d| } |||||j                  |            \  }||<   - | j                  |||j                  d            \  }|d<   | j                  |      }| j                  ||j                  d            \  }|d<   ||fS )z5The forward method of the `CogVideoXDecoder3D` class.r   rv   r   	up_block_r   r   )r   rx   r!   r   r   r   r   r   r   r   r   r   )	r)   r   r   rU   ry   r   r-   r   r   s	            r2   r(   zCogVideoXDecoder3D.forward  s    %2
37<<S]SaSabkSl<3m0~i0  "t'B'B9=9Z9Z{+:6M>+6  )8 8#,QC@D@a@a!NN>2A=~n= :>tV
{8S :H :6M>+6
  )8 8#,QC@H!4JNN><ZA=~n= 59MM6jnnZ.H 5B 5
1~j1 m448MM-\f\j\jku\vM4w1~j1n,,r:   )r   r   r   r   r   r   r   r   r   rg   r   r   r   r   r   r   r9   s   @r2   r   r     s   * (,$ +
 /C !!,-#N,N, N, c3h	N, "#s(ON, N, N, N, N, N,  !N," %*#N,f (,8<	5-5- u||$5- T#u||"345	5-
 
5-r:   r   c            ,           e Zd ZdZdZdgZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d6dededee	   dee	   d	ee   d
edede	de
dede
dedede
dee
   deee
      deee
      de
dededef* fd       Z	 	 	 	 d7dee   dee   dee
   dee
   ddf
d Zd8d!Zd8d"Zd8d#Zd$ej&                  dej&                  fd%Ze	 d9d$ej&                  d&edeeee   f   fd'       Zd9d(ej&                  d&edeeej&                  f   fd)Zed9d(ej&                  d&edeeej&                  f   fd*       Zd+ej&                  d,ej&                  d-edej&                  fd.Zd+ej&                  d,ej&                  d-edej&                  fd/Zd$ej&                  dej&                  fd0Zd9d(ej&                  d&edeeej&                  f   fd1Z 	 	 	 d:d2ej&                  d3ed&ed4eejB                     deej&                  ej&                  f   f
d5Z" xZ#S );AutoencoderKLCogVideoXa  
    A VAE model with KL loss for encoding images into latents and decoding latent representations into images. Used in
    [CogVideoX](https://github.com/THUDM/CogVideo).

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
            Tuple of downsample block types.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
            Tuple of upsample block types.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
            Tuple of block output channels.
        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
        scaling_factor (`float`, *optional*, defaults to `1.15258426`):
            The component-wise standard deviation of the trained latent space computed using the first batch of the
            training set. This is used to scale the latent space to have unit variance when training the diffusion
            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
            Synthesis with Latent Diffusion Models](https://huggingface.co/papers/2112.10752) paper.
        force_upcast (`bool`, *optional*, default to `True`):
            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
            can be fine-tuned / trained to a lower range without losing too much precision in which case `force_upcast`
            can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
    Tr   Nr=   r>   r   r   r   latent_channelsr   r   r   r   r   sample_heightsample_widthscaling_factorshift_factorlatents_meanlatents_stdforce_upcastuse_quant_convuse_post_quant_convinvert_scale_latentsc                 f   t         |           t        |||||||	|
|	      | _        t	        |||||||	|
|	      | _        |rt        d|z  d|z  d      nd | _        |rt        ||d      nd | _        d| _	        d| _
        d| _        d| _        |dz  | _        |dz  | _        t        | j                  dt!        | j"                  j$                        dz
  z  z        | _        t        | j                  dt!        | j"                  j$                        dz
  z  z        | _        d| _        d| _        y )	N)	r=   r>   r   r   r   r   r   r   r   )	r=   r>   r   r   r   r   r   r   r   r   r   F   gUUUUUU?g?)r'   rG   r   encoderr   decoderr   
quant_convpost_quant_convuse_slicing
use_tilingnum_latent_frames_batch_sizenum_sample_frames_batch_sizetile_sample_min_heighttile_sample_min_widthr    r$   configr   tile_latent_min_heighttile_latent_min_widthtile_overlap_factor_heighttile_overlap_factor_width)r)   r=   r>   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r1   s                         r2   rG   zAutoencoderKLCogVideoX.__init__  sX   F 	)#(-1-+'A

 *'%)1-+'A

 Yg-a,.>L@PRSTlpUh2<qQnr $ -.),-) '4q&8#%1Q%6"&)''1T[[5S5S1TWX1X+YZ'
# &))C)CqSQUQ\Q\QoQoMpstMtGu)v%w"
 +0').&r:   r  r  r  r  r   c                    d| _         |xs | j                  | _        |xs | j                  | _        t        | j                  dt	        | j
                  j                        dz
  z  z        | _        t        | j                  dt	        | j
                  j                        dz
  z  z        | _        |xs | j                  | _	        |xs | j                  | _
        y)a  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.

        Args:
            tile_sample_min_height (`int`, *optional*):
                The minimum height required for a sample to be separated into tiles across the height dimension.
            tile_sample_min_width (`int`, *optional*):
                The minimum width required for a sample to be separated into tiles across the width dimension.
            tile_overlap_factor_height (`int`, *optional*):
                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
                no tiling artifacts produced across the height dimension. Must be between 0 and 1. Setting a higher
                value might cause more tiles to be processed leading to slow down of the decoding process.
            tile_overlap_factor_width (`int`, *optional*):
                The minimum amount of overlap between two consecutive horizontal tiles. This is to ensure that there
                are no tiling artifacts produced across the width dimension. Must be between 0 and 1. Setting a higher
                value might cause more tiles to be processed leading to slow down of the decoding process.
        Tr   r   N)r  r  r  r    r$   r  r   r  r  r  r  )r)   r  r  r  r  s        r2   enable_tilingz$AutoencoderKLCogVideoX.enable_tilingC  s    4 &<&[@[@[#%:%Xd>X>X"&)''1T[[5S5S1TWX1X+YZ'
# &))C)CqSQUQ\Q\QoQoMpstMtGu)v%w"*D*gHgHg')B)ddFdFd&r:   c                     d| _         y)z
        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
        decoding in one step.
        FN)r  r)   s    r2   disable_tilingz%AutoencoderKLCogVideoX.disable_tilingg  s    
  r:   c                     d| _         y)z
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        TNr  r  s    r2   enable_slicingz%AutoencoderKLCogVideoX.enable_slicingn  s    
  r:   c                     d| _         y)z
        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
        decoding in one step.
        FNr  r  s    r2   disable_slicingz&AutoencoderKLCogVideoX.disable_slicingu  s    
 !r:   xc                    |j                   \  }}}}}| j                  r/|| j                  kD  s|| j                  kD  r| j	                  |      S | j
                  }t        ||z  d      }d }	g }
t        |      D ]s  }||z  }||z  |dk(  rdn|z   }||dz   z  |z   }|d d d d ||f   }| j                  ||	      \  }}	| j                  | j                  |      }|
j                  |       u t        j                  |
d      }
|
S )Nr   r   rv   r   r   )r   r  r  r  tiled_encoder  maxr#   r
  r  r&   r!   r%   )r)   r#  
batch_sizerh   
num_framesheightwidthframe_batch_sizenum_batchesrU   encr-   remaining_framesstart_frame	end_framex_intermediates                   r2   _encodezAutoencoderKLCogVideoX._encode|  s'   >?gg;
L*fe??(B(B BftOjOjFj$$Q''<< *(88!<
{# 	'A),<<*Q.qAv!CSTK(AE25EEIq![%::;N)-nQ[)\&NJ*!%!@JJ~&	' ii#
r:   return_dictc                 (   | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }t        |      }|s|fS t        |      S c c}w )a  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded videos. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   )latent_dist)r  r   splitr2  r!   r%   r   r   )r)   r#  r3  x_sliceencoded_slicesh	posteriors          r2   encodezAutoencoderKLCogVideoX.encode  s      
QCD771:Ndll73NNN		.)AQA03	<"y99 Os   Bzc                 0   |j                   \  }}}}}| j                  r1|| j                  kD  s|| j                  kD  r| j	                  ||      S | j
                  }t        ||z  d      }	d }
g }t        |	      D ]s  }||z  }||z  |dk(  rdn|z   }||dz   z  |z   }|d d d d ||f   }| j                  | j                  |      }| j                  ||
      \  }}
|j                  |       u t        j                  |d      }|s|fS t        |      S )N)r3  r   r   rv   r   r   r   )r   r  r  r  tiled_decoder  r&  r#   r  r  r&   r!   r%   r   )r)   r<  r3  r'  rh   r(  r)  r*  r+  r,  rU   decr-   r.  r/  r0  z_intermediates                    r2   _decodezAutoencoderKLCogVideoX._decode  s>   >?gg;
L*fe??(B(B BftOjOjFj$$QK$@@<<*(88!<
{# 	'A),<<*Q.qAv!CSTK(AE25EEIq![%::;N##/!%!5!5n!E)-nQ[)\&NJJJ~&	' ii#6MC((r:   c                 :   | j                   r_|j                  d   dkD  rM|j                  d      D cg c]  }| j                  |      j                   }}t        j                  |      }n| j                  |      j                  }|s|fS t        |      S c c}w )a  
        Decode a batch of images.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   r>  )r  r   r6  rB  r   r!   r%   r   )r)   r<  r3  z_slicedecoded_slicesdecodeds         r2   decodezAutoencoderKLCogVideoX.decode  s     
QJK''RS*Uwdll73::UNUii/Gll1o,,G:G,, Vs   "Babblend_extentc           	         t        |j                  d   |j                  d   |      }t        |      D ]J  }|d d d d d d | |z   d d f   d||z  z
  z  |d d d d d d |d d f   ||z  z  z   |d d d d d d |d d f<   L |S )Nr   r   minr   r#   )r)   rH  rI  rJ  ys        r2   blend_vzAutoencoderKLCogVideoX.blend_v  s    1771:qwwqz<@|$ 	A Aq<-!*;Q!>?1q<GWCWX[\]^`acdfgij]j[kL \  AaAq!m	 r:   c                    t        |j                  d   |j                  d   |      }t        |      D ]J  }|d d d d d d d d | |z   f   d||z  z
  z  |d d d d d d d d |f   ||z  z  z   |d d d d d d d d |f<   L |S )Nr   r   rL  )r)   rH  rI  rJ  r#  s        r2   blend_hzAutoencoderKLCogVideoX.blend_h  s    1771:qwwqz<@|$ 	A Aq!l]Q->!>?1q<GWCWX[\]^`acdfgij]j[kL \  AaAq!m	 r:   c                 V   |j                   \  }}}}}t        | j                  d| j                  z
  z        }t        | j                  d| j
                  z
  z        }t        | j                  | j                  z        }	t        | j                  | j
                  z        }
| j                  |	z
  }| j                  |
z
  }| j                  }g }t        d||      D ]  }g }t        d||      D ]  }t        ||z  d      }d}g }t        |      D ]  }||z  }||z  |dk(  rdn|z   }||dz   z  |z   }|dddd||||| j                  z   ||| j                  z   f   }| j                  ||      \  }}| j                  | j                  |      }|j                  |        |j                  t        j                  |d              |j                  |        g }t!        |      D ]  \  }}g }t!        |      D ]g  \  }}|dkD  r| j#                  ||dz
     |   ||	      }|dkD  r| j%                  ||dz
     ||
      }|j                  |ddddddd|d|f          i |j                  t        j                  |d              t        j                  |d      }|S )	a  Encode a batch of images using a tiled encoder.

        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
        output, but they should be much less noticeable.

        Args:
            x (`torch.Tensor`): Input batch of videos.

        Returns:
            `torch.Tensor`:
                The latent representation of the encoded videos.
        r   r   Nrv   r   r   r   r   )r   r    r  r  r  r  r  r  r  r#   r&  r
  r  r&   r!   r%   r   rO  rQ  )r)   r#  r'  rh   r(  r)  r*  overlap_heightoverlap_widthblend_extent_heightblend_extent_widthrow_limit_heightrow_limit_widthr+  rowsr-   rowjr,  rU   timekr.  r/  r0  tileresult_rows
result_rowr-  s                                r2   r%  z#AutoencoderKLCogVideoX.tiled_encode  s   " ?@gg;
L*feT88A@_@_<_`aD66!d>\>\:\]^!$"="=@_@_"_` !;!;d>\>\!\]669LL447II<< q&.1 	AC1e]3 3 "*0@"@!D!
{+ &A'14D'D$"2Q"6qAv!K["\K 0AE :=M MI#I-A ; ;;;A : :::	<D (,||DZ|'P$D*2#t4KK%&  

599Tq12/30 KK5	8 o 
	=FAsJ$S> V4 q5<<QUA>QRDq5<<AE
D:LMD!!$q!Q0A1A0ACSOCS'S"TUV uyy;<
	= ii+
r:   c                 t   |j                   \  }}}}}t        | j                  d| j                  z
  z        }t        | j                  d| j
                  z
  z        }	t        | j                  | j                  z        }
t        | j                  | j
                  z        }| j                  |
z
  }| j                  |z
  }| j                  }g }t        d||      D ]  }g }t        d||	      D ]  }t        ||z  d      }d}g }t        |      D ]  }||z  }||z  |dk(  rdn|z   }||dz   z  |z   }|dddd||||| j                  z   ||| j                  z   f   }| j                  | j                  |      }| j                  ||      \  }}|j                  |        |j                  t        j                  |d              |j                  |        g }t!        |      D ]  \  }}g }t!        |      D ]g  \  }}|dkD  r| j#                  ||dz
     |   ||
      }|dkD  r| j%                  ||dz
     ||      }|j                  |ddddddd|d|f          i |j                  t        j                  |d              t        j                  |d      }|s|fS t'        |	      S )
a  
        Decode a batch of images using a tiled decoder.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   Nrv   r   r   r   r   r>  )r   r    r  r  r  r  r  r  r  r#   r&  r  r  r&   r!   r%   r   rO  rQ  r   )r)   r<  r3  r'  rh   r(  r)  r*  rS  rT  rU  rV  rW  rX  r+  rY  r-   rZ  r[  r,  rU   r\  r]  r.  r/  r0  r^  r_  r`  r@  s                                 r2   r?  z#AutoencoderKLCogVideoX.tiled_decodeA  s   0 ?@gg;
L*feT88A@_@_<_`aD66!d>\>\:\]^!$"="=@_@_"_` !;!;d>\>\!\]669LL447II<< q&.1 	AC1e]3 3!*0@"@!D!
{+ &A'14D'D$"2Q"6qAv!K["\K 0AE :=M MI#I-A ; ;;;A : :::	<D ++7#33D9'+||DZ|'P$D*KK%&  

599Tq12+3, KK1	4 o 
	=FAsJ$S> V4 q5<<QUA>QRDq5<<AE
D:LMD!!$q!Q0A1A0ACSOCS'S"TUV uyy;<
	= ii+6MC((r:   r   sample_posterior	generatorc                     |}| j                  |      j                  }|r|j                  |      }n|j                         }| j	                  |      j                  }|s|fS t        |      S )N)rc  r>  )r;  r5  r   rW   rG  r   )	r)   r   rb  r3  rc  r#  r:  r<  r@  s	            r2   r(   zAutoencoderKLCogVideoX.forward  sf     KKN..	  9 5A Akk!n##6MC((r:   )r   r   r   r   r   r   r   r   rg   r   r   i  i  gYi1p?NNNTFFF)NNNN)r   N)T)FTN)$r3   r4   r5   r6   r   _no_split_modulesr	   r    r   r`   r   r   r   rG   r  r  r   r"  r!   r7   r2  r   r   r   r   r;  r   rB  rG  rO  rQ  r%  r?  	Generatorr(   r8   r9   s   @r2   r   r     s   > (,$12 (
&
 *>! !!,-  *(,/3.2"$$)%*A_/_/ _/  *	_/ c
_/  "#J!_/" #_/$ %_/& '_/( )_/* +_/, %*-_/. /_/0 1_/2 3_/4 uo5_/6 uU|,7_/8 eEl+9_/: ;_/< =_/> "?_/@ #A_/ _/F 15/36:59"e ("e  (}"e %-UO	"e
 $,E?"e 
"eH  ! %,, 4 37::,0:	"E*F$GG	H: :6) )D )E-Y^YeYeJeDf )8 - -4 -5X]XdXdIdCe - -0 %,, c ell  %,, c ell Hell Hu|| HTQ)ell Q) Q)}^c^j^jOjIk Q)l "' /3)) ) 	)
 EOO,) 
u||U\\)	*)r:   r   )2typingr   r   r   r   numpyr   r!   torch.nnrl   torch.nn.functional
functionalrX   configuration_utilsr   r	   loaders.single_file_modelr
   utilsr   utils.accelerate_utilsr   activationsr   downsamplingr   modeling_outputsr   modeling_utilsr   
upsamplingr   vaer   r   
get_loggerr3   loggerConv3dr   Moduler<   rb   r   r   r   r   r   r   r    r:   r2   <module>r{     s    0 /      B ?  8 ( 0 2 ' , < 
		H	%*")) *>M"BII M"`0%RYY 0%f@-RYY @-Fn-299 n-bT-")) T-np- p-fU- U-p]- ]-@f)Z6L f)r:   