
    bi*                     X   d dl Z d dlmZmZmZ d dlZd dlmZ d dlmc m	Z
 ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ  ej6                  e      Z G d dej<                        Z G d dej@                        Z! G d dej@                        Z" G d dej@                        Z# G d dej@                        Z$ G d dej@                        Z% G d dej@                        Z& G d dej@                        Z' G d dej@                        Z( G d d ee      Z)y)!    N)OptionalTupleUnion   )ConfigMixinregister_to_config)logging)apply_forward_hook   )get_activation)AutoencoderKLOutput)
ModelMixin   )DecoderOutputDiagonalGaussianDistributionc                        e Zd Z	 	 	 	 	 	 	 ddededeeeedf   f   deeeedf   f   deeeedf   f   deeeedf   f   ded	ed
ef fdZd Z	de
j                  de
j                  f fdZ xZS )EasyAnimateCausalConv3din_channelsout_channelskernel_size.stridepaddingdilationgroupsbiaspadding_modec
                    t        |t              r|n|fdz  }t        |      dk(  sJ d| d       t        |t              r|n|fdz  }t        |      dk(  sJ d| d       t        |t              r|n|fdz  }t        |      dk(  sJ d| d       |\  }
}}|\  | _        }}|\  }}}|
dz
  |z  }|It	        j
                  |dz
  |z  d|z
  z   dz        }t	        j
                  |dz
  |z  d|z
  z   dz        }nt        |t              r|x}}nt        sJ || _        t	        j
                  |
dz
  |z  d|z
  z   dz        | _	        d | _
        t        | 1  |||||df|||			       y )
Nr   z#Kernel size must be a 3-tuple, got z	 instead.zStride must be a 3-tuple, got z Dilation must be a 3-tuple, got r   r   r   )	r   r   r   r   r   r   r   r   r   )
isinstancetuplelent_stridemathceilintNotImplementedErrortemporal_paddingtemporal_padding_originprev_featuressuper__init__)selfr   r   r   r   r   r   r   r   r   t_ksh_ksw_ksh_stridew_stride
t_dilation
h_dilation
w_dilationt_padh_padw_pad	__class__s                        n/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/autoencoders/autoencoder_kl_magvit.pyr*   z EasyAnimateCausalConv3d.__init__$   s    &0U%Ck+Z[I[;1$b(KK=Xa&bb$%fe46)a-6{aS#A&!SS)(E:8a8}!Y%EhZy#YY! 'dD,2)x-5*
J
 Z' ?IIqJ6!h,G1LMEIIqJ6!h,G1LME%##EE&&& !&'+yy4!8z2IQQY\2Z^_1_'`$! 	#%#u%% 	 
	
    c                     | ` d | _         y Nr(   r+   s    r8   _clear_conv_cachez)EasyAnimateCausalConv3d._clear_conv_cache^       !r9   hidden_statesreturnc           	      "   |j                   }| j                  t        j                  |dddd| j                  dfd      }|j                  |      }| j                          |d d d d | j                   d f   j                         | _        |j                  d      }g }d}|| j                  z   dz   |k  rat        | )  |d d d d ||| j                  z   dz   f         }|| j                  z  }|j                  |       || j                  z   dz   |k  rat        j                  |d      S | j                  dk(  r>t        j                  | j                  d d d d | j                  dz
   d f   |gd      }n#t        j                  | j                  |gd      }|j                  |      }| j                          |d d d d | j                   d f   j                         | _        |j                  d      }g }d}|| j                  z   dz   |k  rat        | )  |d d d d ||| j                  z   dz   f         }|| j                  z  }|j                  |       || j                  z   dz   |k  rat        j                  |d      S )Nr   	replicate)padmode)dtyper   r   dim)rF   r(   FrD   r&   tor>   clonesizer)   forwardr!   appendtorchconcat)r+   r@   rF   
num_framesoutputsioutr7   s          r8   rM   zEasyAnimateCausalConv3d.forwardb   s   ##%EE1a!6!6: M
 *,,5,9M ""$!.q!d6K6K5K5M/M!N!T!T!VD '++A.JGAd+++a/:=gomAq!a$BWBW>WZ[>[:[4[&\]T]]"s# d+++a/:= <<++ }}! %''10E0E0I.J.L(LM}]cd! !&d.@.@--PVW X),,5,9M ""$!.q!d6K6K5K5M/M!N!T!T!VD '++A.JGAd+++a/:=gomAq!a$BWBW>WZ[>[:[4[&\]T]]"s# d+++a/:= <<++r9   )r   r   r   r   r   Tzeros)__name__
__module____qualname__r$   r   r   boolstrr*   r>   rO   TensorrM   __classcell__r7   s   @r8   r   r   #   s    
 45.//001#8
8
 8
 3c3h/0	8

 c5c?*+8
 sE#s(O+,8
 U38_,-8
 8
 8
 8
t"/,U\\ /,ell /, /,r9   r   c                        e Zd Z	 	 	 	 	 	 ddedededededededef fd	Zd
ej                  dej                  fdZ
 xZS )EasyAnimateResidualBlock3Dr   r   non_linearitynorm_num_groupsnorm_epsspatial_group_normdropoutoutput_scale_factorc	                    t         	|           || _        t        j                  |||d      | _        t        |      | _        t        ||d      | _	        t        j                  |||d      | _
        t        j                  |      | _        t        ||d      | _        ||k7  r%t        j                  ||d      | _        || _        y t        j                          | _        || _        y )NT)
num_groupsnum_channelsepsaffiner   r   r   )r)   r*   re   nn	GroupNormnorm1r   nonlinearityr   conv1norm2Dropoutrd   conv2Conv3dshortcutIdentityrc   )
r+   r   r   r`   ra   rb   rc   rd   re   r7   s
            r8   r*   z#EasyAnimateResidualBlock3D.__init__   s     	#6  \\&$	

 +=9,[,TUV
\\_<]enrs
zz'*,\<UVW
,&IIk<QODM #5 KKMDM"4r9   r@   rA   c                 "   | j                  |      }| j                  ro|j                  d      }|j                  ddddd      j	                  dd      }| j                  |      }|j                  d|df      j                  ddddd      }n| j                  |      }| j                  |      }| j                  |      }| j                  ro|j                  d      }|j                  ddddd      j	                  dd      }| j                  |      }|j                  d|df      j                  ddddd      }n| j                  |      }| j                  |      }| j                  |      }| j                  |      }||z   | j                  z  S Nr   r   r   r      )ru   rc   rL   permuteflattenrn   	unflattenro   rp   rq   rd   rs   re   )r+   r@   ru   
batch_sizes       r8   rM   z"EasyAnimateResidualBlock3D.forward   s   ==/""&++A.J)11!Q1a@HHANM JJ}5M)33A
B7GHPP1aAM !JJ}5M))-8

=1""&++A.J)11!Q1a@HHANM JJ}5M)33A
B7GHPP1aAM !JJ}5M))-8]3

=1(D,D,DDDr9   )silu    ư>T              ?rV   rW   rX   r$   rZ   floatrY   r*   rO   r[   rM   r\   r]   s   @r8   r_   r_      s    
 $!#'%("5"5 "5 	"5
 "5 "5 !"5 "5 #"5HEU\\ Eell Er9   r_   c            	       j     e Zd Zd	dedededef fdZdej                  dej                  fdZ xZ	S )
EasyAnimateDownsampler3Dr   r   r   r   c                 L    t         |           t        ||||d      | _        y )Nr   )r   r   r   r   r   )r)   r*   r   conv)r+   r   r   r   r   r7   s        r8   r*   z!EasyAnimateDownsampler3D.__init__   s'    +#,K`fpq
	r9   r@   rA   c                 T    t        j                  |d      }| j                  |      }|S )N)r   r   r   r   )rI   rD   r   r+   r@   s     r8   rM   z EasyAnimateDownsampler3D.forward   s&    m\:		-0r9   )r   r   r   r   )
rV   rW   rX   r$   r   r*   rO   r[   rM   r\   r]   s   @r8   r   r      s@    
C 
s 
 
Z_ 
U\\ ell r9   r   c                   z     e Zd Z	 	 	 ddededededef
 fdZd Zdej                  d	ej                  fd
Z	 xZ
S )EasyAnimateUpsampler3Dr   r   r   temporal_upsamplerc   c                 ~    t         |           |xs |}|| _        || _        t	        |||      | _        d | _        y )N)r   r   r   )r)   r*   r   rc   r   r   r(   )r+   r   r   r   r   rc   r7   s         r8   r*   zEasyAnimateUpsampler3D.__init__   sG     	#2{!2"4+#,K
	 "r9   c                     | ` d | _         y r;   r<   r=   s    r8   r>   z(EasyAnimateUpsampler3D._clear_conv_cache   r?   r9   r@   rA   c                     t        j                  |dd      }| j                  |      }| j                  r;| j                  	|| _        |S t        j                  |d| j
                  sdnd      }|S )Nr   r   r   nearest)scale_factorrE   )r   r   r   	trilinear)rI   interpolater   r   r(   rc   r   s     r8   rM   zEasyAnimateUpsampler3D.forward   ss    m)R[\		-0!!!!)%2"  !"!!*,0,C,C!
 r9   )r   FT)rV   rW   rX   r$   rY   r*   r>   rO   r[   rM   r\   r]   s   @r8   r   r      sc    
 "'#'"" " 	"
  " !"&"U\\ ell r9   r   c                        e Zd Z	 	 	 	 	 	 	 	 	 ddedededededededed	ed
edef fdZdej                  dej                  fdZ
 xZS )EasyAnimateDownBlock3Dr   r   
num_layersact_fnra   rb   rc   rd   re   add_downsampleadd_temporal_downsamplec                    t         |           t        j                  g       | _        t        |      D ]7  }|dk(  r|n|}| j                  j                  t        ||||||||	             9 |
r%|r#t        ||dd      | _	        d| _
        d| _        y |
r%|s#t        ||dd      | _	        d| _
        d| _        y d | _	        d| _
        d| _        y )	Nr   r   r   r`   ra   rb   rc   rd   re   r   r   )r   r   r   r   r   )r)   r*   rl   
ModuleListconvsrangerN   r_   r   downsamplerspatial_downsample_factortemporal_downsample_factor)r+   r   r   r   r   ra   rb   rc   rd   re   r   r   rS   r7   s                r8   r*   zEasyAnimateDownBlock3D.__init__  s     	]]2&
z" 	A)*a+\KJJ* +!-"($3%'9#(;		 57l`ajstD-.D*./D+$;7l`ajstD-.D*./D+#D-.D*./D+r9   r@   rA   c                 r    | j                   D ]
  } ||      } | j                  | j                  |      }|S r;   )r   r   r+   r@   r   s      r8   rM   zEasyAnimateDownBlock3D.forward>  sA    JJ 	0D /M	0' ,,];Mr9   )	r   r   r   r   Tr   r   TTr   r]   s   @r8   r   r     s    
 !#'%(#(,+0+0 +0 	+0
 +0 +0 +0 !+0 +0 #+0 +0 "&+0ZU\\ ell r9   r   c                        e Zd Z	 	 	 	 	 	 	 	 	 ddedededededededed	ed
edef fdZdej                  dej                  fdZ
 xZS )EasyAnimateUpBlock3dr   r   r   r   ra   rb   rc   rd   re   add_upsampleadd_temporal_upsamplec                    t         |           t        j                  g       | _        t        |      D ]7  }|dk(  r|n|}| j                  j                  t        ||||||||	             9 |
rt        ||||      | _	        y d | _	        y )Nr   r   )r   rc   )
r)   r*   rl   r   r   r   rN   r_   r   	upsampler)r+   r   r   r   r   ra   rb   rc   rd   re   r   r   rS   r7   s                r8   r*   zEasyAnimateUpBlock3d.__init__G  s     	]]2&
z" 	A)*a+\KJJ* +!-"($3%'9#(;		 3"7#5	DN "DNr9   r@   rA   c                 r    | j                   D ]
  } ||      } | j                  | j                  |      }|S r;   )r   r   r   s      r8   rM   zEasyAnimateUpBlock3d.forwardq  s=    JJ 	0D /M	0>>% NN=9Mr9   )	r   r   r   r   Fr   r   TTr   r]   s   @r8   r   r   F  s    
 !#(%(!&*("(" (" 	("
 (" (" (" !(" (" #(" ("  $("TU\\ ell r9   r   c                        e Zd Z	 	 	 	 	 	 	 ddedededededededef fd	Zd
ej                  dej                  fdZ
 xZS )EasyAnimateMidBlock3dr   r   r   ra   rb   rc   rd   re   c	                    t         
|           ||nt        |dz  d      }t        j                  t        ||||||||      g      | _        t        |dz
        D ].  }	| j                  j                  t        ||||||||             0 y )Nry   r   r   r   )	r)   r*   minrl   r   r_   r   r   rN   )r+   r   r   r   ra   rb   rc   rd   re   _r7   s             r8   r*   zEasyAnimateMidBlock3d.__init__z  s     	-<-H/cR]abRbdfNg]]* +!,"($3%'9#(;	

 zA~& 	AJJ* +!,"($3%'9#(;		r9   r@   rA   c                 h     | j                   d   |      }| j                   dd  D ]
  } ||      } |S )Nr   r   )r   )r+   r@   resnets      r8   rM   zEasyAnimateMidBlock3d.forward  s?    %

1m4jjn 	2F"=1M	2r9   )r   r   r   r   Tr   r   r   r]   s   @r8   r   r   y  s     !#'%(** * 	*
 * * !* * #*XU\\ ell r9   r   c                        e Zd ZdZdZdddg dddd	dd
f	dededeedf   deedf   dededededef fdZ	de
j                  de
j                  fdZ xZS )EasyAnimateEncoderzp
    Causal encoder for 3D video-like data used in [EasyAnimate](https://huggingface.co/papers/2405.18991).
    Tr      SpatialDownBlock3DSpatialTemporalDownBlock3Dr   r            r   r   r   r   Fr   r   down_block_types.block_out_channelslayers_per_blockra   r   double_zrc   c
                    t         |           t        ||d   d      | _        t	        j
                  g       | _        |d   }
t        |      D ]|  \  }}|
}||   }
|t        |      dz
  k(  }|dk(  rt        ||
|||d|	| d	      }n)|d	k(  rt        ||
|||d|	| d
	      }nt        d|       | j                  j                  |       ~ t        |d   |||	|ddd      | _        |	| _        t	        j                  |d   |d      | _        t#        |      | _        |rd|z  n|}t        |d   |d      | _        d| _        y )Nr   r   rk   r   r   r   F)	r   r   r   r   ra   rb   rc   r   r   r   TUnknown up block type: rz   )r   r   r   rc   ra   rb   rd   re   rh   rg   ri   r   )r)   r*   r   conv_inrl   r   down_blocks	enumerater    r   
ValueErrorrN   r   	mid_blockrc   rm   conv_norm_outr   conv_actconv_outgradient_checkpointing)r+   r   r   r   r   r   ra   r   r   rc   output_channelsrS   down_block_typeinput_channelsis_final_block
down_blockconv_out_channelsr7   s                    r8   r*   zEasyAnimateEncoder.__init__  s   " 	 /{<Nq<Q_`a ==,,Q/"+,<"= 	0A,N03O#&8"9A"==N"663 .!0/!$3!'9'5#5,1

 !$@@3 .!0/!$3!'9'5#5,0

 !#:?:K!LMM##J/=	0B /*2.'1+ !	
 #5\\+B/&

 'v. 19A,l/0B20FHYghi&+#r9   r@   rA   c                 >   | j                  |      }| j                  D ]=  }t        j                         r| j                  r| j                  ||      }6 ||      }? | j                  |      }| j                  ro|j                  d      }|j                  ddddd      j                  dd      }| j                  |      }|j                  d|df      j                  ddddd      }n| j                  |      }| j                  |      }| j                  |      }|S rx   )r   r   rO   is_grad_enabledr   _gradient_checkpointing_funcr   rc   rL   r{   r|   r   r}   r   r   )r+   r@   r   r~   s       r8   rM   zEasyAnimateEncoder.forward  s   ]3** 	:J$$&4+F+F $ A A*m \ *= 9		: }5""&++A.J)11!Q1a@HHANM ..}=M)33A
B7GHPPQRTUWXZ[]^_M ..}=Mm4m4r9   rV   rW   rX   __doc__ _supports_gradient_checkpointingr$   r   rZ   rY   r*   rO   r[   rM   r\   r]   s   @r8   r   r     s     (,$ -
 /C !!#(R,R, R,  S/	R, "#s(OR, R, R, R, R, !R,hU\\ ell r9   r   c                        e Zd ZdZdZdddg dddd	d
fdededeedf   deedf   dedededef fdZ	de
j                  de
j                  fdZ xZS )EasyAnimateDecoderzp
    Causal decoder for 3D video-like data used in [EasyAnimate](https://huggingface.co/papers/2405.18991).
    Tr   r   SpatialUpBlock3DSpatialTemporalUpBlock3Dr   r   r   r   r   r   Fr   r   up_block_types.r   r   ra   r   rc   c	                    t         |           t        ||d   d      | _        t	        |d   |||ddd      | _        t        j                  g       | _        t        t        |            }	|	d   }
t        |      D ]  \  }}|
}|	|   }
|t        |      dz
  k(  }|dk(  rt        ||
|dz   ||d|| d	
	      }n,|dk(  rt        ||
|dz   ||d|| d
	      }nt        d|       | j                  j                  |        || _        t        j"                  |d   |d      | _        t'        |      | _        t        |d   |d      | _        d	| _        y )Nrz   r   rk   r   r   r   )r   r   r   ra   rb   rd   re   r   F)	r   r   r   r   ra   rb   rc   r   r   r   Tr   r   )r)   r*   r   r   r   r   rl   r   	up_blockslistreversedr   r    r   r   rN   rc   rm   r   r   r   r   r   )r+   r   r   r   r   r   ra   r   rc   reversed_block_out_channelsr   rS   up_block_typer   r   up_blockr7   s                   r8   r*   zEasyAnimateDecoder.__init__(  s     	 /{<Nr<R`ab /*2.'+ !
 r*&*84F+G&H#5a8 ). 9  	,A},N9!<O#&8"9A"==N  22/ .!0/!3!$3!'9%3!3*/
 "<</ .!0/!3!$3!'9%3!3*.
 !#:=/!JKKNN!!(+A 	,F #5\\+A.&

 'v. 00B10E|abc&+#r9   r@   rA   c                    | j                  |      }t        j                         r)| j                  r| j	                  | j
                  |      }n| j                  |      }| j                  D ]=  }t        j                         r| j                  r| j	                  ||      }6 ||      }? | j                  ro|j                  d      }|j                  ddddd      j                  dd      }| j                  |      }|j                  d|df      j                  ddddd      }n| j                  |      }| j                  |      }| j                  |      }|S rx   )r   rO   r   r   r   r   r   rc   rL   r{   r|   r   r}   r   r   )r+   r@   r   r~   s       r8   rM   zEasyAnimateDecoder.forward|  s<   ]3  "t'B'B ==dnnm\M NN=9M 	8H$$&4+F+F $ A A(M Z ( 7		8 ""&++A.J)11!Q1a@HHANM ..}=M)33A
B7GHPP1aAM !..}=Mm4m4r9   r   r]   s   @r8   r   r   !  s     (,$ +
 /C !!#(R,R, R, c3h	R, "#s(OR, R, R, R, !R,hU\\ ell r9   r   c                       e Zd ZdZdZedddg dg dg ddd	d
ddfdedededeedf   deedf   deedf   dededede	de
f fd       Zd Z	 	 	 	 	 	 d8dee   dee   dee   dee	   dee	   d ee	   d!dfd"Zd9d#Zd9d$Zd9d%Ze	 d:d&ej(                  d'e
d!eeee   f   fd(       Ze	 d:d&ej(                  d'e
d!eeee   f   fd)       Zd:d*ej(                  d'e
d!eeej(                  f   fd+Zed:d*ej(                  d'e
d!eeej(                  f   fd,       Zd-ej(                  d.ej(                  d/ed!ej(                  fd0Zd-ej(                  d.ej(                  d/ed!ej(                  fd1Zd:d&ej(                  d'e
d!efd2Zd:d*ej(                  d'e
d!eeej(                  f   fd3Z 	 	 	 d;d4ej(                  d5e
d'e
d6eejB                     d!eeej(                  f   f
d7Z" xZ#S )<AutoencoderKLMagvitaq  
    A VAE model with KL loss for encoding images into latents and decoding latent representations into images. This
    model is used in [EasyAnimate](https://huggingface.co/papers/2405.18991).

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).
    Tr      r   r   r   r   r   r   g?r   latent_channelsr   r   .r   r   r   r   ra   scaling_factorrc   c                    t         |           t        ||||||	|d|	      | _        t	        ||||||	||      | _        t        j                  d|z  d|z  d      | _        t        j                  ||d      | _	        dt        |      dz
  z  | _        dt        |      dz
  z  | _        d| _        d| _        d| _        d| _        d| _        d| _        d	| _        d	| _        d| _        d
| _        d
| _        d| _        y )NT)	r   r   r   r   r   ra   r   r   rc   )r   r   r   r   r   ra   r   rc   r   r   rk   Fry   r   i  r   )r)   r*   r   encoderr   decoderrl   rt   
quant_convpost_quant_convr    spatial_compression_ratiotemporal_compression_ratiouse_slicing
use_tilinguse_framewise_encodinguse_framewise_decodingnum_sample_frames_batch_sizenum_latent_frames_batch_sizetile_sample_min_heighttile_sample_min_widthtile_sample_min_num_framestile_sample_stride_heighttile_sample_stride_widthtile_sample_stride_num_frames)r+   r   r   r   r   r   r   r   r   ra   r   rc   r7   s               r8   r*   zAutoencoderKLMagvit.__init__  s1   2 	 *#(-1-+1

 *'%)1-+1	
 ))A$7_9LZ[\!yy/WXY)*s3E/F/J)K&*+4F0G!0K*L' !
   ',#&+# -.),-) '*#%("*+' *-&(+%-.*r9   c                     | j                         D ]F  \  }}t        |t              r|j                          t        |t              s7|j                          H y r;   )named_modulesr   r   r>   r   )r+   namemodules      r8   r>   z%AutoencoderKLMagvit._clear_conv_cache  sL     ..0 	+LD&&"9:((*&"89((*		+r9   Nr   r   r   r   r   r  rA   c                 *   d| _         d| _        d| _        |xs | j                  | _        |xs | j                  | _        |xs | j
                  | _        |xs | j                  | _        |xs | j                  | _        |xs | j                  | _        y)aX  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.

        Args:
            tile_sample_min_height (`int`, *optional*):
                The minimum height required for a sample to be separated into tiles across the height dimension.
            tile_sample_min_width (`int`, *optional*):
                The minimum width required for a sample to be separated into tiles across the width dimension.
            tile_sample_stride_height (`int`, *optional*):
                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
                no tiling artifacts produced across the height dimension.
            tile_sample_stride_width (`int`, *optional*):
                The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
                artifacts produced across the width dimension.
        TN)	r   r   r   r   r   r   r   r   r  )r+   r   r   r   r   r   r  s          r8   enable_tilingz!AutoencoderKLMagvit.enable_tiling  s    4 &*#&*#&<&[@[@[#%:%Xd>X>X"*D*gHgHg')B)ddFdFd&(@(aDDaDa%-J-pdNpNp*r9   c                     d| _         y)z
        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
        decoding in one step.
        FN)r   r=   s    r8   disable_tilingz"AutoencoderKLMagvit.disable_tiling(  s    
  r9   c                     d| _         y)z
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        TNr   r=   s    r8   enable_slicingz"AutoencoderKLMagvit.enable_slicing/  s    
  r9   c                     d| _         y)z
        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
        decoding in one step.
        FNr  r=   s    r8   disable_slicingz#AutoencoderKLMagvit.disable_slicing6  s    
 !r9   xreturn_dictc           
      D   | j                   rK|j                  d   | j                  kD  s|j                  d   | j                  kD  r| j	                  ||      S | j                  |ddddddddddf         }|g}t        d|j                  d   | j                        D ]C  }| j                  |dddd||| j                  z   ddddf         }|j                  |       E t        j                  |d      }| j                  |      }| j                          |S )a  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded images. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        rz   r  Nr   r   rG   )r   shaper   r   tiled_encoder   r   r   rN   rO   catr   r>   )r+   r  r  first_frameshrS   next_framesmomentss           r8   _encodezAutoencoderKLMagvit._encode=  s     ??d.I.I IQWWUW[[_[u[uMu$$QK$@@||AaBQB1n$56Nq!''!*d&G&GH 	"A,,qAq1t7X7X3X/XZ[]^)^'_`KHH[!	" IIaQ//!$ r9   c                 (   | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }t        |      }|s|fS t        |      S c c}w )a  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded videos. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   )latent_dist)r   r  splitr  rO   r  r   r   )r+   r  r  x_sliceencoded_slicesr  	posteriors          r8   encodezAutoencoderKLMagvit.encode[  s      
QCD771:Ndll73NNN		.)AQA03	<"y99 Os   Bzc           
         |j                   \  }}}}}| j                  | j                  z  }| j                  | j                  z  }	| j                  r7|j                   d   |kD  s|j                   d   |	kD  r| j                  ||      S | j                  |      }| j                  |d d d d d dd d d d f         }
|
g}t        d|j                   d   | j                        D ]C  }| j                  |d d d d ||| j                  z   d d d d f         }|j                  |       E t        j                  |d      }|s|fS t        |      S )Nrz   r  r  r   r   rG   sample)r  r   r   r   r   tiled_decoder   r   r   r   rN   rO   r  r   )r+   r#  r  r~   rh   rQ   heightwidthtile_latent_min_heighttile_latent_min_widthr  decrS   r  s                 r8   _decodezAutoencoderKLMagvit._decodew  sE   >?gg;
L*fe!%!<!<@^@^!^ $ : :d>\>\ \??.D DPRVkHk$$QK$@@  # ||AaBQB1n$56nq!''!*d&G&GH 	$A,,qAq1t7X7X3X/XZ[]^)^'_`KJJ{#	$ ii#6MC((r9   c                 Z   | j                   r_|j                  d   dkD  rM|j                  d      D cg c]  }| j                  |      j                   }}t        j                  |      }n| j                  |      j                  }| j                          |s|fS t        |      S c c}w )a  
        Decode a batch of images.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   r%  )	r   r  r  r-  r&  rO   r  r>   r   )r+   r#  r  z_slicedecoded_slicesdecodeds         r8   decodezAutoencoderKLMagvit.decode  s     
QJK''RS*Uwdll73::UNUii/Gll1o,,G :G,, Vs   "B(abblend_extentc           	         t        |j                  d   |j                  d   |      }t        |      D ]J  }|d d d d d d | |z   d d f   d||z  z
  z  |d d d d d d |d d f   ||z  z  z   |d d d d d d |d d f<   L |S )Nr   r   r   r  r   )r+   r3  r4  r5  ys        r8   blend_vzAutoencoderKLMagvit.blend_v  s    1771:qwwqz<@|$ 	A Aq<-!*;Q!>?1q<GWCWX[\]^`acdfgij]j[kL \  AaAq!m	 r9   c                    t        |j                  d   |j                  d   |      }t        |      D ]J  }|d d d d d d d d | |z   f   d||z  z
  z  |d d d d d d d d |f   ||z  z  z   |d d d d d d d d |f<   L |S )Nry   r   r7  )r+   r3  r4  r5  r  s        r8   blend_hzAutoencoderKLMagvit.blend_h  s    1771:qwwqz<@|$ 	A Aq!l]Q->!>?1q<GWCWX[\]^`acdfgij]j[kL \  AaAq!m	 r9   c                    |j                   \  }}}}}|| j                  z  }|| j                  z  }	| j                  | j                  z  }
| j                  | j                  z  }| j                  | j                  z  }| j
                  | j                  z  }|
|z
  }||z
  }g }t        d|| j                        D ],  }g }t        d|| j
                        D ]  }|d d d d d d ||| j                  z   ||| j                  z   f   }| j                  |d d d d ddd d d d f         }|g}t        d|| j                        D ]C  }| j                  |d d d d ||| j                  z   d d d d f         }|j                  |       E t        j                  |d      }| j                  |      }| j                          |j                  |        |j                  |       / g }t        |      D ]  \  }}g }t        |      D ]g  \  }}|dkD  r| j                  ||dz
     |   ||      }|dkD  r| j!                  ||dz
     ||      }|j                  |d d d d d d d |d |	f          i |j                  t        j                  |d              t        j                  |d      d d d d d d d |d |	f   }|S )Nr   r   r   rG   ry   r   )r  r   r   r   r   r   r   r   r   rN   rO   r  r   r>   r   r9  r;  )r+   r  r  r~   rh   rQ   r(  r)  latent_heightlatent_widthr*  r+  tile_latent_stride_heighttile_latent_stride_widthblend_heightblend_widthrowsrS   rowjtiler  tile_hkr  result_rows
result_rowr  s                               r8   r  z AutoencoderKLMagvit.tiled_encode  s   >?gg;
L*fe$"@"@@ > >>!%!<!<@^@^!^ $ : :d>\>\ \$($B$BdFdFd$d!#'#@#@DDbDb#b -0II+.FF q&$"@"@A 	AC1eT%B%BC !D7777D6666	8  $||DAqsAq,AB&q*d.O.OP /A"&,,tAq!a$BcBc>c:cefhi4i/j"kKMM+./ yyQ/t,&&(

4 #!$ KK)	* o 
	=FAsJ$S> P4 q5<<QUAlKDq5<<AE
D+FD!!$q!Q'M"NOP uyy;<
	= ))KQ/1a-,0VWr9   c                    |j                   \  }}}}}|| j                  z  }|| j                  z  }	| j                  | j                  z  }
| j                  | j                  z  }| j                  | j                  z  }| j
                  | j                  z  }| j                  | j                  z
  }| j                  | j
                  z
  }g }t        d||      D ]  }g }t        d||      D ]  }|d d d d d d |||
z   |||z   f   }| j                  |      }| j                  |d d d d d dd d d d f         }|g}t        d|| j                        D ]C  }| j                  |d d d d ||| j                  z   d d d d f         }|j                  |       E t        j                  |d      }| j                          |j                  |        |j                  |        g }t        |      D ]  \  }}g }t        |      D ]{  \  }}|dkD  r| j                  ||dz
     |   ||      }|dkD  r| j!                  ||dz
     ||      }|j                  |d d d d d d d | j                  d | j
                  f          } |j                  t        j                  |d              t        j                  |d      d d d d d d d |d |	f   }|s|fS t#        |      S )Nr   r   r   rG   ry   r   r%  )r  r   r   r   r   r   r   r   r   r   rN   rO   r  r>   r   r9  r;  r   )r+   r#  r  r~   rh   rQ   r(  r)  sample_heightsample_widthr*  r+  r?  r@  rA  rB  rC  rS   rD  rE  rF  r  tile_decrH  r  r1  rI  rJ  r,  s                                r8   r'  z AutoencoderKLMagvit.tiled_decode  s   >?gg;
L*fe!?!??t===!%!<!<@^@^!^ $ : :d>\>\ \$($B$BdFdFd$d!#'#@#@DDbDb#b 22T5S5SS0043P3PP q&";< 	AC1e%=> $222111	3 ++D1  $||DArr1a,@A(>q*d.O.OP 1A"&,,tAq!a$BcBc>c:cefhi4i/j"kKOOK01  ))H!4&&(

7#+$, KK1	2 o 
	=FAsJ$S> t4 q5<<QUAlKDq5<<AE
D+FD!!$q!Q0P$2P2P0PRqTXTqTqRq'q"rst uyy;<
	= ii+Aq!^m^]l],RS6MC((r9   r&  sample_posterior	generatorc                     |}| j                  |      j                  }|r|j                  |      }n|j                         }| j	                  |      j                  }|s|fS t        |      S )aa  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
        )rP  r%  )r"  r  r&  rE   r2  r   )	r+   r&  rO  r  rP  r  r!  r#  r,  s	            r8   rM   zAutoencoderKLMagvit.forward,  sf     KKN..	  9 5A Akk!n##6MC((r9   )NNNNNN)rA   N)T)FTN)$rV   rW   rX   r   r   r   r$   r   rZ   r   rY   r*   r>   r   r  r	  r  r  r
   rO   r[   r   r   r   r  r"  r   r-  r2  r9  r;  r  r'  	GeneratorrM   r\   r]   s   @r8   r   r     si    (,$ !.B-
+
 !"! &#'-T/T/ T/ 	T/
 "#s(OT/  S/T/ c3hT/$ %T/& 'T/( )T/* +T/, !-T/ T/l+ 15/34859489="q ("q  (}"q %-SM	"q
 $,E?"q #+5/"q (0"q 
"qH  ! 37,0	"E*F$GG	H : 37::,0:	"E*F$GG	H: :6) )D )E-Y^YeYeJeDf )4 - -4 -5X]XdXdIdCe - -2 %,, c ell  %,, c ell 2ell 2 2I\ 2h;)ell ;) ;)}^c^j^jOjIk ;)@ "' /3)) ) 	)
 EOO,) 
}ell*	+)r9   r   )*r"   typingr   r   r   rO   torch.nnrl   torch.nn.functional
functionalrI   configuration_utilsr   r   utilsr	   utils.accelerate_utilsr
   activationsr   modeling_outputsr   modeling_utilsr   vaer   r   
get_loggerrV   loggerrt   r   Moduler_   r   r   r   r   r   r   r   r    r9   r8   <module>rb     s      ) )     B  8 ( 2 ' < 
		H	%n,bii n,bCE CELryy %RYY %P3RYY 3l0299 0f1BII 1hq qhv vrl)*k l)r9   