
    bi`                        d dl Z d dlmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ  G d dej:                        Z G d dej:                        Z G d dej:                        Z  G d dej:                        Z! G d dej:                        Z" G d dej:                        Z# G d dee	      Z$d Z%y)    N)OptionalTupleUnion   )ConfigMixinregister_to_config)apply_forward_hook   )	AttentionSpatialNorm)DecoderOutputDiagonalGaussianDistribution)Downsample2D)AutoencoderKLOutput)
ModelMixin)ResnetBlock2D)
Upsample2Dc                        e Zd ZdZ	 	 	 	 	 	 ddedee   dedededed	ed
df fdZe	de
j                  d
e
j                  fd       Zde
j                  ded
e
j                  fdZ xZS )AllegroTemporalConvLayera
  
    Temporal convolutional layer that can be used for video (sequence of images) input. Code adapted from:
    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016
    Nin_dimout_dimdropoutnorm_num_groups	up_sampledown_samplestridereturnc                 L   t         |           |xs |}t        |dz
  dz        x}}	d}
|| _        || _        |rat        j                  t        j                  ||      t        j                         t        j                  ||d||fdd||	f            | _
        n|rct        j                  t        j                  ||      t        j                         t        j                  ||dz  d||fd||	f            | _
        n_t        j                  t        j                  ||      t        j                         t        j                  ||d||f|
||	f            | _
        t        j                  t        j                  ||      t        j                         t        j                  |      t        j                  ||d||f|
||	f            | _        t        j                  t        j                  ||      t        j                         t        j                  |      t        j                  ||d||f|
||f            | _        t        j                  t        j                  ||      t        j                         t        j                  ||d||f|
||f            | _        y )	N   g      ?r   r
   )r
   r   r   )r   paddingr    r   )super__init__intr   r   nn
Sequential	GroupNormSiLUConv3dconv1Dropoutconv2conv3conv4)selfr   r   r   r   r   r   r   pad_hpad_wpad_t	__class__s              o/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/autoencoders/autoencoder_kl_allegro.pyr#   z!AllegroTemporalConvLayer.__init__'   s(    	#VVaZ3.//&"_f5			&'Avv+>y[\^cejZklDJ
 _f5			&'A+66/BQPUW\L]^DJ _f5			&'Avv+>PUW\H]^DJ
 ]]LL'2GGIJJwIIgv66':UESXDYZ	

 ]]LL'2GGIJJwIIgv66':UESXDYZ	

 ]]LL'2GGIIIgv66':UESXDYZ

    hidden_statesc                     t        j                  | d d d d ddf   | fd      } t        j                  | | d d d d dd f   fd      } | S )Nr   r   r
   )dim)torchcat)r6   s    r4   _pad_temporal_dimz*AllegroTemporalConvLayer._pad_temporal_dim^   sP    		=Aqs#;]"KQRS		=-1bc	2J"KQRSr5   
batch_sizec                 .   |j                  d|df      j                  ddddd      }| j                  r|d d d d d d df   }n3| j                  r%|j	                  dd|j
                  d   dz        }n|}| j                  s| j                  r| j                  |      }n"| j                  |      }| j                  |      }| j                  r6|j                  dd      j                  dddddd	      j                  dd      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }||z   }|j                  ddddd      j                  dd      }|S )
Nr   r9   r
   r   r      )r8   output_size)r
   r9      )	unflattenpermuter   r   repeat_interleaveshaper*   r<   flattenr,   r-   r.   )r/   r6   r=   identitys       r4   forwardz AllegroTemporalConvLayer.forwardd   s   %//J3CDLLQPQSTVWYZ[$Q3Q3Y/H^^$66qa]M`M`abMcfgMg6hH$Ht~~ JJ}5M 22=AM JJ}5M>>)33Aw?GG1aQRTUWXYaabcefgM..}=

=1..}=

=1..}=

=1 =0%--aAq!<DDQJr5   )N            FFr   )__name__
__module____qualname____doc__r$   r   floatboolr#   staticmethodr:   Tensorr<   rH   __classcell__r3   s   @r4   r   r   !   s     "&!!5
5
 #5
 	5

 5
 5
 5
 5
 
5
n  %,,  
U\\ s u|| r5   r   c                        e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddedededededededed	ed
edededef fdZdej                  dej                  fdZ
 xZS )AllegroDownBlock3Din_channelsout_channelsr   
num_layers
resnet_epsresnet_time_scale_shiftresnet_act_fnresnet_groupsresnet_pre_normoutput_scale_factorspatial_downsampletemporal_downsampledownsample_paddingc                    t         |           g }g }t        |      D ]M  }|dk(  r|n|}|j                  t	        ||d ||||||
|	
             |j                  t        ||d|             O t        j                  |      | _        t        j                  |      | _	        |rt        ||d|dd      | _
        || _        |r*t        j                  t        |d||d	      g      | _        y d | _        y )
Nr   
rW   rX   temb_channelsepsgroupsr   time_embedding_normnon_linearityr_   pre_norm皙?r   r   Tr   )r   r   r   r   op)use_convrX   r    name)r"   r#   rangeappendr   r   r%   
ModuleListresnets
temp_convstemp_convs_downadd_temp_downsampler   downsamplers)r/   rW   rX   r   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rs   rt   ir3   s                    r4   r#   zAllegroDownBlock3D.__init__   s     	
z" 	A)*a+\KNN +!-"&"(#(?"/(;, (  $1	!	2 }}W---
3#;lCdhqr$D  $7  " $t,Xjqu!D !%Dr5   r6   r   c                    |j                   d   }|j                  ddddd      j                  dd      }t        | j                  | j
                        D ]  \  }} ||d       } |||      } | j                  r| j                  ||      }| j                  | j                  D ]
  } ||      } |j                  d|df      j                  ddddd      }|S 	Nr   r
   r   r   r?   )temb)r=   r9   )
rE   rC   rF   ziprs   rt   rv   ru   rw   rB   )r/   r6   r=   resnet	temp_convdownsamplers         r4   rH   zAllegroDownBlock3D.forward   s    "((+
%--aAq!<DDQJ!$T\\4??!C 	LFI"=t<M%m
KM	L ## 00:0VM(#00 ; +M :; &//J3CDLLQPQSTVWYZ[r5   )rI   r   ư>defaultswishrJ   T      ?TFr   rK   rL   rM   r$   rO   strrP   r#   r:   rR   rH   rS   rT   s   @r4   rV   rV      s    
  '0$ $%(#'$)"#@%@% @% 	@%
 @% @% "%@% @% @% @% #@% !@% "@%  @%DU\\ ell r5   rV   c                        e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddedededededededed	ed
edededee   f fdZde	j                  de	j                  fdZ xZS )AllegroUpBlock3DrW   rX   r   rY   rZ   r[   r\   r]   r^   r_   spatial_upsampletemporal_upsamplere   c                    t         |           g }g }t        |      D ]M  }|dk(  r|n|}|j                  t	        |||||||||
|	
             |j                  t        ||d|             O t        j                  |      | _        t        j                  |      | _	        || _
        |rt        ||d|dd      | _        |r(t        j                  t        |d|      g      | _        y d | _        y )	Nr   rd   rk   rl   Tr   )r   r   r   r   )rn   rX   )r"   r#   rp   rq   r   r   r%   rr   rs   rt   add_temp_upsampletemp_conv_upr   
upsamplers)r/   rW   rX   r   rY   rZ   r[   r\   r]   r^   r_   r   r   re   rs   rt   rx   input_channelsr3   s                     r4   r#   zAllegroUpBlock3D.__init__   s     	
z" 	A,-F[NNN .!-"/"(#(?"/(;, (  $1	#	4 }}W---
3!2 8lCbfop!D  mmZtbn-o,pqDO"DOr5   r6   r   c                    |j                   d   }|j                  ddddd      j                  dd      }t        | j                  | j
                        D ]  \  }} ||d       } |||      } | j                  r| j                  ||      }| j                  | j                  D ]
  } ||      } |j                  d|df      j                  ddddd      }|S rz   )
rE   rC   rF   r|   rs   rt   r   r   r   rB   )r/   r6   r=   r}   r~   	upsamplers         r4   rH   zAllegroUpBlock3D.forward  s    "((+
%--aAq!<DDQJ!$T\\4??!C 	LFI"=t<M%m
KM	L !! --m
-SM??&!__ 9	 )- 89 &//J3CDLLQPQSTVWYZ[r5   )rI   r   r   r   r   rJ   Tr   TFN)rK   rL   rM   r$   rO   r   rP   r   r#   r:   rR   rH   rS   rT   s   @r4   r   r      s    
  '0$ $%(!%"''+;#;# ;# 	;#
 ;# ;# "%;# ;# ;# ;# #;# ;#  ;#  };#zU\\ ell r5   r   c                        e Zd Z	 	 	 	 	 	 	 	 	 	 ddedededededededed	ed
ededef fdZdej                  dej                  fdZ
 xZS )AllegroMidBlock3DConvrW   re   r   rY   rZ   r[   r\   r]   r^   add_attentionattention_head_dimr_   c                 N   t         |           t        ||||||||||	
      g}t        ||d|      g}g }||}t	        |      D ]  }|
r7|j                  t        |||z  ||||dk(  r|nd |dk(  r|nd dddd             n|j                  d        |j                  t        ||||||||||	
             |j                  t        ||d|              t        j                  |      | _	        t        j                  |      | _
        t        j                  |      | _        y )Nrd   rk   rl   r   spatialT)
headsdim_headrescale_output_factorrf   r   spatial_norm_dimresidual_connectionbiasupcast_softmax_from_deprecated_attn_block)r"   r#   r   r   rp   rq   r   r%   rr   rs   rt   
attentions)r/   rW   re   r   rY   rZ   r[   r\   r]   r^   r   r   r_   rs   rt   r   _r3   s                    r4   r#   zAllegroMidBlock3DConv.__init__0  ss    	 '(+$$;+$7(
 % -	

 
%!,z" *	A!!#)-??!3.A&9PT]9]cg:QU^:^dh,0!'+48  !!$'NN +!,"/"(#(?"/(;, ($1	G*	X }}W---
3--
3r5   r6   r   c                    |j                   d   }|j                  ddddd      j                  dd      } | j                  d   |d       } | j                  d   ||      }t        | j                  | j                  dd  | j                  dd        D ]"  \  }}} ||      } ||d       } |||      }$ |j                  d|df      j                  ddddd      }|S rz   )rE   rC   rF   rs   rt   r|   r   rB   )r/   r6   r=   attnr}   r~   s         r4   rH   zAllegroMidBlock3DConv.forward  s    "((+
%--aAq!<DDQJ'QDA**=ZP'*4??DLL<Ldoo^_^`Na'b 	L#D&) /M"=t<M%m
KM	L
 &//J3CDLLQPQSTVWYZ[r5   )
rI   r   r   r   r   rJ   TTr   r   r   rT   s   @r4   r   r   /  s    
  '0$ $""#%([4[4 [4 	[4
 [4 [4 "%[4 [4 [4 [4 [4  [4 #[4zU\\ ell r5   r   c                        e Zd Zddddg dddddf	d	ed
edeedf   deedf   deedf   dedededef fdZdej                  dej                  fdZ
 xZS )AllegroEncoder3Dr   rV   rV   rV   rV            r   TTFFr
   rJ   siluTrW   rX   down_block_types.block_out_channelstemporal_downsample_blockslayers_per_blockr   act_fndouble_zc
                 "   t         |           t        j                  ||d   ddd      | _        t        j
                  |d   |d   dd      | _        t        j                  g       | _        |d   }
t        |      D ]a  \  }}|
}||   }
|t        |      dz
  k(  }|dk(  rt        |||
| ||   d	d||
	      }nt        d      | j                  j                  |       c t        |d   d	|dd|d   |d       | _        t        j                   |d   |d	      | _        t        j$                         | _        |	rd|z  n|}t        j
                  |d   |d   dd      | _        t        j                  |d   |dd      | _        d| _        y )Nr   r   r   kernel_sizer   r    r   r   r   r   r   r   )rW   rX   r   r    rV   r   )	rY   rW   rX   r`   ra   rZ   rb   r\   r]   zCInvalid `down_block_type` encountered. Must be `AllegroDownBlock3D`r9   r   rW   rZ   r\   r_   r[   r   r]   re   num_channels
num_groupsrf   r
   r!   F)r"   r#   r%   Conv2dconv_inr)   temp_conv_inrr   down_blocks	enumeratelenrV   
ValueErrorrq   r   	mid_blockr'   conv_norm_outr(   conv_acttemp_conv_outconv_outgradient_checkpointing)r/   rW   rX   r   r   r   r   r   r   r   output_channelrx   down_block_typeinput_channelis_final_block
down_blockconv_out_channelsr3   s                    r4   r#   zAllegroEncoder3D.__init__  s   " 	yyq!
 II*1-+A.!	
 ==, ,A."+,<"= 	0A*M/2N#&8"9A"==N"66// -!/+9'9(B1(E#'("("1

 !!fgg##J/)	0. /*2.  !$-1"5)	
  \\7I"7MZiost	08A,lYY'9"'=?QRT?UW`jst		"4R"8:KQXYZ&+#r5   sampler   c                    |j                   d   }|j                  ddddd      j                  dd      }| j                  |      }|j	                  d|df      j                  ddddd      }|}| j                  |      }||z   }t        j                         rL| j                  r@| j                  D ]  }| j                  ||      } | j                  | j                  |      }n*| j                  D ]
  } ||      } | j                  |      }|j                  ddddd      j                  dd      }| j                  |      }| j                  |      }|j	                  d|df      j                  ddddd      }|}| j                  |      }||z   }|j                  ddddd      j                  dd      }| j                  |      }|j	                  d|df      j                  ddddd      }|S Nr   r
   r   r   r?   r9   )rE   rC   rF   r   rB   r   r:   is_grad_enabledr   r   _gradient_checkpointing_funcr   r   r   r   r   )r/   r   r=   residualr   s        r4   rH   zAllegroEncoder3D.forward  s   \\!_
1aA.66q!<f%!!!j"%56>>q!Q1M""6*("  "t'B'B".. O
:::vNO 66t~~vNF #.. ,
#F+, ^^F+F 1aA.66q!<##F+v&!!!j"%56>>q!Q1M##F+("1aA.66q!<v&!!!j"%56>>q!Q1Mr5   rK   rL   rM   r$   r   r   rP   r#   r:   rR   rH   rS   rT   s   @r4   r   r     s     -
 /C7Q !!Q,Q, Q,  S/	Q, "#s(OQ, %*$)$4Q, Q, Q, Q, Q,f(ell (u|| (r5   r   c                        e Zd Zdddg dddddd	f	d
ededeedf   deedf   deedf   dedededef fdZdej                  dej                  fdZ
 xZS )AllegroDecoder3Dr?   r   r   r   r   r   FTTFr   r
   rJ   r   grouprW   rX   up_block_types.temporal_upsample_blocksr   r   r   r   	norm_typec
                    t         |           t        j                  ||d   ddd      | _        t        j
                  |d   |d   dd      | _        d | _        t        j                  g       | _	        |	dk(  r|nd }
t        |d   d	|d|	d
k(  rdn|	|d   ||
      | _        t        t        |            }|d   }t        |      D ]g  \  }}|}||   }|t        |      dz
  k(  }|dk(  rt        |dz   ||| ||   d	|||
|	
      }nt!        d      | j                  j#                  |       |}i |	dk(  rt%        |d   |
      | _        n t        j(                  |d   |d	      | _        t        j*                         | _        t        j
                  |d   |d   dd      | _        t        j                  |d   |dd      | _        d| _        y )Nr9   r   r   r   r   r   r!   r   r   r   r   r   r   r   )
rY   rW   rX   r   r   rZ   r\   r]   re   r[   z?Invalid `UP_block_type` encountered. Must be `AllegroUpBlock3D`r   F)r"   r#   r%   r   r   r)   r   r   rr   	up_blocksr   listreversedr   r   r   r   rq   r   r   r'   r(   r   r   r   r   )r/   rW   rX   r   r   r   r   r   r   r   re   reversed_block_out_channelsr   rx   up_block_typeprev_output_channelr   up_blockr3   s                     r4   r#   zAllegroDecoder3D.__init__  s   " 	yyr"
 II&8&<>PQS>TV_irsr*'0I'=4 /*2.  !1:g1EI91"5)'	
 '+84F+G&H#4Q7 ). 9 	1A}"08;N#&8"9A"==N 22+/!3 3!/)7%7&>q&A#"("1"/,5 !!bccNN!!(+"0/	14 	!!,-?-BM!RD!#;Ma;P]lrv!wD	YY'9!'<>PQR>SU^hqr		"4Q"7qRST&+#r5   r   r   c                 v   |j                   d   }|j                  ddddd      j                  dd      }| j                  |      }|j	                  d|df      j                  ddddd      }|}| j                  |      }||z   }t        t        | j                  j                                     j                  }t        j                         rL| j                  r@| j                  | j                  |      }| j                  D ]  }| j                  ||      } n;| j                  |      }|j!                  |      }| j                  D ]
  } ||      } |j                  ddddd      j                  dd      }| j#                  |      }| j%                  |      }|j	                  d|df      j                  ddddd      }|}| j'                  |      }||z   }|j                  ddddd      j                  dd      }| j)                  |      }|j	                  d|df      j                  ddddd      }|S r   )rE   rC   rF   r   rB   r   nextiterr   
parametersdtyper:   r   r   r   r   tor   r   r   r   )r/   r   r=   r   upscale_dtyper   s         r4   rH   zAllegroDecoder3D.forwardu  s'   \\!_
1aA.66q!<f%!!!j"%56>>q!Q1M""6*("T$..";";"=>?EE  "t'B'B66t~~vNF !NN M::8VLM
 ^^F+FYY}-F !NN *!&)* 1aA.66q!<##F+v&!!!j"%56>>q!Q1M##F+("1aA.66q!<v&!!!j"%56>>q!Q1Mr5   r   rT   s   @r4   r   r     s     +
 6P.B !! U,U, U, c3h	U, #(c	"2U, "#s(OU, U, U, U, U,n,ell ,u|| ,r5   r   c            "           e Zd ZdZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d'dededeedf   deedf   deedf   d	ee	df   d
ee	df   dedededede
dede
de	ddf  fd       Zd(dZd(dZd(dZd(dZdej"                  dej"                  fdZe	 d)dej"                  de	deeee   f   fd       Zdej"                  dej"                  fdZed)dej"                  de	deeej"                  f   fd        Zdej"                  dej"                  fd!Zdej"                  dej"                  fd"Z	 	 	 d*d#ej"                  d$e	de	d%eej<                     deeej"                  f   f
d&Z xZ S )+AutoencoderKLAllegroa!  
    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Used in
    [Allegro](https://github.com/rhymes-ai/Allegro).

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        in_channels (int, defaults to `3`):
            Number of channels in the input image.
        out_channels (int, defaults to `3`):
            Number of channels in the output.
        down_block_types (`Tuple[str, ...]`, defaults to `("AllegroDownBlock3D", "AllegroDownBlock3D", "AllegroDownBlock3D", "AllegroDownBlock3D")`):
            Tuple of strings denoting which types of down blocks to use.
        up_block_types (`Tuple[str, ...]`, defaults to `("AllegroUpBlock3D", "AllegroUpBlock3D", "AllegroUpBlock3D", "AllegroUpBlock3D")`):
            Tuple of strings denoting which types of up blocks to use.
        block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512)`):
            Tuple of integers denoting number of output channels in each block.
        temporal_downsample_blocks (`Tuple[bool, ...]`, defaults to `(True, True, False, False)`):
            Tuple of booleans denoting which blocks to enable temporal downsampling in.
        latent_channels (`int`, defaults to `4`):
            Number of channels in latents.
        layers_per_block (`int`, defaults to `2`):
            Number of resnet or attention or temporal convolution layers per down/up block.
        act_fn (`str`, defaults to `"silu"`):
            The activation function to use.
        norm_num_groups (`int`, defaults to `32`):
            Number of groups to use in normalization layers.
        temporal_compression_ratio (`int`, defaults to `4`):
            Ratio by which temporal dimension of samples are compressed.
        sample_size (`int`, defaults to `320`):
            Default latent size.
        scaling_factor (`float`, defaults to `0.13235`):
            The component-wise standard deviation of the trained latent space computed using the first batch of the
            training set. This is used to scale the latent space to have unit variance when training the diffusion
            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
            Synthesis with Latent Diffusion Models](https://huggingface.co/papers/2112.10752) paper.
        force_upcast (`bool`, default to `True`):
            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
            can be fine-tuned / trained to a lower range without losing too much precision in which case `force_upcast`
            can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
    TrW   rX   r   .r   r   r   r   latent_channelsr   r   r   temporal_compression_ratiosample_sizescaling_factorforce_upcastr   Nc                    t         |           t        ||||||	|
|d	      | _        t	        ||||||	||
      | _        t        j                  d|z  d|z  d      | _        t        j                  ||d      | _	        d| _
        d| _        dt        |      dz
  z  | _        d| _        d| _        d	| _        d
}|||f| _        || j                  z
  || j                  z
  || j                   z
  f| _        y )NT)	rW   rX   r   r   r   r   r   r   r   )rW   rX   r   r   r   r   r   r   r
   r   F   x   P      )r"   r#   r   encoderr   decoderr%   r   
quant_convpost_quant_convuse_slicing
use_tilingr   spatial_compression_ratiotile_overlap_ttile_overlap_htile_overlap_wkernelr   )r/   rW   rX   r   r   r   r   r   r   r   r   r   r   r   r   r   sample_framesr3   s                    r4   r#   zAutoencoderKLAllegro.__init__  s   : 	'#(-'A1-+

 ('%)%=1-+	
 ))A$7_9LaP!yy/1M
 !)*s3E/F/J)K&! $k;?D///$---$---
r5   c                     d| _         y)a  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        TNr   r/   s    r4   enable_tilingz"AutoencoderKLAllegro.enable_tiling  s     r5   c                     d| _         y)z
        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
        decoding in one step.
        FNr  r  s    r4   disable_tilingz#AutoencoderKLAllegro.disable_tiling&  s    
  r5   c                     d| _         y)z
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        TNr   r  s    r4   enable_slicingz#AutoencoderKLAllegro.enable_slicing-  s    
  r5   c                     d| _         y)z
        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
        decoding in one step.
        FNr  r  s    r4   disable_slicingz$AutoencoderKLAllegro.disable_slicing4  s    
 !r5   xc                 R    | j                   r| j                  |      S t        d      )Nz5Encoding without tiling has not been implemented yet.)r   tiled_encodeNotImplementedError)r/   r  s     r4   _encodezAutoencoderKLAllegro._encode;  (     ??$$Q''!"YZZr5   return_dictc                 (   | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }t        |      }|s|fS t        |      S c c}w )a  
        Encode a batch of videos into latents.

        Args:
            x (`torch.Tensor`):
                Input batch of videos.
            return_dict (`bool`, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded videos. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   )latent_dist)r   rE   splitr  r:   r;   r   r   )r/   r  r  x_sliceencoded_slicesh	posteriors          r4   encodezAutoencoderKLAllegro.encodeC  s    " 
QCD771:Ndll73NNN		.)AQA03	<"y99 Os   Bzc                 R    | j                   r| j                  |      S t        d      )Nz5Decoding without tiling has not been implemented yet.)r   tiled_decoder  )r/   r  s     r4   _decodezAutoencoderKLAllegro._decode`  r  r5   c                    | j                   rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t	        j
                  |      }n| j                  |      }|s|fS t        |      S c c}w )a  
        Decode a batch of videos.

        Args:
            z (`torch.Tensor`):
                Input batch of latent vectors.
            return_dict (`bool`, defaults to `True`):
                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   r   )r   rE   r  r!  r:   r;   r   )r/   r  r  z_slicedecoded_slicesdecodeds         r4   decodezAutoencoderKLAllegro.decodeh  sv      
QCD771:Ndll73NNNii/Gll1oG:G,, Os   Bc                 

   d}| j                   }| j                  j                  }|j                  \  }}}}}	t	        j
                  || j                  d   z
  | j                  d   z        dz   }
t	        j
                  || j                  d   z
  | j                  d   z        dz   }t	        j
                  |	| j                  d   z
  | j                  d   z        dz   }d}|j                  |
|z  |z  d| j                  j                  z  | j                  d   |z  | j                  d   |z  | j                  d   |z  f      }|j                  ||| j                  d   | j                  d   | j                  d   f      }t        |
      D ]  }t        |      D ]u  }t        |      D ]c  }|| j                  d   z  || j                  d   z  | j                  d   z   }}|| j                  d   z  || j                  d   z  | j                  d   z   }}|| j                  d   z  || j                  d   z  | j                  d   z   }}|d d d d ||||||f   }||||z  <   ||z  |dz
  k(  s||
|z  |z  dz
  k(  r| j                  |      }||
|z  |z  dz
  k(  r ||z  |dz
  k7  r|d ||z  dz    ||||z  z
  d  n||||z
  dz   |dz    |j                  ||| j                  d   | j                  d   | j                  d   f      }|dz  }f x  |j                  |d| j                  j                  z  ||z  ||z  |	|z  f      }| j                  d   |z  | j                  d   |z  | j                  d   |z  f}| j                  d   |z  | j                  d   |z  | j                  d   |z  f}|d   |d   z
  |d   |d   z
  |d   |d   z
  f}t        |
      D ]  }||d   z  ||d   z  |d   z   }}t        |      D ]  }||d   z  ||d   z  |d   z   }}t        |      D ]u  }||d   z  ||d   z  |d   z   }}t        ||
|d   f|||d   f|||d   f|||z  |z  ||z  z   |z      j                  d            }|d d d d ||||||fxx   |z  cc<   w   |j                  ddddd      j                  dd      }| j!                  |      }|j#                  d|df      j                  ddddd      }|S Nr   r   r
   r   r?   r9   )r   configr   rE   mathfloorr  r   	new_zerosr   rp   r   _prepare_for_blend	unsqueezerC   rF   r   rB   )r/   r  local_batch_sizersrtr=   r   
num_framesheightwidthoutput_num_framesoutput_heightoutput_widthcountoutput_latentvae_batch_inputrx   jkn_startn_endh_starth_endw_startw_end
video_cubelatentoutput_kerneloutput_strideoutput_overlaplatent_means                                  r4   r  z!AutoencoderKLAllegro.tiled_encode  s   ++[[33>?gg;
L*fe JJ
T[[^(Ct{{ST~'UVYZZ

FT[[^$;t{{1~#MNQRRzz54;;q>#9T[[^"KLqP!M1L@DKK///A"$A"$A"$
 ++'7t{{ST~W[WbWbcdWegkgrgrstgu&vw() 	A=) |, A%&Q%7T[[^9KdkkZ[n9\UG%&Q%7T[[^9KdkkZ[n9\UG%&Q%7T[[^9KdkkZ[n9\UG!"1awu}#T!UJ@JOE,<$<=  004Dq4HH $5$E$TWX$XX!%o!> "%6%F%UXY%YY %(8 8<Lq<P PPVWuY^aqYqtuYuPvM%%:J2J*J*LMV\M%2B*BQ*FQRS*+++-|T[[^T[[YZ^]a]h]hij]kl+ QJE5	< T[[888*:JFVXLZ_ceZef
 A",dkk!n.BDKKPQNVXDXXA",dkk!n.BDKKPQNVXDXX!}Q//!}Q//!}Q//
 () 	]Aq!111}Q7G3G-XYJZ3ZUG=) 
]!"]1%5!5q=;K7Km\]N^7^|, ]A%&q)9%91}Q?O;OR_`aRb;bUG"4-~a/@AM>!+<=L.*;<%a-&7,&F\IY&Y\]&]^hhijk	#K 1awu}LMQ\\M]
]	] 1aA.66q!<(!!!j"%56>>q!Q1Mr5   c                 	   d}| j                   }| j                  j                  }| j                  d   |z  | j                  d   |z  | j                  d   |z  f}| j                  d   |z  | j                  d   |z  | j                  d   |z  f}|j
                  \  }}}	}
}|j                  ddddd      j                  dd      }| j                  |      }|j                  d|df      j                  ddddd      }t        j                  |	|d   z
  |d   z        dz   }t        j                  |
|d   z
  |d   z        dz   }t        j                  ||d   z
  |d   z        dz   }d}|j                  ||z  |z  | j                  j                  | j                  d   | j                  d   | j                  d   f      }|j                  |||d   |d   |d   f      }t        |      D ]  }t        |      D ]  }t        |      D ]  }||d   z  ||d   z  |d   z   }}||d   z  ||d   z  |d   z   }}||d   z  ||d   z  |d   z   }}|d d d d ||||||f   }||||z  <   ||z  |dz
  k(  s|||z  |z  dz
  k(  rl| j                  |      }|||z  |z  dz
  k(  r ||z  |dz
  k7  r|d ||z  dz    ||||z  z
  d  n||||z
  dz   |dz    |j                  |||d   |d   |d   f      }|dz  }   |j                  || j                  j                  |	|z  |
|z  ||z  f      }| j                  d   | j                  d   z
  | j                  d   | j                  d   z
  | j                  d   | j                  d   z
  f}t        |      D ]  }|| j                  d   z  || j                  d   z  | j                  d   z   }}t        |      D ]  }|| j                  d   z  || j                  d   z  | j                  d   z   }}t        |      D ]  }|| j                  d   z  || j                  d   z  | j                  d   z   }}t!        |||d   f|||d   f|||d   f|||z  |z  ||z  z   |z      j#                  d            }|d d d d ||||||fxx   |z  cc<      |j                  ddddd      j%                         }|S r)  )r   r*  r   r  r   rE   rC   rF   r   rB   r+  r,  r-  rX   rp   r   r.  r/  
contiguous) r/   r  r0  r1  r2  latent_kernellatent_strider=   r   r3  r4  r5  r6  r7  r8  r9  decoded_videosr;  rx   r<  r=  r>  r?  r@  rA  rB  rC  current_latentcurrent_videovideovideo_overlapout_video_blends                                    r4   r   z!AutoencoderKLAllegro.tiled_decode  s   ++[[33A",dkk!n.BDKKPQNVXDXXA",dkk!n.BDKKPQNVXDXX>?gg;
L*fe IIaAq!$,,Q2  #KKJ+,44Q1aC JJ
]15E(EWXIY'YZ]^^

F]1-=$=qAQ#QRUVVzz5=+;#;}Q?O"OPSTT!M1L@((AAA
 ++|]1-=}Q?OQ^_`Qab
 () 	A=) |, A%&q)9%91}Q?O;OR_`aRb;bUG%&q)9%91}Q?O;OR_`aRb;bUG%&q)9%91}Q?O;OR_`aRb;bUG%&q!WU]GEM7SX='X%YN@NOE,<$<=  004Dq4HH $5$E$TWX$XX(,_(E "%6%F%UXY%YY %(8 8<Lq<P PQ^ >%*:":Q">RN55;K3K+K+MN XeN53C+Ca+G%RS)T*+++-|]1=M}]^O_anopaqr+ QJE9	@ Z)A)A:PR?TZ]_T_afikaklmKKNT[[^+KKNT[[^+KKNT[[^+
 () 	`AQ/T[[^1CdkkRSn1TUG=) 
`!"T[[^!3QQ5G$++VW.5X|, `A%&Q%7T[[^9KdkkZ[n9\UG&8-}Q/?@M=+;<L-*:;&q='8<'G!lJZ'Z]^'^_iijkl	'O !Qwu}gemKLP__L`
`	` aAq!,779r5   r   sample_posterior	generatorc                     |}| j                  |      j                  }|r|j                  |      }n|j                         }| j	                  |      j                  }|s|fS t        |      S )a  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
            generator (`torch.Generator`, *optional*):
                PyTorch random number generator.
        )rU  r#  )r  r  r   moder'  r   )	r/   r   rT  r  rU  r  r  r  decs	            r4   rH   zAutoencoderKLAllegro.forward.  sf    " KKN..	  9 5A Akk!n##6MC((r5   )r   r   r   r   r   r   r   r?   r
   r   rJ   r?   i@  gp=
ף?T)r   N)T)FTN)!rK   rL   rM   rN    _supports_gradient_checkpointingr   r$   r   r   rP   rO   r#   r  r	  r  r  r:   rR   r  r	   r   r   r   r  r!  r   r'  r  r   r   	GeneratorrH   rS   rT   s   @r4   r   r     s   +Z (,$ -
+
 /C7Q5O  !!,- $!5G
G
 G
  S/	G
 c3hG
  "#s(O!G
" %*$)$4#G
$ #(c	"2%G
& 'G
( )G
* +G
, -G
. %*/G
0 1G
2 3G
4 5G
6 
7G
 G
R  ![ [%,, [ 37::,0:	"E*F$GG	H: :8[ [%,, [ - -4 -5X]XdXdIdCe - -2Qell Qu|| QfWell Wu|| Wx "' /3)) ) 	)
 EOO,) 
}ell*	+)r5   r   c                    | \  }}}|\  }}}	|\  }
}}|dkD  r|dkD  rx|d d d d d|d d d d f   t        j                  d|      j                         j                  |j                        |z  j                  |dd      z  |d d d d d|d d d d f<   ||dz
  k  r}|d d d d | d d d d d f   dt        j                  d|      j                         j                  |j                        |z  z
  j                  |dd      z  |d d d d | d d d d d f<   |dkD  rw|d d d d d d d|	d d f   t        j                  d|	      j                         j                  |j                        |	z  j                  |	d      z  |d d d d d d d|	d d f<   ||dz
  k  r||d d d d d d |	 d d d f   dt        j                  d|	      j                         j                  |j                        |	z  z
  j                  |	d      z  |d d d d d d |	 d d d f<   |
dkD  rg|d d d d d d d d d|f   t        j                  d|      j                         j                  |j                        |z  z  |d d d d d d d d d|f<   |
|dz
  k  rl|d d d d d d d d | d f   dt        j                  d|      j                         j                  |j                        |z  z
  z  |d d d d d d d d | d f<   |S )Nr   r   )r:   arangerO   r   devicereshape)n_paramh_paramw_paramr  nn_max	overlap_nr  h_max	overlap_hww_max	overlap_ws                r4   r.  r.  M  s   !Aui!Aui!Aui1}q5)*1a9a+B)CQ	*00255ahh?)KgiA&*'AaAiKA%& uqy=)*1a)a+B)CELLI.44699!((CiOOgiA&*'AaYJKA%& 	1u%&q!Q)Q'>%?LLI&,,.11!((;iG
')Q
& !Q1Y;
!" 	519}%&q!Q
Q'>%?Q	*00255ahh?)KK
')Q
& !QI:;
!" 	1u%&q!Q1Y;'>%?LLI&,,.11!((;iG&
!Q1a	k
!" 	519}%&q!QI:;'>%?Q	*00255ahh?)KK&
!Q1yjk
!" Hr5   )&r+  typingr   r   r   r:   torch.nnr%   configuration_utilsr   r   utils.accelerate_utilsr	   attention_processorr   r   autoencoders.vaer   r   downsamplingr   modeling_outputsr   modeling_utilsr   r}   r   
upsamplingr   Moduler   rV   r   r   r   r   r   r.   r5   r4   <module>rv     s      ) )   B 8 8 J ' 2 ' " #bryy bJT TnOryy OdlBII l^|ryy |~Dryy DNf):{ f)Rr5   