
    biS                    $   d dl mZ d dlmZmZmZmZmZmZ d dl	Z	d dl
mZ d dlZ	ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZ dd
lmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,  e,jZ                  e.      Z/d Z0e G d de+             Z1 G d dee      Z2 G d deee      Z3	 	 	 	 	 	 	 	 	 ddZ4	 	 	 	 	 	 	 	 d dZ5 G d dejl                        Z7 G d dejl                        Z8 G d dejl                        Z9y)!    )	dataclass)AnyDictListOptionalTupleUnionN   )ConfigMixinregister_to_config)UNet2DConditionLoadersMixin)get_activation)ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORSAttentionProcessorAttnAddedKVProcessorAttnProcessor)TimestepEmbedding	Timesteps)
ModelMixin)Downsample2DResnetBlock2D
Upsample2D)Transformer2DModel)DownBlock2D	UpBlock2D)UNet2DConditionOutput)
BaseOutputloggingc                    | j                   d   }|-|j                  |df      }t        j                  |||gd      }|j	                  |dd      }|j	                  |dd      }t        j                  || |gd      } | |fS )Nr      dim)shapenew_onestorchconcatexpand)hidden_statesattention_mask	sos_token	eos_token
batch_sizenew_attn_mask_steps         k/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/audioldm2/modeling_audioldm2.pyadd_special_tokensr1   /   s    $$Q'J!+44j!_E'9>K]&^dfg   Q3I  Q3ILL)]I!FANM.((    c                   X    e Zd ZU dZej
                  ed<   dZeej                     ed<   y)AudioLDM2ProjectionModelOutputa  
    Args:
    Class for AudioLDM2 projection layer's outputs.
        hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states obtained by linearly projecting the hidden-states for each of the text
             encoders and subsequently concatenating them together.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices, formed by concatenating the attention masks
             for the two text encoders together. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
    r*   Nr+   )
__name__
__module____qualname____doc__r'   Tensor__annotations__r+   r   
LongTensor r2   r0   r4   r4   >   s(     <<15NHU--.5r2   r4   c            
            e Zd ZdZe	 	 d fd	       Z	 	 	 	 d	deej                     deej                     deej                     deej                     fdZ
 xZS )
AudioLDM2ProjectionModela  
    A simple linear projection model to map two text embeddings to a shared latent space. It also inserts learned
    embedding vectors at the start and end of each text embedding sequence respectively. Each variable appended with
    `_1` refers to that corresponding to the second text encoder. Otherwise, it is from the first.

    Args:
        text_encoder_dim (`int`):
            Dimensionality of the text embeddings from the first text encoder (CLAP).
        text_encoder_1_dim (`int`):
            Dimensionality of the text embeddings from the second text encoder (T5 or VITS).
        langauge_model_dim (`int`):
            Dimensionality of the text embeddings from the language model (GPT2).
    c                    t         |           t        j                  ||      | _        t        j                  ||      | _        t        j                  t        j                  |            | _	        t        j                  t        j                  |            | _
        t        j                  t        j                  |            | _        t        j                  t        j                  |            | _        || _        | j                  ;t        j                  j                  t        j                  d||f            | _        y y )Nr!   )super__init__nnLinear
projectionprojection_1	Parameterr'   ones	sos_embed	eos_embedsos_embed_1eos_embed_1use_learned_position_embeddingzeroslearnable_positional_embedding)selftext_encoder_dimtext_encoder_1_dimlangauge_model_dimrL   max_seq_length	__class__s         r0   rA   z!AudioLDM2ProjectionModel.__init__a   s     	))$46HIII&8:LM ejj1C&DEejj1C&DE<<

3E(FG<<

3E(FG.L+ ..:27((2D2DQ 2NCD3D/ ;r2   r*   hidden_states_1r+   attention_mask_1c                 F   | j                  |      }t        ||| j                  | j                        \  }}| j                  1|j                  ddd      | j                  z   j                  ddd      }| j                  |      }t        ||| j                  | j                        \  }}t        j                  ||gd      }|||j                  |d d       }n|||j                  |d d       }||t        j                  ||gd      }nd }t        ||      S )N)r,   r-   r      r!   r#   r"   )r*   r+   )rD   r1   rH   rI   rL   permuterN   rE   rJ   rK   r'   catr&   r4   )rO   r*   rU   r+   rV   s        r0   forwardz AudioLDM2ProjectionModel.forward~   sB    6(:>T^^t~~)
%~
 ..:.66q!Q?$BeBeennoprsuvwO++O<,>-9I9IUYUeUe-
))
 		=/"BJ !&6&B-66bq8IKN',<,D-668KM%*:*F"YY8H'IrRN!N-')
 	
r2   )NN)NNNN)r5   r6   r7   r8   r   rA   r   r'   r9   r;   r[   __classcell__rT   s   @r0   r>   r>   R   s      (, < 1526597;&
-&
 "%,,/&
 !!1!12	&

 #5#3#34&
r2   r>   c            F       H    e Zd ZdZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d8dee   dedededede	e
   d	ee
   d
e	e
   deee	e   f   de	e   deee	e   f   dedede
dee   dedeee	e   f   deee	e   f   deee	e   f   deeee	e   f      dedee
   dee   dede
de
dee   dee
   dee
   d ee   d!ed"ed#ee   d$efD fd%       Zed&ee
ef   fd'       Zd(eeee
ef   f   fd)Zd* Zd+ Z	 	 	 	 	 	 	 	 d9d,ej,                  d-eej,                  eef   d.ej,                  d/eej,                     d0eej,                     d1eej,                     d2eee
ef      d3eej,                     d4ed5eej,                     d6eej,                     d&eee	f   fd7Z xZS ):AudioLDM2UNet2DConditionModela  
    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
    shaped output. Compared to the vanilla [`UNet2DConditionModel`], this variant optionally includes an additional
    self-attention layer in each Transformer block, as well as multiple cross-attention layers. It also allows for up
    to two cross-attention embeddings, `encoder_hidden_states` and `encoder_hidden_states_1`.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample.
        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
            Whether to flip the sin to cos in the time embedding.
        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
            The tuple of downsample blocks to use.
        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
            Block type for middle of UNet, it can only be `UNetMidBlock2DCrossAttn` for AudioLDM2.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
            The tuple of upsample blocks to use.
        only_cross_attention (`bool` or `Tuple[bool]`, *optional*, default to `False`):
            Whether to include self-attention in the basic transformer blocks, see
            [`~models.attention.BasicTransformerBlock`].
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
            If `None`, normalization and activation layers is skipped in post-processing.
        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
            The dimension of the cross attention features.
        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
        num_attention_heads (`int`, *optional*):
            The number of attention heads. If not defined, defaults to `attention_head_dim`
        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
        class_embed_type (`str`, *optional*, defaults to `None`):
            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
        num_class_embeds (`int`, *optional*, defaults to `None`):
            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
            class conditioning with `class_embed_type` equal to `None`.
        time_embedding_type (`str`, *optional*, defaults to `positional`):
            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
        time_embedding_dim (`int`, *optional*, defaults to `None`):
            An optional override for the dimension of the projected time embedding.
        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
            Optional activation function to use only once on the time embeddings before they are passed to the rest of
            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
        timestep_post_act (`str`, *optional*, defaults to `None`):
            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
            The dimension of `cond_proj` layer in the timestep embedding.
        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
            embeddings with the class embeddings.
    Tsample_sizein_channelsout_channelsflip_sin_to_cos
freq_shiftdown_block_typesmid_block_typeup_block_typesonly_cross_attentionblock_out_channelslayers_per_blockdownsample_paddingmid_block_scale_factoract_fnnorm_num_groupsnorm_epscross_attention_dimtransformer_layers_per_blockattention_head_dimnum_attention_headsuse_linear_projectionclass_embed_typenum_class_embedsupcast_attentionresnet_time_scale_shifttime_embedding_typetime_embedding_dimtime_embedding_act_fntimestep_post_acttime_cond_proj_dimconv_in_kernelconv_out_kernel%projection_class_embeddings_input_dimclass_embeddings_concatc#                    t         7|           || _        |t        d      |xs |}t	        |      t	        |      k7  rt        d| d| d      t	        |
      t	        |      k7  rt        d|
 d| d      t        |	t              s)t	        |	      t	        |      k7  rt        d|	 d| d      t        |t              s)t	        |      t	        |      k7  rt        d| d| d      t        |t              s)t	        |      t	        |      k7  rt        d	| d| d      t        |t              r)t	        |      t	        |      k7  rt        d
| d| d      t        |t              s)t	        |      t	        |      k7  rt        d| d| d      |dz
  dz  }#t        j                  ||
d   ||#      | _        |dk(  r'|xs |
d   dz  }$t        |
d   ||      | _        |
d   }%nt        | d      t        |%|$|||      | _        ||t        j                   ||$      | _        n|dk(  rt        |%|$|      | _        nz|dk(  rt        j$                  |$|$      | _        nY|dk(  r|!t        d      t        |!|$      | _        n5|dk(  r)|!t        d      t        j&                  |!|$      | _        nd | _        |d | _        nt+        |      | _        t        j,                  g       | _        t        j,                  g       | _        t        |	t              r|	gt	        |      z  }	t        |t              r|ft	        |      z  }t        |t              r|ft	        |      z  }t        |t              r|gt	        |      z  }t        |t              r|gt	        |      z  }|"r|$dz  }&n|$}&|
d   }'t3        |      D ]  \  }(})|'}*|
|(   }'|(t	        |
      dz
  k(  }+t5        |)fi d||(   d||(   d|*d|'d|&d |+ d!|d"|d#|d$||(   d%||(   d&|d'|d(|	|(   d)|d*|},| j.                  j7                  |,        |d+k(  r)t9        |d,   |
d,   |&|||||d,   |d,   |||-      | _        nt        d.| d/      d| _        t        t?        |
            }-t        t?        |            }.t        t?        |            }/t        t?        |            }0t        t?        |            }1t        t?        |	            }	|-d   }'t3        |      D ]  \  }(}2|(t	        |
      dz
  k(  }+|'}3|-|(   }'|-tA        |(dz   t	        |
      dz
           }*|+sd0}4| xj<                  dz  c_        nd1}4tC        |2fi d|/|(   dz   d|1|(   d|*d|'d2|3d|&d3|4d!|d"|d#|d$|0|(   d%|.|(   d'|d(|	|(   d)|d*|}5| j0                  j7                  |5       |'}3 |1t        jD                  |
d   ||4      | _#        t+        |      | _$        nd | _#        d | _$        | dz
  dz  }6t        j                  |
d   || |6      | _%        y )5Na#  At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19.z\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: .zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: zfMust provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: zbMust provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: zdMust provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: z^Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: r!   rX   r   )kernel_sizepadding
positional   z6 does not exist. Please make sure to use `positional`.)rm   post_act_fncond_proj_dimtimestep)rm   identityrD   zX`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be setsimple_projectionz_`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set
num_layersrq   ra   rb   temb_channelsadd_downsample
resnet_epsresnet_act_fnresnet_groupsrp   rs   rk   rt   rh   rw   rx   UNetMidBlock2DCrossAttnr"   )rq   ra   r   r   r   output_scale_factorrx   rp   rs   r   rt   rw   zunknown mid_block_type : z4. Should be `UNetMidBlock2DCrossAttn` for AudioLDM2.TFprev_output_channeladd_upsample)num_channels
num_groupseps)&r@   rA   r`   
ValueErrorlen
isinstanceboolintlistrB   Conv2dconv_inr   	time_projr   time_embedding	Embeddingclass_embeddingIdentityrC   time_embed_actr   
ModuleListdown_blocks	up_blocks	enumerateget_down_blockappendr   	mid_blocknum_upsamplersreversedminget_up_block	GroupNormconv_norm_outconv_actconv_out)8rO   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   conv_in_paddingtime_embed_dimtimestep_input_dimblocks_time_embed_dimoutput_channelidown_block_typeinput_channelis_final_block
down_blockreversed_block_out_channelsreversed_num_attention_headsreversed_layers_per_blockreversed_cross_attention_dim%reversed_transformer_layers_per_blockup_block_typer   r   up_blockconv_out_paddingrT   s8                                                          r0   rA   z&AudioLDM2UNet2DConditionModel.__init__   s	   V 	&* v  2G5G  C$77no  oA  AU  Vd  Ue  ef  g  !"c*:&;;t  vH  uI  I_  `p  _q  qr  s  .5#>R:SWZ[kWl:lx  zN  yO  Oe  fv  ew  wx  y  -s3<O8PTWXhTi8iv  xK  wL  Lb  cs  bt  tu  v  ,c2s;M7NRUVfRg7gt  vH  uI  I_  `p  _q  qr  s  )40S9L5MQTUeQf5fv  xK  wL  Lb  cs  bt  tu  v  *C0S9I5JcRbNc5cp  rB  qC  CY  Zj  Yk  kl  m 
 *A-!3yy+A.NTc

 ,./L3Ea3H13LN&'9!'<ozZDN!3A!6 344jkll/),
 #(8(D#%<<0@.#QD +#45G`f#gD +#%;;~~#ND -4< n  $55Z\j#kD !444< u  $&99-RTb#cD #'D  ("&D"01F"GD==,r**D1$8#9C@P<Q#Q )3/#6"83?O;P"P)3/#6"83?O;P"P&, 01C8H4II2C8,H+ICP`La+a(" %3Q$6!$2! ,A."+,<"= 	0A*M/2N#&8"9A"==N'+A. .J!-L *	
 , 4 $21 $ % . %8$: %8$: $6 '< &:!%<  "2!" )@#J& ##J/1	06 664-I"-M.r23#$$:(?$7$;$7$;-&;!1DN +N+;;op 
   '+84F+G&H#'+H5H,I'J$$(2B)C$D!'+H5H,I'J$04X>Z5[0\-#H-A$BC4Q7 ). 9 "	1A}#&8"9A"==N"08;N7AE3GYCZ]^C^8_`M "###q(#$#4Q7!; .SST-U *	
 , %8 4 * $ % . %A$C %A$C '< &:!%<  "2!" )@#H& NN!!(+"0E"	1J &!#/2T\"D +62DM "&D DM+a/A5		q!<_Vf
r2   returnc                     i }dt         dt        j                  j                  dt        t         t
        f   ffd| j                         D ]  \  }} |||        |S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namemodule
processorsc                     t        |d      r|j                         ||  d<   |j                         D ]  \  }} |  d| ||        |S )Nget_processor
.processorr   )hasattrr   named_children)r   r   r   sub_namechildfn_recursive_add_processorss        r0   r   zRAudioLDM2UNet2DConditionModel.attn_processors.<locals>.fn_recursive_add_processors!  sd    v/282F2F2H
dV:./#)#8#8#: U%+tfAhZ,@%TU r2   )strr'   rB   Moduler   r   r   )rO   r   r   r   r   s       @r0   attn_processorsz-AudioLDM2UNet2DConditionModel.attn_processors  sm     
	c 	588?? 	X\]`bt]tXu 	 !//1 	BLD&'fjA	B r2   	processorc           	      T   t        | j                  j                               }t        |t              r,t        |      |k7  rt        dt        |       d| d| d      dt        dt        j                  j                  ffd| j                         D ]  \  }} |||        y)	a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r   c                     t        |d      rEt        |t              s|j                  |       n#|j                  |j	                  |  d             |j                         D ]  \  }} |  d| ||        y )Nset_processorr   r   )r   r   dictr   popr   )r   r   r   r   r   fn_recursive_attn_processors        r0   r   zUAudioLDM2UNet2DConditionModel.set_attn_processor.<locals>.fn_recursive_attn_processorE  sx    v/!)T2((3(($z7J)KL#)#8#8#: T%+tfAhZ,@%STr2   N)r   r   keysr   r   r   r   r'   rB   r   r   )rO   r   countr   r   r   s        @r0   set_attn_processorz0AudioLDM2UNet2DConditionModel.set_attn_processor0  s     D((--/0i&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1 	ALD&'fi@	Ar2   c           	      j   t        d | j                  j                         D              rt               }nmt        d | j                  j                         D              rt	               }n8t        dt        t        | j                  j                                            | j                  |       y)ze
        Disables custom attention processors and sets the default attention implementation.
        c              3   @   K   | ]  }|j                   t        v   y wN)rT   r   .0procs     r0   	<genexpr>zKAudioLDM2UNet2DConditionModel.set_default_attn_processor.<locals>.<genexpr>W  s     i4t~~!>>i   c              3   @   K   | ]  }|j                   t        v   y wr   )rT   r   r   s     r0   r   zKAudioLDM2UNet2DConditionModel.set_default_attn_processor.<locals>.<genexpr>Y  s     h$#==hr   zOCannot call `set_default_attn_processor` when attention processors are of type N)	allr   valuesr   r   r   nextiterr   )rO   r   s     r0   set_default_attn_processorz8AudioLDM2UNet2DConditionModel.set_default_attn_processorS  s     i4K_K_KfKfKhii,.Ih$J^J^JeJeJghh%Iabfgklp  mA  mA  mH  mH  mJ  hK  cL  bM  N  		*r2   c                    	
 g 
dt         j                  j                  f
fd| j                         D ]
  } |        t	        
      }|dk(  r
D cg c]  }|dz  	 }}n|dk(  r|dgz  }t        |t              s||gz  n|}t	        |      t	        
      k7  r=t        dt	        |       d| j                   d	t	        
       d
t	        
       d	      t        t	        |            D ]&  }||   }
|   }|||kD  st        d| d| d       dt         j                  j                  dt        t           f	fd	t        t        |            }| j                         D ]  } 	||        yc c}w )a  
        Enable sliced attention computation.

        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
        several steps. This is useful for saving some memory in exchange for a small decrease in speed.

        Args:
            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
                must be a multiple of `slice_size`.
        r   c                     t        | d      rj                  | j                         | j                         D ]
  } |        y Nset_attention_slice)r   r   sliceable_head_dimchildren)r   r   $fn_recursive_retrieve_sliceable_dimssliceable_head_dimss     r0   r   z_AudioLDM2UNet2DConditionModel.set_attention_slice.<locals>.fn_recursive_retrieve_sliceable_dimss  s@    v45#**6+D+DE* <4U;<r2   autorX   maxr!   zYou have provided z, but z has zH different attention layers. Make sure to match `len(slice_size)` to be r   Nzsize z has to be smaller or equal to 
slice_sizec                     t        | d      r| j                  |j                                | j                         D ]  } ||        y r   )r   r   r   r   )r   r   r    fn_recursive_set_attention_slices      r0   r   z[AudioLDM2UNet2DConditionModel.set_attention_slice.<locals>.fn_recursive_set_attention_slice  sE    v45**:>>+;<* D0
CDr2   )r'   rB   r   r   r   r   r   r   configranger   r   r   )rO   r   r   num_sliceable_layersr$   r   sizereversed_slice_sizer   r   r   s           @@@r0   r   z1AudioLDM2UNet2DConditionModel.set_attention_slicec  s    !	< 	< mmo 	9F08	9  ##67 /BBs#(BJB5 -3J@J:W[@\)ZL8bl
z?c"566$S_$5VDKK=cReNfMg hQQTUhQiPjjkm 
 s:' 	VAa=D%a(CD3J 5.McURS!TUU		V	DUXX__ 	DRVWZR[ 	D #8J#78mmo 	JF,V5HI	J= Cs   E;sampler   encoder_hidden_statesclass_labelstimestep_condr+   cross_attention_kwargsencoder_attention_maskreturn_dictencoder_hidden_states_1encoder_attention_mask_1c                 p	   d| j                   z  d}d}t        fd|j                  dd D              rt        j	                  d       d}|2d|j                  |j                        z
  d	z  }|j                  d      }|2d|j                  |j                        z
  d	z  }|j                  d      }|2d|j                  |j                        z
  d	z  }|j                  d      }|}t        j                  |      s|j                  j                  d
k(  }|j                  j                  dk(  }t        |t              r%|s|rt        j                  nt        j                  }n$|s|rt        j                   nt        j"                  }t        j$                  |g||j                        }n6t'        |j                        dk(  r|d   j                  |j                        }|j)                  |j                  d         }| j+                  |      }|j                  |j                        }| j-                  ||      }d}| j.                  |t1        d      | j2                  j4                  dk(  r-| j+                  |      }|j                  |j                        }| j/                  |      j                  |j                        }| j2                  j6                  rt        j8                  ||gd      }n||z   }|||z   n|}| j:                  | j;                  |      }| j=                  |      }|f}| j>                  D ]@  }tA        |d      r |jB                  r ||||||||
|      \  }}n |||      \  }}||z  }B | jD                  | jE                  |||||||
|      }tG        | jH                        D ]  \  }}|t'        | jH                        dz
  k(  }|t'        |jJ                         d }|dt'        |jJ                          }|s|r|d   j                  dd }tA        |d      r|jB                  r ||||||||||
|
      } |||||      } | jL                  r"| jM                  |      }| jO                  |      }| jQ                  |      }|	s|fS tS        |      S )a  
        The [`AudioLDM2UNet2DConditionModel`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, channel, height, width)`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
            encoder_attention_mask (`torch.Tensor`):
                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
                which adds large negative values to the attention scores corresponding to "discard" tokens.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
                tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
            encoder_hidden_states_1 (`torch.Tensor`, *optional*):
                A second set of encoder hidden states with shape `(batch, sequence_length_2, feature_dim_2)`. Can be
                used to condition the model on a different set of embeddings to `encoder_hidden_states`.
            encoder_attention_mask_1 (`torch.Tensor`, *optional*):
                A cross-attention mask of shape `(batch, sequence_length_2)` is applied to `encoder_hidden_states_1`.
                If `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
                which adds large negative values to the attention scores corresponding to "discard" tokens.

        Returns:
            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
                otherwise a `tuple` is returned where the first element is the sample tensor.
        rX   FNc              3   .   K   | ]  }|z  d k7    yw)r   Nr<   )r   sdefault_overall_up_factors     r0   r   z8AudioLDM2UNet2DConditionModel.forward.<locals>.<genexpr>  s     Maq,,1Ms   z9Forward upsample size to force interpolation output size.Tr!   g     mpsnpu)dtypedevicer   )r  z9class_labels should be provided when num_class_embeds > 0r   r"   r#   has_cross_attention)r*   tembr  r+   r  r  r
  r  )r*   r  )r  r+   r  r  r
  r  )
r*   r  res_hidden_states_tupler  r  upsample_sizer+   r  r
  r  )r*   r  r  r  )r  )*r   anyr%   loggerinfotor  	unsqueezer'   	is_tensorr  typer   floatfloat32float64int32int64tensorr   r)   r   r   r   r   r   ru   r   rZ   r   r   r   r   r  r   r   r   resnetsr   r   r   r   )rO   r  r   r  r  r  r+   r  r  r	  r
  r  forward_upsample_sizer  	timestepsis_mpsis_npur  t_embembaug_emb	class_embdown_block_res_samplesdownsample_blockres_samplesr   upsample_blockr   r  s                               @r0   r[   z%AudioLDM2UNet2DConditionModel.forward  s   b %&t':':$:! !&M6<<;LMMKKST$(! %
  ."3"3FLL"AAXMN+55a8N "-&'*@*C*CFLL*Q&QU]%]"%;%E%Ea%H"#/(),D,G,G,U(UYa'a$'?'I'I!'L$ 	y) ]]''50F]]''50F(E**0F(.&u{{i[fmmTI!Q&!$**6==9I $$V\\!_5	y)
 v||,!!%7+# !\]]{{++z9#~~l;  ,V\\B,,\:==FLL=QI{{22iii 0b9Io&2cGm*%%c*C f% #) $ 0 0 	2')>?DTDhDh&6"(*?#1+A+A,C-E	'# '7VRU&V#"k1"	2$ >>%^^&;-'='=(?)A $ 	F "+4>>!: 	A~#dnn"5"99N0#n6L6L2M1M1OPK%;<Zs>CYCY?Z>Z%[" "&; 6r : @ @ D~'<=.BdBd'"(,7*?+A"/#1+A,C-E ("(sKgt3	< ''/F]]6*Fv&9$F33r2   )"Nr   r   Tr   )CrossAttnDownBlock2Dr3  r3  r   r   )r   CrossAttnUpBlock2Dr4  r4  F)i@  i     r5  rX   r!   r!   silu    gh㈵>r5  r!      NFNNFdefaultr   NNNNr
   r
   NF)NNNNNTNN)r5   r6   r7   r8    _supports_gradient_checkpointingr   r   r   r   r   r   r	   r   rA   propertyr   r   r   r   r   r   r'   r9   r   r   r[   r\   r]   s   @r0   r_   r_      s   EN (,$ &* $(
 )B%t9>)?34"#())+6:?@56@D&+*.*.!&'0#/,0/3+/,0 ?C(-Qb
c]b
 b
 	b

 b
 b
  *b
 !b
 c
b
 $D%+$56b
  "#J!b
"  U3Z0#b
$  %b
& !&'b
( )b
* "#+b
, -b
. #3c
?3/b
0 ',CsO&<1b
2 "#uSz/23b
4 &eCsO&<=5b
6  $7b
8 #3-9b
: #3-;b
< =b
> "%?b
@ !Ab
B %SMCb
D  (}Eb
F $C=Gb
H %SMIb
J Kb
L Mb
N 08}Ob
P "&Qb
 b
H	 c+=&=!>  0 AE2Dd3PbKbFc2c,d  AF+ ?JL 040415;?9= :>;?Q4Q4 eS01Q4  %||	Q4
 u||,Q4  -Q4 !.Q4 !)c3h 8Q4 !) 6Q4 Q4 "*%,,!7Q4 #+5<<"8Q4 
$e+	,Q4r2   r_   c                    | j                  d      r| dd  n| } | dk(  rt        ||||||||
||
      S | dk(  rF|t        d      t        di d|d|d	|d
|d|d|d|d|d|
d|d|d|	d|d|d|d|S t        |  d      )NUNetRes   r   )
r   ra   rb   r   r   r   r   r   rk   rx   r3  z>cross_attention_dim must be specified for CrossAttnDownBlock2Dr   rq   ra   rb   r   r   r   r   r   rk   rp   rs   rt   rh   rw   rx    does not exist.r<   )
startswithr   r   r3  )r   r   ra   rb   r   r   r   r   rq   rs   r   rp   rk   rt   rh   rw   rx   s                    r0   r   r   x  s/   & .=-G-G	-Roab)XgO-'!#%')!''1$;
 	
 
2	2&]^^# 
!
)E
 $
 &	

 (
 *
 "
 (
 (
  2
 !4
 !4
 #8
 "6
 .
  %<!
 	
$ ((89
::r2   c                    | j                  d      r| dd  n| } | dk(  rt        ||||||||||
      S | dk(  rF|t        d      t        di d|d|	d	|d
|d|d|d|d|d|d|d|d|
d|d|d|d|S t        |  d      )Nr=  r>  r   )
r   ra   rb   r   r   r   r   r   r   rx   r4  z<cross_attention_dim must be specified for CrossAttnUpBlock2Dr   rq   ra   rb   r   r   r   r   r   r   rp   rs   rt   rh   rw   rx   r?  r<   )r@  r   r   r4  )r   r   ra   rb   r   r   r   r   r   rq   rs   r   rp   rt   rh   rw   rx   s                    r0   r   r     s.   & *7)A)A))LM!"%R_M#!#% 3'%!''$;
 	
 
.	.&[\\! 
!
)E
 $
 &	

 !4
 (
 &
 "
 (
 (
 !4
 !4
 #8
 "6
 .
  %<!
 	
$ &67
88r2   c                   f    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedededededededed	ed
edef fdZ	 	 	 	 	 	 	 ddej                  de
ej                     de
ej                     de
ej                     de
eeef      de
ej                     de
ej                     de
ej                     fdZ xZS )r3  ra   rb   r   dropoutr   rq   r   rx   r   r   resnet_pre_normc                    t         |           g }g }d| _        || _        t	        |t
              r|f}t	        |t        t        f      r(t        |      dkD  rt        d| dt        |             || _
        t        |      D ]y  }|dk(  r|n|}|j                  t        |||||
|||	||
             t        t        |            D ]3  }|j                  t        |||z  ||||   |
|||||   dnd
             5 { t        j                   |      | _        t        j                   |      | _        |r1t        j                   t'        |d||d	
      g      | _        d| _        y d | _        d| _        y )NTr   Only up to 4 cross-attention layers are supported. Ensure that the length of cross-attention dims is less than or equal to 4. Got cross-attention dims  of length r   
ra   rb   r   r   groupsrC  time_embedding_normnon_linearityr   pre_normFra   r   rp   rn   rt   rh   rw   double_self_attentionop)use_convrb   r   r   )r@   rA   r  rs   r   r   r   tupler   r   rp   r   r   r   r   rB   r   
attentionsr&  r   downsamplersgradient_checkpointing)rO   ra   rb   r   rC  r   rq   r   rx   r   r   rD  rs   rp   r   rk   r   rt   rh   rw   r&  rR  r   jrT   s                           r0   rA   zCrossAttnDownBlock2D.__init__  s   , 	
#' #6 )3/#6"8)D%=9cBU>VYZ>ZMM`Laalmp  rE  nF  mGH  $7 z" 	A)*a+\KNN +!-"/"(#(?"/(;, 3234 !!&+$(;;$0#?,?,B(5.C-A)96I!6L6TdZ_!	> --
3}}W- " $t,Xjqu!D ',# !%D&+#r2   r*   r  r  r+   r  r  r
  r  c	                 :   d}	t        | j                        }
t        | j                        |
z  }||n|}||n|}t        |
      D ]#  }t	        j
                         r| j                  r| j                  | j                  |   ||      }t        | j                        D ]O  \  }}|
|dk  r|}|}n|
|dkD  r|}|}nd }d }| j                  | j                  ||z  |z      ||d d |||      d   }Q nr | j                  |   ||      }t        | j                        D ]D  \  }}|
|dk  r|}|}n|
|dkD  r|}|}nd }d } | j                  ||z  |z      ||||d      d   }F |	|fz   }	& | j                  | j                  D ]
  } ||      } |	|fz   }	||	fS )Nr<   r!   r   Fr+   r  r  r	  )r   r&  rR  r   r'   is_grad_enabledrT  _gradient_checkpointing_funcr   rp   rS  )rO   r*   r  r  r+   r  r  r
  r  output_statesr   num_attention_per_layerr   idxrp   forward_encoder_hidden_statesforward_encoder_attention_maskdownsamplers                     r0   r[   zCrossAttnDownBlock2D.forward@  sB    &
"%doo"6*"D (?'J#Pe 	  )@(K$Qg 	! z" +	=A$$&4+F+F $ A A$,,q/S`bf g09$:R:R0S ,C,*63!88M59O6,8S1W8O59Q68<59=6$($E$E,C(Cc(IJ%5.&6	% 	%M* !0Qt D09$:R:R0S ,C,*63!88M59O6,8S1W8O59Q68<59=6$VDOOA8O4ORU4U$V%'5.K/M$)% %M$ *],<<MW+	=Z (#00 ; +M :; *],<<Mm++r2   )        r!   r!   ư>r9  swishr7  Tr!   r5        ?r!   TFFFNNNNNNNr5   r6   r7   r   r   r   r   rA   r'   r9   r   r   r   r[   r\   r]   s   @r0   r3  r3    sm    ,- '0$ $ #")S,S, S, 	S,
 S, S, '*S, S, "%S, S, S, S,p (,8<15;?9=:>;?I,||I, u||$I,  (5	I,
 !.I, !)c3h 8I, !) 6I, "*%,,!7I, #+5<<"8I,r2   r3  c                   t    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 ddedededededededed	ed
ef fdZ	 	 	 	 	 	 	 ddej                  de
ej                     de
ej                     de
ej                     de
eeef      de
ej                     de
ej                     de
ej                     dej                  fdZ xZS )r   ra   r   rC  r   rq   r   rx   r   r   rD  c                    t         |           d| _        || _        |	|	nt	        |dz  d      }	t        |t              r|f}t        |t        t        f      r(t        |      dkD  rt        d| dt        |             || _        t        |||||	|||||

      g}g }t        |      D ]o  }t        t        |            D ]2  }|j                  t        |||z  ||||   |	||||   dnd	             4 |j                  t        |||||	|||||

             q t!        j"                  |      | _        t!        j"                  |      | _        d| _        y )	NTr   r7  rF  rG  rH  F)ra   r   rp   rn   rt   rw   rN  )r@   rA   r  rs   r   r   r   r   rQ  r   r   rp   r   r   r   r   rB   r   rR  r&  rT  )rO   ra   r   rC  r   rq   r   rx   r   r   rD  rs   r   rp   rt   rw   r&  rR  r   rU  rT   s                       r0   rA   z UNetMidBlock2DCrossAttn.__init__  s   $ 	#' #6 )6)BK[\L\^`Ha)3/#6"8)D%=9cBU>VYZ>ZMM`Laalmp  rE  nF  mGH  $7  '(+$$;+$7(
 
z" 	A3234 !!&+#'::$/#?,?,B(5.C)96I!6L6TdZ_
 NN +!,"/"(#(?"/(;,	< --
3}}W-&+#r2   r*   r  r  r+   r  r  r
  r  r   c	                 6    | j                   d   ||      }t        | j                        t        | j                         dz
  z  }	||n|}||n|}t        t        | j                   dd              D ]#  }
t	        j
                         r| j                  rt        | j                        D ]O  \  }}|
|dk  r|}|}n|
|dkD  r|}|}nd }d }| j                  | j                  |
|	z  |z      ||d d |||      d   }Q | j                  | j                   |
dz      ||      }t        | j                        D ]D  \  }}|
|dk  r|}|}n|
|dkD  r|}|}nd }d } | j                  |
|	z  |z      ||||d      d   }F  | j                   |
dz      ||      }& |S )Nr   r!   FrW  )
r&  r   rR  r   r'   rX  rT  r   rp   rY  )rO   r*   r  r  r+   r  r  r
  r  r[  r   r\  rp   r]  r^  s                  r0   r[   zUNetMidBlock2DCrossAttn.forward  s$    (Qt<"%doo"63t||;Lq;P"Q (?'J#Pe 	  )@(K$Qg 	! s4<<+,- *	IA$$&4+F+F09$:R:R0S ,C,*63!88M59O6,8S1W8O59Q68<59=6$($E$E,C(Cc(IJ%5.&6	% 	%M( !% A A$,,qSTuBUWdfj k09$:R:R0S ,C,*63!88M59O6,8S1W8O59Q68<59=6$VDOOA8O4ORU4U$V%'5.K/M$)% %M$ !4QU 3M4 HU*	IX r2   )r`  r!   r!   ra  r9  rb  r7  Tr!   rc  r5  FFrd  re  r]   s   @r0   r   r     sh   
 ,- '0$ $ #!S,S, S, 	S,
 S, '*S, S, "%S, S, S, S,p (,8<15;?9=:>;?A||A u||$A  (5	A
 !.A !)c3h 8A !) 6A "*%,,!7A #+5<<"8A 
Ar2   r   c                       e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedededededededed	ed
ededef fdZ	 	 	 	 	 	 	 	 ddej                  de
ej                  df   deej                     deej                     deeeef      dee   deej                     deej                     deej                     deej                     fdZ xZS )r4  ra   rb   r   r   rC  r   rq   r   rx   r   r   rD  c                    t         |           g }g }d| _        || _        t	        |t
              r|f}t	        |t        t        f      r(t        |      dkD  rt        d| dt        |             || _
        t        |      D ]  }||dz
  k(  r|n|}|dk(  r|n|}|j                  t        ||z   ||||||	|
||
             t        t        |            D ]3  }|j                  t        |||z  ||||   ||||||   dnd	
             5  t        j                   |      | _        t        j                   |      | _        |r/t        j                   t'        |d|
      g      | _        d| _        y d | _        d| _        y )NTr   rF  rG  r!   r   rH  FrM  )rP  rb   )r@   rA   r  rs   r   r   r   rQ  r   r   rp   r   r   r   r   rB   r   rR  r&  r   
upsamplersrT  )rO   ra   rb   r   r   rC  r   rq   r   rx   r   r   rD  rs   rp   r   r   rt   rh   rw   r&  rR  r   res_skip_channelsresnet_in_channelsrU  rT   s                             r0   rA   zCrossAttnUpBlock2D.__init__'  s   , 	
#' #6 )3/#6"8)D%=9cBU>VYZ>ZMM`Laalmp  rE  nF  mGH  $7 z"  	A01Z!^0C,89Q!4LNN 25F F!-"/"(#(?"/(;, 3234 !!&+$(;;$0#?,?,B(5.C-A)96I!6L6TdZ_% 	B --
3}}W- mmZtbn-o,pqDO ',# #DO&+#r2   r*   r  .r  r  r  r  r+   r  r
  r  c                 b   t        | j                        }t        | j                        |z  }|	|	n|}	|	|
n|}
t        |      D ]@  }|d   }|d d }t	        j
                  ||gd      }t	        j                         r| j                  r| j                  | j                  |   ||      }t        | j                        D ]O  \  }}|
|dk  r|}|}n|
|dkD  r|	}|
}nd }d }| j                  | j                  ||z  |z      ||d d |||      d   }Q  | j                  |   ||      }t        | j                        D ]D  \  }}|
|dk  r|}|}n|
|dkD  r|	}|
}nd }d } | j                  ||z  |z      ||||d      d   }F C | j                  | j                  D ]  } |||      } |S )Nr"   r!   r#   r   FrW  )r   r&  rR  r   r'   rZ   rX  rT  rY  r   rp   rk  )rO   r*   r  r  r  r  r  r+   r  r
  r  r   r[  r   res_hidden_statesr\  rp   r]  r^  	upsamplers                       r0   r[   zCrossAttnUpBlock2D.forwardx  sI    &
"%doo"6*"D (?'J#Pe 	  )@(K$Qg 	! z" .	A 7 ;&=cr&B#!II}6G&HaPM$$&4+F+F $ A A$,,q/S`bf g09$:R:R0S ,C,*63!88M59O6,8S1W8O59Q68<59=6$($E$E,C(Cc(IJ%5.&6	% 	%M* !0Qt D09$:R:R0S ,C,*63!88M59O6,8S1W8O59Q68<59=6$VDOOA8O4ORU4U$V%'5.K/M$)% %M=.	` ??&!__ H	 )- GH r2   )r`  r!   r!   ra  r9  rb  r7  Tr!   r5  rc  TFFF)NNNNNNNN)r5   r6   r7   r   r   r   r   rA   r'   r9   r   r   r   r   r[   r\   r]   s   @r0   r4  r4  &  s    ,- '0$ $ #")O,O, O, !	O,
 O, O, O, '*O, O, "%O, O, O, O,j (,8<;?'+159=:>;?K||K "'u||S'8!9K u||$	K
  (5K !)c3h 8K  }K !.K !) 6K "*%,,!7K #+5<<"8Kr2   r4  )	r!   NNNNFFFr9  )r!   NNNFFFr9  ):dataclassesr   typingr   r   r   r   r   r	   r'   torch.nnrB   torch.utils.checkpointconfiguration_utilsr   r   loadersr   models.activationsr   models.attention_processorr   r   r   r   r   models.embeddingsr   r   models.modeling_utilsr   models.resnetr   r   r   "models.transformers.transformer_2dr   models.unets.unet_2d_blocksr   r   models.unets.unet_2d_conditionr   utilsr   r   
get_loggerr5   r  r1   r4   r>   r_   r   r   r   r3  r   r4  r<   r2   r0   <module>r     s   " : :    B 2 0  0 D D D A C ( 
		H	%) 6Z 6 6&R
z; R
jN4J=X N4t "#%#6;F "#%#69r_,299 _,DWbii Wt] ]r2   