
    bi                       d dl mZmZmZmZmZmZmZ d dlZd dl	m
Z
 d dlm
c mZ ddlmZmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZm Z m!Z! dd
l"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)  e       rd dl*Z+ndZ+ ejX                  e-      Z. G d d      Z/ G d d      Z0de
jb                  dejd                  de3de3fdZ4e G d de
jb                               Z5e G d de
jb                               Z6e G d de
jb                               Z7 G d de
jb                        Z8e G d de
jb                               Z9 G d d e
jb                        Z:e G d! d"e
jb                               Z; G d# d$e
jb                        Z<y)%    )AnyCallableDictListOptionalTupleUnionN   )	deprecatelogging)is_torch_npu_availableis_torch_xla_availableis_xformers_available)maybe_allow_in_graph   )GEGLUGELUApproximateGELUFP32SiLULinearActivationSwiGLU)	AttentionAttentionProcessorJointAttnProcessor2_0)SinusoidalPositionalEmbedding)AdaLayerNormAdaLayerNormContinuousAdaLayerNormZeroRMSNormSD35AdaLayerNormZeroXc                   X    e Zd Zedeeef   fd       Zdeeeeef   f   fdZ	d Z
d Zy)AttentionMixinreturnc                     i }dt         dt        j                  j                  dt        t         t
        f   ffd| j                         D ]  \  }} |||        |S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namemodule
processorsc                     t        |d      r|j                         ||  d<   |j                         D ]  \  }} |  d| ||        |S )Nget_processor
.processor.)hasattrr)   named_children)r%   r&   r'   sub_namechildfn_recursive_add_processorss        U/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/attention.pyr0   zCAttentionMixin.attn_processors.<locals>.fn_recursive_add_processors2   sd    v/282F2F2H
dV:./#)#8#8#: U%+tfAhZ,@%TU     )strtorchnnModuler   r   r-   )selfr'   r%   r&   r0   s       @r1   attn_processorszAttentionMixin.attn_processors(   sm     
	c 	588?? 	X\]`bt]tXu 	 !//1 	BLD&'fjA	B r2   	processorc           	      T   t        | j                  j                               }t        |t              r,t        |      |k7  rt        dt        |       d| d| d      dt        dt        j                  j                  ffd| j                         D ]  \  }} |||        y)	a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r%   r&   c                     t        |d      rEt        |t              s|j                  |       n#|j                  |j	                  |  d             |j                         D ]  \  }} |  d| ||        y )Nset_processorr*   r+   )r,   
isinstancedictr<   popr-   )r%   r&   r9   r.   r/   fn_recursive_attn_processors        r1   r@   zFAttentionMixin.set_attn_processor.<locals>.fn_recursive_attn_processorU   sx    v/!)T2((3(($z7J)KL#)#8#8#: T%+tfAhZ,@%STr2   N)lenr8   keysr=   r>   
ValueErrorr3   r4   r5   r6   r-   )r7   r9   countr%   r&   r@   s        @r1   set_attn_processorz!AttentionMixin.set_attn_processor@   s     D((--/0i&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1 	ALD&'fi@	Ar2   c                    | j                   j                         D ]1  \  }}dt        |j                  j                        v s(t        d       | j                         D ]#  }t        |t              s|j                          % y)z
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.
        AddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.N)
r8   itemsr3   	__class____name__rC   modulesr=   AttentionModuleMixinfuse_projections)r7   _attn_processorr&   s       r1   fuse_qkv_projectionsz#AttentionMixin.fuse_qkv_projectionsb   s|    
 "&!5!5!;!;!= 	vA~#n66??@@ !tuu	v lln 	*F&"67'')	*r2   c                 p    | j                         D ]#  }t        |t              s|j                          % y)u   Disables the fused QKV projection if enabled.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>
        N)rK   r=   rL   unfuse_projections)r7   r&   s     r1   unfuse_qkv_projectionsz%AttentionMixin.unfuse_qkv_projectionso   s0     lln 	,F&"67))+	,r2   N)rJ   
__module____qualname__propertyr   r3   r   r8   r	   rE   rP   rS    r2   r1   r"   r"   '   sU    c+=&=!>  . AE2Dd3PbKbFc2c,d  AD*,r2   r"   c                   x   e Zd ZdZg ZdZdeddfdZd%deddfdZ	d	e
fd
ZdeddfdZ	 	 d&dedeeee
   df      ddfdZ	 d'dedee   ddfdZ ej&                         d        Z ej&                         d        ZdeddfdZdej0                  dej0                  fdZd(dej0                  dedej0                  fdZ	 d'dej0                  dej0                  deej0                     dej0                  fdZ	 d(dej0                  d ed!ededej0                  f
d"Zd#ej0                  dej0                  fd$Zy))rL   NFr9   r#   c                 N   t        | d      rt        | j                  t        j                  j
                        rdt        |t        j                  j
                        s@t        j                  d| j                   d|        | j                  j                  d       || _        y)z
        Set the attention processor to use.

        Args:
            processor (`AttnProcessor`):
                The attention processor to use.
        r9   z-You are removing possibly trained weights of z with N)
r,   r=   r9   r4   r5   r6   loggerinfo_modulesr?   )r7   r9   s     r1   r<   z"AttentionModuleMixin.set_processor   sq     D+&4>>588??;y%((//:KKGGWW]^g]hijMMk*"r2   return_deprecated_lorar   c                      |s| j                   S y)a7  
        Get the attention processor in use.

        Args:
            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
                Set to `True` to return the deprecated LoRA attention processor.

        Returns:
            "AttentionProcessor": The attention processor in use.
        N)r9   )r7   r]   s     r1   r)   z"AttentionModuleMixin.get_processor   s     &>>! &r2   backendc                    ddl m} |j                  j                         D ch c]  }|j                   }}||vr!t        d|ddj                  |      z          ||j                               }|| j                  _	        y c c}w )Nr   )AttentionBackendNamez	`backend=z ` must be one of the following: z, )
attention_dispatchra   __members__valuesvaluerC   joinlowerr9   _attention_backend)r7   r_   ra   xavailable_backendss        r1   set_attention_backendz*AttentionModuleMixin.set_attention_backend   s{    </C/O/O/V/V/XY!aggYY,,z
*JKdiiXjNkkll&w}}7,3) Zs   Buse_npu_flash_attentionc                 T    |rt               st        d      | j                  d       y)z
        Set whether to use NPU flash attention from `torch_npu` or not.

        Args:
            use_npu_flash_attention (`bool`): Whether to use NPU flash attention or not.
        ztorch_npu is not available_native_npuN)r   ImportErrorrk   )r7   rl   s     r1   set_use_npu_flash_attentionz0AttentionModuleMixin.set_use_npu_flash_attention   s'     #)+!">??""=1r2   use_xla_flash_attentionpartition_spec.c                 T    |rt               st        d      | j                  d       y)a  
        Set whether to use XLA flash attention from `torch_xla` or not.

        Args:
            use_xla_flash_attention (`bool`):
                Whether to use pallas flash attention kernel from `torch_xla` or not.
            partition_spec (`Tuple[]`, *optional*):
                Specify the partition specification if using SPMD. Otherwise None.
            is_flux (`bool`, *optional*, defaults to `False`):
                Whether the model is a Flux model.
        ztorch_xla is not available_native_xlaN)r   ro   rk   )r7   rq   rr   is_fluxs       r1   set_use_xla_flash_attentionz0AttentionModuleMixin.set_use_xla_flash_attention   s'    " #)+!">??""=1r2   'use_memory_efficient_attention_xformersattention_opc                 t   |rt               st        dd      t        j                  j	                         st        d      	 t               rFd}||\  }}|j                  ^}}t        j                  dd|      }t        j                  |||      }| j                  d       yy# t        $ r}|d}~ww xY w)	a  
        Set whether to use memory efficient attention from `xformers` or not.

        Args:
            use_memory_efficient_attention_xformers (`bool`):
                Whether to use memory efficient attention from `xformers` or not.
            attention_op (`Callable`, *optional*):
                The attention operation to use. Defaults to `None` which uses the default attention operation from
                `xformers`.
        zeRefer to https://github.com/facebookresearch/xformers for more information on how to install xformersxformers)r%   zvtorch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only available for GPU N)r   r
   (   cudadevicedtype)r   ModuleNotFoundErrorr4   r|   is_availablerC   SUPPORTED_DTYPESrandnxopsmemory_efficient_attention	Exceptionrk   )	r7   rw   rx   r   op_fwop_bwrN   qes	            r1   +set_use_memory_efficient_attention_xformersz@AttentionModuleMixin.set_use_memory_efficient_attention_xformers   s     3(*){#  ZZ,,. / 

,. $'3+7LE5(-(>(>IEA!KK
6O ;;Aq!D **:61 3* ! Gs   AB' '	B70B22B7c                  
   t        | dd      ry| j                  j                  j                  j                  }| j                  j                  j                  j
                  }t        | d      r`| j                  rSt        j                  | j                  j                  j                  | j                  j                  j                  g      }|j                  d   }|j                  d   }t        j                  ||| j                  ||      | _        | j                   j                  j#                  |       t        | d      r| j                  rt        j                  | j                  j$                  j                  | j                  j$                  j                  g      }| j                   j$                  j#                  |       nt        j                  | j                  j                  j                  | j                  j                  j                  | j                  j                  j                  g      }|j                  d   }|j                  d   }t        j                  ||| j                  ||      | _        | j&                  j                  j#                  |       t        | d      r| j                  rt        j                  | j                  j$                  j                  | j                  j$                  j                  | j                  j$                  j                  g      }| j&                  j$                  j#                  |       t        | d	d      t        | d
d      t        | dd      t        j                  | j(                  j                  j                  | j*                  j                  j                  | j,                  j                  j                  g      }|j                  d   }|j                  d   }t        j                  ||| j.                  ||      | _        | j0                  j                  j#                  |       | j.                  rt        j                  | j(                  j$                  j                  | j*                  j$                  j                  | j,                  j$                  j                  g      }| j0                  j$                  j#                  |       d| _        y)ze
        Fuse the query, key, and value projections into a single projection for efficiency.
        fused_projectionsFNis_cross_attentionr   r   )biasr~   r   use_bias
add_q_proj
add_k_proj
add_v_projT)getattrto_qweightdatar~   r   r,   r   r4   catto_kto_vshaper5   Linearr   to_kvcopy_r   to_qkvr   r   r   added_proj_biasto_added_qkvr   )r7   r~   r   concatenated_weightsin_featuresout_featuresconcatenated_biass          r1   rM   z%AttentionModuleMixin.fuse_projections   s    4,e4!!&&--		  %%++4-.43J3J#(99dii.>.>.C.CTYYEUEUEZEZ-[#\ .44Q7K/55a8L;4==Y_glmDJJJ##$89tZ(T]]$)IItyy~~/B/BDIINNDWDW.X$Y!

%%&78 $)99dii.>.>.C.CTYYEUEUEZEZ\`\e\e\l\l\q\q-r#s .44Q7K/55a8L))KDMMZ`hmnDKKK$$%9:tZ(T]]$)IItyy~~/B/BDIINNDWDWY]YbYbYgYgYlYl.m$n!  &&'89 D,-9lD1=lD1=#(99'',,doo.D.D.I.I4??KaKaKfKfg$  /44Q7K/55a8L "		\0D0DV[`!D $$**+?@##$)II__))..0D0D0I0I4??K_K_KdKde%! !!&&,,->?!%r2   c                     t        | dd      syt        | d      rt        | d       t        | d      rt        | d       t        | d      rt        | d       d| _        y)z\
        Unfuse the query, key, and value projections back to separate projections.
        r   FNr   r   r   )r   r,   delattrr   )r7   s    r1   rR   z'AttentionModuleMixin.unfuse_projections5  sZ     t0%8 4"D(#4!D'"4(D.)!&r2   
slice_sizec                     t        | d      r-|+|| j                  kD  rt        d| d| j                   d      d}|| j                  d      }|| j	                         }| j                  |       y)z
        Set the slice size for attention computation.

        Args:
            slice_size (`int`):
                The slice size for attention computation.
        sliceable_head_dimNzslice_size z has to be smaller or equal to r+   sliced)r,   r   rC   _get_compatible_processordefault_processor_clsr<   )r7   r   r9   s      r1   set_attention_slicez(AttentionModuleMixin.set_attention_sliceJ  s     4-.:3Ij[_[r[rNr{:,6UVZVmVmUnnopqq	 !66x@I 224I9%r2   tensorc                     | j                   }|j                  \  }}}|j                  ||z  |||      }|j                  dddd      j                  ||z  |||z        }|S )a  
        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`.

        Args:
            tensor (`torch.Tensor`): The tensor to reshape.

        Returns:
            `torch.Tensor`: The reshaped tensor.
        r   r
   r      )headsr   reshapepermute)r7   r   	head_size
batch_sizeseq_lendims         r1   batch_to_head_dimz&AttentionModuleMixin.batch_to_head_dima  sj     JJ	#)<< 
GS
i 7GSQ1a+33J)4KWVY\eVefr2   out_dimc                 "   | j                   }|j                  dk(  r|j                  \  }}}d}n|j                  \  }}}}|j                  |||z  |||z        }|j	                  dddd      }|dk(  r|j                  ||z  ||z  ||z        }|S )a5  
        Reshape the tensor for multi-head attention processing.

        Args:
            tensor (`torch.Tensor`): The tensor to reshape.
            out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor.

        Returns:
            `torch.Tensor`: The reshaped tensor.
        r   r   r   r
   )r   ndimr   r   r   )r7   r   r   r   r   r   r   	extra_dims           r1   head_to_batch_dimz&AttentionModuleMixin.head_to_batch_dimq  s     JJ	;;!'-||$JI28,,/J	7C
Gi,?CS\L\]1a+a<^^J$:Gi<OQTXaQabFr2   querykeyattention_maskc                    |j                   }| j                  r |j                         }|j                         }|Xt        j                  |j
                  d   |j
                  d   |j
                  d   |j                   |j                        }d}n|}d}t        j                  |||j                  dd      || j                        }~| j                  r|j                         }|j                  d      }~|j                  |      }|S )aL  
        Compute the attention scores.

        Args:
            query (`torch.Tensor`): The query tensor.
            key (`torch.Tensor`): The key tensor.
            attention_mask (`torch.Tensor`, *optional*): The attention mask to use.

        Returns:
            `torch.Tensor`: The attention probabilities/scores.
        r   r   r   r~   )betaalphar   )r   upcast_attentionfloatr4   emptyr   r~   baddbmm	transposescaleupcast_softmaxsoftmaxto)	r7   r   r   r   r   baddbmm_inputr   attention_scoresattention_probss	            r1   get_attention_scoresz)AttentionModuleMixin.get_attention_scores  s       KKME))+C!!KKAA		!EKKX]XdXdM D*MD ==MM"b!**
 /557*22r2:),,U3r2   target_lengthr   c                 .   | j                   }||S |j                  d   }||k7  r|j                  j                  dk(  re|j                  d   |j                  d   |f}t	        j
                  ||j                  |j                        }t	        j                  ||gd      }nt        j                  |d|fd	      }|d
k(  r*|j                  d   ||z  k  r|j                  |d      }|S |dk(  r$|j                  d      }|j                  |d      }|S )a  
        Prepare the attention mask for the attention computation.

        Args:
            attention_mask (`torch.Tensor`): The attention mask to prepare.
            target_length (`int`): The target length of the attention mask.
            batch_size (`int`): The batch size for repeating the attention mask.
            out_dim (`int`, *optional*, defaults to `3`): Output dimension.

        Returns:
            `torch.Tensor`: The prepared attention mask.
        r   mpsr   r   r   r
   r           )re   r      )r   r   r~   typer4   zerosr   r   Fpadrepeat_interleave	unsqueeze)	r7   r   r   r   r   r   current_lengthpadding_shapepaddings	            r1   prepare_attention_maskz+AttentionModuleMixin.prepare_attention_mask  s)    JJ	!!!,2226]*$$))U2 "0!5!5a!8.:N:Nq:QS` a++m>;O;OXfXmXmn!&NG+D!!L "#~=7IQT!Ua<##A&i)??!/!A!A)QR!A!S
 	 \+55a8N+==iQ=ONr2   encoder_hidden_statesc                 P   | j                   J d       t        | j                   t        j                        r| j                  |      }|S t        | j                   t        j                        r7|j                  dd      }| j                  |      }|j                  dd      }|S J )z
        Normalize the encoder hidden states.

        Args:
            encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.

        Returns:
            `torch.Tensor`: The normalized encoder hidden states.
        zGself.norm_cross must be defined to call self.norm_encoder_hidden_statesr   r
   )
norm_crossr=   r5   	LayerNorm	GroupNormr   )r7   r   s     r1   norm_encoder_hidden_statesz/AttentionModuleMixin.norm_encoder_hidden_states  s     *u,uu*door||4$(OO4I$J! %$ 6 %:$C$CAq$I!$(OO4I$J!$9$C$CAq$I! %$ 5r2   )F)NFN)r   )rJ   rT   rU   _default_processor_cls_available_processorsr   r   r<   boolr)   r3   rk   rp   r   r   rv   r   r   r4   no_gradrM   rR   intr   Tensorr   r   r   r   r   rW   r2   r1   rL   rL   }   s   !#'9 #d #("D "EY "4S 424 2D 2" ?C	2!%2 !x}c'9!:;2
 
20 ae%77;%7KST\K]%7	%7N U]]_8& 8&t U]]_' '(&c &d &.    s 5<< 4 `d-\\-(--FNu||F\-	-` ab)#ll);>)LO)Z])	)V% %QVQ]Q] %r2   rL   ffhidden_states	chunk_dim
chunk_sizec                    |j                   |   |z  dk7  rt        d|j                   |    d| d      |j                   |   |z  }t        j                  |j	                  ||      D cg c]
  } | |       c}|      }|S c c}w )Nr   z)`hidden_states` dimension to be chunked: z$ has to be divisible by chunk size: z[. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`.r   )r   rC   r4   r   chunk)r   r   r   r   
num_chunks	hid_slice	ff_outputs          r1   _chunked_feed_forwardr      s    9%
2a778K8KI8V7WW{  }G  |H  Hc  d
 	
 $$Y/:=J		(5(;(;JI(;(VW9IWI  	Xs   +Bc                        e Zd ZdZdedededef fdZdej                  dej                  d	ej                  fd
Z xZ	S )GatedSelfAttentionDenseat  
    A gated self-attention dense layer that combines visual features and object features.

    Parameters:
        query_dim (`int`): The number of channels in the query.
        context_dim (`int`): The number of channels in the context.
        n_heads (`int`): The number of heads to use for attention.
        d_head (`int`): The number of channels in each head.
    	query_dimcontext_dimn_headsd_headc                    t         |           t        j                  ||      | _        t        |||      | _        t        |d      | _        t        j                  |      | _
        t        j                  |      | _        | j                  dt        j                  t        j                  d                   | j                  dt        j                  t        j                  d                   d| _        y )N)r   r   dim_headgegluactivation_fn
alpha_attnr   alpha_denseT)super__init__r5   r   linearr   attnFeedForwardr   r   norm1norm2register_parameter	Parameterr4   r   enabled)r7   r   r   r   r  rI   s        r1   r
  z GatedSelfAttentionDense.__init__  s     iiY7	6R	iw?\\),
\\),
bll5<<;L.MNr||ELL<M/NOr2   ri   objsr#   c           
         | j                   s|S |j                  d   }| j                  |      }|| j                  j	                         | j                  | j                  t        j                  ||gd                  d d d |d d f   z  z   }|| j                  j	                         | j                  | j                  |            z  z   }|S )Nr   r   )r  r   r  r  tanhr  r  r4   r   r  r   r  )r7   ri   r  n_visuals       r1   forwardzGatedSelfAttentionDense.forward,  s    ||H771:{{4 $$&4::eiiD	WX>Y3Z)[\]_h`h_hjk\k)lll  %%'$''$**Q-*@@@r2   )
rJ   rT   rU   __doc__r   r
  r4   r   r  __classcell__rI   s   @r1   r   r     sO    # C # s "
 
U\\ 
ell 
r2   r   c                        e Zd ZdZ	 	 	 ddededededee   def fdZdd	ee   defd
Z		 dde
j                  de
j                  de
j                  deeeef      fdZ xZS )JointTransformerBlocka,  
    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.

    Reference: https://huggingface.co/papers/2403.03206

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
            processing of `context` conditions.
    r   num_attention_headsattention_head_dimcontext_pre_onlyqk_normuse_dual_attentionc                    t         	|           || _        || _        |rdnd}|rt	        |      | _        nt        |      | _        |dk(  rt        ||dddd      | _        n%|dk(  rt        |      | _        nt        d| d	      t        t        d
      rt               }nt        d      t        |d |||||d||d      | _        |rt        |d |||d||d	      | _        nd | _        t!        j"                  |dd      | _        t'        ||d      | _        |s1t!        j"                  |dd      | _        t'        ||d      | _        nd | _        d | _        d | _        d| _        y )Nada_norm_continousada_norm_zeroFư>T
layer_norm)elementwise_affineepsr   	norm_typezUnknown context_norm_type: z>, currently only support `ada_norm_continous`, `ada_norm_zero`scaled_dot_product_attentionzYThe current PyTorch version does not support the `scaled_dot_product_attention` function.)r   cross_attention_dimadded_kv_proj_dimr  r   r   r  r   r9   r   r(  )	r   r+  r  r   r   r   r9   r   r(  r'  r(  gelu-approximate)r   dim_outr  r   )r	  r
  r!  r  r    r  r   r   norm1_contextrC   r,   r   r   r   r  attn2r5   r   r  r  r   norm2_context
ff_context_chunk_size
_chunk_dim)
r7   r   r  r  r  r   r!  context_norm_typer9   rI   s
            r1   r
  zJointTransformerBlock.__init__H  s    	"4 04D0/.s3DJ)#.DJ 44!7SU4S_"D /1!1#!6D-.?-@@~  145-/Ik   $!'%-
	 "$(+)#
DJ DJ\\#%TJ
#sBTU!#ceQU!VD)c3N`aDO!%D"DO  r2   r   c                      || _         || _        y r   r4  r5  r7   r   r   s      r1   set_chunk_feed_forwardz,JointTransformerBlock.set_chunk_feed_forward      %r2   r   r   tembjoint_attention_kwargsc                 b   |xs i }| j                   r| j                  ||      \  }}}}}	}
}n| j                  ||      \  }}}}}	| j                  r| j                  ||      }n| j                  ||      \  }}}}} | j                  d||d|\  }}|j                  d      |z  }||z   }| j                   r- | j                  dd
i|}j                  d      |z  }||z   }| j                  |      }|d|d d d f   z   z  |d d d f   z   }| j                  -t        | j                  || j                  | j                        }n| j                  |      }|	j                  d      |z  }||z   }| j                  rd }||fS j                  d      |z  }||z   }| j                  |      }|dd d d f   z   z  d d d f   z   }| j                  -t        | j                  || j                  | j                        }n| j                  |      }|j                  d      |z  z   }||fS )N)emb)r   r   r   r   rW   )r!  r  r  r0  r  r   r1  r  r4  r   r   r5  r2  r3  )r7   r   r   r<  r=  norm_hidden_statesgate_msa	shift_mlp	scale_mlpgate_mlpnorm_hidden_states2	gate_msa2r   
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpattn_outputcontext_attn_outputattn_output2r   context_ff_outputs                         r1   r  zJointTransformerBlock.forward  s    "8!=2""kokuku4 lv lh)YJ]_h LP::Vcim:KnH)Y  )-););<QSW)X&[_[m[m%4 \n \X&
Kj
 ,5499 ,
,"<,
 %,
(( ((+k9%3""%4::b4GbKabL$..q1L@L)L8M!ZZ6/1yD7I3IJYWXZ^W^M__'-dgg7I4??\`\l\lmI 23I&&q)I5	%	1   $(!  %m33 #-"6"6q"9<O"O$9<O$O!)-););<Q)R&)Cq;WXZ^W^K_G_)`cnoprvovcw)w&+$9OO%?RVRbRb%! %)OO4N$O!$9J<P<PQR<SVg<g$g!$m33r2   )FNFr   r   )rJ   rT   rU   r  r   r   r   r3   r
  r:  r4   FloatTensorr   r   r  r  r  s   @r1   r  r  9  s    $ "'!%#(OO !O  	O
 O #O !Od# S  <@C4((C4  %00C4 	C4
 !)c3h 8C4r2   r  c            -           e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%dedededee   dedee   ded	ed
edededededededee   dee   dee   dee   dee   dedef, fdZ	d&dee   defdZ
	 	 	 	 	 	 	 d'dej                  deej                     deej                     deej                     deej                     d eeef   d!eej                     d"eeeej                  f      d#ej                  fd$Z xZS )(BasicTransformerBlocka  
    A basic Transformer block.

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        num_embeds_ada_norm (:
            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (:
            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used.
        double_self_attention (`bool`, *optional*):
            Whether to use two self-attention layers. In this case no cross attention layers are used.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
            Whether to use learnable elementwise affine parameters for normalization.
        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
        final_dropout (`bool` *optional*, defaults to False):
            Whether to apply a final dropout after the last feed-forward layer.
        attention_type (`str`, *optional*, defaults to `"default"`):
            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
        positional_embeddings (`str`, *optional*, defaults to `None`):
            The type of positional embeddings to apply to.
        num_positional_embeddings (`int`, *optional*, defaults to `None`):
            The maximum number of positional embeddings to apply.
    r   r  r  r+  r  num_embeds_ada_normattention_biasonly_cross_attentiondouble_self_attentionr   norm_elementwise_affiner)  norm_epsfinal_dropoutattention_typepositional_embeddingsnum_positional_embeddings-ada_norm_continous_conditioning_embedding_dimada_norm_biasff_inner_dimff_biasattention_out_biasc           
         t         |           || _        || _        || _        || _        || _        || _        || _        |
| _	        || _
        || _        || _        |	| _        |d uxr |dk(  | _        |d uxr |dk(  | _        |dk(  | _        |dk(  | _        |dk(  | _        |dv r|t'        d| d| d	      || _        || _        |r|t'        d
      |dk(  rt-        ||      | _        nd | _        |dk(  rt1        ||      | _        nO|dk(  rt5        ||      | _        n8|dk(  rt7        |||||d      | _        nt9        j:                  |||      | _        t=        ||||||	r|nd ||      | _        ||
rk|dk(  rt1        ||      | _         n7|dk(  rt7        |||||d      | _         nt9        j:                  |||      | _         t=        ||
s|nd ||||||      | _!        n0|dk(  rt9        j:                  |||      | _         nd | _         d | _!        |dk(  rt7        |||||d      | _"        n-|dv rt9        j:                  |||      | _"        n|dk(  rd | _"        tG        ||||||      | _$        |dk(  s|dk(  rtK        ||||      | _&        |dk(  r4t9        jN                  tQ        jR                  d|      |dz  z        | _*        d | _+        d| _,        y )Nr$  ada_normada_norm_singler&  ada_norm_continuousrc  r$  `norm_type` is set to w, but `num_embeds_ada_norm` is not defined. Please make sure to define `num_embeds_ada_norm` if setting `norm_type` to r+   \If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined.
sinusoidalmax_seq_lengthrms_normr-  r   r   r  dropoutr   r+  r   out_biasr   r+  r   r  ro  r   r   rp  )r$  rc  r&  layer_norm_i2vgenro  r  rY  	inner_dimr   gatedzgated-text-image   g      ?r   )-r	  r
  r   r  r  ro  r+  r  rT  rV  rW  r[  r\  rU  use_ada_layer_norm_zerouse_ada_layer_normuse_ada_layer_norm_singleuse_layer_normuse_ada_layer_norm_continuousrC   r)  rS  r   	pos_embedr   r  r   r   r5   r   r   attn1r  r1  norm3r  r   r   fuserr  r4   r   scale_shift_tabler4  r5  )r7   r   r  r  ro  r+  r  rS  rT  rU  rV  r   rW  r)  rX  rY  rZ  r[  r\  r]  r^  r_  r`  ra  rI   s                           r1   r
  zBasicTransformerBlock.__init__	  s|   4 	#6 "4#6 *,%:"'>$%:")B&$8! )<4(G'iYZiMi$#6d#B"_	U_H_)26G)G&'<7-6:O-O*55:M:U( 4KKT+UVX 
 ##6  &?&Gn  !L0:3OhiDN!DN 
"%c+>?DJ/))#/BCDJ///='DJ c>U[cdDJ%'7K 3QU-'	

 *.C J&)#/BC
333A+!
  \\#x9PQ
"?T$7Z^)+#!1+	DJ --\\#x9PQ
!
DJ --/='DJ EEc85LMDJ--DJ''"
 W$:L(L06IK^`rsDJ ))%'\\%++a2ES2P%QD"  r2   r   c                      || _         || _        y r   r8  r9  s      r1   r:  z,BasicTransformerBlock.set_chunk_feed_forward  r;  r2   r   r   r   encoder_attention_masktimestepcross_attention_kwargsclass_labelsadded_cond_kwargsr#   c	                 t   |'|j                  dd       t        j                  d       |j                  d   }	| j                  dk(  r| j                  ||      }
n| j                  dk(  r&| j                  ||||j                        \  }
}}}}n| j                  dv r| j                  |      }
n| j                  dk(  r| j                  ||d	         }
nr| j                  d
k(  rX| j                  d    |j                  |	dd      z   j                  dd      \  }}}}}}| j                  |      }
|
d|z   z  |z   }
nt        d      | j                  | j                  |
      }
||j                         ni }|j                  dd       } | j                  |
f| j                  r|nd |d|}| j                  dk(  rj!                  d      |z  }n| j                  d
k(  r|z  }||z   }|j"                  dk(  r|j%                  d      }|| j'                  ||d         }| j(                  | j                  dk(  r| j+                  ||      }
nb| j                  dv r| j+                  |      }
nB| j                  d
k(  r|}
n0| j                  dk(  r| j+                  ||d	         }
nt        d      | j                   | j                  d
k7  r| j                  |
      }
 | j(                  |
f||d|}||z   }| j                  dk(  r| j-                  ||d	         }
n | j                  d
k(  s| j-                  |      }
| j                  dk(  r|
dd d d f   z   z  d d d f   z   }
| j                  d
k(  r| j+                  |      }
|
dz   z  z   }
| j.                  -t1        | j2                  |
| j4                  | j.                        }n| j3                  |
      }| j                  dk(  rj!                  d      |z  }n| j                  d
k(  r|z  }||z   }|j"                  dk(  r|j%                  d      }|S )Nr   SPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r   rc  r$  )hidden_dtype)r&  rr  re  pooled_text_embrd  rv  r   r   r   zIncorrect norm usedgligenr   r   r   r  )r$  r&  rr  zIncorrect norm)getrZ   warningr   r)  r  r   r  r   r   rC   r|  copyr?   r}  rU  r   r   squeezer  r1  r  r~  r4  r   r   r5  )r7   r   r   r   r  r  r  r  r  r   r@  rA  rB  rC  rD  	shift_msa	scale_msagligen_kwargsrK  r   s                       r1   r  zBasicTransformerBlock.forward  s    "-%))'48Dtu #((+
>>Z'!%M8!D^^.KO::xMDWDW LV LH)Y ^^BB!%M!:^^44!%M;LM^;_!`^^00&&t,x/?/?
Ar/RReA1eo KIy(Iy( "&M!:!3q9}!E	!Q233>>%!%0B!C CYBd!7!<!<!>jl.228TB djj
;?;T;T"7Z^)
 %	
 >>_,",,Q/+=K^^00"[0K#m3")11!4M $ JJ}mF6KLM ::!~~+%)ZZx%H"#WW%)ZZ%>"#44 &3"#88%)ZZ?PQb?c%d" !122~~)dnn@Q.Q%)^^4F%G"$$**"&;5 )	K (-7M >>22!%M;LM^;_!`#44!%M!:>>_,!3q9QW;M7M!NQZ[\^b[bQc!c>>..!%M!:!3q9}!E	!Q'-dgg7I4??\`\l\lmI 23I>>_, **1-	9I^^00 9,I!M1")11!4Mr2   )r   Nr  NFFFFTr&  h㈵>FdefaultNNNNNTTrO  )NNNNNNN)rJ   rT   rU   r  r   r   r3   r   r   r
  r:  r4   r   
LongTensorr   r   r  r  r  s   @r1   rR  rR    sD    N -1$-1$%*&+!&(,%#'/337GK'+&*#'1ff !f  	f &c]f f &c]f f #f  $f f "&f f f  !f" #f$  (}%f& $,C='f( 8@})f*  }+f, sm-f. /f0 !1fP# S  268<9=/31537?Cx||x !.x  (5	x
 !) 6x 5++,x !%S#Xx u//0x $Dell):$;<x 
xr2   rR  c            
       L     e Zd ZdZ	 	 ddededee   dee   f fdZd Z xZ	S )	LuminaFeedForwarda'  
    A feed-forward layer.

    Parameters:
        hidden_size (`int`):
            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
            hidden representations.
        intermediate_size (`int`): The intermediate dimension of the feedforward layer.
        multiple_of (`int`, *optional*): Value to ensure hidden dimension is a multiple
            of this value.
        ffn_dim_multiplier (float, *optional*): Custom multiplier for hidden
            dimension. Defaults to None.
    r   rt  multiple_offfn_dim_multiplierc                 *   t         |           |t        ||z        }|||z   dz
  |z  z  }t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||d      | _        t               | _	        y )Nr   Fr   )
r	  r
  r   r5   r   linear_1linear_2linear_3r   silu)r7   r   rt  r  r  rI   s        r1   r
  zLuminaFeedForward.__init__@  s     	).:;II$;a$?K#OP			

 		

 		

 J	r2   c                     | j                  | j                  | j                  |            | j                  |      z        S r   )r  r  r  r  )r7   ri   s     r1   r  zLuminaFeedForward.forward^  s1    }}TYYt}}Q'784==;KKLLr2   )   N)
rJ   rT   rU   r  r   r   r   r
  r  r  r  s   @r1   r  r  1  sI    $ &).2  c]	
 %UO<Mr2   r  c                        e Zd ZdZ	 ddededededee   f
 fdZdee   fd	Z	 dd
ej                  dedeej                     dej                  fdZ
 xZS )TemporalBasicTransformerBlocka  
    A basic Transformer block for video like data.

    Parameters:
        dim (`int`): The number of channels in the input and output.
        time_mix_inner_dim (`int`): The number of channels for temporal attention.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
    r   time_mix_inner_dimr  r  r+  c                    t         |           ||k(  | _        t        j                  |      | _        t        ||d      | _        t        j                  |      | _        t        |||d       | _
        |/t        j                  |      | _        t        ||||      | _        nd | _        d | _        t        j                  |      | _        t        |d      | _        d | _        d | _        y )Nr  )r/  r  )r   r   r  r+  )r   r+  r   r  r  )r	  r
  is_resr5   r   norm_inr  ff_inr  r   r}  r  r1  r~  r   r4  r5  )r7   r   r  r  r  r+  rI   s         r1   r
  z&TemporalBasicTransformerBlock.__init__o  s     	//||C( !&!

 \\"45
(%' $	

 * &89DJ",$7)+	DJ DJDJ \\"45
0H  r2   r   c                      || _         d| _        y )Nr   r8  )r7   r   kwargss      r1   r:  z4TemporalBasicTransformerBlock.set_chunk_feed_forward  s    %r2   r   
num_framesr   r#   c                    |j                   d   }|j                   \  }}}||z  }|d d d f   j                  ||||      }|j                  dddd      }|j                  ||z  ||      }|}| j                  |      }| j                  -t        | j                  || j                  | j                        }n| j                  |      }| j                  r||z   }| j                  |      }	| j                  |	d       }
|
|z   }| j                  )| j                  |      }	| j                  |	|      }
|
|z   }| j                  |      }	| j                  -t        | j                  |	| j                  | j                        }n| j                  |	      }| j                  r||z   }n|}|d d d f   j                  ||||      }|j                  dddd      }|j                  ||z  ||      }|S )Nr   r
   r   r   )r   )r   r   r   r  r4  r   r  r5  r  r  r}  r1  r  r~  r   )r7   r   r  r   r   batch_frames
seq_lengthchannelsresidualr@  rK  r   s               r1   r  z%TemporalBasicTransformerBlock.forward  s    #((+
-:-@-@*j(!Z/
%dAg.66z:z[cd%--aAq9%--j:.EzS[\ ]3'1$**mT__^b^n^noM JJ}5M;;)H4M!ZZ6jj!34jP#m3 ::!!%M!:**%7Od*eK'-7M "ZZ6'-dgg7I4??\`\l\lmI 23I;;%5M%M%dAg.66z:z[cd%--aAq9%--j:.EzS[\r2   r   )rJ   rT   rU   r  r   r   r
  r:  r4   r   r  r  r  s   @r1   r  r  b  s    	" .233  3 !	3
  3 &c]3j#  9=	7||7 7  (5	7
 
7r2   r  c                   V     e Zd Z	 	 	 	 ddedededededee   dedef fd	Zd
 Z xZS )SkipFFTransformerBlockr   r  r  kv_input_dimkv_input_dim_proj_use_biasr+  rT  ra  c
           	         t         
|           ||k7  rt        j                  |||      | _        nd | _        t        |d      | _        t        |||||||	      | _        t        |d      | _	        t        |||||||	      | _
        y )Nr%  )r   r   r  ro  r   r+  rp  )r   r+  r   r  ro  r   rp  )r	  r
  r5   r   	kv_mapperr   r  r   r}  r  r1  )r7   r   r  r  r  r  ro  r+  rT  ra  rI   s             r1   r
  zSkipFFTransformerBlock.__init__  s     	3YY|S:TUDN!DNS%(
%' 3'

 S%(
 3%''

r2   c                 :   ||j                         ni }| j                  $| j                  t        j                  |            }| j	                  |      } | j
                  |fd|i|}||z   }| j                  |      } | j                  |fd|i|}||z   }|S )Nr   )r  r  r   r  r  r}  r  r1  )r7   r   r   r  r@  rK  s         r1   r  zSkipFFTransformerBlock.forward  s    BXBd!7!<!<!>jl>>%$(NN166:O3P$Q!!ZZ6 djj
"7
 %
 $m3!ZZ6 djj
"7
 %
 $m3r2   )r   NFT)	rJ   rT   rU   r   r   r   r
  r  r  r  s   @r1   r  r    sn     -1$#'(
(
 !(
  	(

 (
 %)(
 &c](
 (
 !(
Tr2   r  c            /           e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d(dededededee   ded	ee   d
ededededededededee   dee   dee   dededededef. fdZ	dede
eeef      fdZd)dedede
e   fdZ	 d)dedededdfdZd*d ee   deddfd!Z	 	 	 	 d+d"ej"                  d#eej"                     d$eej"                     d%eej"                     d&eeef   dej"                  fd'Z xZS ),FreeNoiseTransformerBlocka  
    A FreeNoise Transformer block.

    Parameters:
        dim (`int`):
            The number of channels in the input and output.
        num_attention_heads (`int`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`):
            The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability to use.
        cross_attention_dim (`int`, *optional*):
            The size of the encoder_hidden_states vector for cross attention.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to be used in feed-forward.
        num_embeds_ada_norm (`int`, *optional*):
            The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (`bool`, defaults to `False`):
            Configure if the attentions should contain a bias parameter.
        only_cross_attention (`bool`, defaults to `False`):
            Whether to use only cross-attention layers. In this case two cross attention layers are used.
        double_self_attention (`bool`, defaults to `False`):
            Whether to use two self-attention layers. In this case no cross attention layers are used.
        upcast_attention (`bool`, defaults to `False`):
            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
        norm_elementwise_affine (`bool`, defaults to `True`):
            Whether to use learnable elementwise affine parameters for normalization.
        norm_type (`str`, defaults to `"layer_norm"`):
            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
        final_dropout (`bool` defaults to `False`):
            Whether to apply a final dropout after the last feed-forward layer.
        attention_type (`str`, defaults to `"default"`):
            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
        positional_embeddings (`str`, *optional*):
            The type of positional embeddings to apply to.
        num_positional_embeddings (`int`, *optional*, defaults to `None`):
            The maximum number of positional embeddings to apply.
        ff_inner_dim (`int`, *optional*):
            Hidden dimension of feed-forward MLP.
        ff_bias (`bool`, defaults to `True`):
            Whether or not to use bias in feed-forward MLP.
        attention_out_bias (`bool`, defaults to `True`):
            Whether or not to use bias in attention output project layer.
        context_length (`int`, defaults to `16`):
            The maximum number of frames that the FreeNoise block processes at once.
        context_stride (`int`, defaults to `4`):
            The number of frames to be skipped before starting to process a new batch of `context_length` frames.
        weighting_scheme (`str`, defaults to `"pyramid"`):
            The weighting scheme to use for weighting averaging of processed latent frames. As described in the
            Equation 9. of the [FreeNoise](https://huggingface.co/papers/2310.15169) paper, "pyramid" is the default
            setting used.
    Nr   r  r  ro  r+  r  rS  rT  rU  rV  r   rW  r)  rX  rY  r[  r\  r_  r`  ra  context_lengthcontext_strideweighting_schemec           
      ~   t         |           || _        || _        || _        || _        || _        || _        || _        |
| _	        || _
        || _        || _        |	| _        | j                  |||       |d uxr |dk(  | _        |d uxr |dk(  | _        |dk(  | _        |dk(  | _        |dk(  | _        |dv r|t)        d| d| d	      || _        || _        |r|t)        d
      |dk(  rt/        ||      | _        nd | _        t3        j4                  |||      | _        t9        ||||||	r|nd ||      | _        ||
r8t3        j4                  |||      | _        t9        ||
s|nd ||||||      | _        tA        ||||||      | _!        t3        j4                  |||      | _"        d | _#        d| _$        y )Nr$  rc  rd  r&  re  rf  rg  rh  r+   ri  rj  rk  r-  rn  rq  rs  r   )%r	  r
  r   r  r  ro  r+  r  rT  rV  rW  r[  r\  rU  set_free_noise_propertiesrw  rx  ry  rz  r{  rC   r)  rS  r   r|  r5   r   r  r   r}  r  r1  r  r   r~  r4  r5  )r7   r   r  r  ro  r+  r  rS  rT  rU  rV  r   rW  r)  rX  rY  r[  r\  r_  r`  ra  r  r  r  rI   s                           r1   r
  z"FreeNoiseTransformerBlock.__init__d  s1   4 	#6 "4#6 *,%:"'>$%:")B&$8!&&~~GWX )<4(G'iYZiMi$#6d#B"_	U_H_)26G)G&'<7-6:O-O*55:M:U( 4KKT+UVX 
 ##6  &?&Gn  !L0:3OhiDN!DN \\#:QW_`
%'7K 3QU-'	

 *.Cc85LMDJ"?T$7Z^)+#!1+	DJ ''"
 \\#x1HI
  r2   r  r#   c                     g }t        d|| j                  z
  dz   | j                        D ]0  }|}t        ||| j                  z         }|j	                  ||f       2 |S )Nr   r   )ranger  r  minappend)r7   r  frame_indicesiwindow_start
window_ends         r1   _get_frame_indicesz,FreeNoiseTransformerBlock._get_frame_indices  sl    q*t':'::Q>@S@ST 	=ALZT-@-@)@AJ  ,
!;<	= r2   c                    |dk(  rdg|z  }|S |dk(  r`|dz  dk(  r*|dz  }t        t        d|dz               }||d d d   z   }|S |dz   dz  }t        t        d|            }||gz   |d d d   z   }|S |dk(  r^|dz  dk(  r-|dz  }d	g|dz
  z  |gz   }|t        t        |dd            z   }|S |dz   dz  }d	g|z  }|t        t        |dd            z   }|S t        d
|       )Nflatg      ?pyramidr
   r   r   r   delayed_reverse_sawtoothg{Gz?z'Unsupported value for weighting_scheme=)listr  rC   )r7   r  r  weightsmids        r1   _get_frame_weightsz,FreeNoiseTransformerBlock._get_frame_weights  sP   v%ej(G8 5 *A~" AouQa01!GDbDM1* % "A~!+uQ}-!SE/GDbDM9   !;;A~" Ao&C!G,u4!DsAr):$;;  "A~!+&3,!DsAr):$;;  FGWFXYZZr2   c                 .    || _         || _        || _        y r   )r  r  r  )r7   r  r  r  s       r1   r  z3FreeNoiseTransformerBlock.set_free_noise_properties  s     -, 0r2   r   c                      || _         || _        y r   r8  r9  s      r1   r:  z0FreeNoiseTransformerBlock.set_chunk_feed_forward  r;  r2   r   r   r   r  r  c                 N   |'|j                  dd       t        j                  d       ||j                         ni }|j                  }|j
                  }	|j                  d      }
| j                  |
      }| j                  | j                  | j                        }t        j                  |||	      j                  d      j                  d      }|d   d   |
k(  }|sU|
| j                  k  rt        d|
d| j                        |
|d   d   z
  }|j                  |
| j                  z
  |
f       t        j                   d|
df|	      }t        j"                  |      }t%        |      D ]  \  }\  }}t        j&                  |d d ||f         }||z  }|d d ||f   }| j)                  |      }| j*                  | j+                  |      } | j,                  |f| j.                  r|nd |d
|}||z   }|j0                  dk(  r|j3                  d      }| j4                  X| j7                  |      }| j*                   | j8                  dk7  r| j+                  |      } | j4                  |f||d
|}||z   }|t;        |      dz
  k(  rK|sI|d d  d fxx   |d d | d f   |d d | d f   z  z  cc<   |d d | d fxx   |d d | f   z  cc<   o|d d ||fxx   ||z  z  cc<   |d d ||fxx   |z  cc<    t        j<                  t?        |jA                  | j                  d      |jA                  | j                  d            D cg c]"  \  }}t        jB                  |dkD  ||z  |      $ c}}d      jE                  |	      }| jG                  |      }| jH                  -tK        | jL                  || jN                  | jH                        }n| jM                  |      }||z   }|j0                  dk(  r|j3                  d      }|S c c}}w )Nr   r  r   r}   r   r   zExpected num_frames=z1 to be greater or equal than self.context_length=)r~   r  r   rd  r   )(r  rZ   r  r  r~   r   sizer  r  r  r  r4   r   r   rC   r  r   
zeros_like	enumerate	ones_liker  r|  r}  rU  r   r  r1  r  r)  rA   r   zipsplitwherer   r~  r4  r   r   r5  )r7   r   r   r   r  r  argsr  r~   r   r  r  frame_weightsis_last_frame_batch_completelast_frame_batch_lengthnum_times_accumulatedaccumulated_valuesr  frame_start	frame_endr  hidden_states_chunkr@  rK  accumulated_splitnum_times_splitr   s                              r1   r  z!FreeNoiseTransformerBlock.forward
  s    "-%))'48DtuBXBd!7!<!<!>jl %%##"''*
//
;//0C0CTEZEZ[]6OYYZ[\ffgij'4R'8';z'I$
 ,D/// #8ZM9kW[WjWjVl!mnn&0=3DQ3G&G#  *t/B/B"BJ!OP %Q
A,>v N"--m<+4]+C 1	K'A'Y oo&;A{9?T<T&UVG}$G"/;y3H0H"I "&,?!@~~)%)^^4F%G"$$**"?C?X?X&;^b- )	K #.0C"C"''1,&9&A&A!&D# zz%%)ZZ0C%D">>-$..DU2U)-8J)K&(djj&*?#9 -	 '24G&G#C&**3O"1'>&>&?#?@',C+C+D(DEPQTkSkSlPlHmm@ &a*A)A)B&BCwqSjRjOjGkkC"1k)&;#;<@SV]@]]<%aY)>&>?7J?c1	K| 		 ;>&,,T-@-@a,H)//0C0C/K;6% Oa/1B_1TVgh 	
 "U) 	 "ZZ6'-dgg7I4??\`\l\lmI 23I!M1")11!4M-s   'P!
)r   Nr  NFFFFTr&  r  FNNNTT   r   r  )r  rO  )NNNN)rJ   rT   rU   r  r   r   r   r3   r   r
  r   r   r  r  r  r:  r4   r   r   r   r  r  r  s   @r1   r  r  ,  si   4v -1$-1$%*&+!&(,%#/337&*#'  )1pp !p  	p
 p &c]p p &c]p p #p  $p p "&p p p  !p"  (}#p$ $,C=%p& sm'p( )p* !+p, -p. /p0 1pdS T%S/5J S C X\]bXc B QZ1!1361JM1	1# S QU  268<9=15{||{ !.{  (5	{
 !) 6{ !%S#X{ 
{r2   r  c                        e Zd ZdZ	 	 	 	 	 	 	 ddedee   dededededef fd	Z	d
e
j                  de
j                  fdZ xZS )r  a  
    A feed-forward layer.

    Parameters:
        dim (`int`): The number of channels in the input.
        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
    r   r/  multro  r  rY  r   c	                    t         
|           |t        ||z        }||n|}|dk(  rt        |||      }	|dk(  rt        ||d|      }	nP|dk(  rt	        |||      }	n<|dk(  rt        |||      }	n(|dk(  rt        |||      }	n|d	k(  rt        |||d
      }	t        j                  g       | _
        | j                  j                  	       | j                  j                  t        j                  |             | j                  j                  t        j                  |||             |r/| j                  j                  t        j                  |             y y )Ngelur  r.  r  )approximater   r  zgeglu-approximateswigluzlinear-silur  )r   
activation)r	  r
  r   r   r   r   r   r   r5   
ModuleListnetr  Dropoutr   )r7   r   r/  r  ro  r  rY  rt  r   act_fnrI   s             r1   r
  zFeedForward.__init__  s0    	C$JI$0'cF"#yt4F..#yf4HFg%3	5F11$S)$?Fh&C6Fm+%c94FSF==$

7+,		)W4@AHHOOBJJw/0 r2   r   r#   c                     t        |      dkD  s|j                  dd       d}t        dd|       | j                  D ]
  } ||      } |S )Nr   r   zThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.z1.0.0)rA   r  r   r  )r7   r   r  r  deprecation_messager&   s         r1   r  zFeedForward.forward  sV    t9q=FJJw5A #Ugw(;<hh 	2F"=1M	2r2   )Nr   r   r  FNT)rJ   rT   rU   r  r   r   r   r3   r   r
  r4   r   r  r  r  s   @r1   r  r    s      "&$#&1&1 #&1 	&1
 &1 &1 &1 &1PU\\ u|| r2   r  )=typingr   r   r   r   r   r   r	   r4   torch.nnr5   torch.nn.functional
functionalr   utilsr   r   utils.import_utilsr   r   r   utils.torch_utilsr   activationsr   r   r   r   r   r   attention_processorr   r   r   
embeddingsr   normalizationr   r   r   r   r    rz   r   
get_loggerrJ   rZ   r"   rL   r6   r   r   r   r   r  rR  r  r  r  r  r  rW   r2   r1   <module>r      s   E D D     & f f 4 Y Y U U 5 q q D 
		H	%S, S,l@% @%Fbii  QT be  &bii & &R h4BII h4 h4V HBII H HV
.M		 .Mb ~BII ~ ~BERYY EP X		 X Xv
<")) <r2   