
    bi                    
   d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
c mZ d dlm
Z
 ddlmZ ddlmZmZ ddlmZ 	 	 	 	 dd
ej(                  dedededededej(                  fdZ	 	 	 	 ddedeeeeef   f   dedededeej2                     dedej(                  fdZ	 	 ddedeeeeef   f   dedededej8                  fdZ	 	 	 	 	 	 ddeej2                     defdZddZddZ 	 ddZ!d  Z"d! Z# G d" d#e
jH                        Z% G d$ d%e
jH                        Z& G d& d'e
jH                        Z' G d( d)e
jH                        Z(	 	 	 	 	 dd+ed,ed-ed.eeeef      deej2                     deej(                  eej(                  ej(                  f   f   fd/Z)	 	 	 dd0eeeef   d+edeej2                     deej(                  eej(                  ej(                  f   f   fd1Z*	 ddeej2                     defd2Z+dd3Z,dd4Z-dd5Z.d6d	ddd*ej^                  fd7ed8eej8                  ef   d+efd9Z0	 	 	 dd:ej(                  d;eej(                  eej(                     f   d,ed<ed=edeej(                  ej(                  f   fd>Z1d:ej(                  fd?Z2 G d@ dAe
jH                        Z3 G dB dCe
jH                        Z4 G dD dEe
jH                        Z5 G dF dGe
jH                        Z6 G dH dIe
jH                        Z7 G dJ dKe
jH                        Z8 G dL dMe
jH                        Z9 G dN dOe
jH                        Z: G dP dQe
jH                        Z; G dR dSe
jH                        Z< G dT dUe
jH                        Z= G dV dWe
jH                        Z> G dX dYe
jH                        Z? G dZ d[e
jH                        Z@ G d\ d]e
jH                        ZA G d^ d_e
jH                        ZB G d` dae
jH                        ZC G db dce
jH                        ZD G dd dee
jH                        ZE G df dge
jH                        ZF G dh die
jH                        ZG G dj dke
jH                        ZH G dl dme
jH                        ZI G dn doe
jH                        ZJdp ZK G dq dre
jH                        ZL G ds dte
jH                        ZM G du dve
jH                        ZN G dw dxe
jH                        ZO G dy dze
jH                        ZP G d{ d|e
jH                        ZQ G d} d~e
jH                        ZR G d de
jH                        ZS G d de
jH                        ZT G d de
jH                        ZUy)    N)ListOptionalTupleUnion)nn   )	deprecate   )FP32SiLUget_activation)	AttentionF	timestepsembedding_dimflip_sin_to_cosdownscale_freq_shiftscale
max_periodreturnc                    t        | j                        dk(  sJ d       |dz  }t        j                  |       t	        j
                  d|t        j                  | j                        z  }|||z
  z  }t	        j                  |      }| dddf   j                         |dddf   z  }||z  }t	        j                  t	        j                  |      t	        j                  |      gd      }|r+t	        j                  |dd|df   |ddd|f   gd      }|dz  dk(  r*t        j                  j                  j                  |d	      }|S )
a&  
    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.

    Args
        timesteps (torch.Tensor):
            a 1-D Tensor of N indices, one per batch element. These may be fractional.
        embedding_dim (int):
            the dimension of the output.
        flip_sin_to_cos (bool):
            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
        downscale_freq_shift (float):
            Controls the delta between frequencies between dimensions
        scale (float):
            Scaling factor applied to the embeddings.
        max_period (int):
            Controls the maximum frequency of the embeddings
    Returns
        torch.Tensor: an [N x dim] Tensor of positional embeddings.
    r
   zTimesteps should be a 1d-arrayr   r   )startenddtypedeviceNdim)r   r
   r   r   )lenshapemathlogtorcharangefloat32r   expfloatcatsincosr   
functionalpad)	r   r   r   r   r   r   half_dimexponentembs	            V/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/embeddings.pyget_timestep_embeddingr/      s7   6 y1$F&FF$!H$$u||XU]]9;K;K( H 8&::;H
))H
C
AtG

"
"
$s47|
3C #+C ))UYYs^UYYs^4"
=C iiQ	\*C9H9,=>BG qAhh!!%%c<8J          ?	embed_dimspatial_sizetemporal_sizespatial_interpolation_scaletemporal_interpolation_scaler   output_typec                 H   |dk(  rt        | ||||      S | dz  dk7  rt        d      t        |t              r||f}d| z  dz  }| dz  }t	        j
                  |d   |t        j                        |z  }	t	        j
                  |d   |t        j                        |z  }
t	        j                  |
|	d	
      }t	        j                  |d      }|j                  dd|d   |d   g      }t        ||d      }t	        j
                  ||t        j                        |z  }t        ||d      }|dddddf   }|j                  |d|j                  d   |z        }|dddddf   }|j                  |d   |d   z  d      }t	        j                  ||gd      }|S )a  
    Creates 3D sinusoidal positional embeddings.

    Args:
        embed_dim (`int`):
            The embedding dimension of inputs. It must be divisible by 16.
        spatial_size (`int` or `Tuple[int, int]`):
            The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
            spatial dimensions (height and width).
        temporal_size (`int`):
            The temporal dimension of positional embeddings (number of frames).
        spatial_interpolation_scale (`float`, defaults to 1.0):
            Scale factor for spatial grid interpolation.
        temporal_interpolation_scale (`float`, defaults to 1.0):
            Scale factor for temporal grid interpolation.

    Returns:
        `torch.Tensor`:
            The 3D sinusoidal positional embeddings of shape `[temporal_size, spatial_size[0] * spatial_size[1],
            embed_dim]`.
    np)r2   r3   r4   r5   r6      r   "`embed_dim` must be divisible by 4   r
   r   r   xyindexingr   r   ptr7   Nr   output_sizer   )_get_3d_sincos_pos_embed_np
ValueError
isinstanceintr!   r"   r#   meshgridstackreshape!get_2d_sincos_pos_embed_from_grid!get_1d_sincos_pos_embed_from_gridrepeat_interleaver   concat)r2   r3   r4   r5   r6   r   r7   embed_dim_spatialembed_dim_temporalgrid_hgrid_wgridpos_embed_spatialgrid_tpos_embed_temporal	pos_embeds                   r.   get_3d_sincos_pos_embedrY   Q   s   < d*%'(C)E
 	
 1}=>>,$$l3I*"a \\,q/&NQllF\\,q/&NQllF>>&&48D;;t#D<<A|AQ@AD9:KT_cd \\-emmLOkkF:;Mvcgh *$1*5);;1*;*A*A!*D}*T <  ,AtQJ7+==Q,q/)q >  02CD"MIr0   c                 X   d}t        dd|d       | dz  dk7  rt        d      t        |t              r||f}d	| z  dz  }| dz  }t	        j
                  |d
   t        j                        |z  }t	        j
                  |d   t        j                        |z  }	t	        j                  |	|      }
t	        j                  |
d      }
|
j                  dd
|d
   |d   g      }
t        ||
      }t	        j
                  |t        j                        |z  }t        ||      }|t        j                  ddddf   }t	        j                  ||d      }|ddt        j                  ddf   }t	        j                  ||d   |d
   z  d
      }t	        j                  ||gd      }|S )a  
    Creates 3D sinusoidal positional embeddings.

    Args:
        embed_dim (`int`):
            The embedding dimension of inputs. It must be divisible by 16.
        spatial_size (`int` or `Tuple[int, int]`):
            The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
            spatial dimensions (height and width).
        temporal_size (`int`):
            The temporal dimension of positional embeddings (number of frames).
        spatial_interpolation_scale (`float`, defaults to 1.0):
            Scale factor for spatial grid interpolation.
        temporal_interpolation_scale (`float`, defaults to 1.0):
            Scale factor for temporal grid interpolation.

    Returns:
        `np.ndarray`:
            The 3D sinusoidal positional embeddings of shape `[temporal_size, spatial_size[0] * spatial_size[1],
            embed_dim]`.
    z`get_3d_sincos_pos_embed` uses `torch` and supports `device`. `from_numpy` is no longer required.  Pass `output_type='pt' to use the new version now.output_type=='np'0.33.0Fstandard_warnr:   r   r;   r<   r
   r   axisr   Nr   )r	   rF   rG   rH   r9   r"   r#   rI   rJ   rK   rL   rM   newaxisrepeatconcatenate)r2   r3   r4   r5   r6   deprecation_messagerP   rQ   rR   rS   rT   rU   rV   rW   rX   s                  r.   rE   rE      s   :	? 
 !8-@PUV1}=>>,$$l3I*"a YY|Abjj9<WWFYY|Abjj9<WWF;;vv&D88Dq!D<<A|AQ@AD9:KTR YY}BJJ7:VVF:;MvV *"**a*:;		"3]K+Arzz1,<=#5|AVW7X_`a 24EFRPIr0   c                 j   |dk(  r"d}t        dd|d       t        | |||||      S t        |t              r||f}t	        j
                  |d   |t        j                  	      |d   |z  z  |z  }	t	        j
                  |d
   |t        j                  	      |d
   |z  z  |z  }
t	        j                  |
|	d      }t	        j                  |d      }|j                  dd
|d
   |d   g      }t        | ||      }|r3|dkD  r.t	        j                  t	        j                  || g      |gd      }|S )a  
    Creates 2D sinusoidal positional embeddings.

    Args:
        embed_dim (`int`):
            The embedding dimension.
        grid_size (`int`):
            The size of the grid height and width.
        cls_token (`bool`, defaults to `False`):
            Whether or not to add a classification token.
        extra_tokens (`int`, defaults to `0`):
            The number of extra tokens to add.
        interpolation_scale (`float`, defaults to `1.0`):
            The scale of the interpolation.

    Returns:
        pos_embed (`torch.Tensor`):
            Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size,
            embed_dim]` if using cls_token
    r9   `get_2d_sincos_pos_embed` uses `torch` and supports `device`. `from_numpy` is no longer required.  Pass `output_type='pt' to use the new version now.r[   r\   Fr]   )r2   	grid_size	cls_tokenextra_tokensinterpolation_scale	base_sizer   r=   r
   r>   r?   r   r   rB   )r	   get_2d_sincos_pos_embed_nprG   rH   r!   r"   r#   rI   rJ   rK   rL   rO   zeros)r2   rh   ri   rj   rk   rl   r   r7   re   rR   rS   rT   rX   s                r.   get_2d_sincos_pos_embedro      sT   < dC 	
 	%x1DTYZ)% 3
 	
 )S!	*	 	Yq\&FQ<)#	%
	  	Yq\&FQ<)#	%
	 
 >>&&48D;;t#D<<Ay|Yq\:;D1)T{[I\A%LL%++|Y.G"H)!TZ[\	r0   c                     |dk(  rd}t        dd|d       t        | |      S | dz  d	k7  rt        d
      t        | dz  |d	   |      }t        | dz  |d   |      }t	        j
                  ||gd      }|S )aG  
    This function generates 2D sinusoidal positional embeddings from a grid.

    Args:
        embed_dim (`int`): The embedding dimension.
        grid (`torch.Tensor`): Grid of positions with shape `(H * W,)`.

    Returns:
        `torch.Tensor`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)`
    r9   z`get_2d_sincos_pos_embed_from_grid` uses `torch` and supports `device`. `from_numpy` is no longer required.  Pass `output_type='pt' to use the new version now.r[   r\   Fr]   )r2   rT   r   r    embed_dim must be divisible by 2rB   r
   r   )r	   $get_2d_sincos_pos_embed_from_grid_nprF   rM   r!   rO   )r2   rT   r7   re   emb_hemb_wr-   s          r.   rL   rL   !  s     dC 	
 	%x1DTYZ3
 	
 1};<< .i1nd1gS^_E-i1nd1gS^_E
,,u~1
-CJr0   c                 2   |dk(  rd}t        dd|d       t        | |      S | dz  d	k7  rt        d
      t        j                  | dz  |j
                  t        j                        }|| dz  z  }dd|z  z  }|j                  d      }t        j                  ||      }t        j                  |      }t        j                  |      }t        j                  ||gd      }	|r1t        j                  |	dd| dz  df   |	ddd| dz  f   gd      }	|	S )a*  
    This function generates 1D positional embeddings from a grid.

    Args:
        embed_dim (`int`): The embedding dimension `D`
        pos (`torch.Tensor`): 1D tensor of positions with shape `(M,)`

    Returns:
        `torch.Tensor`: Sinusoidal positional embeddings of shape `(M, D)`.
    r9   z`get_1d_sincos_pos_embed_from_grid` uses `torch` and supports `device`. `from_numpy` is no longer required.  Pass `output_type='pt' to use the new version now.r[   z0.34.0Fr]   )r2   posr   r   rq   r=          @r1   '  r   r
   r   N)r	   $get_1d_sincos_pos_embed_from_grid_nprF   r!   r"   r   float64rK   outerr'   r(   rO   r&   )
r2   rv   r7   r   re   omegaoutemb_sinemb_cosr-   s
             r.   rM   rM   B  s    dC 	
 	%x1DTYZ3iSQQ1};<<LLa

%--PE	Y_E%,E
++b/C
++c5
!CiinGiinG
,,)q
1C iiQ	Q 0013q:JIN:J7J3KLRSTJr0   c                    t        |t              r||f}t        j                  |d   t        j                        |d   |z  z  |z  }t        j                  |d   t        j                        |d   |z  z  |z  }t        j
                  ||      }t        j                  |d      }|j                  dd|d   |d   g      }t        | |      }	|r3|dkD  r.t        j                  t        j                  || g      |	gd      }	|	S )a  
    Creates 2D sinusoidal positional embeddings.

    Args:
        embed_dim (`int`):
            The embedding dimension.
        grid_size (`int`):
            The size of the grid height and width.
        cls_token (`bool`, defaults to `False`):
            Whether or not to add a classification token.
        extra_tokens (`int`, defaults to `0`):
            The number of extra tokens to add.
        interpolation_scale (`float`, defaults to `1.0`):
            The scale of the interpolation.

    Returns:
        pos_embed (`np.ndarray`):
            Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size,
            embed_dim]` if using cls_token
    r   r_   r
   r`   r   )rG   rH   r9   r"   r#   rI   rJ   rK   rr   rd   rn   )
r2   rh   ri   rj   rk   rl   rR   rS   rT   rX   s
             r.   rm   rm   k  s    . )S!	*	YYy|2::6)A,:RSViiFYYy|2::6)A,:RSViiF;;vv&D88Dq!D<<Ay|Yq\:;D4YEI\A%NNBHHlI-F$G#SZ[\	r0   c                     | dz  dk7  rt        d      t        | dz  |d         }t        | dz  |d         }t        j                  ||gd      }|S )aC  
    This function generates 2D sinusoidal positional embeddings from a grid.

    Args:
        embed_dim (`int`): The embedding dimension.
        grid (`np.ndarray`): Grid of positions with shape `(H * W,)`.

    Returns:
        `np.ndarray`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)`
    r   r   rq   r
   r`   )rF   ry   r9   rd   )r2   rT   rs   rt   r-   s        r.   rr   rr     s_     1};<< 1aaIE0aaIE
..%a
0CJr0   c                 r   | dz  dk7  rt        d      t        j                  | dz  t        j                        }|| dz  z  }dd|z  z  }|j	                  d      }t        j
                  d	||      }t        j                  |      }t        j                  |      }t        j                  ||gd
      }|S )a,  
    This function generates 1D positional embeddings from a grid.

    Args:
        embed_dim (`int`): The embedding dimension `D`
        pos (`numpy.ndarray`): 1D tensor of positions with shape `(M,)`

    Returns:
        `numpy.ndarray`: Sinusoidal positional embeddings of shape `(M, D)`.
    r   r   rq   r_   rw   r1   rx   r   zm,d->mdr
   r`   )	rF   r9   r"   rz   rK   einsumr'   r(   rd   )r2   rv   r|   r}   r~   r   r-   s          r.   ry   ry     s     1};<<IIi1nBJJ7E	Y_E%,E
++b/C
))IsE
*CffSkGffSkG
..'7+!
4CJr0   c                   F     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 d fd	Zd Zd Z xZS )
PatchEmbeda  
    2D Image to Patch Embedding with support for SD3 cropping.

    Args:
        height (`int`, defaults to `224`): The height of the image.
        width (`int`, defaults to `224`): The width of the image.
        patch_size (`int`, defaults to `16`): The size of the patches.
        in_channels (`int`, defaults to `3`): The number of input channels.
        embed_dim (`int`, defaults to `768`): The output dimension of the embedding.
        layer_norm (`bool`, defaults to `False`): Whether or not to use layer normalization.
        flatten (`bool`, defaults to `True`): Whether or not to flatten the output.
        bias (`bool`, defaults to `True`): Whether or not to use bias.
        interpolation_scale (`float`, defaults to `1`): The scale of the interpolation.
        pos_embed_type (`str`, defaults to `"sincos"`): The type of positional embedding.
        pos_embed_max_size (`int`, defaults to `None`): The maximum size of the positional embedding.
    c                 b   t         |           ||z  ||z  z  }|| _        || _        || _        t        j                  ||||f||      | _        |rt        j                  |dd      | _	        nd | _	        || _
        ||z  ||z  c| _        | _        ||z  | _        |	| _        |r|}nt        |dz        }|
d | _        y |
dk(  r\t#        ||| j                  | j                  d      }|rd	nd}| j%                  d
|j'                         j)                  d      |       y t+        d|
       )Nkernel_sizestridebiasFgư>)elementwise_affineeps      ?sincosrA   )rl   rk   r7   TrX   r   
persistentzUnsupported pos_embed_type: )super__init__flatten
layer_normpos_embed_max_sizer   Conv2dproj	LayerNormnorm
patch_sizeheightwidthrl   rk   rH   rX   ro   register_bufferr%   	unsqueezerF   )selfr   r   r   in_channelsr2   r   r   r   rk   pos_embed_typer   num_patchesrh   rX   r   	__class__s                   r.   r   zPatchEmbed.__init__  sI    	+0CD$"4IIZ0HQ[bf
	 Y5dSDIDI$"(J"68KTZ:-#6  *IK,-I!!DNx'/..$($<$< I "4J  ioo.?.I.I!.LYc d;N;KLMMr0   c                 6   | j                   t        d      || j                  z  }|| j                  z  }|| j                   kD  rt        d| d| j                    d      || j                   kD  rt        d| d| j                    d      | j                   |z
  dz  }| j                   |z
  dz  }| j                  j	                  d| j                   | j                   d	      }|dd|||z   |||z   ddf   }|j	                  dd	|j
                  d	         }|S )
z2Crops positional embeddings for SD3 compatibility.Nz.`pos_embed_max_size` must be set for cropping.zHeight (z/) cannot be greater than `pos_embed_max_size`: .zWidth (r   r
   r   )r   rF   r   rX   rK   r   )r   r   r   topleftspatial_pos_embeds         r.   cropped_pos_embedzPatchEmbed.cropped_pos_embed  sJ   ""*MNN4??*(D+++6("QRVRiRiQjjkl  4***% OPTPgPgOhhij  &&/A5''%/A5 NN221d6M6MtOfOfhjk-asV|1CTDSXLEXZ[.[\-55a=N=T=TUW=XY  r0   c                 R   | j                   |j                  dd  \  }}n8|j                  d   | j                  z  |j                  d   | j                  z  }}| j                  |      }| j                  r!|j	                  d      j                  dd      }| j                  r| j                  |      }| j                  |j                  |j                        S | j                   r| j                  ||      }n| j                  |k7  s| j                  |k7  rht        | j                  j                  d   ||f| j                  | j                   |j"                  d      }|j%                         j'                  d      }n| j                  }||z   j                  |j                        S )Nr   r   r
   rA   )r2   rh   rl   rk   r   r7   r   )r   r   r   r   r   	transposer   r   rX   tor   r   r   r   ro   rl   rk   r   r%   r   )r   latentr   r   rX   s        r.   forwardzPatchEmbed.forward#  s_   ""."LL-MFE"LL,?bAQUYUdUdAdEF6"<<^^A&00A6F??YYv&F>>!99V\\**""..vu=I{{f$

e(;3"nn2226%uo"nn(,(@(@!== $	 &OO-77:	 NN	"&&v||44r0   )   r      r<      FTTr
   r   N)__name__
__module____qualname____doc__r   r   r   __classcell__r   s   @r.   r   r     s<    & 5Nn!.5r0   r   c                   *     e Zd ZdZd fd	Zd Z xZS )LuminaPatchEmbedaz  
    2D Image to Patch Embedding with support for Lumina-T2X

    Args:
        patch_size (`int`, defaults to `2`): The size of the patches.
        in_channels (`int`, defaults to `4`): The number of input channels.
        embed_dim (`int`, defaults to `768`): The output dimension of the embedding.
        bias (`bool`, defaults to `True`): Whether or not to use bias.
    c                 v    t         |           || _        t        j                  ||z  |z  ||      | _        y )Nin_featuresout_featuresr   )r   r   r   r   Linearr   )r   r   r   r2   r   r   s        r.   r   zLuminaPatchEmbed.__init__N  s8    $II"Z/+="
	r0   c                 R   |j                  |d   j                        }| j                  x}}|j                         \  }}}}||z  ||z  }
}	|j	                  |||	||
|      j                  dddddd      }|j                  d      }| j                  |      }|j                  dd      }t        j                  |j                  d   |j                  d   t        j                  |j                        }||||fg|z  |d|	d|
f   j                  dd      j                  d      fS )	a  
        Patchifies and embeds the input tensor(s).

        Args:
            x (List[torch.Tensor] | torch.Tensor): The input tensor(s) to be patchified and embedded.

        Returns:
            Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], torch.Tensor]: A tuple containing the patchified
            and embedded tensor(s), the mask indicating the valid patches, the original image size(s), and the
            frequency tensor(s).
        r   r   r:   r
   r<      r   r   N)r   r   r   sizeviewpermuter   r   r!   onesr   int32r   )r   x	freqs_cispatch_heightpatch_width
batch_sizechannelr   r   height_tokenswidth_tokensmasks               r.   r   zLuminaPatchEmbed.forwardW  s%    LL1-	%)__4{-.VVX*
GVU&,&<e{>R|FF:w|\S^_ggq!Q1
 IIaLIIaLIIaOzz!''!*aggajAHHU e_
*n}nm|m34<<QBLLQO	
 	
r0   )r   r:   r   Tr   r   r   r   r   r   r   r   s   @r.   r   r   C  s    

r0   r   c            !           e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedee   dededededed	ed
edededededededdf  fdZ	 dd	eded
edeej                     dej                  f
dZdej                  dej                  fdZ xZS )CogVideoXPatchEmbedNr   patch_size_tr   r2   text_embed_dimr   sample_widthsample_heightsample_framestemporal_compression_ratiomax_text_seq_lengthr5   r6   use_positional_embeddings!use_learned_positional_embeddingsr   c                    t         |           || _        || _        || _        || _        || _        |	| _        |
| _        || _	        || _
        || _        || _        || _        |"t        j                  ||||f||      | _        n$t        j"                  ||z  |z  |z  |      | _        t        j"                  ||      | _        |s|r*|}| j'                  |||	      }| j)                  d||       y y )Nr   pos_embeddingr   )r   r   r   r   r2   r   r   r   r   r   r5   r6   r   r   r   r   r   r   	text_proj_get_positional_embeddingsr   )r   r   r   r   r2   r   r   r   r   r   r   r   r5   r6   r   r   r   r   r   s                     r.   r   zCogVideoXPatchEmbed.__init__z  s   $ 	$("*(**D'#6 +F(,H))B&1R.		YZ4LU_fjDI
 		+
":Z"G,"VXabDI>9=$(I:J ;;M<YfgM  -J W )Jr0   r   c           	         || j                   z  }|| j                   z  }|dz
  | j                  z  dz   }||z  |z  }t        | j                  ||f|| j                  | j
                  |d      }	|	j                  dd      }	|	j                  d| j                  |z   | j                  d      }
|
j                  d d | j                  d f   j                  |	       |
S )Nr
   rA   )r   r7   r   Frequires_grad)r   r   rY   r2   r5   r6   r   	new_zerosr   datacopy_)r   r   r   r   r   post_patch_heightpost_patch_widthpost_time_compression_framesr   r   joint_pos_embeddings              r.   r   z.CogVideoXPatchEmbed._get_positional_embeddings  s     *T__<'4??:(5(9d>]>]']`a'a$'*::=YY/NN01(,,--
 &--a3+55t''+5t~~UZ 6 
 	  D$<$<$>!>?EEmT""r0   text_embedsimage_embedsc           
         | j                  |      }|j                  \  }}}}}| j                  z|j                  d|||      }| j	                  |      } |j
                  ||g|j                  dd  }|j                  d      j                  dd      }|j                  dd      }n| j                  }| j                  }	|j                  ddddd      }|j                  |||	z  |	||z  |||z  ||      }|j                  ddddd	ddd
      j                  dd	      j                  dd      }| j	                  |      }t        j                  ||gd      j                         }
| j                  s| j                  r| j                  r)| j                  |k7  s| j                   |k7  rt#        d      |dz
  | j$                  z  dz   }| j                   |k7  s| j                  |k7  s| j&                  |k7  r | j)                  ||||
j*                        }n| j,                  }|j/                  |
j0                        }|
|z   }
|
S )a7  
        Args:
            text_embeds (`torch.Tensor`):
                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
            image_embeds (`torch.Tensor`):
                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
        Nr   r
   r<   r   r   r:   r         r   a   It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'.If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues.r   r_   )r   r   r   rK   r   r   r   r   r   r   r!   r&   
contiguousr   r   r   r   rF   r   r   r   r   r   r   r   )r   r   r   r   
num_frameschannelsr   r   pp_tembedspre_time_compression_framesr   s                r.   r   zCogVideoXPatchEmbed.forward  sf    nn[1:F:L:L7
J&%$'//HfeLL99\2L,<,,Z]lFXFXYZY[F\]L'//2<<QBL'//15LA##C'//1aA>L'//J#-sFaKEQJPQS[L (//1aAq!QGOOPQSTU]]^_abcL99\2L,'Q

*, 	 ))T-S-S554;L;LPU;UY]YkYkouYu  
 ,6>T=\=\*\_`*`' ""f,$$-%%)DD $ ? ?E#>v}} !@ ! !% 2 2),,6<<,@Mm+Fr0   )r   Nr   i     TZ   <   1   r:      g      ?r1   TTN)r   r   r   rH   r   boolr%   r   r!   r   Tensorr   r   r   r   s   @r.   r   r   y  sJ    &*"*+#&-2.1*.26!/X/X sm/X 	/X
 /X /X /X /X /X /X %(/X !/X &+/X ',/X $(/X  ,0!/X" 
#/Xd ko# #03#DG#QYZ_ZfZfQg#	#285<< 8u|| 8r0   r   c                        e Zd Z	 	 	 	 	 ddededededef
 fdZdej                  dej                  d	ej                  fd
Z xZS )CogView3PlusPatchEmbedr   hidden_sizer   text_hidden_sizer   c                 h   t         |           || _        || _        || _        || _        || _        t        j                  ||dz  z  |      | _	        t        j                  ||      | _
        t        |||d      }|j                  |||      }| j                  d|j                         d       y )Nr   rA   )rl   r7   rX   Fr   )r   r   r   r  r   r  r   r   r   r   r   ro   rK   r   r%   )r   r   r  r   r  r   rX   r   s          r.   r   zCogView3PlusPatchEmbed.__init__   s     	&&$ 0"4IIkJM9;G	 #3[A++7IW[
	 %%&8:LkZ	[)//*;Nr0   hidden_statesencoder_hidden_statesr   c                    |j                   \  }}}}|| j                  z  dk7  s|| j                  z  dk7  rt        d      || j                  z  }|| j                  z  }|j                  |||| j                  || j                        }|j	                  dddddd      j                         }|j                  |||z  || j                  z  | j                  z        }| j                  |      }| j                  |      }t        j                  ||gd      }|j                   d   }| j                  d |d |f   j                  ||z  d	      }t        j                  || j                  f|j                  |j                  
      }	t        j                  |	|gd      d   }
||
z   j!                  |j                        S )Nr   z0Height and width must be divisible by patch sizer   r:   r
   r<   r   r   r   r   )N.)r   r   rF   r   r   r   r   r   r!   r&   rX   rK   rn   r  r   r   r   )r   r  r  r   r   r   r   text_lengthimage_pos_embedtext_pos_embedrX   s              r.   r   zCogView3PlusPatchEmbed.forward  s   -:-@-@*
GVUDOO#q(EDOO,Cq,HOPP4??*(%**:wY^`d`o`op%--aAq!Q?JJL%**:v~wQUQ`Q`G`cgcrcrGrs 		-0 $/D E		#8-"HaP ,11!4..&&5&9AA&5.RTU$**+?3H3HQ`QgQg
 II~?QG	R		)--m.A.ABBr0   )r   i 
  r   r      	r   r   r   rH   r   r!   r   r   r   r   s   @r.   r  r    s|      $"%OO O 	O
 O  O4CU\\ C%,, C[`[g[g Cr0   r  Tthetause_real	grid_typemax_sizec	                     |durt        d      |dk(  r|\  }	}
|\   t        j                  |	d   |
d   dz
  z  z  |t        j                        }t        j                  |	d   |
d    dz
  z   z   |t        j                        }t        j                  |t        j                        }t        j                  ddz
  z  z  |t        j                        }n|dk(  r}|\  }}|\   t        j                  ||t        j                        }t        j                  ||t        j                        }t        j                  |t        j                        }nt        d      | d	z  }| d
z  dz  }| d
z  dz  }t        |||d      }t        |||d      }t        |||d      } fd}|\  }}|\  }}|\  }}|dk(  r|d |d }}|d |d }}|d  |d  }} ||||      } ||||      }||fS )a  
    RoPE for video tokens with 3D structure.

    Args:
    embed_dim: (`int`):
        The embedding dimension size, corresponding to hidden_size_head.
    crops_coords (`Tuple[int]`):
        The top-left and bottom-right coordinates of the crop.
    grid_size (`Tuple[int]`):
        The grid size of the spatial positional embedding (height, width).
    temporal_size (`int`):
        The size of the temporal dimension.
    theta (`float`):
        Scaling factor for frequency computation.
    grid_type (`str`):
        Whether to use "linspace" or "slice" to compute grids.

    Returns:
        `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
    TzJ `use_real = False` is not currently supported for get_3d_rotary_pos_embedlinspacer   r
   r=   slicez%Invalid value passed for `grid_type`.r:      r<   )r  r  c                 &   | d d d d d d f   j                  dd      } |d d d d d d f   j                  dd      }|d d d d d d f   j                  dd      }t        j                  | ||gd      }|j                  z  z  d      }|S )Nr   r   )expandr!   r&   r   )freqs_tfreqs_hfreqs_wfreqsgrid_size_hgrid_size_wr4   s       r.   combine_time_height_widthz:get_3d_rotary_pos_embed.<locals>.combine_time_height_width{  s    !T4*+22["
 $4*+222{B
 $a*+22;B
 		gw'R
 

K'+5r
 r0   N)rF   r!   r  r#   r"   get_1d_rotary_pos_embed)!r2   crops_coordsrh   r4   r  r  r  r  r   r   stoprR   rS   rV   max_hmax_wdim_tdim_hdim_wr  r  r  r  t_cost_sinh_cosh_sinw_cosw_sinr(   r'   r  r  s!      `                           @@r.   get_3d_rotary_pos_embedr,  7  sD   > teffJ"t#, [!Hd1gq1K?U[chcpcp
 !Hd1gq1K?U[chcpcp
 mF%--P} 12]BMZ`hmhuhu
 
g	u#, [eF%--HeF%--HmF%--P@AA NENQENQE &eV54PG%eV54PG%eV54PG& LE5LE5LE5G^m,eN].Cu\k*E,;,?u\k*E,;,?u
#E5%
8C
#E5%
8C8Or0   rk   c                     |\  }}|\  }	}
|\  }}}t        j                  d||dz
  z  |z  ||t         j                        }t        j                  |d   |d   |	dz
  z  |	z  |	|t         j                        }t        j                  |d   |d   |
dz
  z  |
z  |
|t         j                        }| dz  }| dz  }| dz  }t        |||z  |dd      }t        |||z  |dd      }t        |||z  |dd      }||||||fS )Nr   r
   r=   r<   TF)r  r  repeat_interleave_real)r!   r  r#   r  )r2   r  rh   r4   rk   r  r   r   r   r  r  interpolation_scale_tinterpolation_scale_hinterpolation_scale_wrV   rR   rS   r#  r$  r%  r  r  r  s                          r.   get_3d_rotary_pos_embed_allegror2    sT    KE4(KJ]G02G^^	=MA-.>V\didqdqF ^^a$q'[1_-;[QW_d_l_lF ^^a$q'[1_-;[QW_d_l_lF
 NENENE &v--UTbgG &v--UTbgG &v--UTbgG GWfff<<r0   c                    |dk(  r d}t        dd|d       t        | |||      S |\  }}t        j                  |d   |d   |d   d	z
  z  |d   z  |d   |t        j                  
      }	t        j                  |d	   |d	   |d	   d	z
  z  |d	   z  |d	   |t        j                  
      }
t        j
                  |
|	d      }t        j                  |d      }|j                  dd	g|j                  d	d       }t        | ||      }|S )ak  
    RoPE for image tokens with 2d structure.

    Args:
    embed_dim: (`int`):
        The embedding dimension size
    crops_coords (`Tuple[int]`)
        The top-left and bottom-right coordinates of the crop.
    grid_size (`Tuple[int]`):
        The grid size of the positional embedding.
    use_real (`bool`):
        If True, return real part and imaginary part separately. Otherwise, return complex numbers.
    device: (`torch.device`, **optional**):
        The device used to create tensors.

    Returns:
        `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
    r9   rg   r[   r\   Fr]   )r2   r  rh   r  r   r
   r=   r>   r?   r   r   Nr  )
r	   _get_2d_rotary_pos_embed_npr!   r  r#   rI   rJ   rK   r   !get_2d_rotary_pos_embed_from_grid)r2   r  rh   r  r   r7   re   r   r   rR   rS   rT   rX   s                r.   get_2d_rotary_pos_embedr7    s2   * dC 	
 	%x1DTYZ*%	
 	
 KE4^^a$q'Yq\A-.1=y|TZbgboboF ^^a$q'Yq\A-.1=y|TZbgboboF >>&&48D;;t#D<<A/

12/0D1)THUIr0   c                    |\  }}t        j                  |d   |d   |d   dt         j                        }t        j                  |d   |d   |d   dt         j                        }t        j                  ||      }t        j                  |d      }|j                  ddg|j                  dd       }t        | ||      }	|	S )	a  
    RoPE for image tokens with 2d structure.

    Args:
    embed_dim: (`int`):
        The embedding dimension size
    crops_coords (`Tuple[int]`)
        The top-left and bottom-right coordinates of the crop.
    grid_size (`Tuple[int]`):
        The grid size of the positional embedding.
    use_real (`bool`):
        If True, return real part and imaginary part separately. Otherwise, return complex numbers.

    Returns:
        `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
    r   F)endpointr   r
   r`   r   Nr4  )r9   r  r#   rI   rJ   rK   r   r6  )
r2   r  rh   r  r   r   rR   rS   rT   rX   s
             r.   r5  r5    s    " KE4[[q47IaL5PRPZPZ[F[[q47IaL5PRPZPZ[F;;vv&D88Dq!D<<A/

12/0D1)THUIr0   c                 `   | dz  dk(  sJ t        | dz  |d   j                  d      |      }t        | dz  |d   j                  d      |      }|rBt        j                  |d   |d   gd      }t        j                  |d   |d   gd      }||fS t        j                  ||gd      }|S )a  
    Get 2D RoPE from grid.

    Args:
    embed_dim: (`int`):
        The embedding dimension size, corresponding to hidden_size_head.
    grid (`np.ndarray`):
        The grid of the positional embedding.
    use_real (`bool`):
        If True, return real part and imaginary part separately. Otherwise, return complex numbers.

    Returns:
        `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
    r:   r   r   r   r4  r
   r   )r  rK   r!   r&   )r2   rT   r  rs   rt   r(   r'   r-   s           r.   r6  r6    s     q=A $QQ+hE $QQ+hE iiq58,!4iiq58,!4CxiiA.
r0   c                 V   | dz  dk(  sJ t        | dz  |||      }t        | dz  |||      }|j                  |d| dz  d      j                  d|dd      }|j                  d|| dz  d      j                  |ddd      }t        j                  ||gd      j                  d      }|S )as  
    Get 2D RoPE from grid.

    Args:
    embed_dim: (`int`):
        The embedding dimension size, corresponding to hidden_size_head.
    grid (`np.ndarray`):
        The grid of the positional embedding.
    linear_factor (`float`):
        The linear factor of the positional embedding, which is used to scale the positional embedding in the linear
        layer.
    ntk_factor (`float`):
        The ntk factor of the positional embedding, which is used to scale the positional embedding in the ntk layer.

    Returns:
        `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
    r:   r   r   )linear_factor
ntk_factorr
   r   r   )r  r   rc   r!   r&   r   )r2   len_hlen_wr<  r=  rs   rt   r-   s           r.   get_2d_rotary_pos_embed_luminar@  7  s    $ q=A#Q]zE $Q]zE JJuaa3::1eQJEJJq%a3::5!QJE
))UEN
+
3
3A
6CJr0        @r   rv   c           	      (   | dz  dk(  sJ t        |t              rt        j                  |      }t        |t        j
                        rt        j                  |      }||z  }d|t        j                  d| d||j                        | z  z  z  |z  }t        j                  ||      }|j                  j                  dk(  }	|	r|j                         }|r|r|j                         j                  dd|j                  d   dz        j                         }
|j                         j                  dd|j                  d   dz        j                         }|
|fS |rt        j                  |j                         |j                         gd	      j                         }
t        j                  |j                         |j                         gd	      j                         }|
|fS t        j                   t        j"                  |      |      }|S )
a  
    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
    index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
    data type.

    Args:
        dim (`int`): Dimension of the frequency tensor.
        pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
        theta (`float`, *optional*, defaults to 10000.0):
            Scaling factor for frequency computation. Defaults to 10000.0.
        use_real (`bool`, *optional*):
            If True, return real part and imaginary part separately. Otherwise, return complex numbers.
        linear_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor for the context extrapolation. Defaults to 1.0.
        ntk_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor for the NTK-Aware RoPE. Defaults to 1.0.
        repeat_interleave_real (`bool`, *optional*, defaults to `True`):
            If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`.
            Otherwise, they are concateanted with themselves.
        freqs_dtype (`torch.float32` or `torch.float64`, *optional*, defaults to `torch.float32`):
            the dtype of the frequency tensor.
    Returns:
        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
    r   r   r1   r   npur
   rC   r   r   )rG   rH   r!   r"   r9   ndarray
from_numpyr   r{   typer%   r(   rN   r   r'   r&   polar	ones_like)r   rv   r  r  r<  r=  r.  freqs_dtyper  is_npu	freqs_cos	freqs_sinr   s                r.   r  r  X  s   H 7a<<#sll3#rzz"s#JEuaa{3::VY\\]^ann 
 KKU#E\\%'F*IIK11!TUYZHZ1[aac	IIK11!TUYZHZ1[aac	)##	IIuyy{EIIK8bAGGI	IIuyy{EIIK8bAGGI	)## KK 6>	r0   r   r   use_real_unbind_dimsequence_dimc                 J   |r|\  }}|dk(  r|ddddddf   }|ddddddf   }n/|dk(  r|ddddddf   }|ddddddf   }nt        d| d      |j                  | j                        |j                  | j                        }}|dk(  r_ | j                  g | j                  dd dd j                  d      \  }}t        j                  | |gd      j                  d      }	nd|d	k(  rP | j                  g | j                  dd dd j                  d	      \  }}t        j                  | |gd      }	nt        d
| d      | j                         |z  |	j                         |z  z   j                  | j                        }
|
S t        j                   | j                         j                  g | j                  dd dd       }	|j                  d      }t        j                  |	|z        j                  d      }|j                  |       S )a3  
    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
    tensors contain rotary embeddings and are returned as real tensors.

    Args:
        x (`torch.Tensor`):
            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
    r   Nr
   z`sequence_dim=z` but should be 1 or 2.r   r   r<   r   z`use_real_unbind_dim=z` but should be -1 or -2.)rF   r   r   rK   r   unbindr!   rJ   r   r&   r%   r   view_as_complexr   view_as_realtype_as)r   r   r  rM  rN  r(   r'   x_realx_imag	x_rotatedr}   x_outs               r.   apply_rotary_embrX    s!   * S1dD!Q&'CdD!Q&'CQdAtQ&'CdAtQ&'C~l^;RSTT66!((#SVVAHH%5S"$&QYY<<b<!<CCBGNFFfWf$52>FFqII B&&QYY<<a<<CCBGNFF		F7F"3<I45H4IIbcddwwy3!2S!88<<QWWE
 ))*;!'')*;*;*QQWWSb\*Q2*Qq*QR	''*	""9y#89AA!D}}Qr0   c                     d }|\  \  }}\  }}\  }}	| j                  dd      \  }
}} ||
|d   ||      }
 |||d   ||      } |||d   ||	      }t        j                  |
||gd      } | S )Nc                    t        j                  ||      d d d d d d d f   }t        j                  ||      d d d d d d d f   }| dd | j                  d   dz  f   | d| j                  d   dz  d f   }}t        j                  | |fd      }| j                         |z  |j                         |z  z   j                  | j                        S )N.r   r   r   )F	embeddingr   r!   r&   r%   r   r   )tokensrv   r(   r'   x1x2tokens_rotateds          r.   apply_1d_ropez/apply_rotary_emb_allegro.<locals>.apply_1d_rope  s    kk#s#AtQM2kk#s#AtQM24v||B/14445vc6<<PRCSWXCXCZ>Z7[BRC9"5$~';';'='CCGGUUr0   r<   r   r   r   r
   r   )chunkr!   r&   )r   r   	positionsra  r&  r'  r(  r)  r*  r+  thws                r.   apply_rotary_emb_allegrorg    s    V 6?2NUENUENUEggaRg GAq!a1ue4Aa1ue4Aa1ue4A		1a)$AHr0   c                   N     e Zd Z	 	 	 	 	 ddededededee   f
 fdZd	dZ xZS )
TimestepEmbeddingr   time_embed_dimact_fnout_dimpost_act_fnc                 B   t         	|           t        j                  |||      | _        |t        j                  ||d      | _        nd | _        t        |      | _        ||}n|}t        j                  |||      | _        |d | _	        y t        |      | _	        y )NFr   )
r   r   r   r   linear_1	cond_projr   actlinear_2post_act)
r   r   rj  rk  rl  rm  cond_proj_dimsample_proj_biastime_embed_dim_outr   s
            r.   r   zTimestepEmbedding.__init__  s     			+~?OP$YY}kNDN!DN!&)!(!/		.2DFVW DM*;7DMr0   c                     ||| j                  |      z   }| j                  |      }| j                  | j                  |      }| j                  |      }| j                  | j	                  |      }|S r   )rq  rp  rr  rs  rt  )r   sample	conditions      r.   r   zTimestepEmbedding.forward  sl     dnnY77Fv&88XXf%Fv&==$]]6*Fr0   )siluNNNTr   )	r   r   r   rH   strr   r   r   r   r   s   @r.   ri  ri    sS    
 %)88 8 	8
 8 c]8@r0   ri  c            	       j     e Zd Zd	dedededef fdZdej                  dej                  fdZ	 xZ
S )
	Timestepsnum_channelsr   r   r   c                 Z    t         |           || _        || _        || _        || _        y r   )r   r   r  r   r   r   )r   r  r   r   r   r   s        r.   r   zTimesteps.__init__  s-    (.$8!
r0   r   r   c                 v    t        || j                  | j                  | j                  | j                        }|S )N)r   r   r   )r/   r  r   r   r   )r   r   t_embs      r.   r   zTimesteps.forward  s8    & 00!%!:!:**
 r0   )r
   )r   r   r   rH   r   r%   r   r!   r   r   r   r   s   @r.   r~  r~    sA    S 4 W\ eh  %,, r0   r~  c                   6     e Zd ZdZ	 ddedef fdZd Z xZS )GaussianFourierProjectionz-Gaussian Fourier embeddings for noise levels.embedding_sizer   c                 6   t         |           t        j                  t	        j
                  |      |z  d      | _        || _        || _        |rH| `t        j                  t	        j
                  |      |z  d      | _	        | j                  | _        | `	y y )NFr   )
r   r   r   	Parameterr!   randnweightr    r   W)r   r  r   set_W_to_weightr    r   r   s         r.   r   z"GaussianFourierProjection.__init__,  s~     	ll5;;~#>#FV[\.\\%++n"="EUZ[DF&&DK r0   c                    | j                   rt        j                   |      }|d d d f   | j                  d d d f   z  dz  t        j                  z  }| j
                  rAt        j                  t        j                  |      t        j                  |      gd      }|S t        j                  t        j                  |      t        j                  |      gd      }|S )Nr   r   r   )	r    r!   r  r9   pir   r&   r(   r'   )r   r   x_projr}   s       r.   r   z!GaussianFourierProjection.forward;  s    88		!A1d7dkk$'22Q6>))UYYv.		&0ABKC 
 ))UYYv.		&0ABKC
r0   )   r1   TTF)	r   r   r   r   rH   r%   r   r   r   r   s   @r.   r  r  )  s'    7 ns!05
r0   r  c                   4     e Zd ZdZddedef fdZd Z xZS )SinusoidalPositionalEmbeddinga[  Apply positional information to a sequence of embeddings.

    Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
    them

    Args:
        embed_dim: (int): Dimension of the positional embedding.
        max_seq_length: Maximum sequence length to apply positional embeddings

    r2   max_seq_lengthc                    t         |           t        j                  |      j	                  d      }t        j
                  t        j                  d|d      t        j                  d       |z  z        }t        j                  d||      }t        j                  ||z        |dd d dd df<   t        j                  ||z        |dd d dd df<   | j                  d|       y )Nr
   r   r   rA  pe)r   r   r!   r"   r   r$   r   r    rn   r'   r(   r   )r   r2   r  positiondiv_termr  r   s         r.   r   z&SinusoidalPositionalEmbedding.__init__T  s    <</99!<99U\\!Y:txx?P>PS\>\]^[[NI68h#671aA:8h#671aA:T2&r0   c                 V    |j                   \  }}}|| j                  d d d |f   z   }|S r   )r   r  )r   r   _
seq_lengths       r.   r   z%SinusoidalPositionalEmbedding.forward]  s2    77:q;J;''r0   )    r   r   r   r   rH   r   r   r   r   s   @r.   r  r  H  s     	'# 's 'r0   r  c                   :     e Zd ZdZdedededef fdZd Z xZS )ImagePositionalEmbeddingsa  
    Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
    height and width of the latent space.

    For more details, see figure 10 of the dall-e paper: https://huggingface.co/papers/2102.12092

    For VQ-diffusion:

    Output vector embeddings are used as input for the transformer.

    Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.

    Args:
        num_embed (`int`):
            Number of embeddings for the latent pixels embeddings.
        height (`int`):
            Height of the latent image i.e. the number of height embeddings.
        width (`int`):
            Width of the latent image i.e. the number of width embeddings.
        embed_dim (`int`):
            Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
    	num_embedr   r   r2   c                 8   t         |           || _        || _        || _        || _        t        j                  | j                  |      | _        t        j                  | j                  |      | _	        t        j                  | j                  |      | _
        y r   )r   r   r   r   r  r2   r   	Embeddingr-   
height_emb	width_emb)r   r  r   r   r2   r   s        r.   r   z"ImagePositionalEmbeddings.__init__{  sn     	
""<<	:,,t{{I>djj)<r0   c                 V   | j                  |      }| j                  t        j                  | j                  |j
                        j                  d| j                              }|j                  d      }| j                  t        j                  | j                  |j
                        j                  d| j                              }|j                  d      }||z   }|j                  d| j                  | j                  z  d      }||d d d |j                  d   d d f   z   }|S )Nr   r
   r   r   )r-   r  r!   r"   r   r   r   r   r  r   r   )r   indexr-   r  r  pos_embs         r.   r   z!ImagePositionalEmbeddings.forward  s    hhuo__U\\$++ell%S%X%XYZ\`\g\g%hi
  ))!,
NN5<<

5<<#P#U#UVWY]YcYc#de	 ''*	y( ,,q$++

":B?GA~1~q011
r0   r  r   s   @r.   r  r  c  s8    .== = 	=
 =$r0   r  c                   L     e Zd ZdZ fdZddZddej                  fdZ xZ	S )LabelEmbeddinga7  
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.

    Args:
        num_classes (`int`): The number of classes.
        hidden_size (`int`): The size of the vector embeddings.
        dropout_prob (`float`): The probability of dropping a label.
    c                     t         |           |dkD  }t        j                  ||z   |      | _        || _        || _        y Nr   )r   r   r   r  embedding_tablenum_classesdropout_prob)r   r  r  r  use_cfg_embeddingr   s        r.   r   zLabelEmbedding.__init__  sB    (1,!||K:K,K[Y&(r0   c                     |<t        j                  |j                  d   |j                        | j                  k  }nt        j
                  |dk(        }t        j                  || j                  |      }|S )zB
        Drops labels to enable classifier-free guidance.
        r   r   r
   )r!   randr   r   r  tensorwherer  )r   labelsforce_drop_idsdrop_idss       r.   
token_dropzLabelEmbedding.token_drop  sa     !zz&,,q/&--H4K\K\\H||Na$78HXt'7'7@r0   r  c                     | j                   dkD  }| j                  r|s|| j                  ||      }| j                  |      }|S r  )r  trainingr  r  )r   r  r  use_dropout
embeddingss        r.   r   zLabelEmbedding.forward  sF    ''!+MMk~/I__V^<F))&1
r0   r   )
r   r   r   r   r   r  r!   
LongTensorr   r   r   s   @r.   r  r    s$    )	e.. r0   r  c            	       r     e Zd Z	 	 	 	 d	dedededef fdZdej                  dej                  fdZ xZS )
TextImageProjectionr   image_embed_dimcross_attention_dimnum_image_text_embedsc                     t         |           || _        t        j                  || j                  |z        | _        t        j                  ||      | _        y r   )r   r   r  r   r   r   r   )r   r   r  r  r  r   s        r.   r   zTextImageProjection.__init__  sJ     	%:"IIot7Q7QTg7gh>3FGr0   r   r   c                     |j                   d   }| j                  |      }|j                  || j                  d      }| j	                  |      }t        j                  ||gd      S )Nr   r   r
   r   )r   r   rK   r  r   r!   r&   )r   r   r   r   image_text_embedss        r.   r   zTextImageProjection.forward  sh     &&q)
 !--l;-55j$B\B\^`a nn[1yy+[9qAAr0   )   r   r   
   r  r   s   @r.   r  r    s_     #"#&%'HH H !	H
  #H
B5<< 
Bu|| 
Br0   r  c                   T     e Zd Z	 	 	 ddededef fdZdej                  fdZ xZS )ImageProjectionr  r  r  c                     t         |           || _        t        j                  || j                  |z        | _        t        j                  |      | _        y r   )r   r   r  r   r   r   r   r   )r   r  r  r  r   s       r.   r   zImageProjection.__init__  sH     	%:"IIot7Q7QTg7ghLL!45	r0   r   c                     |j                   d   }| j                  |j                  | j                  j                  j                              }|j                  || j                  d      }| j                  |      }|S )Nr   r   )r   r   r   r  r   rK   r  r   )r   r   r   s      r.   r   zImageProjection.forward  sn    !''*
 ((9J9J9Q9Q9W9W)XY#++J8R8RTVWyy.r0   )r   r   r  r  r   s   @r.   r  r    s?      ##&%'	
6
6 !
6  #	
6ELL r0   r  c                   @     e Zd Zd fd	Zdej
                  fdZ xZS )IPAdapterFullImageProjectionc                     t         |           ddlm}  |||dd      | _        t        j                  |      | _        y Nr
   FeedForwardgelu)multactivation_fn)r   r   	attentionr  ffr   r   r   )r   r  r  r  r   s       r.   r   z%IPAdapterFullImageProjection.__init__  s6    *o/BZ`aLL!45	r0   r   c                 B    | j                  | j                  |            S r   )r   r  )r   r   s     r.   r   z$IPAdapterFullImageProjection.forward  s    yy.//r0   )r  r  r   r   r   r   r!   r   r   r   r   s   @r.   r  r    s    60ELL 0r0   r  c                   @     e Zd Zd fd	Zdej
                  fdZ xZS )IPAdapterFaceIDImageProjectionc                     t         |           ddlm} || _        || _         ||||z  |d      | _        t        j                  |      | _	        y r  )
r   r   r  r  
num_tokensr  r  r   r   r   )r   r  r  r  r  r  r   s         r.   r   z'IPAdapterFaceIDImageProjection.__init__  sK    *$#6 o/BZ/OVZjpqLL!45	r0   r   c                     | j                  |      }|j                  d| j                  | j                        }| j	                  |      S )Nr   )r  rK   r  r  r   )r   r   r   s      r.   r   z&IPAdapterFaceIDImageProjection.forward  s:    GGL!IIb$//4+C+CDyy|r0   )r  r  r
   r
   r  r   s   @r.   r  r    s    6ELL r0   r  c                   (     e Zd Zd fd	ZddZ xZS )CombinedTimestepLabelEmbeddingsc                     t         |           t        ddd      | _        t	        d|      | _        t        |||      | _        y )Nr  Tr
   r  r   r   r   rj  )r   r   r~  	time_projri  timestep_embedderr  class_embedder)r   r  r   class_dropout_probr   s       r.   r   z(CombinedTimestepLabelEmbeddings.__init__  sA    "T`ab!2sS`!a,[-I[\r0   c                     | j                  |      }| j                  |j                  |            }| j                  |      }||z   }|S Nr_   )r  r  r   r  )r   timestepclass_labelshidden_dtypetimesteps_projtimesteps_embconditionings          r.   r   z'CombinedTimestepLabelEmbeddings.forward  sN    1..~/@/@|/@/TU**<8$|3r0   )g?r   r   r   r   r   r   r   r   s   @r.   r  r    s    ]r0   r  c                   $     e Zd Z fdZd Z xZS )"CombinedTimestepTextProjEmbeddingsc                     t         |           t        ddd      | _        t	        d|      | _        t        ||d      | _        y Nr  Tr   r  r  r{  rk  )r   r   r~  r  ri  r  PixArtAlphaTextProjectiontext_embedderr   r   pooled_projection_dimr   s      r.   r   z+CombinedTimestepTextProjEmbeddings.__init__*  sB    "T`ab!2sS`!a67Lmdjkr0   c                     | j                  |      }| j                  |j                  |j                              }| j	                  |      }||z   }|S r  )r  r  r   r   r  )r   r  pooled_projectionr  r  pooled_projectionsr  s          r.   r   z*CombinedTimestepTextProjEmbeddings.forward1  sX    1..~/@/@GXG^G^/@/_`!//0AB$'99r0   r  r   s   @r.   r  r  )  s    lr0   r  c                   $     e Zd Z fdZd Z xZS )*CombinedTimestepGuidanceTextProjEmbeddingsc                     t         |           t        ddd      | _        t	        d|      | _        t	        d|      | _        t        ||d      | _        y r  )	r   r   r~  r  ri  r  guidance_embedderr  r  r  s      r.   r   z3CombinedTimestepGuidanceTextProjEmbeddings.__init__=  sR    "T`ab!2sS`!a!2sS`!a67Lmdjkr0   c                 ,   | j                  |      }| j                  |j                  |j                              }| j                  |      }| j	                  |j                  |j                              }||z   }| j                  |      }	||	z   }
|
S r  )r  r  r   r   r  r  )r   r  guidancer  r  r  guidance_projguidance_embtime_guidance_embr  r  s              r.   r   z2CombinedTimestepGuidanceTextProjEmbeddings.forwardE  s    1..~/@/@GXG^G^/@/_`x0--m.>.>EVE\E\.>.]^)L8!//0AB(+==r0   r  r   s   @r.   r  r  <  s    lr0   r  c                        e Zd Zddedededef fdZdej                  dej                  dej                  d	ej                  d
ej                  dej                  fdZ xZ	S )&CogView3CombinedTimestepSizeEmbeddingsr   condition_dimr  timesteps_dimc                     t         |           t        |dd      | _        t        |dd      | _        t        ||      | _        t        ||d      | _        y )NTr   r  r  r{  r  )	r   r   r~  r  condition_projri  r  r  condition_embedder)r   r   r  r  r  r   s        r.   r   z/CogView3CombinedTimestepSizeEmbeddings.__init__U  sV    "tjkl']TXopq!2}]j!k";<QS`io"pr0   r  original_sizetarget_sizecrop_coordsr  r   c                 ^   | j                  |      }| j                  |j                               j                  |j	                  d      d      }| j                  |j                               j                  |j	                  d      d      }| j                  |j                               j                  |j	                  d      d      }	t        j                  |||	gd      }
| j                  |j                  |            }| j                  |
j                  |            }||z   }|S )Nr   r   r
   r   r_   )
r  r  r   r   r   r!   r&   r  r   r  )r   r  r	  r
  r  r  r  original_size_projcrop_coords_projtarget_size_projr  r  condition_embr  s                 r.   r   z.CogView3CombinedTimestepSizeEmbeddings.forward]  s    1!001F1F1HINN}OaOabcOdfhi..{/B/B/DEJJ;K[K[\]K^`bc..{/B/B/DEJJ;K[K[\]K^`bc $68HJZ#[abc..~/@/@|/@/TU//0A0A0A0UV$}4r0   )r  )
r   r   r   rH   r   r!   r   r   r   r   r   s   @r.   r  r  T  s    qc q# qVY qjm q,, || \\	
 \\ kk 
r0   r  c            	       8     e Zd Zddedededef fdZd Z xZS )HunyuanDiTAttentionPoolspacial_dimr2   	num_heads
output_dimc                 ~   t         |           t        j                  t	        j
                  |dz   |      |dz  z        | _        t        j                  ||      | _        t        j                  ||      | _	        t        j                  ||      | _
        t        j                  ||xs |      | _        || _        y Nr
   r   )r   r   r   r  r!   r  positional_embeddingr   k_projq_projv_projc_projr  )r   r  r2   r  r  r   s        r.   r   z HunyuanDiTAttentionPool.__init__x  s    $&LL[1_i1X[dfi[i1i$j!ii	95ii	95ii	95ii	:+BC"r0   c           
      T   |j                  ddd      }t        j                  |j                  dd      |gd      }|| j                  d d d d d f   j                  |j                        z   }t        j                  di d|d d d|d	|d
|j                  d   d| j                  d| j                  j                  d| j                  j                  d| j                  j                  dd dt        j                  | j                  j                  | j                  j                  | j                  j                  g      dd dd ddddd| j                   j                  d| j                   j                  ddd| j"                  dd\  }}|j%                  d      S )Nr
   r   r   Tr   keepdimr   querykeyvalueembed_dim_to_checkr   r  q_proj_weightk_proj_weightv_proj_weightin_proj_weightin_proj_biasbias_kbias_vadd_zero_attnF	dropout_pout_proj_weightout_proj_biasuse_separate_proj_weightr  need_weights )r   r!   r&   meanr  r   r   r[  multi_head_attention_forwardr   r  r  r  r  r  r   r  r  squeeze)r   r   r  s      r.   r   zHunyuanDiTAttentionPool.forward  s   IIaAIIqvv!Tv2A6A>))!T1*588AA-- 
BQ%

 
  !wwr{	

 nn
 ++,,
 ++,,
 ++,,
  
 DKK$4$4dkk6F6FHXHX#YZ
 
 
  
 
 !KK..
  ++**!
" &*#
$ ]]%
& '
1* yy|r0   r   r   r   r   rH   r   r   r   r   s   @r.   r  r  u  s*    #C #C #C #UX #r0   r  c                   0     e Zd Z	 	 	 	 d fd	ZddZ xZS )-HunyuanCombinedTimestepTextSizeStyleEmbeddingc                 F   t         |           t        ddd      | _        t	        d|      | _        t        ddd      | _        t        ||d|      | _        || _	        |r$t        j                  d|      | _        d	|z   |z   }n|}t        ||d
z  |d      | _        y )Nr  Tr   r  r  r  )r  r  r
      r:   	silu_fp32)r   r  r   rk  )r   r   r~  r  ri  r  	size_projr  pooler"use_style_cond_and_image_meta_sizer   r  style_embedderr  extra_embedder)r   r   r  seq_lenr  r=  extra_in_dimr   s          r.   r   z6HunyuanCombinedTimestepTextSizeStyleEmbedding.__init__  s     	"T`ab!2sS`!a"T`ab-(ABW

 3U/-"$,,q-"@D"]25JJL0L7$%)&	
r0   c                    | j                  |      }| j                  |j                  |            }| j                  |      }| j                  rp| j                  |j                  d            }|j                  |      }|j                  dd      }| j                  |      }	t        j                  |||	gd      }
nt        j                  |gd      }
|| j                  |
      z   }|S )Nr_   r   r9  r
   r   )r  r  r   r<  r=  r;  r   r>  r!   r&   r?  )r   r  r  image_meta_sizestyler  r  r  r  style_embedding
extra_condr  s               r.   r   z5HunyuanCombinedTimestepTextSizeStyleEmbedding.forward  s    1..~/@/@|/@/TU "[[)>?22"nn_-A-A"-EFO-00|0DO-222w?O #11%8O $6#Y_`aJ$6#7Q?J$t':'::'FFr0   )r  r     Tr   r  r   s   @r.   r7  r7    s     # +/ 
Dr0   r7  c                   &     e Zd Zd fd	Zd Z xZS )&LuminaCombinedTimestepCaptionEmbeddingc                     t         |           t        |dd      | _        t	        ||      | _        t        j                  t        j                  |      t        j                  ||d            | _
        y )NT        r  r  ro  )r   r   r~  r  ri  r  r   
Sequentialr   r   caption_embedder)r   r  r  frequency_embedding_sizer   s       r.   r   z/LuminaCombinedTimestepCaptionEmbedding.__init__  sh    "14^a
 "3?Whs!t "LL,-II#!
r0   c                 Z   | j                  |      }| j                  |j                  |j                              }|j	                         j                  d      }||z  j                  d      |j                  d      z  }|j                  |      }| j                  |      }||z   }	|	S Nr_   r   r
   r   )r  r  r   r   r%   r   sumrM  )
r   r  caption_featcaption_mask	time_freq
time_embedcaption_mask_floatcaption_feats_poolcaption_embedr  s
             r.   r   z.LuminaCombinedTimestepCaptionEmbedding.forward  s    NN8,	++ILL|?Q?QL,RS
 *//1;;B?*-??DDDKN`NdNdijNdNkk/22<@--.@A!M1r0   )r   rG  r  r  r   s   @r.   rI  rI    s    
"r0   rI  c                        e Zd Z	 	 ddedededededdf fdZ	 dd	ej                  d
ej                  dej                  deej                     fdZ
 xZS )%MochiCombinedTimestepCaptionEmbeddingr   r  r   rj  num_attention_headsr   Nc                     t         |           t        |dd      | _        t	        ||      | _        t        |||      | _        t        j                  ||      | _
        y )NTrK  r  r  )r[  r2   r  )r   r   r~  r  ri  r  MochiAttentionPoolr<  r   r   caption_proj)r   r   r  r   rj  r[  r   s         r.   r   z.MochiCombinedTimestepCaptionEmbedding.__init__  s\     	"PTkno!2~^k!l( 3~Zg
 IIn6KLr0   r  r  encoder_attention_maskr  c                     | j                  |      }| j                  |j                  |            }| j                  ||      }| j	                  |      }||z   }	|	|fS r  )r  r  r   r<  r^  )
r   r  r  r_  r  r  time_embr  r^  r  s
             r.   r   z-MochiCombinedTimestepCaptionEmbedding.forward  sg     NN8,	))),,\,*JK![[)>@VW(()>?"44\))r0   )r  r  r   )r   r   r   rH   r   r!   r  r   r   r   r   r   r   s   @r.   rZ  rZ    s     "#$MM  #M 	M
 M !M 
M, /3*""*  %||* !&	*
 u{{+*r0   rZ  c                   4     e Zd Zddededef fdZd Z xZS )TextTimeEmbeddingencoder_dimrj  r  c                     t         |           t        j                  |      | _        t        ||      | _        t        j                  ||      | _        t        j                  |      | _	        y r   )
r   r   r   r   norm1AttentionPoolingpoolr   r   norm2)r   rd  rj  r  r   s       r.   r   zTextTimeEmbedding.__init__   sN    \\+.
$Y<	IIk>:	\\.1
r0   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )rf  rh  r   ri  )r   r  s     r.   r   zTextTimeEmbedding.forward'  s@    

=1		-0		-0

=1r0   )@   r5  r   s   @r.   rc  rc    s"    2C 2 2 2r0   rc  c                   f     e Zd Zddededef fdZdej                  dej                  fdZ xZS )	TextImageTimeEmbeddingr   r  rj  c                     t         |           t        j                  ||      | _        t        j
                  |      | _        t        j                  ||      | _        y r   )r   r   r   r   r   r   	text_norm
image_proj)r   r   r  rj  r   s       r.   r   zTextImageTimeEmbedding.__init__0  sB    >>Bn5))O^Dr0   r   r   c                 r    | j                  |      }| j                  |      }| j                  |      }||z   S r   )r   ro  rp  )r   r   r   time_text_embedstime_image_embedss        r.   r   zTextImageTimeEmbedding.forward6  s>    >>+6>>*:; !OOL9 #333r0   )r   r   r9  r  r   s   @r.   rm  rm  /  s=    Es E3 E^a E45<< 4u|| 4r0   rm  c                   J     e Zd Zddedef fdZdej                  fdZ xZS )ImageTimeEmbeddingr  rj  c                     t         |           t        j                  ||      | _        t        j
                  |      | _        y r   )r   r   r   r   rp  r   
image_normr   r  rj  r   s      r.   r   zImageTimeEmbedding.__init__B  s0    ))O^D,,~6r0   r   c                 J    | j                  |      }| j                  |      }|S r   )rp  rw  )r   r   rs  s      r.   r   zImageTimeEmbedding.forwardG  s'     OOL9 OO,=>  r0   r   r9  r  r   s   @r.   ru  ru  A  s&    7 73 7
!ELL !r0   ru  c                   b     e Zd Zddedef fdZdej                  dej                  fdZ xZS )ImageHintTimeEmbeddingr  rj  c                 N   t         |           t        j                  ||      | _        t        j
                  |      | _        t        j                  t        j                  dddd      t        j                         t        j                  dddd      t        j                         t        j                  ddddd      t        j                         t        j                  dddd      t        j                         t        j                  ddddd      t        j                         t        j                  dddd      t        j                         t        j                  dd	ddd      t        j                         t        j                  d	d
dd            | _
        y )Nr<   r   r
   )paddingr  r   )r~  r   `   r  r:   )r   r   r   r   rp  r   rw  rL  r   SiLUinput_hint_blockrx  s      r.   r   zImageHintTimeEmbedding.__init__O  s   ))O^D,,~6 "IIaQ*GGIIIb"a+GGIIIb"a15GGIIIb"a+GGIIIb"a15GGIIIb"a+GGIIIb#q!A6GGIIIc1a+!
r0   r   hintc                 p    | j                  |      }| j                  |      }| j                  |      }||fS r   )rp  rw  r  )r   r   r  rs  s       r.   r   zImageHintTimeEmbedding.forwarde  s<     OOL9 OO,=>$$T* $&&r0   rz  r  r   s   @r.   r|  r|  N  s1    
 
3 
,'ELL ' 'r0   r|  c                   &     e Zd Zd fd	Zd Z xZS )rg  c                    t         |           || _        t        j                  t        j                  d|      |dz  z        | _        t        j                  ||| j                        | _	        t        j                  ||| j                        | _
        t        j                  ||| j                        | _        || _        || j                  z  | _        y )Nr
   r   r_   )r   r   r   r   r  r!   r  r  r   r  r  r  r  dim_per_head)r   r  r2   r   r   s       r.   r   zAttentionPooling.__init__p  s    
$&LLQ	1JYX[^1[$\!ii	9DJJGii	9DJJGii	9DJJG"%7r0   c                     |j                         \  }} fd}|j                  dd       j                  j                  |j                        z   }t        j                  ||gd      } | j                  |            } | j                  |            } | j                  |            }dt        j                  t        j                   j                              z  }	t        j                  d||	z  ||	z        }
t        j                  |
j                         d      j!                  |
j                        }
t        j                  d|
|      }|j#                  dd      j%                  dd	      }|d d d
d d f   S )Nc                     | j                  dj                  j                        } | j                  dd      } | j	                  j                  z  dj                        } | j                  dd      } | S )Nr   r
   r   )r   r  r  r   rK   )r   bsr   s    r.   r   z'AttentionPooling.forward.<locals>.shape}  sg    r2t~~t/@/@AAAq!A		"t~~-r43D3DEAAq!AHr0   r
   Tr  r   zbct,bcs->btsr   zbts,bcs->bctr   r   )r   r2  r  r   r   r!   r&   r  r  r  r   sqrtr  r   softmaxr%   rF  rK   r   )r   r   lengthr   r   class_tokenqkvr   r  ar  s   `           @r.   r   zAttentionPooling.forwardz  sA   FFHFE		 ffDf1D4M4M4P4PQRQXQX4YYII{A&A. $++k*+$++a.!$++a.! DIIdii(9(9:;;na%iUCv||~26;;FLLI LL3 IIb"a **1a0Aqzr0   r   r  r   s   @r.   rg  rg  m  s    8"r0   rg  c            	            e Zd Z	 ddededee   ddf fdZeddd	ej                  d
ej                  dej                  fd       Z	d	ej                  d
ej                  dej                  fdZ xZS )r]  Nr[  r2   r  r   c                    t         |           |xs || _        || _        t	        j
                  |d|z        | _        t	        j
                  ||      | _        t	        j
                  || j                        | _        y )Nr   )	r   r   r  r[  r   r   to_kvto_qto_out)r   r[  r2   r  r   s       r.   r   zMochiAttentionPool.__init__  sb     	$1	#6 YYy!i-8
IIi3	ii	4??;r0   Fr  r   r   c                ^   | j                  d      |j                  d      k(  sJ | j                  d      |j                  d      k(  sJ |dddddf   j                  | j                        }||j                  dd      j	                  d      z  }| |z  j                  d|      }|S )a6  
        Pool tokens in x using mask.

        NOTE: We assume x does not require gradients.

        Args:
            x: (B, L, D) tensor of tokens.
            mask: (B, L) boolean tensor indicating which tokens are not padding.

        Returns:
            pooled: (B, D) tensor of pooled tokens.
        r
   r   Nr_   Tr  )min)r   r   r   rQ  clamp)r   r   r  pooleds       r.   pool_tokenszMochiAttentionPool.pool_tokens  s     vvayDIIaL(((vvayDIIaL(((Aq$J"""1dhh1dh399a9@@d(Aw7r0   c                    |j                  d      }|ddddddf   j                         }t        j                  |dd      }| j	                  ||d      }t        j                  ||gd      }| j                  |      }| j                  |ddd	f         }|| j                  z  }|j                  dd| j                  |f      }|j                  dd
      }|j                  d      \  }	}
|j                  d| j                  |f      }|j                  d      }t        j                  ||	|
|d      }|j                  d      j!                  dd      }| j#                  |      }|S )aP  
        Args:
            x (`torch.Tensor`):
                Tensor of shape `(B, S, D)` of input tokens.
            mask (`torch.Tensor`):
                Boolean ensor of shape `(B, S)` indicating which tokens are not padding.

        Returns:
            `torch.Tensor`:
                `(B, D)` tensor of pooled tokens.
        r   N)r
   r   T)r"  r  r
   r   r   r<   rK  )	attn_maskr,  )r   r   r[  r*   r  r!   r&   r  r  r[  	unflattenr   rP  r   scaled_dot_product_attentionr4  r   r  )r   r   r   Dr  x_poolkvr  head_dimr  r  s              r.   r   zMochiAttentionPool.forward  sV    FF1I D$)*//1	EE)V48	 !!!T4!8 IIvqkq) ZZ]IIa1g 000\\!a!9!98DE\\!Qyy|1KKD44h?@KKN **1aiSVW IIaL  A&KKNr0   r   )r   r   r   rH   r   r   staticmethodr!   r   r  
BoolTensorr   r   r   s   @r.   r]  r]    s    
 %)	< < < SM	<
 
< DI u|| 5<< ell  (* *U-=-= *%,, *r0   r]  c                    |j                   dd \  }}dt        j                  |       | z  z  }|d   j                  |j                  |j
                        }||j                  d      z  }t        j                  |j                         |j                         fd      }|j                  dd	d
dd      j                  ||| dz  dz        }|S )z
    Args:
        embed_dim: int
        box: a 3-D tensor [B x N x 4] representing the bounding boxes for GLIGEN pipeline
    Returns:
        [B x N x embed_dim] tensor of positional embeddings
    Nr   d   )NNNr=   r   r   r   r
   r<   r:   )r   r!   r"   r   r   r   r   rJ   r'   r(   r   rK   )r2   boxr   	num_boxesr-   s        r.   #get_fourier_embeds_from_boundingboxr    s      IIbqMJ	
%,,y)I5
6C


"
"#**CII
"
FC
b!
!C
++swwy#''),"
5C
++aAq!
$
,
,ZIPQMTUDU
VCJr0   c                   2     e Zd Zd fd	Z	 	 	 	 	 ddZ xZS )GLIGENTextBoundingboxProjectionc           
      8   t         |           || _        || _        || _        |dz  dz  | _        t        |t              r|d   }|dk(  rt        j                  t        j                  | j                  | j
                  z   d      t        j                         t        j                  dd      t        j                         t        j                  d|            | _        t        j                  j                  t        j                  | j                  g            | _        n|dk(  rt        j                  t        j                  | j                  | j
                  z   d      t        j                         t        j                  dd      t        j                         t        j                  d|            | _        t        j                  t        j                  | j                  | j
                  z   d      t        j                         t        j                  dd      t        j                         t        j                  d|            | _        t        j                  j                  t        j                  | j                  g            | _        t        j                  j                  t        j                  | j                  g            | _        t        j                  j                  t        j                  | j
                  g            | _        y )Nr   r:   r   	text-only   z
text-image)r   r   positive_lenrl  fourier_embedder_dimposition_dimrG   tupler   rL  r   r  linearsr!   r  rn   null_positive_featurelinears_textlinears_imagenull_text_featurenull_image_featurenull_position_feature)r   r  rl  feature_typefourier_freqsr   s        r.   r   z(GLIGENTextBoundingboxProjection.__init__  s   ($1!)A-1gu%ajG;&==		$++d.?.??E			#s#			#w'DL */););EKKIZIZH[<\)]D&\) "		$++d.?.??E			#s#			#w'!D "$		$++d.?.??E			#s#			#w'"D &+XX%7%7TEVEVDW8X%YD"&+hh&8&8dFWFWEX9Y&ZD#%*XX%7%7TEVEVDW8X%Y"r0   c                    |j                  d      }t        | j                  |      }| j                  j	                  ddd      }	||z  d|z
  |	z  z   }|U| j
                  j	                  ddd      }
||z  d|z
  |
z  z   }| j                  t        j                  ||gd            }|S |j                  d      }|j                  d      }| j                  j	                  ddd      }| j                  j	                  ddd      }||z  d|z
  |z  z   }||z  d|z
  |z  z   }| j                  t        j                  ||gd            }| j                  t        j                  ||gd            }t        j                  ||gd      }|S )Nr   r
   r   )r   r  r  r  r   r  r  r!   r&   r  r  r  r  )r   boxesmaskspositive_embeddingsphrases_masksimage_masksphrases_embeddingsimage_embeddingsxyxy_embedding	xyxy_nullpositive_nullobjs	text_null
image_null	objs_text
objs_images                   r.   r   z'GLIGENTextBoundingboxProjection.forward0  s    # =T=V=VX]^ ..33Aq"=	 (%/1u9	2II * 66;;Aq"EM #6"=Um@["[<<		+>*OUW XYD&  *33B7M%//3K ..33Aq"=I0055aB?J "4m!Cq=GX\eFe!e/+=[T^@^^))%))5G4X^`*abI++EII7G6X^`,abJ99i4!<Dr0   )r  r  )NNNNNr  r   s   @r.   r  r    s     'ZZ !0r0   r  c                   0     e Zd ZdZddef fdZd Z xZS ))PixArtAlphaCombinedTimestepSizeEmbeddingsz
    For PixArt-Alpha.

    Reference:
    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
    use_additional_conditionsc                     t         |           || _        t        ddd      | _        t        d|      | _        || _        |r8t        ddd      | _        t        d|      | _	        t        d|      | _
        y y )Nr  Tr   r  r  )r   r   outdimr~  r  ri  r  r  additional_condition_projresolution_embedderaspect_ratio_embedder)r   r   size_emb_dimr  r   s       r.   r   z2PixArtAlphaCombinedTimestepSizeEmbeddings.__init__k  sy    ""T`ab!2sS`!a)B&$-6CY]tu-vD*'8SYe'fD$):s[g)hD& %r0   c                    | j                  |      }| j                  |j                  |            }| j                  r| j	                  |j                               j                  |      }| j                  |      j                  |d      }| j	                  |j                               j                  |      }	| j                  |	      j                  |d      }	|t        j                  ||	gd      z   }
|
S |}
|
S rP  )r  r  r   r  r  r   r  rK   r  r!   r&   )r   r  
resolutionaspect_ratior   r  r  r  resolution_embaspect_ratio_embr  s              r.   r   z1PixArtAlphaCombinedTimestepSizeEmbeddings.forwardx  s    1..~/@/@|/@/TU))!;;J<N<N<PQTTUabN!55nEMMjZ\]N#==l>R>R>TUXXYef#99:JKSST^`bc(599nFV5W]^+__L  )Lr0   F)r   r   r   r   r   r   r   r   r   s   @r.   r  r  c  s    it ir0   r  c                   *     e Zd ZdZd fd	Zd Z xZS )r  z
    Projects caption embeddings. Also handles dropout for classifier-free guidance.

    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
    c                 d   t         |           ||}t        j                  ||d      | _        |dk(  rt        j
                  d      | _        nB|dk(  rt        j                         | _        n#|dk(  rt               | _        nt        d|       t        j                  ||d      | _
        y )	NTr   	gelu_tanhtanh)approximater{  r:  zUnknown activation function: )r   r   r   r   rp  GELUact_1r  r   rF   rs  )r   r   r  r   rk  r   s        r.   r   z"PixArtAlphaTextProjection.__init__  s    &L		kZ^_[ V4DJvDJ{"!DJ<VHEFF		k[_`r0   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rp  r  rs  )r   captionr  s      r.   r   z!PixArtAlphaTextProjection.forward  s2    g.

=1m4r0   )Nr  r   r   s   @r.   r  r    s    ar0   r  c                   D     e Zd Z	 	 	 	 d	dededededdf
 fdZd Z xZS )
!IPAdapterPlusImageProjectionBlock
embed_dimsdim_headheads	ffn_ratior   Nc           
      0   t         |           ddlm} t	        j
                  |      | _        t	        j
                  |      | _        t        |||d      | _	        t	        j                  t	        j
                  |       |||d|d            | _        y )Nr
   r  F)	query_dimr  r  out_biasr  r  r  r   )r   r   r  r  r   r   ln0ln1r   attnrL  r  )r   r  r  r  r  r  r   s         r.   r   z*IPAdapterPlusImageProjectionBlock.__init__  sz     	*<<
+<<
+ 	
	 --LL$
Jf9[`a
r0   c                     | j                  |      }| j                  |      }t        j                  ||gd      }| j	                  ||      |z   }| j                  |      |z   }|S )Nr   r   )r  r  r!   r&   r  r  )r   r   latentsresidualr  s        r.   r   z)IPAdapterPlusImageProjectionBlock.forward  sd     $((7# %		+@'*JPR S))G%:;hF'''"W,r0   )r   rk  r   r:   )r   r   r   rH   r%   r   r   r   r   s   @r.   r  r    sK     

 
 	

 
 

.r0   r  c                        e Zd ZdZ	 	 	 	 	 	 	 	 ddededededededed	ed
df fdZdej                  d
ej                  fdZ	 xZ
S )IPAdapterPlusImageProjectiona  Resampler of IP-Adapter Plus.

    Args:
        embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
        that is the same
            number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
        hidden_dims (int):
            The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
        to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
        Defaults to 16. num_queries (int):
            The number of queries. Defaults to 8. ffn_ratio (float): The expansion ratio
        of feedforward network hidden
            layer channels. Defaults to 4.
    r  output_dimshidden_dimsdepthr  r  num_queriesr  r   Nc	                    t         
|           t        j                  t	        j
                  d||      |dz  z        | _        t        j                  ||      | _        t        j                  ||      | _	        t        j                  |      | _        t        j                  t        |      D 	cg c]  }	t        ||||       c}	      | _        y c c}	w r  )r   r   r   r  r!   r  r  r   proj_inproj_outr   norm_out
ModuleListranger  layers)r   r  r  r  r  r  r  r  r  r  r   s             r.   r   z%IPAdapterPlusImageProjection.__init__  s     	||EKK;$L{\_O_$_`yy[9		+{;[1mmafglamn\].{HeYWn
ns   1Cr   c                     | j                   j                  |j                  d      dd      }| j                  |      }| j                  D ]  }|} ||||      } | j                  |      }| j                  |      S )zForward pass.

        Args:
            x (torch.Tensor): Input Tensor.
        Returns:
            torch.Tensor: Output Tensor.
        r   r
   )r  rc   r   r  r  r  r  )r   r   r  blockr  s        r.   r   z$IPAdapterPlusImageProjection.forward  sv     ,,%%affQiA6LLO[[ 	2EHAw1G	2 --(}}W%%r0   )r   r     r:   rk  r   r  r:   r   r   r   r   rH   r%   r   r!   r   r   r   r   s   @r.   r  r    s    " 

 
 	

 
 
 
 
 
 

.& &%,, &r0   r  c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
edededdf fdZdej                  dej                  fdZ	 xZ
S )"IPAdapterFaceIDPlusImageProjectiona  FacePerceiverResampler of IP-Adapter Plus.

    Args:
        embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
        that is the same
            number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
        hidden_dims (int):
            The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
        to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
        Defaults to 16. num_tokens (int): Number of tokens num_queries (int): The number of queries. Defaults to 8.
        ffn_ratio (float): The expansion ratio of feedforward network hidden
            layer channels. Defaults to 4.
        ffproj_ratio (float): The expansion ratio of feedforward network hidden
            layer channels (for ID embeddings). Defaults to 4.
    r  r  r  id_embeddings_dimr  r  r  r  r  r  ffproj_ratior   Nc                    t         |           ddlm} || _        || _        d | _        d| _        d| _         ||||z  d|      | _	        t        j                  |      | _        t        j                  ||      | _        t        j                  ||      | _        t        j                  |      | _        t        j"                  t%        |      D cg c]  }t'        ||||
       c}      | _        y c c}w )Nr
   r  Fr1   r  )r  r  )r   r   r  r  r  r2   clip_embedsshortcutshortcut_scaler   r   r   r   r   r  r  r  r   r  r  r  )r   r  r  r  r	  r  r  r  r  r  r  r
  r  r  r   s                 r.   r   z+IPAdapterFaceIDPlusImageProjection.__init__	  s     	*$#! 1:
3JZ`gst	LL,	yyj9		*k:[1mm`efk`lm[\.z8UIVm
ms   C5	id_embedsc                 0   |j                  | j                  j                        }| j                  |      }|j	                  d| j
                  | j                        }| j                  |      }|}| j                  | j                        }|j	                  d|j                  d   |j                  d         }| j                  D ]  }|} ||||      } | j                  |      }| j                  |      }| j                  r|| j                  |z  z   }|S )zForward pass.

        Args:
            id_embeds (torch.Tensor): Input Tensor (ID embeds).
        Returns:
            torch.Tensor: Output Tensor.
        r   r   r<   )r   r  r   r   rK   r  r2   r   r  r   r  r  r  r  r  )r   r  r  r  r   r  r  r}   s           r.   r   z*IPAdapterFaceIDPlusImageProjection.forward5	  s     LL!1!1!7!78	IIi(	%%b$//4>>J	IIi(	ll4#3#34K$5$5a$8+:K:KA:NO[[ 	2EHAw1G	2 --(mmG$==d11C77C
r0   )r   r   r  r  r:   rk  r   r:   r  r:   r   r  r   s   @r.   r  r  	  s    $ !$!
!
 !
 	!

 !
 !
 !
 !
 !
 !
 !
 !
 
!
F %,, r0   r  c                        e Zd ZdZ	 	 	 	 ddededededdf
 fdZd	ej                  d
ej                  dej                  dej                  fdZ xZ	S )!IPAdapterTimeImageProjectionBlocka  Block for IPAdapterTimeImageProjection.

    Args:
        hidden_dim (`int`, defaults to 1280):
            The number of hidden channels.
        dim_head (`int`, defaults to 64):
            The number of head channels.
        heads (`int`, defaults to 20):
            Parallel attention heads.
        ffn_ratio (`int`, defaults to 4):
            The expansion ratio of feedforward network hidden layer channels.
    
hidden_dimr  r  r  r   Nc                 t   t         |           ddlm} t	        j
                  |      | _        t	        j
                  |      | _        t        ||||dd      | _	         |||d|d      | _
        t	        j                         | _        t	        j                  |d|z        | _        t	        j
                  |      | _        dt!        j"                  t!        j"                  |            z  | j                  _        | j                  j'                          d | j                  _        d | j                  _        y )Nr
   r  F)r  r  r  r  r   r  r  r  r:   )r   r   r  r  r   r   r  r  r   r  r  r  
adaln_silur   
adaln_proj
adaln_normr   r  r   fuse_projectionsto_kto_v)r   r  r  r  r  r  r   s         r.   r   z*IPAdapterTimeImageProjectionBlock.__init___	  s     	*<<
+<<
+  *
	 j*FQZafg '')))JJ?,,z2 dii		((;<<				""$				r0   r   r  timestep_embc                    | j                  | j                  |            }|j                  dd      \  }}}}|}	| j                  |      }| j	                  |      d|dddf   z   z  |dddf   z   }|j
                  d   }
| j                  j                  |      }t        j                  ||fd      }| j                  j                  |      j                  dd      \  }}|j
                  d   }|| j                  j                  z  }|j                  |
d| j                  j                  |      j                  dd      }|j                  |
d| j                  j                  |      j                  dd      }|j                  |
d| j                  j                  |      j                  dd      }|| j                  j                  z  || j                  j                  z  j                  dd      z  }t        j                  |j!                         d      j#                  |j$                        }||z  }|j                  dd      j'                  |
d| j                  j                  |z        } | j                  j(                  d   |      } | j                  j(                  d   |      }||	z   }|}	| j+                  |      d|dddf   z   z  |dddf   z   }| j-                  |      |	z   S )	aA  Forward pass.

        Args:
            x (`torch.Tensor`):
                Image features.
            latents (`torch.Tensor`):
                Latent features.
            timestep_emb (`torch.Tensor`):
                Timestep embedding.

        Returns:
            `torch.Tensor`: Output latent features.
        r:   r
   r   Nr   r   r   r   )r  r  rb  r  r  r   r  r  r!   r&   r  r  r   r   r   r  r%   rF  r   rK   r  r  r  )r   r   r  r  r-   	shift_msa	scale_msa	shift_mlp	scale_mlpr  r   r   kv_inputr!  r"  	inner_dimr  r  s                     r.   r   z)IPAdapterTimeImageProjectionBlock.forward	  s     oodool;<58YYqaY5H2	9i HHQK((7#q9QW+='=>1d7ASS]]1%
		w'99a\r2YY__X.44QB4?
UIIbM			/

:r499??HEOOPQSTUhhz2tyyAKKAqQ

:r499??HEOOPQSTU$))//)cDIIOO.C-N-NrSU-VVv||~26;;FLLI5.##Aq)11*b$))//T\B\]%$))""1%g.%$))""1%g.H$ //'*a)AtG2D.DE	RSUYRYHZZwww(**r0   )r  rk     r:   )
r   r   r   r   rH   r   r!   r   r   r   r   s   @r.   r  r  Q	  s       	
  
B1+ 1+ 1+ELL 1+]b]i]i 1+r0   r  c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
edededdf fdZdej                  dej                  de	ej                  ej                  f   fdZ
 xZS )IPAdapterTimeImageProjectiona  Resampler of SD3 IP-Adapter with timestep embedding.

    Args:
        embed_dim (`int`, defaults to 1152):
            The feature dimension.
        output_dim (`int`, defaults to 2432):
            The number of output channels.
        hidden_dim (`int`, defaults to 1280):
            The number of hidden channels.
        depth (`int`, defaults to 4):
            The number of blocks.
        dim_head (`int`, defaults to 64):
            The number of head channels.
        heads (`int`, defaults to 20):
            Parallel attention heads.
        num_queries (`int`, defaults to 64):
            The number of queries.
        ffn_ratio (`int`, defaults to 4):
            The expansion ratio of feedforward network hidden layer channels.
        timestep_in_dim (`int`, defaults to 320):
            The number of input channels for timestep embedding.
        timestep_flip_sin_to_cos (`bool`, defaults to True):
            Flip the timestep embedding order to `cos, sin` (if True) or `sin, cos` (if False).
        timestep_freq_shift (`int`, defaults to 0):
            Controls the timestep delta between frequencies between dimensions.
    r2   r  r  r  r  r  r  r  timestep_in_dimtimestep_flip_sin_to_costimestep_freq_shiftr   Nc                    t         |           t        j                  t	        j
                  d||      |dz  z        | _        t        j                  ||      | _        t        j                  ||      | _	        t        j                  |      | _        t        j                  t        |      D cg c]  }t        ||||       c}      | _        t!        |	|
|      | _        t%        |	|d      | _        y c c}w )Nr
   r   r{  r  )r   r   r   r  r!   r  r  r   r  r  r   r  r   r  r  r  r~  r  ri  time_embedding)r   r2   r  r  r  r  r  r  r  r&  r'  r(  r  r   s                r.   r   z%IPAdapterTimeImageProjection.__init__	  s     	||EKK;
$KjZ]o$]^yyJ7		*j9Z0mm`efk`lm[\.z8UIVm
 #?4LNab/TZ[ ns   1C8r   r  c                    | j                  |      j                  |j                        }| j                  |      }| j                  j                  |j                  d      dd      }| j                  |      }||dddf   z   }| j                  D ]  } ||||      } | j                  |      }| j                  |      }||fS )a#  Forward pass.

        Args:
            x (`torch.Tensor`):
                Image features.
            timestep (`torch.Tensor`):
                Timestep in denoising process.
        Returns:
            `Tuple`[`torch.Tensor`, `torch.Tensor`]: The pair (latents, timestep_emb).
        r_   r   r
   N)r  r   r   r*  r  rc   r   r  r  r  r  )r   r   r  r  r  r  s         r.   r   z$IPAdapterTimeImageProjection.forward	  s     ~~h/222A**<8,,%%affQiA6LLOQW%%[[ 	6EAw5G	6 --(--($$r0   )i  i	  r  r:   rk  r#  rk  r:   i@  Tr   )r   r   r   r   rH   r   r   r!   r   r   r   r   r   s   @r.   r%  r%  	  s    : ")-#$\\ \ 	\
 \ \ \ \ \ \ #'\ !\ 
\2% % %%V[VbVbHbBc %r0   r%  c                        e Zd Zdeeej                     eej                     f   f fdZe	de
fd       Zdeej                     fdZ xZS )MultiIPAdapterImageProjectionIPAdapterImageProjectionLayersc                 V    t         |           t        j                  |      | _        y r   )r   r   r   r   image_projection_layers)r   r.  r   s     r.   r   z&MultiIPAdapterImageProjection.__init__
  s     ')}}5S'T$r0   r   c                 ,    t        | j                        S )zNumber of IP-Adapters loaded.)r   r0  )r   s    r.   num_ip_adaptersz-MultiIPAdapterImageProjection.num_ip_adapters
  s     4//00r0   r   c                 F   g }t        |t              s#d}t        dd|d       |j                  d      g}t	        |      t	        | j
                        k7  r-t        dt	        |       dt	        | j
                               t        || j
                        D ]  \  }}|j                  d	   |j                  d   }}|j                  ||z  f|j                  d
d  z         } ||      }|j                  ||f|j                  dd  z         }|j                  |        |S )NzYou have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release. Please make sure to update your script to pass `image_embeds` as a list of tensors to suppress this warning.zimage_embeds not a list1.0.0Fr]   r
   zGimage_embeds must have the same length as image_projection_layers, got z and r   r   )rG   listr	   r   r   r0  rF   zipr   rK   append)r   r   projected_image_embedsre   image_embedimage_projection_layerr   
num_imagess           r.   r   z%MultiIPAdapterImageProjection.forward
  s\   !#
 ,-@   /:M]bc(22156L|D$@$@ AAYZ]^jZkYllqruvz  wS  wS  sT  rU  V  47|TEaEa3b 	7/K/%0%6%6q%9;;L;LQ;O
J%--zJ/F.H;K\K\]^]_K`.`aK0=K%--z:.FIZIZ[\[]I^.^_K"))+6	7 &%r0   )r   r   r   r   r   r   Moduler   r   propertyrH   r2  r!   r   r   r   r   s   @r.   r-  r-  
  s_    UuT"))_eTVT]T]N^=^7_ U 1 1 1&D$6 &r0   r-  c                       e Zd Zd Zy)FluxPosEmbedc                 <    d}t        dd|       ddlm}  ||i |S )NzImporting and using `FluxPosEmbed` from `diffusers.models.embeddings` is deprecated. Please import it from `diffusers.models.transformers.transformer_flux`.r?  r4  r
   )r?  )r	   transformers.transformer_fluxr?  )clsargskwargsre   r?  s        r.   __new__zFluxPosEmbed.__new__/
  s+     }.'+>??T,V,,r0   N)r   r   r   rE  r1  r0   r.   r?  r?  .
  s    -r0   r?  )Fr
   r
   rx   )r1   r1   Nr9   )r1   r1   )Fr   r1   r   Nr9   )r9   )r9   F)Fr   r1   r   )rx   Tr  NN))r1   r1   r1   rx   N)TNr9   )Tr  )Tr   r   )Vr   typingr   r   r   r   numpyr9   r!   torch.nn.functionalr   r)   r[  utilsr	   activationsr   r   attention_processorr   r   rH   r   r%   r/   r   r|  rY   rD  rE   ro   rL   rM   rm   rr   ry   r<  r   r   r   r  r,  r2  r7  r5  r6  r@  r#   r  rX  rg  ri  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r7  rI  rZ  rc  rm  ru  r|  rg  r]  r  r  r  r  r  r  r  r  r%  r-  r?  r1  r0   r.   <module>rL     sY    / /       1 * ""#3||33 3  	3
 3 3 \\3t *-*-%)GGU38_,-G G "'	G
 #(G U\\"G G \\G\ *-*-??U38_,-? ? "'	?
 #(? ZZ?J %)A U\\"A AHB&T _a#L,8}5 }5@3
ryy 3
lC")) CL5CRYY 5Cz *.%)b
 b b b uS#X'b U\\"b 5<<u||U\\9::;bT 7F%)(=
 ueU23(= (= U\\"(= 5<<u||U\\9::;(=X rv/?G?U/kn/d8DH @	@	rzz3	@ @L !6 ||6 U\\5#6676  6  	6 
 6  5<<%&6 r $-		 -`		 &		 >BII 6>		 >B!RYY !HB")) B6bii .	0299 	0RYY  bii & & 0RYY B%bii %P:BII :zRYY D *BII  *F		  4RYY 4$
! 
!'RYY '>/ryy /dO Od*Zbii Zz"		 "J		 :		 B8&299 8&vM M``+		 `+HN%299 N%b%&BII %&P-299 -r0   