
    biO                     h   d dl Z d dlZd dlmZ d dlZd dlmZ dde	fdZ
ej                  j                  j                  ddfde	de	fdZ G d d	ej                        Z G d
 dej                        Z G d dej                        Z G d dej                        Z G d dej                        Zy)    N   key_chunk_sizec                     j                   dd \  }j                   d   t        |       t        j                        z   t	        j
                  t        j                  d      fd        fd}t        j                  j                  |t        j                  d|      	      \  }}}	t        j                  |	dd
      }
t        j                  |	|
z
        }|t        j                  |d      z  }||z  }|j                  d      }t        j                  |d      j                  d      }||z  S )zBMulti-head dot product attention with a limited number of queries.NF)prevent_csec                 \   t        j                  d| |      }t        j                  |dd      }t        j                  j                  |      }t        j                  ||z
        }t        j                  d||      }t        j                  d|      }||j                  d      |fS )	Nz...qhd,...khd->...qhk)	precisionr   Taxiskeepdimsz...vhf,...qhv->...qhfz...qhk->...qhr   )jnpeinsummaxjaxlaxstop_gradientexpsum)querykeyvalueattn_weights	max_scoreexp_weights
exp_valuesr
   s          Z/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/attention_flax.pysummarize_chunkz/_query_chunk_attention.<locals>.summarize_chunk   s    zz"95#QZ[GGLrDA	GG)))4	gglY67ZZ 7W`a
JJ	:	KOOO4i@@    c           	      l   t         j                  j                  dgj                  dz
  z  | ddgz   t	        j
                  d d       gz         }t         j                  j                  
dg
j                  dz
  z  | ddgz   t	        
j
                  d d       	gz         } ||      S )Nr      r   operandstart_indicesslice_sizes)r   r   dynamic_slicendimlistshape)	chunk_idx	key_chunkvalue_chunk
k_featuresr   r   	num_headsr   r   
v_featuresr   s      r   chunk_scannerz-_query_chunk_attention.<locals>.chunk_scanner+   s    GG))#A.)Q1BBSYYs^,	:/VV * 
	 gg++#a0Iq!3DDU[["-..)Z1XX , 
 ui==r    r   )fxsTr   r   )r*   minr   sqrt	functoolspartialr   
checkpointr   maparanger   r   expand_dimsr   )r   r   r   r
   r   num_kvr1   chunk_valueschunk_weights	chunk_max
global_max	max_diffs
all_valuesall_weightsr.   r/   r   r0   s   `````         @@@@r   _query_chunk_attentionrD      s$   $'IIbcN!FIzRJ0NCHHZ((Es~~59
A :
A> >" .1WW[[=SZZXY[acqMr[-s*L-T:J	J./ICOOIB77LYM!!q!)J//-488a8@K##r    i   query_chunk_sizec           	          	
  j                   dd \  
		
 f	d}t        j                  j                  |ddt	        j
                  
z              \  }}t        j                  |d      S )a  
    Flax Memory-efficient multi-head dot product attention. https://huggingface.co/papers/2112.05682v2
    https://github.com/AminRezaei0x443/memory-efficient-attention

    Args:
        query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
        key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
        value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
        precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
            numerical precision for computation
        query_chunk_size (`int`, *optional*, defaults to 1024):
            chunk size to divide query array value must divide query_length equally without remainder
        key_chunk_size (`int`, *optional*, defaults to 4096):
            chunk size to divide key and value array value must divide key_value_length equally without remainder

    Returns:
        (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
    r   Nc           	         	 t         j                  j                  	dg	j                  dz
  z  | ddgz   t	        	j
                  d d       t        
      gz         }| 
z   t        |      fS )Nr   r"   r   r#   )r   r   r   r
   r   )r   r   r'   r(   r)   r*   r4   rD   )r+   _query_chunkr   r   r/   num_qr
   
q_featuresr   rE   r   s      r   r1   z5jax_memory_efficient_attention.<locals>.chunk_scannera   s    gg++3%**q.1iA5FFU[["-.#6F2NPY[e1ff , 
 (("!s%9]k
 	
r    r   )r2   initr3   lengthr   )r*   r   r   scanmathceilr   concatenate)r   r   r   r
   rE   r   r1   rH   resr/   rJ   rK   s   ``````   @@@r   jax_memory_efficient_attentionrS   J   sr    * $);;rs#3 E9j
 
 WW\\
yy!112	  FAs ??3R((r    c                       e Zd ZU dZeed<   dZeed<   dZeed<   dZe	ed<   d	Z
eed
<   d	Zeed<   ej                  Zej                  ed<   d Zd Zd ZddZy)FlaxAttentiona  
    A Flax multi-head attention module as described in: https://huggingface.co/papers/1706.03762

    Parameters:
        query_dim (:obj:`int`):
            Input hidden states dimension
        heads (:obj:`int`, *optional*, defaults to 8):
            Number of heads
        dim_head (:obj:`int`, *optional*, defaults to 64):
            Hidden states dimension inside each head
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://huggingface.co/papers/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`

    	query_dim   heads@   dim_head        dropoutFuse_memory_efficient_attentionsplit_head_dimdtypec                    | j                   | j                  z  }| j                   dz  | _        t        j                  |d| j
                  d      | _        t        j                  |d| j
                  d      | _        t        j                  |d| j
                  d      | _        t        j                  | j                  | j
                  d      | _
        t        j                  | j                  	      | _        y )
Ng      Fto_q)use_biasr_   nameto_kto_vto_out_0)r_   rc   rate)rZ   rX   scalennDenser_   r   r   r   rV   	proj_attnDropoutr\   dropout_layerself	inner_dims     r   setupzFlaxAttention.setup   s    MMDJJ.	]]D(
 XXi%tzzPVW
88ITZZfUXXi%tzzPVW
$..

TZZT\\:r    c                     |j                   \  }}}| j                  }|j                  |||||z        }t        j                  |d      }|j                  ||z  |||z        }|S N)r         r"   r*   rX   reshaper   	transposerp   tensor
batch_sizeseq_lendim	head_sizes         r   reshape_heads_to_batch_dimz(FlaxAttention.reshape_heads_to_batch_dim   se    #)<< 
GSJJ	
GYy@PQv|4
Y 6	AQRr    c                     |j                   \  }}}| j                  }|j                  ||z  |||      }t        j                  |d      }|j                  ||z  |||z        }|S rt   rw   rz   s         r   reshape_batch_dim_to_headsz(FlaxAttention.reshape_batch_dim_to_heads   sd    #)<< 
GSJJ	
i 7GSQv|4
i 7#	/Rr    Nc                    ||n|}| j                  |      }| j                  |      }| j                  |      }| j                  r|j                  d   }t        j                  ||d| j                  | j                  f      }t        j                  ||d| j                  | j                  f      }	t        j                  ||d| j                  | j                  f      }
n3| j                  |      }| j                  |      }	| j                  |      }
| j                  r|j                  ddd      }|	j                  ddd      }	|
j                  ddd      }
|j                  d   }|dz  dk(  rt        |dz        }n9|dz  dk(  rt        |dz        }n"|dz  dk(  rt        |dz        }nt        |      }t        ||	|
|d	
      }|j                  ddd      }| j                  |      }n| j                  rt        j                  d|	|      }nt        j                  d||	      }|| j                   z  }t#        j$                  || j                  rdnd      }| j                  rWt        j                  d||
      }|j                  d   }t        j                  ||d| j                  | j                  z  f      }n(t        j                  d||
      }| j                  |      }| j'                  |      }| j)                  ||      S )Nr   r   rv   ru   r   rY         i @  )rE   r   zb t n h, b f n h -> b n f tzb i d, b j d->b i jr   zb n f t, b t n h -> b f n hzb i j, b j d -> b i ddeterministic)r   r   r   r^   r*   r   rx   rX   rZ   r   r]   ry   intrS   r   r   ri   rj   softmaxrl   rn   )rp   hidden_statescontextr   
query_projkey_proj
value_projbquery_states
key_statesvalue_statesflatten_latent_dimrE   attention_scoresattention_probss                  r   __call__zFlaxAttention.__call__   s   #*?-ZZ.
88G$ZZ(
##A&A;;zAr4::t}}3UVLX2tzz4==/QRJ;;zAr4::t}}3UVL:::FL88BJ:::FL..'11!Q:L#--aA6J'11!Q:L
 ".!3!3B!7!B&!+#&'9B'>#? #b(A-#&'9B'>#? #a'1,#&'9A'=#> #&'9#: :j,IYjrM *33Aq!<M ;;MJM ""#&::.KZYe#f #&::.C\S]#^ /$**< jj)9dFYFY_`aO "" #

+H/[g h!''* #MAr4::PTP]P]C];^ _ #

+BOUa b $ ? ? N}5!!-}!MMr    )NT)__name__
__module____qualname____doc__r   __annotations__rX   rZ   r\   floatr]   boolr^   r   float32r_   rr   r   r   r    r    r   rU   rU   z   sg    , NE3NHcGU+0"D0 ND {{E399"
;<Nr    rU   c                       e Zd ZU dZeed<   eed<   eed<   dZeed<   dZe	ed<   e
j                  Ze
j                  ed	<   dZe	ed
<   dZe	ed<   d ZddZy)FlaxBasicTransformerBlockau  
    A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
    https://huggingface.co/papers/1706.03762


    Parameters:
        dim (:obj:`int`):
            Inner hidden states dimension
        n_heads (:obj:`int`):
            Number of heads
        d_head (:obj:`int`):
            Hidden states dimension inside each head
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        only_cross_attention (`bool`, defaults to `False`):
            Whether to only apply cross attention.
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://huggingface.co/papers/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    r~   n_headsd_headr[   r\   Fonly_cross_attentionr_   r]   r^   c           	         t        | j                  | j                  | j                  | j                  | j
                  | j                  | j                        | _        t        | j                  | j                  | j                  | j                  | j
                  | j                  | j                        | _	        t        | j                  | j                  | j                        | _        t        j                  d| j                        | _        t        j                  d| j                        | _        t        j                  d| j                        | _        t        j"                  | j                        | _        y )Nr_   )r~   r\   r_   h㈵>)epsilonr_   rg   )rU   r~   r   r   r\   r]   r^   r_   attn1attn2FlaxFeedForwardffrj   	LayerNormnorm1norm2norm3rm   rn   rp   s    r   rr   zFlaxBasicTransformerBlock.setup  s    "HHLLKKLL//**

 #HHLLKKLL//**

 "dhhDJJW\\$djjA
\\$djjA
\\$djjA
ZZT\\:r    c                    |}| j                   r$| j                  | j                  |      ||      }n"| j                  | j                  |      |      }||z   }|}| j                  | j	                  |      ||      }||z   }|}| j                  | j                  |      |      }||z   }| j                  ||      S Nr   )r   r   r   r   r   r   r   rn   )rp   r   r   r   residuals        r   r   z"FlaxBasicTransformerBlock.__call__2  s     $$ JJtzz-'@'YfJgM JJtzz-'@P]J^M%0 !

4::m#<gUb
c%0 !

= 9W%0!!-}!MMr    NT)r   r   r   r   r   r   r\   r   r   r   r   r   r_   r]   r^   rr   r   r   r    r   r   r      s_    2 
HLKGU!&$&{{E399"+0"D0 ND ;6Nr    r   c                       e Zd ZU dZeed<   eed<   eed<   dZeed<   dZeed<   d	Z	e
ed
<   d	Ze
ed<   ej                  Zej                  ed<   d	Ze
ed<   d	Ze
ed<   d ZddZy)FlaxTransformer2DModela  
    A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
    https://huggingface.co/papers/1506.02025


    Parameters:
        in_channels (:obj:`int`):
            Input number of channels
        n_heads (:obj:`int`):
            Number of heads
        d_head (:obj:`int`):
            Hidden states dimension inside each head
        depth (:obj:`int`, *optional*, defaults to 1):
            Number of transformers block
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        use_linear_projection (`bool`, defaults to `False`): tbd
        only_cross_attention (`bool`, defaults to `False`): tbd
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://huggingface.co/papers/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    in_channelsr   r   rv   depthr[   r\   Fuse_linear_projectionr   r_   r]   r^   c                 0   t        j                  dd      | _        | j                  | j                  z  }| j
                  r't        j                  || j                        | _        n)t        j                  |ddd| j                        | _        t        | j                        D cg c][  }t        || j                  | j                  | j                  | j                  | j                  | j                  | j                         ] c}| _        | j
                  r't        j                  || j                        | _        n)t        j                  |ddd| j                        | _        t        j&                  | j                  	      | _        y c c}w )
N    r   )
num_groupsr   r   )rv   rv   VALID)kernel_sizestridespaddingr_   )r\   r   r_   r]   r^   rg   )rj   	GroupNormnormr   r   r   rk   r_   proj_inConvranger   r   r\   r   r]   r^   transformer_blocksproj_outrm   rn   )rp   rq   rH   s      r   rr   zFlaxTransformer2DModel.setupo  s/   LLB=	LL4;;.	%%88ITZZ@DL77"jjDL& 4::&#
  &%)%>%>jj/3/R/R#22	#
 %%HHYdjjADMGG"jjDM  ZZT\\:3#
s   )A Fc                    |j                   \  }}}}|}| j                  |      }| j                  r(|j                  |||z  |      }| j	                  |      }n'| j	                  |      }|j                  |||z  |      }| j
                  D ]  }	 |	|||      } | j                  r&| j                  |      }|j                  ||||      }n%|j                  ||||      }| j                  |      }||z   }| j                  ||      S r   )r*   r   r   rx   r   r   r   rn   )
rp   r   r   r   batchheightwidthchannelsr   transformer_blocks
             r   r   zFlaxTransformer2DModel.__call__  s   )6)<)<&vuh 		-0%%)11%%RM LL7M LL7M)11%%RM!%!8!8 	c-mWTabM	c %% MM-8M)11%QM)11%QM MM-8M%0!!-}!MMr    Nr   )r   r   r   r   r   r   r   r\   r   r   r   r   r   r   r_   r]   r^   rr   r   r   r    r   r   r   H  su    6 LKE3NGU"'4'!&$&{{E399"+0"D0 ND (;TNr    r   c                   n    e Zd ZU dZeed<   dZeed<   ej                  Z
ej                  ed<   d Zd	dZy)
r   a  
    Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
    [`FeedForward`] class, with the following simplifications:
    - The activation function is currently hardcoded to a gated linear unit from:
    https://huggingface.co/papers/2002.05202
    - `dim_out` is equal to `dim`.
    - The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].

    Parameters:
        dim (:obj:`int`):
            Inner hidden states dimension
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
    r~   r[   r\   r_   c                     t        | j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nr   )	FlaxGEGLUr~   r\   r_   net_0rj   rk   net_2r   s    r   rr   zFlaxFeedForward.setup  s:     txxtzzB
XXdhhdjj9
r    c                 N    | j                  ||      }| j                  |      }|S r   )r   r   )rp   r   r   s      r   r   zFlaxFeedForward.__call__  s(    

=
N

=1r    Nr   r   r   r   r   r   r   r\   r   r   r   r_   rr   r   r   r    r   r   r     s4    " 
HGU{{E399":r    r   c                   n    e Zd ZU dZeed<   dZeed<   ej                  Z
ej                  ed<   d Zd	dZy)
r   a  
    Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
    https://huggingface.co/papers/2002.05202.

    Parameters:
        dim (:obj:`int`):
            Input hidden states dimension
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
    r~   r[   r\   r_   c                     | j                   dz  }t        j                  |dz  | j                        | _        t        j
                  | j                        | _        y )Nr   ru   r   rg   )r~   rj   rk   r_   projrm   r\   rn   ro   s     r   rr   zFlaxGEGLU.setup  s>    HHqL	HHY]$**=	ZZT\\:r    c                     | j                  |      }t        j                  |dd      \  }}| j                  |t	        j
                  |      z  |      S )Nru   r   r   )r   r   splitrn   rj   gelu)rp   r   r   hidden_linearhidden_gelus        r   r   zFlaxGEGLU.__call__  sL    		-0%(YY}aa%H"{!!-"''+2F"FVc!ddr    Nr   r   r   r    r   r   r     s5     
HGU{{E399";
er    r   )r   )r6   rO   
flax.linenlinenrj   r   	jax.numpynumpyr   r   rD   r   	PrecisionHIGHESTrS   ModulerU   r   r   r   r   r   r    r   <module>r      s       
 0$ 0$h "%!2!2!:!:TXpt-)NQ-)jm-)`wNBII wNtQN		 QNhgNRYY gNTbii De		 er    