
    ukiGY                       d Z ddlmZ ddlZddlZddlmZ ddlZddlmZ ddl	m
Z ddlmZ ddlmZ ddlZddlZd e ej*                   ej,                  d	            j.                        z  Z ej2                  d
d
       G d d             Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d"dZ	 	 	 	 d#dZ ej:                  ej<                  g d       ej:                  ej>                  g d      ddejA                         dddddddf
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d$d              Z!	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%dZ"d&dZ# ejH                  d      	 	 	 	 d'd       Z%	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d(dZ&	 	 	 	 	 	 	 	 	 	 	 	 	 	 d)d Z'e!jQ                  e"e'        ej:                  ej>                  ddg      	 	 d*	 	 	 d+d!       Z)y),z<Module containing fused attention forward and backward pass.    )annotationsN)Any)lax)pallas)tritongfffffffloat32T)frozenslotsc                      e Zd ZU dZded<   ded<   dZded<   dZded<   dZded	<   dZded
<   e	d        Z
edd       Zy)
BlockSizesa  
  Tile sizes parameterizing the attention kernel. These block sizes
  should be tuned for the model and hardware for optimal performance.

  Attributes:
    block_q: Block size along Q sequence length for forward kernel.
    block_k: Block size along KV sequence length for forward kernel.
    block_kv: Block size along KV sequence length for forward kernel.
    block_q_dkv: Block size along Q sequence length for dKV backward kernel.
    block_kv_dkv: Block size along KV sequence length for dKV backward kernel.
    block_q_dq: Block size along Q sequence length for dQ backward kernel.
    block_kv_dq: Block size along KV sequence length for dQ backward kernel.
  intblock_qblock_kN
int | Noneblock_q_dkvblock_kv_dkv
block_q_dqblock_kv_dqc                $    t        dddddd      S )N       )r   r   r   r   r   r   )r   )clss    d/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/pallas/ops/gpu/attention.pyget_defaultzBlockSizes.get_default7   s!         c                    | j                   | j                  | j                  | j                  g}t	        d |D              S )zfReturns True if all backward blocks are specified for the fused

    dq and dk/dv backwards pass.
    c              3  $   K   | ]  }|d u 
 y wN ).0bs     r   	<genexpr>z1BlockSizes.has_backward_blocks.<locals>.<genexpr>O   s     6q}6s   )r   r   r   r   all)selfbackward_blockss     r   has_backward_blockszBlockSizes.has_backward_blocksB   s@     		O 6o666r   )returnbool)__name__
__module____qualname____doc____annotations__r   r   r   r   classmethodr   propertyr&   r   r   r   r   r       sa     ,, +z !,
!*j +z   7 7r   r   sm_scalecausalc                  j                   d   }t        j                  d      | j                   d   }t        j                  t        j
                        t        d      z
  }t        j                  t        j
                        }t        j                  |ft        j
                        }t        j                  z        }t        j                  |      |	k  d d d f   t        j                  | d      d n|   fd}r#t        j                  dz   z  z   dz
        }nt        j                  |      }t        j                  d|||||f      \  }}}||d d d f   z  }|
r |
d   }|t        j                  |      z   |d	<   t        j                   |j"                  d d d |j                   d   f   |j%                  |j&                        
       y )Nr   dtypeinf        maskotherc                D   |\  }}}t        j                  | z        }t        j                  j                  |d d f   d      }t        j
                  |j                        }t        j                  t        j                        }!dk7  r|!z  }||z  }s d }	  |   }
t        |
      }	rc"z  t        j                        z   }| z  t        j                        z   }|d d d f   |d d d f   k\  }|	|nt        j                  |	|      }	t        j                  |	|t              }t        j                   |d      }t        j"                  ||      }t        j$                  ||z
        }||z  }t        j$                  ||d d d f   z
        }|j'                  d      }||z   }|d d d f   |z  }t        j                  #j                  |d d f         }t        j
                  |j)                  |j*                        |      }||z   }|||fS )Nr7   r8         ?r3   axisr9   )pldsliceplgpuloadatdotTmathlog2esegment_maskjnparangelogical_andwhereDEFAULT_MASK_VALUEmaxmaximumexp2sumastyper5   )$start_kcarryo_prevm_prevl_prevcurr_k_slicekqkqk_scaler9   kv_segment_idsspan_qspan_kcausal_maskm_currm_next
correctionl_prev_corrs_currl_currl_nexto_prev_corrvo_curro_nextr   r   r1   	head_maskk_refqq_segment_idssegment_ids_refr0   start_qv_refs$                            r   bodyz mha_forward_kernel.<locals>.bodyw   s   "FFF99Ww.8L

588L!O,9CHA	133B yy H2~(h(NB ,d		$(6M>:	7"SZZ%887"SZZ%88QWoa8<KS__T;-O 	 99T212bWWRb!F[[(F&6/*Jv%KXX
VAtG_F ZZRZ F6!FQW%.K

588L!O,9=AVVFMM!''*A.F6!F66!!r      .r?   )shaper@   
program_idrK   zerosr   floatrA   rL   rB   rC   r   divcdiv	fori_looprH   storerD   rT   r5   )q_refrn   rs   rq   o_refr0   r1   r   r   head_dimresidual_refsseq_lenhead_dim_paddedm_il_iocurr_q_slicert   upper_boundlse_refrm   ro   rp   rr   s    ``` ````           @@@@r   mha_forward_kernelr   R   s    KKN'MM!'KKO/ 			'-e<#		'-#	ii/*#++>!
 7W,g6,zz/*X5tQw?)jjYc2!%d?<+H 0" 0"b '''Wq[1G;a?IK'''7+KadQSMB+!S#
 s1d7|!AG#&GCL++ehhq-AGGBK-'(!((5;;*?iPr   c                   t        j                  | d      } |j                  dk(  rt        j                  |d      }nt        j                  |d      }t        j                  | |      j	                  t         j
                        S )Nr3   r=   ru   r   )rK   expand_dimsndimequalrT   bool_)rp   r^   s     r   rJ   rJ      sb    
 //-b9-A__^!<N__^!<N	=.	1	8	8	CCr   )
               	   
            )nondiff_argnums)
r0   r1   block_sizesbackward_pass_impl	num_warps
num_stagesgrid	interpretdebugreturn_residuals)static_argnamesr<   Fr      c                "   ~| j                   \  }}}}|j                   d   }t        |j                  |      }t        |j                  |      }t	        j
                  |      }| j                   d   |j                   d   k7  s| j                   d   |j                   d   k7  r3t        d| j                   d|j                   d|j                   d      ||z  dk7  rt        d|d	|      ||z  dk7  rt        d
|d|      |
}|t	        j                  ||      ||f}|}|	|dk  rdnd}t        j                  t        |||||      }t	        j                  d |d |fd       t	        j                  d |d |fd       t	        j                  d |d |fd       g}|j                  |d nt	        j                  d |fd              | g}t	        j                  d |d |fd       g}|r`|j                  t        j                  |||ft        j                                |j                  t	        j                  d d |fd               t	        j"                  ||||t%        j&                  ||	      |||d	      | |||      }|r|S |d   S )Nru   r3   zSThis kernel expects q, k, and v to have the same head dimension, but found q.shape=z
, k.shape=z
, v.shape=.r   z
q_seq_len=z must be a multiple of block_q=zkv_seq_len=z must be a multiple of block_k=@   r   r   )r0   r   r   r   r1   c                    || |dfS Nr   r   ijr[   s      r   <lambda>zmha.<locals>.<lambda>      Aq!Q< r   c                    |d|dfS r   r   _r   r[   s      r   r   zmha.<locals>.<lambda>
  r   r   c                    |d|dfS r   r   r   s      r   r   zmha.<locals>.<lambda>  r   r   c                
    |dfS r   r   r   s      r   r   zmha.<locals>.<lambda>  s
    QF r   c                    || |dfS r   r   r   s      r   r   zmha.<locals>.<lambda>  s    Q1aL r   )rv   r5   c                    ||| fS r   r   r   s      r   r   zmha.<locals>.<lambda>  s    Q1I r   r   r   mha_forwardr   in_specs	out_specscompiler_params	out_shaper   r   name)rv   minr   r   r@   next_power_of_2
ValueErrorr{   	functoolspartialr   	BlockSpecappendjaxShapeDtypeStructrK   r   pallas_callrB   CompilerParams)ro   r[   rj   segment_idsr0   r1   r   r   r   r   r   r   r   r   
batch_size	q_seq_len	num_headsr   
kv_seq_lenr   r   r   grid_
num_warps_kernelr   r   r   outs                                r   mhar      s   D /0ww,*iHwwqz*##Y/'##Z0'&&x0/ggbkQWWR[ aggbkQWWR[&@
'';aggZ{!''1	6  A

	|#C7*E
FF'Q

}$DG:F
GG %
]WWY(*i@E*"n!J/(%,g&.v?&
 	llD'49/1llD*dO</1llD*dO</1( //		 <<z*,BC
 c)||T7D/B8: ;)S))9i0E F
dD'*,EFH	**:7	 q![	# !,c!f,r   c                b    t        | |||||||||	|
||d      \  }}| |||||f}|r||fn|}||fS )NT)r   r0   r1   r   r   r   r   r   r   r   r   )r   )ro   r[   rj   r   r0   r1   r   r   r   r   r   r   r   r   r   lse	residualsrets                     r   _mha_forwardr   *  s^      AqkHK$6$iu"&((#s !QS#.)&c
C#	ir   c                ,   t        j                  | j                  d         |k  d d d f   }t        j                  | |d      }t        j                  ||d      }t        j
                  ||z  d      }|j                  |j                        |d<   y )Nr3   r7   r8   ru   r=   .)rK   rL   rv   rB   rC   rS   rT   r5   )out_refdout_ref	delta_refr   rm   r   dodeltas           r   _preprocess_backward_kernelr   E  su    zz'--+,x7qA)jjy4!zz(#6"
''!b&q
!%<<	0)C.r   preprocess_backwardc                   | j                   \  }}}}	t        j                  |	      }
t        j                  |j                   |j
                        } t        j                  t        j                  t        |	      t        j                  ||      ||ft        j                  d |d |
fd       t        j                  d |d |
fd       gt        j                  d d |fd       t        j                  dd      |||d		      | |      }|S )
N)r   c                    || |dfS r   r   r   s      r   r   z&_preprocess_backward.<locals>.<lambda>Z      1a| r   c                    || |dfS r   r   r   s      r   r   z&_preprocess_backward.<locals>.<lambda>\  r   r   c                    ||| fS r   r   r   s      r   r   z&_preprocess_backward.<locals>.<lambda>^  s    Q1I r   r      r   mha_preprocess_backwardr   )rv   r@   r   r   r   r5   r   r   r   r   r{   r   rB   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   s                r   _preprocess_backwardr   O  s     .1YY**gy(&&x0/""399cii8)"..3hGGGGW%z9=
,,gt_=35
,,gt_=35 dD'24MN**Q1E$ %  
,r   c                   !"#$%&'()* ~ j                   d   }j                   d   }t        j                  d      }t        j                  |z        } j                   d   }t	        j
                  |gt        j                        }t	        j
                  |gt        j                        }t	        j                  |      |k  d d d f   "t        j                  j                  |d d f   "d      *t        j                  j                  |d d f   "d      #|z  t	        j                        z   (d n|   $"#$ (*fd}rt        j                  |z        nd}t        j                  |t        j                  |      |||f      \  }}t        j                  |
j                  d d d |j                   d   f   |j!                  |
j"                        "       t        j                  |	j                  d d d |j                   d   f   |j!                  |	j"                        "       t        j                  d      }t        j$                  |z        }|z  t	        j                        z   )t	        j
                  |gt        j                        }t        j                   j                  |d d f   "d      &d n|   '|   %t        j                  j                  |d d f   "d      !|     !"%&')fd	}rt        j                  |d
z   z        }nt        j                  |      }t        j                  d|||      }t        j                  |j                  d d d |j                   d   f   |j!                  |j"                        "       y )Nr   r   r3   r4   r7   r8   c                   |\  }}t        j                  | z        }t        j                  j                  |d d f   d      }t        j
                  |j                        }t        j                  t        j                        }dk7  r|z  }||z  }szd }|   }	t        |	      }rH| z  t        j                        z   }
|
d d d f   d d d f   k\  }||nt        j                  ||      }t        j                  ||t              }|   }|   }t        j                  j                  |d d f   d      }t        j                   ||d d d f   z
        }|t        j
                  |j#                  |j$                        j                  |      z   }t        j&                  ft        j(                        |d d d f   z
  }|t        j
                  |j                        z   }||z  }dk7  r|z  }|t        j
                  |j#                  j$                        j                  |      z   }||fS Nr7   r8   r<   r4   )r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rR   rT   r5   rx   r   ) rr   rV   dvdkr   ro   r\   r]   r9   rp   r_   ra   r   dir   pdpdsr   r   r1   r   do_scaled_refrm   r[   r^   r   r~   rq   r0   r`   rj   s                      r   inner_loop_dkdvz,mha_backward_kernel.<locals>.inner_loop_dkdv  s   FB99W{2K@L

588L!O,9CHA	133Byy H2~(h(NB,d		$'5M>:	;&K)@@QWoa8<KS__T;-O 	 99T212b
,
C	<	 B	q)	
B 	c!T'l"#A	bffQXXbhh'))2.	.B	K.ckk	BR4[	PB	bffRo	B	
RB3=b	bffRYYu{{+--q1	1Br6Mr   r?   c                   t        j                  | z        }t        j                  j                  |d d f   d      }t        j                  j                  |d d f   d      }t        j
                  |j                        }t        j                  t        j                        }dk7  r|z  }||z  }szd }|   }t        |      }rH| z  t        j                        z   }	d d d f   |	d d d f   k\  }
||
nt        j                  ||
      }t        j                  ||t              }t        j                   |d d d f   z
        }t        j"                  ft        j$                        d d d f   z
  }|t        j
                  |j                        z   }||z  }dk7  r|z  }|t        j
                  |j'                  |j(                        |      j'                  |j(                        z   }|S r   )r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rR   rx   r   rT   r5   )rU   dqrZ   r[   rj   r\   r]   r9   r^   r`   ra   r   r   r   r   r   r1   r   r   rm   rn   r   ro   rp   rq   r0   r_   rs   s                 r   inner_loop_dqz*mha_backward_kernel.<locals>.inner_loop_dq  s   99W{2K@L

588L!O,9CHA

588L!O,9CHA	133Byy H2~(h(NB,d		$(6M>:	;&K)@@QWoa8<KS__T;-O 	 99T212bc!T'l"#A	J,CKK	@2ag;	NB	bffRo	B	
RB3=b	bffRYYqww'+22288<	<BIr   ru   )rv   r@   rw   rA   rK   rx   r   rL   rB   rC   rD   r   rz   r|   r{   r}   rT   r5   r   )+r~   rn   rs   rq   r   r   r   r   dq_refdk_refdv_refr0   r1   r   r   r   r   r   r   r   rU   rZ   r   r   r   r   lower_boundrr   r   r   r   r   r   r   rm   r[   r^   r   ro   rp   r`   r_   rj   s+   ```` ```   ``````               @@@@@@@@@@@r   mha_backward_kernelr   k  s)   . kk!n){{1~* MM!'7\1<@,KKO/
yy,0D"
yy,0D"zz/*X5tQw?)jj,/*#F!jj,/*#F!\!CJJ|$<<&%d?<+H ( ( (T AG,.<A+==2779k2Ob"X&"b ++ii>RXXb\>!"BIIfll$;) ++ii>RXXb\>!"BIIfll$;) MM!'w+Z8,Z#**Z"88&
yy*o.ckkB"jj,/*#F!%d?<+H  	#zz-""<?3)3O""" " "H ''7Q;*4kBK''*k2K
}}Q]R9"++ii>RXXb\>!"BIIfll$;)r   c                   |	rt        d      |
\  }}}}}}~~~	|dk(  r< t        j                  t        j                  t
        | |      ||||      d   |      S |dk(  r5|j                  st        d      |j                  \  }}}}|j                  d   }t        |j                  |      }t        |j                  |      }t        |j                  |      }t        |j                  |      }t        |j                  |      }t        j                  |      }||z  ||z  k7  rt        d      t!        ||||||      }t        j"                  |j                  |j$                        t        j"                  |j                  |j$                        t        j"                  |j                  |j$                        g}t        j&                  d |d |fd       t        j&                  d |d |fd	       t        j&                  d |d |fd
       t        j&                  d |d |fd       t        j&                  d |d |fd       t        j&                  d d |fd       t        j&                  d d |fd       g}||j)                  dd        n)|j)                  dt        j&                  d |fd              ||t        j*                  ||      f}|} | ||z  dk  s||z  dk  rd} nd}  t        j,                  t        j                  t.        | ||||||      |||t        j&                  d |d |fd       t        j&                  d |d |fd       t        j&                  d |d |fd       gd||t1        j2                  | d      	      ||||||||      \  }!}"}#nt        d|       |!j5                  |j$                        |"|#d fS )NzDKernel differentiation is not supported if return_residuals is True.xla)r0   r1   ru   r   z%Backward block sizes must all be set.zdq_seq_len and kv_seq_len must be divided into the same number of blocks for the fused backward pass.c                    | d|dfS r   r   r   r   r   s      r   r   z_mha_backward.<locals>.<lambda><      aAq\ r   c                    | d|dfS r   r   r   s      r   r   z_mha_backward.<locals>.<lambda>>  r   r   c                    | d|dfS r   r   r   s      r   r   z_mha_backward.<locals>.<lambda>@  r   r   c                    | d|dfS r   r   r   s      r   r   z_mha_backward.<locals>.<lambda>B  r   r   c                    | d|dfS r   r   r   s      r   r   z_mha_backward.<locals>.<lambda>D  r   r   c                    | |dfS r   r   r   s      r   r   z_mha_backward.<locals>.<lambda>E      q!Qi r   c                    | |dfS r   r   r   s      r   r   z_mha_backward.<locals>.<lambda>F  r  r   r   c                
    | dfS r   r   r   s      r   r   z_mha_backward.<locals>.<lambda>L  s
    q!f r   i @  r   r   )r0   r1   r   r   r   r   r   c                    | ||dfS r   r   r   s      r   r   z_mha_backward.<locals>.<lambda>k      Aq! r   c                    | ||dfS r   r   r   s      r   r   z_mha_backward.<locals>.<lambda>o  r	  r   c                    | ||dfS r   r   r   s      r   r   z_mha_backward.<locals>.<lambda>s  r	  r   mha_backwardr   r   )r   r   r   r   r   r   r   r   z&Invalid backward pass implementation: )r   r   vjpr   r   mha_referencer&   rv   r   r   r   r   r   r   r@   r   r   r   r5   r   insertr{   r   r   rB   r   rT   )$r0   r1   r   r   r   r   r   r   r   r   resr   ro   r[   rj   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
out_shapesr   r   r   r   r   s$                                       r   _mha_backwardr    s    
NP P#& !Q;S$(5 	377-(6J			 	 
  X%**>??12.J	9hJ+%%y1Gk--y9K{//<L[++Y7Jk--z:K((2OJ*"<<: 
 !b#wyIEQWWagg.QWWagg.QWWagg.J 	dIt_=1	3
dJo>1	3
dJo>1	3
dIt_=1	3
dIt_=1	3
dD),.GH
dD),.GHH ooaooatZ&8&<> ? 	277:|#DEDJ

$y
0+%	1

"#%!#		
 LLz49, LL|T?;, LL|T?;,
 ,, Q
?"D A{CS%E"1JBBH =>P=QR
SS	177	RT	))r   c                   | j                   d   }|j                   d   }t        j                  d| |t        j                        }d }	|@t        j                  t        ||      d      }	t        j                  |	|j                         }	|rlt        j                  t        j                  dd||ft                    }
t        j                  |
|j                         }
|	|
nt        j                  |	|
      }	|	|nt        j                  |	|t        d            }t        j                  j                  ||z        }t        j                  d||t        j                        S )Nru   zbqhc,bkhc->bhqk)preferred_element_typer4   z-infzbhqk,bkhc->bqhc)rv   rK   einsumr   r   rJ   broadcast_totrilonesr(   rM   rN   ry   r   nnsoftmax)ro   r[   rj   r   r0   r1   r   r   logitsr9   ra   weightss               r   r  r    s    ggaj)wwqz*::Ackk& 
$??<[A1EDD&,,/D((388Q9j$ANOK"";=K,;COOD+,ND\6syyvuV}'M&FFNN6H,-'	!CKK
 r   )rq   jax.Array | Noner   r   r   r   r0   ry   r1   r(   r   r   r   r   r   r   )rp   	jax.Arrayr^   r  )r   jnp.ndarray | Noner0   ry   r1   r(   r   r   r   strr   r   r   r   r   ztuple[int, ...] | Noner   r(   r   r(   r   r(   )r   r  r0   ry   r1   r(   r   r   r   r   r   r   r   r   r   r   r   r(   r   r(   r   r(   )r   r   )r   r   r   r(   r   r(   )rq   r  r0   ry   r1   r(   r   r   r   r   r   r   r   r   r   r   )r0   ry   r1   r(   r   r   r   r   r   r   r   r   r   r   r   r(   r   r(   r   r(   )r<   F)r   r  r1   r(   )*r,   
__future__r   r   rG   typingr   r   r   jax.experimentalr   r@   jax.experimental.pallasr   rB   	jax.numpynumpyrK   npdataclassesry   finfor5   rP   rO   	dataclassr   r   rJ   r   
custom_vjpjitr   r   r   r   named_scoper   r   r  defvjpr  r   r   r   <module>r/     s   C "    
  ) 3   E("((8288I+>"?"C"CDD d$/.7 .7 0.7bfQ &	fQ
 fQ fQ fQ fQ fQ fQ fQPDDD NN$F GG( (446& #'"M- $	M-
 M- M- M- M- M- M- !M- M- M- M-$M-` "	
          61 &' $15 (6`
 &` `  !`" #`$ %`& '`( )`* +`Fq*&)q*6@q*!q*),q*9=q* q* 26q*d 

< ' 377Z,BC  $	  Dr   