
    uki                      d dl mZ d dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlZd	d
lmZ d	dlmZ d	dlmZ d	dlmZ dZdZdZej<                  Zej@                  Z!ejD                  Z#ejH                  Z%ejL                  Z&	 	 	 d9	 	 	 	 	 	 	 	 	 	 	 d:dZ'	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d;dZ(d<dZ)d<dZ*dddddddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d=dZ+	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d>dZ,	 	 d?	 	 	 	 	 	 	 d@dZ-dAdZ.dBdCdZ/dDdZ0dBdEdZ1dFd Z2dGd!Z3dHd"Z4dId#Z5 ejl                  d$       G d% d&ejn                               Z8	 	 	 	 	 	 	 	 dJd'Z9dKd(Z:dLdMd)Z;dLdMd*Z<dLdNd+Z=dOd,Z>dPd-Z?dPd.Z@ ejl                  d$       G d/ d             ZA	 	 	 	 	 	 	 	 	 	 	 	 dQd0ZBdRd1ZCdRd2ZDdSd3ZEdSd4ZFdRd5ZGdRd6ZH	 dT	 	 	 	 	 	 	 dUd7ZI	 dT	 	 	 	 	 	 	 dUd8ZJy)V    )annotationsN)AnyCallableIteratorcast)ir)arith)llvm)memref)nvvm   )fragmented_array)	mma_utils)utils)LaunchContext      l          Fc                   t         j                  j                         }t         j                  j                         }t         j                  j                  d      }	d}
|d|cxk  rdk  sJ  J |
|z  }
|
dz  }
||k(  rd}n||k(  rd}n||	k(  rd}nt        d|       |
|dz  z  }
||k(  r|||hv sJ d}n|t         j                  j                         k(  r
||k(  sJ d}n|t         j                  j                         k(  r|||hv sJ d}nf|t         j                  j                         k(  r|||hv sJ d}n:|t         j                  j                  d      k(  r
||	k(  sJ d}nt        d	|       |
|d
z  z  }
|
|dz  z  }
|
|dz  z  }
|
|dz  z  }
|dz  s|dkD  rt        d|       |
|dz	  dz  z  }
| dz  s| dkD  rt        d|        |
| dz	  dz  z  }
t        j                  t         j                  j                  d      |
      S )N    r         r      zUnsupported accumulator dtype:    Unsupported input dtype:    
            +N must be a multiple of 8 and <= 256, got:    ,M must be a multiple of 16 and <= 256, got:    )r   F16TypegetF32TypeIntegerTypeget_signlessNotImplementedErrorBF16TypeFloat8E4M3FNTypeFloat8E5M2Type
ValueErrorr	   constant)mn	acc_dtypeinput_dtypetranspose_atranspose_bsparsity_selectorf16f32i32desc
d_type_valab_type_vals                ^/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/mosaic/gpu/tcgen05.pycreate_instr_descriptorr=   /   sY    	

#


#
##B'#	
$"!%A%%%%%DFND#JCJCJ
 ?	{K
LL:?$Cc
"""Kbkkoo''Kb))--//c
"""Kb''++--c
"""Kbnn11!44K
 9+G
HH;!$;"$+
$+
$Ua#g
B1#F
GG16b.$Vq3w
CA3G
HH16b.$ 
33B7	>>    c
                L   d}
d|cxk  rdk  sJ  J |
|dz  z  }
|
 | |      dz  z  }
|
 | |      dz  z  }
|
|dz  z  }
|
|dz  z  }
|dz  s|dkD  rt        d	|       |
|d
z	  dz  z  }
|	t        j                  j                         k(  rd}n2|	t        j                  j                         k(  rd}nt        d|	       |
|dz  z  }
|dz  s|dkD  rt        d|       |
|dz	  dz  z  }
|
|dz  z  }
t        j                  t        j                  j                  d      |
      S )Nr   r   r   r   r   r   r   r   r    r   r!   r   zUnsupported scale type:    r   r"         r   )
r-   r   Float8E8M0FNUTyper%   r+   r)   r	   r.   r'   r(   )get_input_encodingr/   r0   a_typeb_typea_scale_idxb_scale_idxr3   r4   
scale_typer9   scale_encodings               r<   _create_scaled_instr_descriptorrK   m   sh    
$ 
k	A			+
$
V
$
))$
V
$
**$+
$+
$Ua#g
B1#F
GG16b.$2''++--NR((,,..N
 8E
FF.B
$WC
CA3G
HH16b.$+
$	33B7	>>r>   c                 &    d }t        |g| i |S )Nc                    | t         j                  j                         k(  ry| t         j                  j                         k(  ryt	        d|        )Nr   r   r   )r   r+   r%   r,   r)   tys    r<   rD   zAcreate_scaled_f8f6f4_instr_descriptor.<locals>.get_input_encoding   sI    	R  $$&&	r  $$&	&";B4 @AAr>   rK   argskwargsrD   s      r<   %create_scaled_f8f6f4_instr_descriptorrT      s     B 
));	Md	Mf	MMr>   c                 &    d }t        |g| i |S )Nc                b    | t         j                  j                         k(  ryt        d|        )Nr   r   )r   Float4E2M1FNTyper%   r)   rN   s    r<   rD   z=create_scaled_f4_instr_descriptor.<locals>.get_input_encoding   s/    	R  $$&&";B4 @AAr>   rP   rQ   s      r<   !create_scaled_f4_instr_descriptorrX      s     B
 
));	Md	Mf	MMr>   T)	a_swizzle	b_swizzlea_scaleb_scalea_sparse_metadata
accumulate
collectiveTMEMRefc               l  H |dk(  s|dk(  rt        d      t        j                  j                  d      }
t        j                  j                  d      }t	        |t
              r3t        j                  t        j                  j                  d      |      }|	rdnd}|d ux}|d uk7  rt        d      |d u}|r|rt        d      t        j                  j	                  |j                        st        d	|j                         t        j                  |      \  \  }}Ht	        |t              r|j                  \  }}|j                  }|rt        d
      |dk7  rt        d|       dt!        j"                  H      z  }t%        |j                  |	|      }|j&                  |k7  rwt        d| d|j&                         t        j                  j	                  |j                        st        d|j                         t        j                  |      \  \  }}}|r|dz  }||k7  rt        d| d|       H|k7  rt        d| dH       | j                  |||z  fk7  r t        d|||z  f d| j                         |dk(  r9| j&                  t)        d      x}k7  rt        d| d| j&                         d}n|dk(  r|rt        d      |rt        d      t%        | j                  |	d      }| j&                  |k7  rt        d| d| j&                         |	rd}n@d}d|z  t!        j"                  H      z  ||z  kD  rt        d| d      t        d|       t        j*                  j-                         }t        j.                  j-                         }t        j                  j                  d      }H|k(  s!Ht        j0                  j-                         k(  rNH|k(  r|rt        d      |rt        dH d      | j                  |k7  rt        dH d| j                         H|k(  rL|rt        dH d      | j                  |k7  r| j                  |k7  rt        dH d | j                         t3        Hfd!t        j4                  t        j6                  hD              re| j                  |k7  r*| j                  |k7  rt        dH d | j                         |r| j                  |k7  rt        d"H d#| j                         t3        Hfd$t        j8                  hD              rH|rt        d%      |st        dH d&      | j                  |k7  rt        d"H d#| j                         Ht        j                  j                  d      k(  r8|rt        dH d      | j                  |k7  r&t        d'| j                         t        d(H       |}dt;        |d|z   z  |      z  t!        j"                  H      z  }|r|dk  rd}d }|r>|j                  t        j<                  j-                         k(  rdnd}t;        |d)|z        }|	rdnd}|	rd*nd+}| j                  |k(  r
|dz  }|d,z  }|dkD  r	|d-| d.z  }||z  |z  d/k7  rt        d0| d1| d2|       |s|r!|j?                         dk7  rt        d3|       |d4kD  r!|j?                         dk7  rt        d5|       |	r|dkD  rt        d6      |d7kD  rt        d8      tA        ||z  d4|z        } ||z  rt        d9| d:|       ||z  rt        d;| d:|       || z  rt        d<|  d:|       ||z  }!||z  }"|| z  }#Ht        j*                  j-                         k(  rt        jB                  j-                         nH}$d }%|rE|dk(  sJ |dz  rt        d=|       ||J |j                  }%|j                  t        j<                  j-                         k7  rC|j                  t        j6                  j-                         k7  rt        d>|j                         |j                  |j                  k7  r%t        d?|j                   d@|j                         |j                  |||z  fk7  r!t        dA| dB||z   dC|j                         |j                  ||z  ||z  fk7  r!t        dD| dB||z   dC|j                         |rtE        t        |      }|dz  rt        dE|       |j                  ||dz  fk7  r t        dF||dz  f d|j                         |j                  t        j                  j                  d      k7  rt        dG|j                         t	        |t              s.t        jF                  ||||d|z   z  fdHddIJ      \  \  }&}'\  }(})}*n"t        jH                  jJ                  }*d }'d x}(x})}&t        jF                  |||| fdI|rdnddIJ      \  \  }+},\  }-}.}/|rht!        j"                  |$      d)k(  rP|*t        jH                  jJ                  k7  rt        dK      |/t        jH                  jJ                  k7  rt        dL      |r-|dk(  r(|/t        jH                  jJ                  k(  rt        dM      t        j                  t        j                  j                  d      d      }0| |z  }1|#|z  }2| j&                  jL                  d/   d)z  d/k(  sJ | j&                  jL                  d/   d)z  }3|r|jN                  nd }4|r|jN                  nd }5|r|jN                  nd }6tQ        jR                  |!|#|"      D ]  \  }7}8}9t	        |t              rU|!dk7  rt        dN      |d|z   z  }:|jU                  tU        d       t!        jV                  |9|:z  |:            jN                  };n+|7|(z  |9|)z  z   }<|&d/   |&d   t        jX                  |<      z   f};|8|-z  |9|.z  z   }=|+d/   |+d   t        jX                  |=      z   f}>|4R|#dk7  s|!dk7  rt        dO      |dz  d/k(  sJ |dz  }?t        jZ                  |4t!        j\                  |9|?z  |
            }@nd }@|5|6|!dk7  rt        dP      |#dk7  rt        dQ      |J ||d)z  z  d/k(  sJ |dz  d/k(  r| dz  d/k(  sJ ||d)z  z  }At        jZ                  |5t!        j\                  |9|Az  |z  dz  |
            }Bt        jZ                  |6t!        j\                  |9|Az  |1z  dz  |
            }Cnd x}B}C|9d/k(  r|n|0}D|8|2z  |8|2z  }F}E|E|3z  dz  |F|1z  z   }G|!dk7  rt        dR      t_        t        jZ                  | jN                  t        j                  |
G            |;|>f| j                  || ||	|*t        jH                  jJ                  k7  |/t        jH                  jJ                  k7  |'|,BC@D|$|%dS  y )TNr   zNo swizzle is not supportedr   @   r   r   z-Either none or both scales should be providedz'Block-scaled sparse matmuls unsupportedzB must be a memref, got: z.A in TMEM unsupported for block-scaled matmulsr   z:Only M=128 is supported for MMA with A in TMEM, but got M=packingzA layout mismatch: expected z, got zA must be a memref, got zFMMA requires A and B to have the same contraction dimension (K), got: z and z9MMA requires A and B to have the same element type, got: z%Accumulator shape mismatch: expected z&Accumulator layout mismatch: expected z0MMA with block scaling is not supported for M=64z!Sparse MMA not supported for M=64r   zSwizzle=z/ is too big for MMA with M=64. Try lowering it.z5Only M=128 and M=64 are supported for MMA, but got M=zSparse MMA unsupported for f32zMMA with element type z does not support block scalingz2 only supports accumulators of type f32, but got: z9 only supports accumulators of type f32 or f16, but got: c              3  @   K   | ]  }|j                          y wN
isinstance.0telement_types     r<   	<genexpr>zmma.<locals>.<genexpr>(  s"      
 ll<    z#Block-scaled MMA with element type z* only supports f32 accumulators, but got: c              3  @   K   | ]  }|j                          y wrf   rg   ri   s     r<   rm   zmma.<locals>.<genexpr>6  s      %&all< rn   z#Sparse MMA unsupported for f4e2m1fnz only supports block scalingzBMMA with element type s8 only supports s32 accumulators, but got: zUnsupported element type: r   z2 CTAz1 CTAz integerz with z lane groupsr   zIn z MMA, N must be a multiple of z, got N=zOOnly N that is power of 2 supported for sparse and block-scaled MMA, but got N=r   z.The only supported N > 256, is 512, but got N=z.Only N <= 128 are supported for collective MMAr   z#Only N <= 512 are supported for MMAzM must be a multiple of z, got: zK must be a multiple of zN must be a multiple of z>MMA with block scaling requires N to be divisible by 32, got: z<A scale dtype mismatch: expected f8e8m0fnu or f8e4m3fn, got z!B scale dtype mismatch: expected z (same as A), got z"A scale shape mismatch: expected (z, z), got z"B scale shape mismatch: expected (z2Sparse MMA requires N to be divisible by 32, got: z+A sparse metadata shape mismatch: expected z3A sparse metadata dtype mismatch: expected i2, got FT)swizzle
group_sizelogical_k_majormma_bytewidth_ksplit_constzK4-bit block scaled MMA only supports K-fastest operands, but A is M-fastestzK4-bit block scaled MMA only supports K-fastest operands, but B is N-fastestz<B tiling too small. Increase swizzle or transpose the input.z*A address calculation for multiple M tilesz8A sparse metadata address calculation for multiple tilesz0A scale address calculation for multiple M tilesz0B scale address calculation for multiple N tilesz*D address calculation for multiple M tiles)d_typer/   r0   kr_   a_transposeb_transposea_k_stridesb_k_stridesa_scale_addrb_scale_addra_sparse_addrr^   rl   scale_element_type)0r)   r   r'   r(   rh   boolr	   r.   r-   
MemRefTypetyper   tiled_memref_shaper`   shapedtyper   bitwidth_infer_tmem_layoutlayouttmem_default_layoutr&   r%   r$   r*   anyr,   r+   rW   maxrC   	bit_countminFloatTF32Typer   create_descriptorDimKbase_tile_shapeaddressnpndindexslicedsencode_addraddic_do_mma)IdabrY   rZ   r[   r\   r]   r^   r_   r8   i64num_cta	is_scaled	is_sparserv   r0   r/   k2element_type2expected_packingexpected_layoutexpected_d_layoutn_lane_groupsr7   r6   s32m_group_elemsk_group_elemsscale_blockrequired_multiple	mode_namen_group_elemsm_groupsk_groupsn_groupsmma_element_typer~   a_desc_basea_k_instr_stridesa_m_group_stridea_k_group_stride	a_fastestb_desc_baseb_k_instr_stridesb_n_group_strideb_k_group_stride	b_fastesttruen_collective_group_elemsn_col_groupslanes_per_n_groupa_sparse_addr_basea_scale_addr_baseb_scale_addr_baseminikia_k_group_elemsa_mka_offsetb_offsetb_nkcols_per_k_groupr}   k_scales_per_groupr{   r|   accni_lane_groupni_cold_offsetrl   sI                                                                           @r<   mmar      s    "_	R
;
<<
##B'#
##B'#
D! ; ;A >
KJA'$&&iG4,?@
D
EEt+)9
G
HH 
	!	!!&&	)
09
::"55a8&1a,7GGEArGGM
:  	Cx"\]^\_ `aaU^^L99(	%5O 	xx?"((9z
J  ==##AFF+1!&&:;;&99!<GQ]!GB"W
	E!	  ]"
	?%~	/  WWAK  

/AK0@/AyQ  #Xxx)<Q)GG%H23D2EVAHH:
V  MBw RSS CDD +177JJxx$$23D2EVAHH:
V  mm 
Y%..6	6m9K	Kyk " 
 	

 LQCP
QQ


#


#
##B'#SLBKKOO,==sy @AA"<.0O
P  	ww#~"<. 1$$%GG9.  s"<.0O
P  	ww#~!''S."<. 1(()y2   !!2#6#67  	ww#~!''S."<. 1(()y2  QWW^/~ >%%&WWI/   +-+>+>*?   EFF"<.0L
M  	ww#~/~ >%%&WWI/  r~~22155"<.0O
P  	ww#~ggY 
  :<.I
JJ -c)q9}5yAAU^^T`Eaa-=2% M +)=)=)A)A)CC"Kq;7M&bA#g)WW^IQ6-55I=--2

i[67H6I J	  9!++-1"4
	S	  W!#
 NqcR
SS AG
E
FF3w
:
;;a=(#.9-
/gaSI
JJ
/gaSI
JJ
/gaSI
JJ-(-(-( !-

0@ @bl 
 8O82vJ1#
N  7#666 --1133MMR004466H
X  }}%-gmm_ =mm_  }}A,--.qcA4D3E Fmm_  }}Wa;&677.qcA4D3E Fmm_  W&782vKA3OPP1a1f+-7AF} E%%&(  ".."="=a"@@%%&(  
Aw	 	##	!=Q]#CD		(',	+ I8<<<'+
 !!/%b2	&{%*) 5>>"23q8IMMOO#
W  IMMOO#
W  B9	7
H 
 
33A6	:$*W4]*,	
	!	!!	$q	(A	--	-hh..q1Q64=(004)2goo)2gooJJx8< Bjb"b!W	Q!"NOO%!i-8oWWU4[%((2+?"QRZZd&&.>)>>h!nk!ny/D/DX/NNOd$$r,<'<<HNKNY-B-B8-LLMD%	Q(a-!"\]]R1$$$&",jj!3UWWRBR=RTW5XYmm$):)F	Q!"TUU	Q!"TUU$$$kAo.!333R1$);q)@@@([1_=ZZ

''"))M9R?
El ZZ

''")),DDJC
Pl
 %)(l\a*TC,.\0A6M
+
+	2
+
+	,  1} LMM

199ennS(;< ww


00%%!!#%-%_Br>   c                
   t         j                  j                  d      }t         j                  j                  d      }t         j                  j                  d      }|xs d\  }}|\  }}t        d t	        j
                  |xs d|      D              sJ |d u |d u k(  sJ |d u}|	d u}t        j                  |      }d|z   dz  dz  |z  }d|z  }d }|r0|rJ t         j                  j                  |      st         j                  j                  |      rO|t         j                  j                         k7  rt        d|       d	}d
}t        j                  t         |      }nt         j"                  j                  |      rm|s|rJ t        j                  t$        |      }|t         j                  j                         k(  rd}d}n4|t         j                  j                         k(  rd}d}nt'        d|       d}d} nt         j(                  j                  |      st         j*                  j                  |      rd}nt         j                  j                  |      rd}nat         j                  j                  |      rd}n?t         j                  j                  d      j                  |      rd}nt'        d|       dx} }d }|rdnd}!|d u }"|"rdnd}#|rdnd}$|rdnd}%| |rdndz  } d}&d}'d7d}(t-        ||z        D ]E  })|r|J |rJ d
|z  }*|)|z  |*z  }+ ||
|!z  ||!z  |||+|+||      },|
dk(  sJ |dz  dk(  sJ t/        j0                  ||)|z  d
z        }-t/        j0                  ||)|z  |z  dz  |!z        }.t/        j2                  ||-      t/        j2                  ||.      f}'nvd }/|rZd|cxk  rdk  sJ  J |}0d|0z  }1||z  |1z  dk(  sJ |)|1z  }/t/        j2                  |	t        j4                  |)|1z  dz  |            f}&t7        |
|!z  ||!z  |||||/       },|"r]||z  d|z   z  }2|)|2z  }3t        |t         j8                        sJ |j:                  t         j                  j                  d      k(  sJ |}4n||J |\  }4}3|3 |(|)||      z  }3|\  }5}6|6 |(|)||      z  }6|3d!z  |3dz	  }8}7|6d!z  |6dz	  }:}9t=        j>                  t         j@                  jC                  d"      | |4|5|,|g|'|&d#|"rd$nd% d&|7 d'|9 d(|: d)|"rd*nd+|8 d, d-|$ d.|! d/ d0|# d1|% d2| d3d4| z   d56       t/        j0                  |d      }H y )8Nr   r   rb   NNc              3  ,   K   | ]  }|d z  dk(    yw)r   r   N )rj   ss     r<   rm   z_do_mma.<locals>.<genexpr>K  s     RQQVq[R   r   r   z5Scale element type mismatch: expected f8e8m0fnu, got z"mxf8f6f4.block_scale.scale_vec::1Xr   )rI   zmxf4.block_scale.scale_vec::2Xr   z"mxf4nvf4.block_scale.scale_vec::4Xz,Unsupported element type for block scaling: z[$5], [$6], z,r,rr6   f8f6f4i8z Unsupported input element type:  c                     t         rf   )r)   )rR   s    r<   create_scaled_instr_descriptorz/_do_mma.<locals>.create_scaled_instr_descriptor~  s    r>   z[a_desc]a_descz.spz[$5], z,rc                    t        |      dz   t        |      k(  sJ g }|D ]  }|j                  | |z         | |z  }  |j                  |        t        d t        ||d      D              }|dz	  S )Nr   c              3  ,   K   | ]  \  }}||z    y wrf   r   )rj   ir   s      r<   rm   z/_do_mma.<locals>._get_offset.<locals>.<genexpr>  s     C41aQCr   T)strictr   )lenappendsumzip)idx
idx_tilingstridesidxsrk   offsets         r<   _get_offsetz_do_mma.<locals>._get_offset  s{    z?Q#g,...D 
kk#(!Gc 	KKC3tWT#BCCFQ;r>   r   r   )r5   l    
!llvm.voidz[{
            .reg .b32 a_desc_low, a_desc_high, b_desc_low, b_desc_high;
            .reg z.b32z.b64zK a_desc;
            .reg .b64 b_desc;
            add.s32 a_desc_low, $1, z&;
            add.s32 b_desc_low, $2, z+;
            mov.b64 b_desc, {b_desc_low, z};
            zmov.b32 a_desc, a_desc_low;zmov.b64 a_desc, {a_desc_low, z};z
            tcgen05.mmaz.cta_group::z.kind::z [$0], z
, b_desc, z$3, z$4;
        }z	r,r,r,r,bThas_side_effects)r   intr   tuple[int, ...]r   r   )"r   r'   r(   all	itertoolschainr   r   r,   rh   r+   rC   r%   r-   	functoolspartialrT   rW   rX   r)   r$   r*   ranger	   r.   r   r   r=   Valuer   r
   
inline_asmTypeparse);d_addra_desc_or_addrb_descrw   rx   ry   rz   r{   r|   r}   r/   r0   rv   rl   r~   ru   r^   r_   i1r8   r   a_k_idx_tilingb_k_idx_tilingr   r   elem_bitwidthinstr_krd   scale_stepskindr   	extra_ptxextra_constraintsr   	a_in_tmema_ptx
sparse_modsparse_meta_ptxsparse_addrscales_addrsr   k_stepscale_vec_widthscale_idi_desca_scale_addr_offsetb_scale_addr_offsetsp_selectorselector_widthk_steps_for_col_incr   r   a_enc_addr_baseb_enc_addr_baser   a_offset_lowa_offset_highb_offset_lowb_offset_highs;                                                              r<   r   r   2  s1   ( 
~~""1%"
##B'#
##B'# + ;|.+ +.+	R)//+2C["QR	RR	R
$
LD$8	99	9$&)4')...-]a"$5']"'+=
$$\2
&&|4	r33779	9CDVCWX
 	
 2dk'0'8'8
/<N($ 
			'	'	5[00'0'8'8
+'($ 
r33779	9/!4!4!8!8!::3"N|n ]^^I	zz\*bkk.D.D\.Rd				%	%l	3d				'	'	5d		$	$Q	'	2	2<	@d"B<. QRR$&&	  A'T!)!*x%!ur* )Hr/ytb0!#+"$, a7l# E'f$$$][(o;&/9h-
g+q7{L,
Hk;f #XoXW\\!NN3+0E0IJ!NN3+0E0IR0ORY0YZ
**\#6
7
**\#6
7l
 k	W""""""  N2W 33q88822 JJuwwv1D'Dq'H#N

 '
g+q7{FL+{fqf  G+I>**h111  BNN$?$?$CCCC&o'K,CCC"0ox+fnkBBh &OXFNK@@H"*Z"7R-L"*Z"7R-LOO
l#	/6:ccWbc%&62 3%%1N 3%%1N 3++8/ :.7*?]^k]llo=pq r"|<yvWUGS]^m]nnrs|r} ~		 	''  A&JKE'r>   c                *   t        | t        j                        r| j                         } nD| j                  t
        j                  j                  d      k7  rt        d| j                         |r|t        d      |j                  dk7  rt        d      t
        j                  j                  d      }t        j                  |d      }t        j                   | t        j"                  j$                  |       y t        j                   |        y )	N!llvm.ptr<3>z9barrier must be a Mosaic barrier or a SMEM pointer, got: z,ctx must be provided for collective barriers)r   r   r   z:Collective arrivals only support (2, 1, 1)-shaped clustersr   r   )groupmulticast_mask)rh   r   
BarrierRefget_ptrr   r   r   r   r-   cluster_sizer)   r'   r(   r	   r.   r   tcgen05_commitCTAGroupKindCTA_2)barrierr_   ctxi16masks        r<   commit_arriver+    s    
 ))*ooG||rww}}^44
	LL>	  
{EFF
9$ \]]
..
%
%b
)C>>#q!Dt((..t 	 r>   c                    |r0| j                         dk7  sd| cxk  rdk  sn t        d|        | S t        dd| dz
  j                         z        } | dkD  rt        d|  d      | S )a!  Returns the exact number of columns to allocate in TMEM.

  The number of columns is rounded up to the nearest power of 2.

  Args:
    ncols: The number of columns to allocate.
    exact: If true, throws an error if the number of columns is not a power of 2
      and within [32, 512].
  r   r   r   z6ncols must be a power of 2 and within [32, 512], got: zAfter rounding up, got z$ columns, exceeding the limit of 512)r   r-   r   
bit_length)ncolsexacts     r<   tmem_alloc_exact_ncolsr0    s     AR5%7C%7OPUwWXX 
, A%!)//112Es{#E7*N
O  
,r>   c                   t         j                  j                  | j                        rt        j                  | j                        }|j                  t         j
                  j                  d      k7  rt        d|       t        j                  |      st        d|       t        j                  |j                        dk7  rt        d|       t        j                  | d      } nD| j                  t         j                  j                  d      k7  rt        d	| j                         t!        ||      }|rt"        j$                  j&                  nt"        j$                  j(                  }t         j
                  j                  d      }t#        j*                  | t        j,                  ||      |
      |fS )Nr   z&tmem_addr must be an i32 memref, got: z)tmem_addr must be in shared memory, got: r   z.tmem_addr must contain a single element, got: r   )memory_spacer  z4tmem_addr must be an SMEM pointer or a memref, got: r  )r   r   rh   r   rl   r'   r(   r-   r   is_smem_refmathprodr   
memref_ptrr   r   r0  r   r%  r&  CTA_1tcgen05_allocr   )	tmem_addrr.  r_   r/  ref_tyr  r8   s          r<   
tmem_allocr<    sL   ]]inn-]]9>>*Fbnn99"==?xHIIV$B6(KLLyy!#GxPQQ  ;I~~~66
KINNK[\
]]
 
.%%/$


!
!T5F5F5L5L%
##B'#			Iuwwuc':%	H%	OOr>   c                    | j                   t        j                  j                  d      k(  sJ t        j                  j                  d      }t        j                  ||       S )Nr   z!llvm.ptr<6>)r   r   r'   r(   r   r   r
   inttoptr)r:  ptr_tys     r<   _tmem_addr_to_ptrr@    sF    	2>>66r:	::	:77==(&	vy	))r>   c                   | j                   t        j                  j                  d      k7  rt	        d| j                          t        ||      }|rt        j                  j                  nt        j                  j                  }t        j                  j                  d      }t        j                  t        |       t        j                  ||      |       y )Nr   ztmem_addr must be an i32, got: r3  )r   r   r'   r(   r-   r0  r   r%  r&  r8  tcgen05_deallocr@  r   r   )r:  r.  r_   r/  r  r8   s         r<   tmem_deallocrC  #  s    ^^r~~22266
6y~~6FG
HH
 
.%%/$


!
!T5F5F5L5L%
##B'#	"EGGE3$7ur>   c                    | rt         j                  j                  nt         j                  j                  }t        j                  |       y )Nr3  )r   r%  r&  r8  tcgen05_relinquish_alloc_permit)r_   r  s     r<   tmem_relinquish_alloc_permitrF  .  s1    %/$


!
!T5F5F5L5L%&&U3r>   c           	     :   |j                         dk7  s|dkD  rt        d|       | xdk(  r d}n xdk(  r d}ndk(  rd}n	 t        d	| d
      ||z  }|dkD  rt        d| d|d| d      dj                  d t	        |      D              }d|z   dz   }||fS )Nr   r   z*num must be a power of 2 and <= 128, got: 32x32b16x128br   16x256br   zshape=z is unsupported   z!TMEM translation too big : shape=z	 and num=z	 involve z5 registers per-thread, which exceeds the limit of 255,c              3  &   K   | ]	  }d |   yw)$Nr   )rj   r   s     r<   rm   z&_tmem_access_helper.<locals>.<genexpr>D  s     :Q1QC:s   {})r   r-   r)   joinr   )r   numnum_regsregs_vectors       r<   _tmem_access_helperrU  2  s    ]]_S3Y
A#G
HH	h	h	h	
65(/ :;;
c/(^

,eXZ3& 9:J	L  :%/::+k!C'+	;	r>   c                   t         j                  j                  d      }t        ||      \  }}|rdnd}t	        j
                  t         j                  j                  ddj                  d t        |      D              z   dz         | gd| d	| | d
| d| d
d|z  dz   d      }t        |      D 	cg c]  }	t	        j                  |||	g       c}	S c c}	w )Nr   z
.pack::16br   z!llvm.struct<(rL  c              3      K   | ]  }d   yw)r8   Nr   )rj   _s     r<   rm   z_tmem_load.<locals>.<genexpr>O  s     %Ie%Is   z)>ztcgen05.ld.sync.aligned..xz.b32 z, [$z];z=r,rTr   )r   r'   r(   rU  r
   r   r   r   rQ  r   extractvalue)
r:  r   rR  packr8   num_out_regsrT  pack_modregsr   s
             r<   
_tmem_loadr`  I  s    
##B'#1%=,!\r(	ggmm
SXX%IU<5H%II
ID
P k r#xjk]$|n\^_lS 
$ 6;<5H	I$

Cs
+	II	Is   4Cc                    t        ||      \  }}|rdnd}t        j                  t        j                  j                  d      g || d| d| | d| d| d
d	|z  d
z   d       y )Nz.unpack::16br   r   ztcgen05.st.sync.aligned.rY  z.b32 [$z], ;zr,rZ  Tr   )rU  r
   r   r   r   r   )r:  r   rR  r_  unpackr]  rT  r^  s           r<   _tmem_storerd  Y  sz    1%=,%^2(//ggmmL!i r#xj~SQ\P]]^_
\Cr>   )frozenc                  <     e Zd ZdZddZddZd fdZd	dZ xZS )

TMEMLayouta  Represents the way a shape is laid out in TMEM.

  The layout describes how the shape is split across the 128 rows (lanes) of
  TMEM. We reinterpret warp_dims as the partitioning of TMEM into 4 banks, each
  accessible from a single warp. The 32 lanes inside each bank are assigned
  consecutive elements from lane_dims. The data within each lane is linearized
  in row-major order, with each vector padded up to 32 bits (wider vectors are
  unsupported).
  c                "   t        |      dk7  rt        d|       t        d t        || j                        D              rt        | d| j                         | j
                  dd|z  x}hvrt        d| d| d	| j
                         y )
Nr   'TMEM can only represent 2D shapes, got c              3  ,   K   | ]  \  }}||z    y wrf   r   )rj   r   rk   s      r<   rm   z(TMEMLayout.check_type.<locals>.<genexpr>t  s     
>TQ1q5
>r   z& is not divisible into tiles of shape r   r   zFor z+-bit types, the vector length must be 1 or z , but got: )r   r-   r   r   r   vector_length)selfr   r   fully_packeds       r<   
check_typezTMEMLayout.check_typeq  s    
5zQ@HII

>S(<(<=
>>G9$:N:N9O
P  !R8^%C\!DD
 ^<(:(:';=  Er>   c                n   | j                  ||       d}| j                  D ],  }t        |t        j                        s||j
                  z  }. | j                  D ],  }t        |t        j                        s||j
                  z  }. t        j                  |      t        z  | j                  z  |z  S )Nr   )rn  	warp_dimsrh   fa
Replicatedtimes	lane_dimsr5  r6  	TMEM_ROWSrk  )rl  r   r   replication_factordims        r<   cols_in_shapezTMEMLayout.cols_in_shape~  s    OOE8$~~ (	C	'cii'( ~~ (	C	'cii'( 99Uy(D,>,>>ASSSr>   c                    t         |          }t        |j                  |j                  |j
                  |j                  d      S )NF)_check_canonical)supercanonicalizerg  tilingrp  rt  
vector_dim)rl  r   	__class__s     r<   r|  zTMEMLayout.canonicalize  sB    W!#F r>   c                    t        j                  | j                  | j                  | j                  | j
                        S rf   )rq  TiledLayoutr}  rp  rt  r~  rl  s    r<   as_tiled_layoutzTMEMLayout.as_tiled_layout  s,    >>T^^T^^T__ r>   )r   r   r   r   returnNone)r   tuple[int, int]r   r   r  r   r  rg  )r  fa.TiledLayout)	__name__
__module____qualname____doc__rn  rx  r|  r  __classcell__)r  s   @r<   rg  rg  e  s    	Tr>   rg  c                    | t        |      k(  rt        S | t        ||      k(  rt        j                  S | t        ||      k(  rt        |      S t        d|  d      )Nrc   TMEM layout  is not supported)r   LAYOUTtmem_half_lane_layoutrq  WGMMA_LAYOUTtmem_m64_collective_layoutfa_m64_collective_layoutr-   )tmem_layoutcolumnsrd   s      r<   !_infer_tmem_load_registers_layoutr    sa     '88M)'7CC??.wHH#G,,\+.?@AAr>   c                   t        |       dk7  rt        d|        |dkD  s|j                         dk7  rt        d|       | d   |z  rt        d|        | d   t        k(  rt	        |      S | d   t        dz  k(  r |rt        | d   |      S t        | d   |      S t        d|  d	t         d
t        dz   d| d    d	      )Nr   ri  r   r   ,Packing must be <= 8 and a power of 2, got: z<Minor dimension of shape must be divisible by packing, got: r   zUnsupported shape: z#. TMEM references must have either z or  rows, but got .)r   r-   r   ru  r   r  r  )r   r_   rd   s      r<   r   r     s    Z1_
>ugF
GGq[G%%'1,
CG9M
NN
1X
STYSZ[
\\
1Xw''Qx9>!'a'::"58W55

eW %;d9>*/%(1	F r>   c                    | j                         dk7  rt        d|        t        t        j                  t
        | ft        j                  | ff      ddd      S )zCA TMEM layout used for 1CTA MMA with M=128 and 2CTA MMA with M=256.r   z#Packing must be a power of 2, got: )rp  rt  r~  r   r-   rg  rq  Tilingru  	WARP_SIZErc   s    r<   r   r     s[    A
:7)D
EE	ii)W%g'>?@	
 r>   c                    || kD  s|j                         dk7  rt        d|       | dz  rt        d|        t        t        j                  t
        dz  | ft        j                  dz  | dz  f|ff      ddd	      S )
z*A TMEM layout used for 1CTA MMA with M=64.r   r  r   'Columns must be a multiple of 16, got: r   ))r  r  r  r  r  rd   s     r<   r  r    s    w'++-2
CG9M
NNr\
>wiH
II	ii>7
#<<1gl
+* 	
 	
 	r>   c                    |dkD  s|j                         dk7  rt        d|       | dz  rt        d|        t        t        j                  t
        dz  | ft        j                  | dz  f|ff      ddd	
      S )z+A TMEM layout used for 2CTA MMA with M=128.r   r   r  r   r  r   )r  r  )r  r  r  r  r  s     r<   r  r    s    q[G%%'1,
CG9M
NNr\
>wiH
II	ii>7
#<<A
&* 	
 	
 	r>   c                    | dz  rt        d|        t        j                  t        j                  t        dz  | ft        j
                  | dz  fddf      ddd	      S )
zEThe register layout for transfers to/from tmem_m64_collective_layout.r   r  r   )r   r   r   ))r  r  r  r  )r-   rq  r  r  ru  r  )r  s    r<   r  r    sh    r\
>wiH
II	ii>7
#bllGqL%A64 	 
 r>   c                     t        t        j                  t        dft        dz  dff      t        j                  d      fdd      S )zA TMEM layout for A and B scales in .scale_vec::1X configuration.

  See https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-a-layout-1x
  r   r   )rs  r  r  r  )rg  rq  r  ru  rr  r   r>   r<   scales_layoutr    sD    
 
ii)Q)q.!!456Q')	
 r>   c                 l    t        t        j                  t        dft        dz  dfddf      ddd	      S )
zA TMEM layout for A sparsity metadata.

  See https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-sparse-matrices-sparsity-selector-kind-tf32-m128-256
  r   r   r   )r   r   )r   r   )i)r  r  r  r  r  )rg  rq  r  ru  r   r>   r<   sparse_meta_layoutr    s<     
ii)R9>1"5wGH	
 r>   c                      e Zd ZU ded<   ded<   ded<   ded<   edd	       Zd
 Ze	 	 d	 	 	 	 	 	 	 	 	 dd       ZddZ	dddZ
ddZddZy)r`   ir.Valuer   r  r   ir.Typer   rg  r   c                .    | j                   j                  S rf   )r   rk  r  s    r<   rd   zTMEMRef.packing  s    ;;$$$r>   c                    t        j                  | j                        | j                  z  }|dk  st	        d|      y )Nr   zFExpected packed packed bitwidth to be <= 32, but got: packed_bitwidth=)r   r   r   rd   r-   )rl  packed_bitwidths     r<   __post_init__zTMEMRef.__post_init__!  sG    nnTZZ04<<?Ob  +)+- . . !r>   Nc                4   t         j                  j                  d      }t         j                  j	                  |j
                        st        d|j
                         t        j                  |j
                        }t        j                  |      st        d|       |j                  |k7  rt        d|       t        j                  |j                        dk7  rt        d|       t        j                  j                  d      }t!        j"                  ||g|j$                  z        }	|d   dk  rt        d|d          ||t        d	      t'        ||d
      }n%|j)                  |t        j*                  |              | |	|||      S )Nr   z2tmem_addr_ref must be a memref or a pointer, got: z-tmem_addr_ref must be in shared memory, got: z*tmem_addr_ref must be an i32 memref, got: r   z2tmem_addr_ref must contain a single element, got: r   z+TMEM refs must have at least 32 rows, got: zAcollective argument must be provided when TMEM layout is inferredrc   )r   r'   r(   r   rh   r   r-   r   r4  rl   r5  r6  r   r	   
ConstantOpcreate_indexr   loadrankr   rn  r   )
clstmem_addr_refr   r   r_   r   r8   addr_ref_tyi0r:  s
             r<   
from_alloczTMEMRef.from_alloc'  sv    ..
%
%b
)C==##M$6$67KML^L^K_`aa-- 2 23K[)F{mTUU3&CK=QRRyy""#q(KK=YZZ				&	&q	)BMB4+2B2B+BCIQx"}DU1XJOPP~		O
 	
 "%Q?fu~~e45 y%//r>   c           	     0   t         j                  j                  d      }t        j                  || j
                        \  }}}t        |      rt        d      |dgt        |      z  k(  r|t        | j
                        k(  r| S | j                  t        | j                        k7  rt        d| j                         |d   dk7  s|d   t        k7  rt        d      |d   dz  rt        d	|d    d
      |d   }t        |t         j                         st#        j$                  ||      }|j&                  t         j(                  j+                         k(  rt#        j,                  ||      }| j                  dk7  r4t#        j.                  |t#        j$                  || j                              }t1        t#        j2                  | j4                  |      t7        t8        t:        t:        f   t9        |            | j                  | j<                        S )Nr   z$TMEM can only be sliced, not indexedr   rc   z=Slicing only implemented for refs with standard layout, got: z TMEM cannot be sliced along rowsr   r   z6TMEM column slice length must be a multiple of 8. Got r  )r   r   r   r   )r   r'   r(   r   parse_indicesr   r   r-   r   listr   r   rd   r)   ru  rh   r   r	   r.   r   	IndexTyper%   
index_castdivuir`   r   r   r   tupler   r   )rl  r   r8   base_idxslice_shapeis_squeezedcol_idxs          r<   r   zTMEMRef.sliceK  s   
..
%
%b
)C).)<)<T4::)N&Hk;
;=>>A3X&&;$tzz:J+Jk{{)$,,??kk]  {a;q>Y6 BCC1~Q #  qkGgrxx(sG,g||r||''))  g.g||qGU^^C%FGg

4<<15c?E+$67{{jj	 r>   c                v   | j                   }|$t        | j                  | j                  d   |      }t	        j
                  | j                        }| j                  t        |      k(  }|j                  | j                        }|d   dk7  rt        d      |t        k(  rc| j                  t        |      k(  rJt        | j                  | j                  d   | j                  |      j                  j                  |      }n|| j                  j                         k(  r||z  dk(  r	t!        |j"                        dk(  sJ t%        d |j&                  D              rJ t%        d |j(                  D              rJ t+        j,                  d	 |j/                  d      j&                  D              }t+        j,                  d
 |j/                  d      j(                  D              }	||	z  }
t1        | j                  | j                  d   |
z  | j                  ||      j                  |      }n|t2        k(  ro|rm|dk(  r|dk(  s|dk(  r^|j4                  dk(  rOt1        | j                  | j                  d   | j                  |t2        j4                        j                  |      }nv|t6        j8                  k(  r| j                  t;        | j                  d   |      k(  rt        | j                  | j                  d   dz  | j                  |      }|j                  d   dk(  sJ t=        j>                  |d d |dd  gd      }|j                  j                  |      }n|tA        | j                  d         k(  r| j                  tC        | j                  d   |      k(  r]|j                  | j                        }t        | j                  | j                  d   dz  | j                  |      j                  |      }ntE        d| j                   d| d      t7        jF                  |||      S )Nr   rc   r   zLoading multiple row tilesr   r   c              3  P   K   | ]  }t        |t        j                           y wrf   rh   rq  rr  rj   r   s     r<   rm   zTMEMRef.load.<locals>.<genexpr>       LaZ2==1L   $&c              3  P   K   | ]  }t        |t        j                           y wrf   r  r  s     r<   rm   zTMEMRef.load.<locals>.<genexpr>  r  r  c              3  l   K   | ],  }t        |t        j                        r|j                  nd  . ywr   Nrh   rq  rr  rs  r  s     r<   rm   zTMEMRef.load.<locals>.<genexpr>  .      $  2==1!''q
8$   24c              3  l   K   | ],  }t        |t        j                        r|j                  nd  . ywr  r  r  s     r<   rm   zTMEMRef.load.<locals>.<genexpr>  r  r  r   r   axiszLoads from TMEM layout z to register layout z are not supported)
_registers_layout
_is_signed)$rd   r  r   r   r   r   r   r   registers_shaper)   r  _load_32xcolsr   Treshaper  r   r   r   rp  rt  r5  r6  remove_dimension_load_32xcols_nativeTMEM_NATIVE_LAYOUTrk  rq  r  r  r   concatenater  r  r-   FragmentedArray)rl  r   	is_signedrd   r   has_default_layout
regs_shape	registerswarp_split_factorlane_split_factorsplit_factorraw_registerss               r<   r  zTMEMRef.loadl  s   llG~0
++tzz!}gf ~~djj)H(;G(LL''

3J!} <==DKK+>w+OO
,,

1tzz7	''*
  
4;;..0	0Wx5G25M''(A--- L6;K;KLLLLL6;K;KLLLL)) $**1-77$  )) $**1-77$  ')::l&
,,

15tzz7G
  
%	%*<	RGqLNv33q8&
,,

1tzz7<N<\<\
  
2??	"t{{6KDJJWXMcj6k'k#
,,

1*DJJm   #q(((..-"3]125F!GaPi++%%j1i	+DJJqM:	:t{{Nhimisistuiv  AH  OI  @I))$**5j
,,

1*DJJ
  #DKK= 1X')  f r>   c                .   t        |t        j                        st        d|       |j                  | j                  k7  r%t        d|j                   d| j                         |j                  | j                  k7  r%t        d|j                   d| j                         t        |j                  t        j                        st        d|j                   d      | j                  }| j                  t        |      k(  }t        j                  | j                        }|j                  t        k(  r=|r;t        | j                   |j"                  j$                  j'                  d	      |       y |j                  | j                  j)                         k(  r9||z  d
k(  r1t+        | j                   |j"                  j'                  d      |       y |j                  t,        k(  r[|rY|dk(  r|dk(  s|d
k(  rJ|j                  j.                  dk(  r1t+        | j                   |j"                  j'                  d      |       y |j                  t        j0                  k(  r| j                  t3        | j                  d   |      k(  rk|j"                  j$                  j'                  dd      }t5        j6                  t5        j8                  |dd      d      }t        | j                   ||       y |j                  t;        | j                  d         k(  rY| j                  t=        | j                  d   |      k(  r2t        | j                   |j"                  j'                  dd      |       y t        d|j                   d| j                   d      )Nz+TMEM stores expect a FragmentedArray, got: zStored array has shape z, but TMEM has shape zStored array has dtype z, but TMEM has dtype zStored array has layout z&, but TMEM stores expect a TiledLayoutrc   )r   r  r   r  r   r   r   r  r   r   zStoring from register layout z to TMEM layout r  )rh   rq  r  	TypeErrorr   r-   
mlir_dtyper   r   r  rd   r   r   r   r  _store_32xcolsr   r  r  r  r  _store_32xcols_nativer  rk  r  r  r   r  splitr  r  )rl  valuerd   r  r   r  s         r<   storezTMEMRef.store  s   eR//0CE7KLL{{djj #EKK= 1jj\  4::%#E$4$4#5 6jj\  ellBNN30>deffllG(;G(LL~~djj)H||v"4
,,))11':G 
446	67X;MQS;SDLL%//*A*A"*EwO
 
+	+0B	RGqLNu||99Q>DLL%//*A*A"*EwO'KK0APP//##++Ar2i..)QQ!?aHiT\\9g6	1

1 

++3

1w  T\\5??#:#:1b#A7K)%,, 8kk]+- r>   c           
     x   t         j                  j                  d      }| j                  j	                  | j
                  t        j                  | j                              }t        j                  t        j                         t        j                  |t        j                              }t        |      D ]  }t        t        j                   | j"                  t        j                  ||                  }t%        j&                  |t$        j(                  j*                  |      }t        j                  | j                        }d|z  }| j,                  dk(  rY|dk  r3t        j.                  t         j                  j                  |      |      }t        j0                  || j                        }ng| j,                  |k(  r@t        j0                  |t         j2                  j5                  |f| j                              }nt7        d| j,                         t        j8                  d| d||d        y )Nr   r   Unsupported packing: z[{}, z]: {}F)uniform)r   r'   r(   r   rx  r   r   r   r   r	   remui
thread_idxr.   WARPGROUP_SIZEr   r@  r   r   r   
tcgen05_ldTcgen05LdStShapeSHAPE_32X32Brd   truncibitcast
VectorTyper%   r)   debug_print)	rl  r8   num_colslaner   ptrvaldtype_bitwidthfull_packings	            r<   _debug_printzTMEMRef._debug_print  s   
..
%
%b
)C{{((U^^DJJ5OPH;;u'')5>>#u?S?S+TUD8_ Hejju~~c17MNOcOOC!6!6!C!CSIc~~djj1n>)l		BR^^88H#N#mmC,<<<'mmC!2!2L?DJJ!OP!$9$,,"HII'!G,dCGHr>   )r  r   r   )
r  r  r   r  r_   bool | Noner   zTMEMLayout | Noner  r`   )r  r`   )r   zfa.TiledLayout | Noner  r  r  fa.FragmentedArray)r  r  r  r  )r  r  r  __annotations__propertyrd   r  classmethodr  r   r  r  r  r   r>   r<   r`   r`     s    		.
% %.  !%"&!0!0 !0
 !0  !0 !0 !0FB@D2hHr>   c              #    K   t         j                  j                  d      }|\  }}||z  dk(  sJ ||z  }|d   |d   z  t        j                  |z  z  }	d|z  dk(  sJ d|z  }
d|j                         dz
  z  }t        |d|	z        }t        |
      D ]  }t        j                  | t        j                  ||z  dz  |            }d}|}||z
  x}dkD  rg||kD  r|dz  }||kD  rt        |||z         }t        j                  |t        j                  ||z  |z  |            }||||f ||z  }||z
  x}dkD  rg||k(  rJ  yw)a$  Generates a sequence of parameters for a given TMEM read or write.

  Arguments:
    base_addr: The base address of the TMEM region.
    cols: The number of logical columns to transfer.
    atom_shape: The logical shape of the tile written by the warp in a single
      TMEM transfer.
    tmem_packing: Packing degree in TMEM. When packing is 1, but the data is
      16-bit, we expect that each transfer actually involves double the number
      of physical columns.
    reg_packing: The number of elements that fit in a single 32-bit register.
  r   r   r   r   r   N)r   r'   r(   r   r  r-  r   r   r	   r   r   r   )	base_addrcols
atom_shapetmem_packingreg_packingr8   	atom_rows	atom_cols	total_numregs_per_instrnum_row_stepsmax_num	lane_stepaddr_rownum_processed	instr_num	remaining	num_sliceaddr_row_cols                      r<   _transfer_32xcolsr(    s    & 	##B'##)Y			Q		i)a=:a=0U__{5RS.	i1			/- )&&(1,-'~-.'' &izz)UWWi).C-JC%PQHMI!M119Q
6	!a	 	!}y'@AiZZ
EGGMI5EsKl )Y	99y m "M119Q
6 I%%%&s   C"E%AE<EEc                >   t         j                  j                  d      }|j                  dk(  r|j                  d   dk(  sJ |j                  d   dz  }dt        j                  |j                  d   j                        z  }|dk(  rd}t        j                  d|j                  d   dft        	      }t        j                  |d      }t        j                  |d      }	t        j                  |      D ]A  \  }
}t        j                   ||      |g |
d<   t        j                   ||	      |g |
d<   C |j#                  dd|j                  d   d      j%                  dd      }|j                  d
d  dk(  sJ |dk(  sJ d}nX|dk(  rHd}|j#                  dd|j                  d         j%                  dd      }d|cxk  rdk  sJ  J |dk(  }nt'        |      t)        | |d||      }|D ]'  \  }}}}|||f   j                  }t+        |||||       ) y )Nr   r   r   r   r   r   rb   rJ  r   r  )r   r   FrI  r   r   )r   r'   r(   ndimr   r   r   flatr   r   emptyobjectr	   r.   ndenumerater
   extractelementr  swapaxesr)   r(  rd  )r  vector_regsr  r8   r  r  store_shaper_  c0c1r   vregrc  itr'  r$  r!  r&  
regs_slices                      r<   r  r  %  s1   
##B'#			Q	;#4#4Q#71#<<	<			1		!$enn[%5%5a%8%=%=>>+AK88Q))!,a0?D	Q	B	Q	B^^K0 6	T++D"5d9S9!9o++D"5d9S9!9o6 <<1k//2A6??1ED ::bc?f$$$1FaK q![%6%6q%9:CCAqID!!!!!!QF
k
**D'<M"79 J3lIy)i*+00Jk9j&IJr>   c                   t         j                  j                  d      }|j                  dk(  sJ t        j                  |j
                  d   j                        }|j                  \  }t        j                  |j                        }d|z  }d|f}|dk(  r|dk(  rd gt        |      dz  z  }	t        j                  |d      }
t        j                  |d      }t        |      D ]@  \  }}t        j                   ||
      |	d|z  <   t        j                   ||      |	d|z  dz   <   B n#|D cg c]  }t        j"                  ||       }	}|dk(  sJ d}n|dk(  rm|dk(  sJ |D cg c]  }t        j"                  ||       }	}|dk(  rd|cxk  rdk  sJ  J |dk(  }n|dk(  r|dk7  rt%        d| d|       |d|z  k(  sJ d}nX||k7  rt%        d	| d
| d|       t        j                  |      dk(  sJ |D cg c]  }t        j"                  ||       }	}d}t        |	      |z  }t'        | ||||      }|D ]"  \  }}}}|dk(  sJ |	|   }t)        |d|||       $ y c c}w c c}w c c}w )Nr   r   r   r   Fr   r  z for element type Only z  packing supported for bitwidth , but got TMEM packing of rH  )r   r'   r(   r,  r  r-  r   r   r   r   rl   r   r	   r.   	enumerater
   r1  r  r)   r(  rd  )r  r3  r  r8   vec_tyrk  elt_bitwidthr  store_atom_shaper_  r5  r6  r   r7  rZ  rc  r  r8  r'  r$  r!  r&  r9  s                          r<   r  r  J  s   
##B'#			Q		==))!,112&LL/= 3 34,l"++&AVs;'!+,d>>#q!b>>#q!b - :)#t++D"5QW //b9QWq[: .99emmAs#9d91FaA+67aEMM!S!7D7r,#!#####q f		|r1!#L>1CL>R
 	
 R<////f{"+>|n M&&2^5  >>&!R'''+67aEMM!S!7D7F	T[	 $D*:L+V"79 G3lIy)>>iJh	:vFG9 : 8$ 8s   .I&'I+ I0c                   t         j                  j                  d      }t         j                  j	                  d|      }dt        j                  |      z  }|dk(  rd}|dk(  sJ d}n(|dk(  rd}d|cxk  rdk  sJ  J |dk(  }nt        |      t        j                  d|d	z  ft        
      }	t        | |d||      }
t        j                  |d      }t        j                  |d      }|
D ]  \  }}}}t        ||||      }t        |dz  |dz   dz        }|	||f   }|j                   d|fk(  sJ |j                   |f       |dk(  r|D cg c]  }t#        j$                  ||       }}t        j&                  |t        
      j)                  |dd      j+                  dd      }t#        j,                  |      }|j                   g |j                   dk(  sJ t        j.                  |j                         D ]E  }t#        j0                  ||g |d   |      }t#        j0                  ||g |d   |      }|||<   G Y|dk(  sJ |D cg c]  }t#        j$                  ||       }}t        j&                  |t        
      j)                  |d      j+                  dd      }||d<    |	S c c}w c c}w )Nr   r  r   rJ  Fr   rI  r   r   r*  r+  r   .)r   r'   r(   r  r%   r   r   r)   r   ndarrayr/  r(  r	   r.   r`  r   r   r
   r  asarrayr  r2  
mlir_undefr   insertelement)r  r  r   r  r8   r>  r  
load_shaper\  r3  r8  r5  r6  r'  r$  r!  r&  r_  	row_slicevector_regs_updaterZ  undefr   
high_undefr7  s                            r<   r  r    s   
##B'#==T5)&ennU+++AJ1DaJ!!!!!!1D
k
**

Atqy>8+D'<M"~~c1"~~c1"79 %3lIy)lJ	4@Di!mi!mq%89I$Y	%9:##9~5\8J8P8PR[7\\5a.23dll5!$3d3 ZZF+33Iq!DMMaQRSdoof%eZZ9/559q9999.445 '#''tIcI1IC
!!*d9S9!9orB"&3'
 A/34!dll61%4d4 ZZF+33IqAJJ1aPd $1%4 
' 4 5s   J6J;c                X   t         j                  j                  d      }t         j                  j	                  |f|      }dt        j                  |      z  }||z  dk(  sJ d}d|f}	|dk(  rd|cxk  rdk  sJ  J |dk(  }
n||k7  rt        d| d| d|       d	}
t        | ||	||      }t        j                  |d      }t        j                  |d      }d g||z  z  }|D ]z  \  }}}}|dk(  sJ |       t        ||||
      }|dk(  r-|dk(  r(|D cg c]  }t        j                  ||       c}||<   T|D cg c]  }t        j                  ||       c}||<   | |dk(  r|dk(  rt        j                  |dz  ft         
      }t        j"                  |      }t%        |j&                        D ]D  }t        j(                  ||d|z     |      }t        j(                  ||d|z  dz      |      }|||<   F |S ||k(  sJ t        j*                  |t         
      }|S c c}w c c}w )Nr   r   rH  r   r   r;  z supported for element type r<  Fr*  )r   r'   r(   r  r%   r   r   r)   r(  r	   r.   r`  r
   r  r   rB  r/  rD  r   sizerE  rC  )r  r  r   r  rk  r8   r>  r  rF  load_atom_shaper\  r8  r5  r6  r_  r'  r$  r!  r&  
instr_regsrZ  r3  rI  r   rJ  r7  s                             r<   r  r    sa   
##B'#==m-u5&ennU+++		$	))	)*%/A!!!!!!1D{"+:5' B*^-  DD/<U"~~c1"~~c1"
4;&	'$79 G3lIy)>$9$>L*iFJaMQ.9CDAeQ/Dd9o;EFaq&1Fd9oG A-1,**dai\8KOOF#E[%%& %%eT!c']B?j
DS1,=rBdk# 
 K'''**T0K	 EFs   H"=H'c                     t        j                  t         j                  j                         t	        j
                          y rf   )r   tcgen05_waitTcgen05WaitKindSTOREr   warpgroup_barrierr   r>   r<   commit_tmemrT    s(    D((../r>   c                     t        j                  t         j                  j                         t	        j
                          y rf   )r   rP  rQ  LOADr   rS  r   r>   r<   wait_load_tmemrW    s(    D((--.r>   c           
        t         j                  j                  d      }t        j                  | j                        }|j
                  x}|j                  k7  rt        d| d|j                         |t         j                  j                         t         j                  j                         hvrt        d| d      |j                  d   t        z  r"t        dt         d|j                  d          |j                  d	   d
z  rt        d|j                  d	          |j                  t               k7  rt        d|j                   d      t!        |j                        }|j                  d   t        z  |j                  d	   d
z  ddf}||k7  rt        d| d| d|j                         |j#                         \  }}	|t%        j&                  |      k7  rt        d      |dd \  }
}|
dz  s|dz  rt        d      |
d
z  }|d
z  }t%        j(                  | d      }|d   dkD  rt        d      t+        j,                  |dd       D ]
  \  }}t%        j.                  |||z  ||z  z   g|      }t1        j2                  |j4                  t1        j6                  |d
|d   z  |z  d
|z  z               }t9        j:                  |ddd      }t=        j>                  t<        j@                  jB                  tE        |      |t<        jF                  jH                  |rt<        jJ                  jL                  nt<        jJ                  jN                          y)a  Asynchronously copies the scale data from SMEM to TMEM.

  The result of the copy can be awaited by calling ``commit_arrive`` and waiting
  on the chosen ``Barrier``. However, if TMEM reference is to be consumed by a
  MMA issued in the same thread, no additional synchronization is needed.

  At the moment the function requires ``smem_ref`` to be contiguous and have a
  shape of ``(MN // 128, K // 128, 32, 16)`` for 8-bit scales (here MN stands
  for the size of the non-contracting dimension which is M or N), matching the
  scale layout for .scale_vec::1X. See https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-a-layout-1x
  for more details. Note that we always put the non-contracting dimension first.
  If you have a (MN, K // 32) array of scales in JAX (where MN and K are
  divisible by 128), you can prepare it for use in the kernel this way::

      scales.reshape(mn // 128, 4, 32, k // 4, 4)
            .transpose(0, 3, 2, 1, 4)
            .reshape(mn // 128, k // 4, 32, 16)

  The TMEM ref is expected to have the logical shape of the scales
  ``(MN, K // 32)``, and the layout created by ``scales_layout()``.
  r   Incompatible dtypes: SMEM has , TMEM has Unsupported dtype: z+, only f8e8m0fnu and f8e4m3fn are supportedr   z'TMEM reference must have a multiple of r  r   r   z:TMEM reference must have a multiple of 4 columns, but got r  r  r   	SMEM has , but expected  for TMEM ref shape 9Only copies from contiguous SMEM references are supportedNr   r   z,Scale tile strides must be a multiple of 128r   zOnly M/N up to 256 supportedrp   )	multicastr  )(r   r'   r(   r   r   rl   r   r-   rC   r%   r+   r)   r   ru  r   r  r  get_strides_and_offsetr   get_contiguous_stridesr7  r   r   getelementptrr	   r   r   r.   r   encode_descriptorr   
tcgen05_cpTcgen05CpShapeSHAPE_32x128br@  Tcgen05CpMulticastWARPX4r%  r&  r8  )smem_reftmem_refr_   r8   smem_tyr   
smem_shapeexpected_smem_shaper   rX  mn_tile_stridek_tile_stridemn_tile_stride_i32k_tile_stride_i32smem_base_ptrmn_tilek_tileload_ptr
store_addrr9   s                       r<   async_copy_scales_smem_to_tmemry    s@   0 	##B'#MM(--('###e6
5eWKGWX
YY
2''++-r/B/B/F/F/HII
 3E7:ef
gg^^A"
>ykYaYgYghiYjXkl
mm^^A
QRZR`R`abRcQde
ff__'
|HOO#44EF
GGW]]#*!*i79Ja9OQSUWX&&

J</B.C Dnn%	'  --/*'1,,Z88
P
QQ")"1+.-c]S0
C
DD%*#q(""8Q/- ]Q
<
==JrN3 ogv""	%	%1B(B	BCH sA
1-6WDEJ
 &&xFDIDOO))*%))00)3d%%9J9J9P9Pr>   c           	     *   t         j                  j                  d      }t         j                  j                  d      }t        j                  | j                        }|j
                  x}|j                  k7  rt        d| d|j                         |t         j                  j                  d      k7  rt        d| d      |j                  d   d	z  rt        d
|j                  d          |j                  d   dz  rt        d|j                  d          |j                  t               k7  rt        d|j                   d      t        |j                        }|j                  d   d	z  |j                  d   dz  d	df}||k7  rt        d| d| d|j                         |j                         \  }	}
|	t        j                  |      k7  rt        d      |d   dk7  rt        d      |	d   }|dz  rt        d      |dz  }t        j                   | d      }t#        |d         D ]  }t        j$                  |||z  g|      }t'        j(                  |j*                  t'        j,                  |d|z              }t/        j0                  |dd	d       }t3        |      }t5        j6                  t4        j8                  j:                  |||rt4        j<                  j>                  nt4        j<                  j@                          y )Nr   r   rY  rZ  r   r[  z, only i2 supportedr   r   z9TMEM reference must have a multiple of 128 rows, but got r   rb   z:TMEM reference must have a multiple of 64 colums, but got r  r  r\  r]  r^  r_  zOnly M=128 supportedr   z&K tile stride must be a multiple of 16r   r   r`  r3  )!r   r'   r(   r   r   rl   r   r-   r)   r   r   r  r  rb  r   rc  r7  r   rd  r	   r   r   r.   r   re  r@  r   rf  rg  SHAPE_128x128br%  r&  r8  )rk  rl  r_   r   r8   rm  r   rn  ro  r   rX  rq  k_tile_byte_stridert  rv  rw  	store_ptrr9   r
  s                      r<   'async_copy_sparse_metadata_smem_to_tmemr~  6  s    
~~""1%"
##B'#MM(--('###e6
5eWKGWX
YY
bnn))!,,
 3E7:MN
OO^^A
PQYQ_Q_`aQbPcd
ee^^A
QRZR`R`abRcQde
ff__*,,
|HOO#44EF
GGW]]#*!*c18>>!3D3JCQST&&

J</B.C Dnn%	'  --/*'1,,Z88
P
QQq 
4
55!*-R
=
>>$)""8Q/-)!,- f""!334bH 

8++U^^CV-LMI&&xFDID
I
&COO**C)3d%%9J9J9P9Pr>   )FFN)r/   r   r0   r   r3   r   r4   r   r5   z
int | Noner  r  )rD   zCallable[[ir.Type], int]r/   r   r0   r   rE   r  rF   r  rG   r   rH   r   r3   r   r4   r   rI   r  r  r  )r  r  )r   r`   r   zir.Value | TMEMRefr   r  rY   r   rZ   r   r[   TMEMRef | Noner\   r  r]   r  r^   zir.Value | boolr_   r   r  r  )&r   r  r   ztuple[ir.Value, int] | ir.Valuer   tuple[ir.Value, int]rw   r   rx   r   ry   z.tuple[tuple[int, ...], tuple[int, ...]] | Nonerz   z'tuple[tuple[int, ...], tuple[int, ...]]r{   ir.Value | Noner|   r  r}   r  r/   r   r0   r   rv   r   rl   r  r~   zir.Type | Noneru   r  r^   r  r_   r   r  r  )FN)r'  zutils.BarrierRef | ir.Valuer_   r   r(  zLaunchContext | Noner  r  )r.  r   r/  r   r  r   )FT)
r:  r  r.  r   r_   r   r/  r   r  r  )r:  r  r  r  )
r:  r  r.  r   r_   r   r/  r   r  r  )r_   r   r  r  )r  ztuple[int, str])r\  r   )rc  r   r  r  )r  rg  r  r   rd   r   r  r  )r   r  r_   r   rd   r   r  rg  )r   )rd   r   r  rg  )r  r   rd   r   r  rg  )r  r   r  r  r  )r  r  r  r   r  r  r  r   r  r   r  z*Iterator[tuple[ir.Value, int, int, slice]]r  )r  z
np.ndarray)F)rk  r  rl  r`   r_   r   r  r  )K
__future__r   dataclassesr   r   r5  typingr   r   r   r   jaxlib.mlirr   jaxlib.mlir.dialectsr	   r
   r   r   numpyr   r   r   rq  r   r   launch_contextr   ru  TMEM_MAX_COLSTCGEN05_SMEM_DESCRIPTOR_BITTCGEN05_LAYOUTr  TCGEN05_TRANSPOSED_LAYOUTTRANSPOSED_LAYOUTTCGEN05_ROW_LAYOUT
ROW_LAYOUTTCGEN05_COL_LAYOUT
COL_LAYOUTr  r=   rK   rT   rX   r   r   r+  r0  r<  r@  rC  rF  rU  r`  rd  	dataclassr  rg  r  r   r   r  r  r  r  r  r`   r(  r  r  r  r  rT  rW  ry  r~  r   r>   r<   <module>r     sr    #     0 0  & % ' %  $   ) 	% 			00 ""
""
**  $(;?
;?
;?
 ;? ;? ";? ;?|(?0(?
(? (? 	(?
 (? (? (? (? (? (? (?VNN ""(,"&CCC C
 C C C C &C  C C 
CLe'e'3e' !e' 	e'
 e' @e' 9e' "e' "e' #e' e' e' e' e' 'e'  !e'" #e'$ %e'& 
'e'T  $!(!! 
! 
	!6,P$*4.J 	 d#0 0 $0f	B	B&)	B47	B	B*	$$
" d#]H ]H $]H@+&+&
+&  +& 	+&
 +& 0+&\"JJ2Gj.b)X
 ?DMM")M7;M	Mb ?D,,"),7;,	,r>   