
    ukiI                     P   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlZddlmZ dd	lmZ dd
lmZ ej$                  Zej&                  Zej(                  j*                  e j,                   G d d                    ZdefdZdej4                  dej6                  dedz  dededz  dedededej:                  fdZdddedej>                  ej6                  z  dej6                  defdZ dej>                  dej>                  fdZ!d  Z"d! Z#d" Z$d# Z%y)$    N)ir)arith)llvm)nvvm)vector   )fragmented_array)	mma_utils)utilsc                      e Zd ZU dZej
                  ed<   ej                  ed<   dddej                  dej
                  defdZ	e
dej                  fd	       Zedd
dded
z  fd       Zed        Zd Zed        Zy
)WGMMAAccumulatora  A FragmentedArray that has is synchronized with the async proxy.

  This implies that it requires no additional synchronization when passed in
  as a WGMMA accumulator. In particular, when created from a
  FragmentedArray, the necessary synchronization is inserted at construction.
  _original_layout_valueT)_syncr   c                F    || _         || _        |rt        |      | _        y y N)r   r   wgmma_fence)selfr   r   r   s       \/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/mosaic/gpu/wgmma.py__init__zWGMMAAccumulator.__init__1   s'     -DDK'dk     returnc                 L    | j                   j                  | j                        S r   )r   	to_layoutr   r   s    r   valuezWGMMAAccumulator.value=   s    ;;  !6!677r   N	is_signedr   c                .   |dz  s|dz  rt        d| d|       |du rt        d      t        j                  j	                         }||}t        j
                  j                  |      r5t        j                  |t        j                  j	                  |d            }n4t        j                  |t        j                  j	                  |d            }| j                  t        j                  j                  |||ft        j                  |	            S )
N@      z8WGMMA requires m and n to be multiples of 64 and 8, got  and Fz0PTX does not support unsigned WGMMA accumulatorsr   g        r   )
ValueError	TypeErrorr   F32TypegetIntegerType
isinstancer   constantIntegerAttr	FloatAttrfrom_registersfaFragmentedArraysplatWGMMA_LAYOUT)clsmndtyper   f32zeros          r   r6   zWGMMAAccumulator.zeroA   s    2vQ SaS* + +EHII
**..
C}e	~~  '^^E2>>#5#5eQ#?@d^^E2<<#3#3E3#?@d
  1a&"//Y 	! 	
 r   c                 :   |j                   }|j                   t        j                  k7  r(|j                   t        j                  k7  rt	        d      t        j                  |j                        dk(  r|j                  t        j                        } | ||      S )Nz0Only WGMMA layouts supported in WGMMAAccumulator    r   r   )	layoutr-   r0   WGMMA_LAYOUT_ACC_32BITr#   r   bitwidth
mlir_dtyper   )r1   	registersoriginal_layouts      r   r,   zWGMMAAccumulator.from_registersU   sx    &&O2??*y/?/?2C\C\/\IJJ~~i**+r1%%b&?&?@ii/BBr   c                 6    | j                   f| j                  ffS r   r9   r   s    r   tree_flattenzWGMMAAccumulator.tree_flatten^   s    KK>D11333r   c                 $     | |d   |d   d      S )Nr   Fr   r   r    )r1   auxr   s      r   tree_unflattenzWGMMAAccumulator.tree_unflattena   s    eAhQuEEr   r   )__name__
__module____qualname____doc__r-   FragmentedLayout__annotations__r.   boolr   propertyr   classmethodr6   r,   rA   rF   rD   r   r   r   r   %   s     '''


 
(   
( ++	
(
 
( 8R'' 8 8 d t  & C C4 F Fr   r   r   c                 $   fdt         j                  t         j                  t         j                  f}t         j                  j                  |       r5t        fdt         j                  t         j                  g|D              S t         j                  j                  |       rt        fd|D              S t         j                  j                  d      j                  |       r% t         j                  j                  d            S y)Nc                 &    | j                        S r   r(   )tyabtypes    r   <lambda>z(_supported_wgmma_types.<locals>.<lambda>g   s    r}}V4 r   c              3   .   K   | ]  } |        y wr   rD   .0rS   input_types_ares     r   	<genexpr>z)_supported_wgmma_types.<locals>.<genexpr>j   s     ]rr"]   c              3   .   K   | ]  } |        y wr   rD   rW   s     r   rZ   z)_supported_wgmma_types.<locals>.<genexpr>l   s     ;rr";r[   r8   r!   F)r   F16TypeFloat8E5M2TypeFloat8E4M3FNTyper%   r(   anyFloatTF32TypeBF16Typer'   get_signless)r4   rT   f16_acc_typesrY   s    ` @r   _supported_wgmma_typesre   f   s    4/::r00"2E2EF-ZZ5!]b.>.>-\m-\]]]	zzU#;];;;	~~""2&11%82>>66q9::r   accb_descriptora_transposeb_transpose
a_k_stride
b_k_strider3   swizzleelement_typec
                 n   789: t        j                   j                  d   j                        j                  }
t        |
|	      st        d|
|	f      |dz  rt        t         j                  j                         }t         j                  j                         }t         j                  j                  d      }t         j                  j                  d      7t         j                  j                  d      }t         j                  j                         }t         j                  j                         }|dz  rt        t        |	      dk(  }|s|s|rt        d      t        |t         j"                        x}r|j$                  |||||hvrt        d	|j$                         t'        j                  |	      }||z  }|j(                  d|fk7  rt        d
      |j*                  t         j,                  t         j.                  hvrt        d      || t        d      ||dz  rt        |t        t         j0                  j                  |
      s|
7k(  r|dz  }t         j                  j                  d|
      8t3         j                        }|d   j                  t         j                  j                  d|
      k(  sJ  fd}t         j                  j                  |
      rdnd}nt         j                  j                  |
      r^|dz  }78 j                  D cg c]  }t5        |       }}t        j                   j                  d   j                        : :fd}d}nt        d|
 d      |rd}n
|
7k(  rd}nd}|rdgdz  }|r	|dz  }ndg}d| g|z  t7        |      D cg c]  }t9        |       c}z   |z   dgdz  z   dgd|z   z  z   }dj;                  |      }t=        j>                         99fd}ddj;                   ||            z   dz   }  ||      D ]  }! |r'ddj;                   |tA        |                  z   dz   }"n
 |d      \  }" |d      \  }#}$dj;                  d  ||      D              }%tC        9      tA        |      k(  sJ dt        |	      z  }&t9        |	      }'t         j                  j                  |	      rd}'nRt         j                  j                  |	      rd }'n0t         j                  j                  d      j                  |	      rd!}'t9        |
      }(|
7k(  rd"}(d#| d$|& d%|( d%|' d%|' d&|  d'|" d'|# d(|% d)})d*|$ d+|) d,}*7fd-}+ |+d      x},x}-}.|
7k(  r|,g}/n|,|-|.g}/|r(|&|/ |+tE        |             |+tE        |            gz  }/n|r|/ |+tE        |            gz  }/tA        |/      |dz   k(  sJ t'        jF                  |
      dk(  rd.nd/}0t'        jF                  |
      dk(  rdnd}1 jH                  |0k7  s7 j(                  d   dk7  s%tK        jL                   j(                  dd        |1k7  rt         j(                        t         jN                  jQ                  d0dj;                  8fd1|D               d2      }2t7        |t        |	      z  |&z        D ]L  }|rA|d d ||&z  |dz   |&z  f   }3|3jR                  j                  D 4cg c]  }4t5        |4       }5}4nM|dkD  rE|J tU        |tW        jX                  |t         jZ                  j                  ||dz	                    }|g}5|dkD  rAtU        |tW        jX                  |t         jZ                  j                  ||dz	                    }tA        |5      tA        |      k(  sJ tW        j\                  |2g ||5||/|*|dd34      }6t7        tA        |            D cg c]  }tW        j^                  8|6|g       }}O  ||      S c c}w c c}w c c}4w c c}w )5Nr   z/Unsupported wgmma types (out_ty, element_type)=r!   r8   r          z"Only f16 WGMMA supports transposesz$Unsupported A register array dtype: z"Unsupported A register array shapez#Unsupported A register array layoutz.Unsupported WGMMA features with A in registersr   c                 `    t        j                  |       j                  j                        S r   )nparrayreshapeshape)regsrf   s    r   rU   zwgmma_m64.<locals>.<lambda>   s    288D>#9#9#))#D r   rf   c           	          t        j                  | D cg c]  }t        |       c}      j                  j                        S c c}w r   )rs   rt   _unpack_i32ru   rv   )rw   regrf   vec_tys     r   rU   zwgmma_m64.<locals>.<lambda>   s8    288QU,V#[-E,V#W#_#_`c`i`i#j ,Vs   Az:WGMMA instruction only supports f32, f16 and s32 out (got )r   l=r3   ,c                 >    d t        j                  |       D        S )Nc              3   &   K   | ]	  }d |   yw)$NrD   )rX   is     r   rZ   z/wgmma_m64.<locals>.take_regs.<locals>.<genexpr>   s     <asG<   )	itertoolsislice)r3   	reg_counts    r   	take_regszwgmma_m64.<locals>.take_regs   s    <Y--i;<<r   {} c              3   &   K   | ]	  }d |   yw), NrD   )rX   rx   s     r   rZ   zwgmma_m64.<locals>.<genexpr>   s     ?!r!X?r   e5m2e4m3s8s32z!wgmma.mma_async.sync.aligned.m64nk. r   z, p;z{ .reg .pred p; setp.ne.b32 p, z, 0; z }
c                     t        j                  t        j                  j	                  |             j
                  S r   )r   
ConstantOpr   r*   r&   resultxi32s    r   lczwgmma_m64.<locals>.lc   s+    ??3 2 23 :;BBBr   
   	   z!llvm.struct<(c              3   4   K   | ]  }t                y wr   )str)rX   _out_ty_fields     r   rZ   zwgmma_m64.<locals>.<genexpr>  s     DaL 1Ds   z)>T)asm_dialecthas_side_effects)0r   
VectorTypeflattyperm   re   r#   rb   r&   r]   r'   rc   r^   r_   	bytewidthr(   r-   r.   r=   r   rv   r:   r0   WGMMA_LAYOUT_8BITr%   list_as_i32_regranger   joinr   countlennextintr<   ndimmathprodTypeparser>   	_llvm_addr   r   r*   
inline_asmextractvalue);rf   arg   rh   ri   rj   rk   r3   rl   rm   out_tybf16f16i8i64f8e5m2f8e4m3fnsupports_transpose	a_in_regselt_bytewidthswizzle_elemsnum_acc_regsacc_regsto_acc_vec_regsacc_constraintr}   num_imm_regsa_reg_constraintsr   reg_constraints_listreg_constraintsr   acc_reg_vectorr   a_regs
b_desc_reguse_out_regimm_regsk_instrel_ty
out_ty_strwgmma_instrptxr   use_outscale_ascale_bimmsexpected_dimexpected_regs_per_tileacc_struct_typea_sliceva_args
acc_structr   r   r   r~   s;   `                                                      @@@@r   	wgmma_m64r   s   s    ==!))*77&		5
G0F/HI
JJU
		$


#	~~""1%"
##B'#
##B'#  "&  $$&("_
 .!3	
9
::Q 2 233Y3||D#r68<<=all^LMMOOL1M},Mww2}%%;<<xx)=)=>><==!8GHHZ"_ZZ6"fm6L==$$T62LCHH~HA;r}}00v>>>>DONN55f=S3N	zzV$6LL,/HH5SC 5H5]]388A;++,FjON

DVHANP P L}LL	al 
>
|+|,-AQ-. 
	 
\!"	#  HH12/oo)= )L"9::S@.\" 	a	388Ic*;&<=>>DFlGF%aL*kWW?y'>??(	iC 45	55	5)L))'
l
%!!,/E	%%l3E	~~""1%00> E6{*s]J 	*!AgYa
|1UG1UGST	6("ZLH:Q@  +;-u[MO#C !#1%'%Ggs]9DWg&DK3RK !2c+&6#788DRK !""D	TlQ&	&&	&~~f-3, %v 6" <1!XX1!2dii		!"6NRh6h
SYY
GGMMsxxD8DDERH/ 'Y|44@A a!a'kq1u&7889g(/(9(9(>(>?1A?f?	
Q%%%OOC!3!3Cq!IJ
 sf1u

//#r~~11#zQG
Hl v;#/00001(1V1\1D1J CHHBV=>,
QC8H 9> 
	""w 62 	.N @0s   '`#6`(=`-7`2   )rl   r   bc                  ) |dk(  rt        d      t        j                  j                  |j                        st        d|j                         t        j                  j                         }t        j                  j                         }t        j                  j                         }t        j                  j                  d      }t        j                  j                  d      }t        j                  j                         }	t        j                  j                         }
t        j                  |      \  \  }})t        |t         j"                        x}rG|j$                  \  }}|j&                  }|||||	|
hvrt        d|       ||k(  rl|dk(  rgt        d      t        j                  j                  |j                        rt        j                  |      \  \  }}}nt        dt	        |             ||k7  rt        d	| d
|       )|k7  rt        d| d
)       | j(                  j$                  ||fk7  r't        d||f d| j(                  j$                         )|k(  s!)t        j                  j                         k(  r?| j(                  j&                  |k7  r=t        d) d| j(                  j&                         t+        )fdt        j                  t        j                  t        j                  hD              rW| j(                  j&                  |k7  r| j(                  j&                  |k7  rt        d) d| j(                  j&                         )|k(  rm|r|j,                  st        d      | j(                  j&                  |k7  s| j(                  j,                  s3t        d) d| j(                  j&                         t        d)       d}|t/        j0                  )      z  }|dkD  s|dz  rt        d|       |}||z  rt        d| d|       ||z  rt        d| d|       ||z  }||z  })t        j                  j                         k(  rt        j2                  j                         n)}|rdx}x}}t5        dd      }n_t        j6                  ||||f||fd      \  \  }}\  }}}|d   rJ |d    d   }t5        |t        j8                  j:                  k7  |      }t        j6                  |||fd!z  ||fd"      \  \  }}\  }} }!|d   rJ |d    d   }~|rt=        |      }t        j                  j                  d      }"| j(                  j>                  jA                         }#tC        |      D ]  }$tC        |      D ]  }%|r||$|z  |$d z   |z  |%|z  |%d z   |z  f   }&n:||J |$|z  |%|z  z   }'tE        |tG        t        jH                  |'      |"            }&tE        |tG        t        jH                  |%| z        |"            }(tK        |#|$|$d z    |&|(f||||!t        j8                  j:                  k7  |d#||#|$|$d z      tM        t!        j"                  |#| j(                  jN                  | j(                  j,                  $      | jP                  d%      S )&a  Perform acc += a @ b using the WGMMA instruction.

  `a` may be passed in registers, or as a memref. `b` must be a memref.

  The expected (logical) memref shapes are:
    a: (m // tile_m, k // tile_k, tile_m, tile_k)
    b: (k // tile_k, n // tile_n, tile_k, tile_n).

  While the shapes may be physically transposed, when considering the row-major
  physical shape, the tile dimensions must be the two minor dimensions and must
  have the shape (8, S) where S = swizzle // bytewidth(element_type).
  ro   zNo swizzle is not supportedzB must be a memref, got: r8   r!   zKOnly f16, bf16, i8, f8e5m2, f8e4m3fn are supported for A in registers, got z0swizzle=32 not supported for s8 lhs in registerszUnsupported A type: zHWGMMA requires A and B to have the same contraction dimension (K), got: r"   z;WGMMA requires A and B to have the same element type, got: z%Accumulator shape mismatch: expected z, got zWGMMA with element type z2 only supports accumulators of type f32, but got: c              3   @   K   | ]  }|j                          y wr   rR   )rX   trm   s     r   rZ   zwgmma.<locals>.<genexpr>q  s"      
 ll< s   z9 only supports accumulators of type f32 or f16, but got: zWGMMA with lhs of type u8z2 only supports accumulators of type s32, but got: zUnsupported element type: r       z+N must be a multiple of 8 and <= 256, got: zM must be a multiple of z, got: zK must be a multiple of N)rh   rj   F)rl   
large_tile
group_sizelogical_k_majorr   r   rp   T)rl   r3   rm   ri   rk   )
_registers_layout
_is_signedrC   ))NotImplementedErrorr   
MemRefTyper(   r   r#   rb   r&   r%   r]   r'   rc   r^   r_   r
   tiled_memref_shaper-   r.   rv   r=   r   r`   r   r   r   ra   dictcreate_descriptorDimKr   r>   copyr   r   cencode_addrr   r   r:   r   )*rf   r   r   rl   r   r5   r   r   r   r   r   r   r3   r   r2   k2element_type2m_group_elemsk_group_elemsn_group_elemsm_groupsk_groupswgmma_element_typea_desc_basea_m_group_stridea_k_group_stridea_instr_paramsa_k_instr_stride	a_fastestb_desc_baseb_k_instr_strideb_n_group_strideb_k_group_stride	b_fastestr   new_acc_regsmikia_mka_group_offsetb_krm   s*                                            @r   wgmmar  ,  s   & ]
;
<<		!	!!&&	)
09
::		$


#


#
##B'#	~~""1%"  "&  $$&("55a8&1a,Q 2 233Y3GGEArLLMS$FH==,o/  w"}   RSS	}}'&99!<GQ]
+DG95
66"W
	E!	  ]"
	?%~	/  	ZZ!Q

/Axvcjj>N>N=OP  SLBKKOO,==
zz#$\N 3$$'JJ$9$9#:<   

B--r/B/BC  zz#

(=(=(D$\N 3**-***?*?)@B  r ;<<
zz#3::+?+?$\N 3##&::#8#8"9; 
  :<.I
JJ -U__\::-WA
B1#F
GG-
/gaSI
JJ
/gaSI
JJ-(-( !-

0@ @bl 
 8<<K<"%5dt<N 	##	!=1!=1		'&,	+  """'*1-i9==??&B%57N !!!A%/	%{$*) a  	 %a(+ AA
##B'#%%**,,(O bHo 	"q&M!99"q&M!99;

  +0@0LLL..6F1FF900@#F
 
q..r4D/DEsKc #,
rBF
#


# )9==??2%
# 
#l2Q6 
!**##ZZ))
 ++
 r   rt   c                 X    t        j                  |       } t        j                          | S )a  Fences the array construction from WGMMA instructions.

  LLVM treats in-register computation as pure and can move it after the fence,
  which is explicitly disallowed by the PTX programming model. For that reason,
  we insert an LLVM optimization barrier before the fence.
  )r-   optimization_barrierr   wgmma_fence_aligned)rt   s    r   r   r     s%     
!
!%
(%	,r   c                     t         j                  j                  d      }t        j                  t        j                  t         j                  j                  d|      |       t        d            S )Nr8   rq   r   )
r   r'   rc   r   extractelementr   bitcastr   r&   _lc)r   r   s     r   r   r     sN    
##B'#			nnR]]&&tS115s1v
 r   c                     t         j                  j                  d      }t        j                  |t         j
                  j                  ||             j                  S )Nr8   )r   r'   rc   r   r   r*   r&   r   r   s     r   r  r    s>    
##B'#	bnn00a8	9	@	@@r   c                 b    t        j                  | |t         j                  j                        S )N)overflow_flags)r   addIntegerOverflowFlagsnone)r   ys     r   r   r     s!    	!Qt'@'@'E'E	FFr   c           	          t         j                  j                  d      }t        j                  | t        j
                  t         j                  j                  d|      |            S )Nr8   rq   )r   r'   rc   r   r  	broadcastr   r&   )r~   rx   r   s      r   r|   r|     sJ    
##B'#	fr}}00s;Q?
 r   )&dataclassesr   r   jaxjaxlib.mlirr   jaxlib.mlir.dialectsr   r   r   r   numpyrs   r   r	   r-   r
   r   r   r   	tree_utilregister_pytree_node_class	dataclassr   rM   re   ndarrayValuer   r   r   r.   r  r   r   r  r   r|   rD   r   r   <module>r1     s{       
  & % % '  $   
GGOO	 ))<F <F  *<F~
T 
v#	v# ((v# 	v#
 v# d
v# v# v# v# ''v#| w	w	BHH$w 
xxw
 wt	r)) 	b.@.@ 	A
Gr   