
    uki~                     4   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	Z
ddlmZ  G d d	      Zd
ej                  dej                  dej                  dej                  fdZd
ej                  dej                  dej                  dej                  fdZy)    N)fragmented_array)ir)llvm)vector   )utilsc                      e Zd ZdZ ej
                   ej                  d      ddd      Z ej
                   ej                  d       ej                  d      fdd      Z	 ej
                   ej                  d	      ddd      Z
y
)
MMALayoutszzContainer for MMA layouts, providing a convenient way to create
  layouts for MMA operands based on warp configuration.
  )@      r      r   r      )i))	warp_dims	lane_dims
vector_dim)r   r   r   r      )r   r   r   r   r   N)__name__
__module____qualname____doc__faTiledLayoutTilinglhs
Replicatedrhsacc     Z/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/mosaic/gpu/mma.pyr
   r
      s     	bii12		# 	bii'(q!#		# 	bii01		#r(   r
   r&   abreturnc                 :
    |j                   dk(  sJ |j                   dk(  sJ  j                   dk(  sJ |j                  |j                  k(  sJ |j                  t        j                  j	                         t        j
                  j	                         fv sJ  j                  t        j                  j	                         k(  sJ t         j                  t        j                        rHt        |j                  t        j                        r$t        |j                  t        j                        sJ d\  }}} j                  j                         D cg c]\  }t         j                  j                        D ]8  }t        j                   |g t        j"                  j	                  |g            : ^ }}}t        j$                  j'                  d      }	|j                  j                         D 
cg c]  }
t)        j*                  |
|	       }}
|j                  j                         D 
cg c]  }
t)        j*                  |
|	       }}
t-        |      dk(  sJ t-        |      dk(  sJ t-        |      dk(  sJ d	|j                   d
|j                   d}t/        j0                         fd} ||      } ||      } ||      } ||      }| d| d| d| d| d
}dj3                  dg|z         ddj3                  dg|z         ddj3                  dg|z         ddj3                  dg|z         }g |||}t        j4                  j7                  ddj3                   fd|D               d      }t9        j:                  ||||d      }t        t-        |            D cg c]$  }t9        j<                   j                  ||g      & }}g }t9        j>                  t        j@                  j	                  d j                              }tC        |ddd   |ddd         D ]n  \  }}t9        jD                  ||t)        jF                  d|	            }t9        jD                  ||t)        jF                  d|	            }|jI                  |       p tK        jL                  |tN              jQ                   j                  j                         }t        jR                  | j                  d      S c c}}w c c}
w c c}
w c c}w ) z;Performs `acc + a @ b.T` using warp level MMA instructions.r   r   r   )r   r   r   )dynamic_positionstatic_position    r   r   z&mma.sync.aligned.m16n8k16.row.col.f32..z.f32c           
          ddj                  t        |       D cg c]  }dt                c}      z   dz   S c c}w )N{,$})joinrangenext)n_counters     r)   <lambda>z"_mma_single_tile.<locals>.<lambda>Y   s;    	CHH58<a$w-)<==C <s   <
 , ;r4   z=frfz!llvm.struct<(c              3   H   K   | ]  }t        j                          y w)N)str
mlir_dtype).0r;   r&   s     r)   	<genexpr>z#_mma_single_tile.<locals>.<genexpr>k   s     FCNN 3Fs   "z)>F)has_side_effectsr   Nr   r   )position)dtype)
_registers_layout
_is_signed)*shaperE   r   F16TypegetBF16TypeF32Type
isinstancelayoutr    r!   	registersflattenr8   vector_lengthr   extractDenseI64ArrayAttrIntegerTypeget_signlessr   bitcastlen	itertoolscountr7   Typeparser   
inline_asmextractvalue
mlir_undef
VectorTypezipinsertelementcappendnpasarrayobjectreshapeFragmentedArray) r&   r*   r+   num_acc_regs
num_a_regs
num_b_regsregposacc_regsi32rA   a_regsb_regsinstr
n_regs_strout_regs_str
a_regs_str
b_regs_str
c_regs_strptxconstraintsin_operandsacc_struct_typeout_regs_structiout_regsvec_regs	vec_undeffirstsecondvecr<   s    `                              @r)   _mma_single_tiler   2   s    
H			
G			g			
	%%	%	
"**..*BKKOO,=>	>>	>	2::>>+	++	+R^^,
QXXr~~
.
QXXr~~
./ *1&,
J &&( szz//0  nn
..22C59(  	##B'#+,;;+>+>+@AaEMM!S!A&A+,;;+>+>+@AaEMM!S!A&A 
V			X!			V		21<<.!,,t
T%OO'* L),*%**%*,'*'<.:,bBzl!L# 
4&%	&'q	3%
"	#$A	3%
"	#$A	3%$	%&(  .&-6-H-+GGMMsxxFXFFGrJ/ OO	/ S]#
 !=(  (oobmm//cnnEF)8CaC=(14a4.9 meV


Y3
HC


S&5771c?
CCOOC ZZ/778K8KL(			3::$
 u BAHs   A!T8T3T5)Tc           
         |j                   \  }}|j                   \  }}| j                   \  }}||k7  rt        d| d|       ||k7  rt        d| d|       ||k7  rt        d| d|       t        j                  j	                         }	t        j
                  j	                         }
|j                  |j                  k7  r%t        d|j                   d|j                         |j                  |	|
fvrt        d      | j                  t        j                  j	                         k7  rt        d      t        j                  |j                  k7  rt        d      t        j                  |j                  k7  rt        d	      t        j                  | j                  k7  rt        d
      t        |j                  t        j                         sJ t        |j                  t        j                         sJ t        | j                  t        j                         sJ |j                  j"                  \  }}|j                  j"                  \  }}| j                  j"                  \  }}||k(  sJ ||k(  sJ ||k(  sJ ||z  ||z  ||z  }}}||k7  rt        d| d|       ||k7  rt        d| d|       ||k7  rt        d| d|       |dk(  r
|dk(  r|dk(  sJ d| d| d| d       | j%                         } d }t'        |      D ]^  }t'        |      D ]N  }t'        |      D ]>  } |||      } |||      } |||      }t)        | ||f   |||f   |||f         | ||f<   @ P ` | S )a`  Computes `acc + a @ b.T` using synchronouse MMA instructions.

  All operands must have `TiledLayout`s. The layouts must be generated
  by the `MMALayouts` class, which ensures that the tiles are mapped
  to the warps correctly.

  Args:
    acc: A `FragmentedArray` with a `TiledLayout` generated from
      `MMALayouts.acc`.
    a: A `FragmentedArray` with a `TiledLayout`  generated from
      `MMALayouts.lhs`.
    b: A `FragmentedArray` with a `TiledLayout` generated from `MMALayouts.rhs`.

  Returns:
    A new `FragmentedArray` with the result of the computation with
      the same type as `acc`.
  zM mismatch: z != zN mismatch: zK mismatch: zDtype mismatch: z-Only bf16 and f16 supported for the operands.zOnly f32 accumulator supported.z$Expected MMALayouts.lhs layout for Az$Expected MMALayouts.rhs layout for Bz&Expected MMALayouts.acc layout for accr   r   r   zTile shape r?   z not supported.c                 ,    t        | |z  | dz   |z        S )Nr   )slice)idxlengths     r)   r=   zmma.<locals>.<lambda>   s    %fsQw&.@A r(   )rN   
ValueErrorr   rQ   rP   rO   rE   NotImplementedErrorrR   r
   r#   rT   r%   r&   rS   r    r!   base_tile_shapecopyr8   r   )r&   r*   r+   mkr:   k2m2n2bf16f16m_tilek_tilen_tilek_tile2m_tile2n_tile2num_m_tilesnum_n_tilesnum_k_tilessk_idxm_idxn_idxmsnskss                              r)   mmar      su   . 77&1aGG'1bYY(2r"W
|A3d2$/
00"W
|A3d2$/
00"W
|A3d2$/
00
 
	$


#\\Q\\!
'~T!,,H
II\\$$
M
NN^^rzz~~''
?
@@^^qxx
;
<<^^qxx
;
<<^^szz!
=
>>	AHHbnn	--	-	AHHbnn	--	-	CJJ	//	/88++.&&HH,,/&'ZZ//'7	7			F			F		*+v+qF{AKK{+"W
|A3d2$/
00"W
|A3d2$/
00"W
|A3d2$/
00	2&A+&B, F82fXRx?	6
 	
#A![! Je{# J% J%ufufuf&s2r6{Ab"fIqRyIBF	JJJ 
*r(   )r^   jax.experimental.mosaic.gpur   r    jaxlib.mlirr   jaxlib.mlir.dialectsr   r   numpyrj    r   r
   rn   r   r   r'   r(   r)   <module>r      s      >  % '   2O			O " 2 2O797I7IOOfW			W	W 
W 	Wr(   