
    uki?9                    ,.   U d Z ddlmZmZmZ ddlZddlZddlZddlZddl	Z	ddl
mZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z% ddl#m&Z' ddl(Z)ddl*m+Z, ddl*m-Z- ddl*m.Z. ddl*m$Z$ ddl*m/Z/ ddl*m&Z& ddl*m0Z0  ejb                          G d d             Z2 G d d      Z3 e3       Z4eejj                     e3z  Z6ee2ejn                  ejp                  z  ge6f   Z9i Z:e;e<e9f   e=d<   d ejj                  d!eej|                     d"e?ej                  eejj                     f   fd#ZAd$e,j                  d%ej|                  d"ejj                  fd&ZCd'ej|                  d"eDdz  fd(ZE	 dd)ejj                  d*ej                  d+eDdz  d"e,j                  fd,ZGd-ejj                  d.ej|                  d/ej                  d"ejj                  fd0ZId1ejj                  d2ej                  d"ejj                  fd3ZJd4e<eKejp                     z  dz  d"ee9ge9f   fd5ZLd"ej|                  fd6ZM eLej                        d7e2d4ej                  d"eejj                     fd8       ZO eLej                        d9e2d4ej                  d"eejj                     fd:       ZQ eLej                        d9e2d4ej                  d"eejj                     fd;       ZSej                  j                  fd<ej                  d/ee.j                     d=ej                  d>ej                  fd?ZX G d@ dAe      ZYdBeYdCeDdz  d"efdDZZ eLej                        d9e2d4ej                  d"eejj                     fdE       Z\ eLej                        d7e2d4ej                  d"eejj                     fdF       Z^ eLej                        d7e2d4ej                  d"eejj                     fdG       Z`dHe,j                  e/j                  z  d"e<fdIZb eLej                        d7e2d4ej                  d"eejj                     fdJ       Zd eLej                        d7e2d4ej                  d"eejj                     fdK       Zf eLe j                        d9e2d4e j                  d"eejj                     fdL       Zh eLe j                        d9e2d4e j                  d"eejj                     fdM       Zj eLe j                        d7e2d4e j                  d"eejj                     fdN       Zl eLe j                        d7e2d4e j                  d"eejj                     fdO       Zn eLe j                        d7e2d4e j                  d"eejj                     fdP       Zp eLej                        d9e2d4ej                  d"eejj                     fdQ       Zr eLej                        d9e2d4ej                  d"eejj                     fdR       Ztd/ej                  d"e?ej                  e?e.j                  dSf   f   fdTZud<ej                  d/e?e.j                  dSf   d"ej                  fdUZvd1ejj                  d/e?e.j                  dSf   d"ejj                  fdVZw eLej                        d7e2dWej                  d"eejj                     fdX       Zy eLej                        d7e2dWej                  d"eejj                     fdY       Z{ eLej                        d7e2dZej                  d"eejj                     fd[       Z} eLej                        d7e2d4ej                  d"eejj                     fd\       Z eLej                         d7e2d4ej                   d"eejj                     fd]       Zd9e2d4ejp                  d^eDdz  d_eDdz  d"eejj                     f
d`Zej                  ddfej                  dadafej
                  dbdbfej                  ddafej                  ddbfej                  dadfej                  ddfej                  dbdbfej                  dbdff	D ]*  \  ZZZ ej                  eeec      e:ej                   <   , 	 dd9e2d4eddedSe,j                  f   d+eDdz  d"eejj                     f
deZej$                  e,j                  j&                  dfej(                  e,j                  j*                  dfej,                  e,j                  j.                  dfej0                  e,j                  j2                  dfej4                  e,j                  j6                  dfej8                  e,j                  j:                  dfej<                  e,j                  j>                  dffD ]*  \  ZZZ ej                  eeef      e:ej                   <   , d9e2d4ed+eDdz  ddee,j                  e,j                  ge,j                  f   d"eejj                     f
dgZejF                  e	jH                  dbfejJ                  e	jH                  dfejL                  e	jN                  dbfejP                  e	jN                  dfejR                  e	jT                  dbfejV                  e	jT                  dfejX                  e	jZ                  dafej\                  e	jZ                  dbfej^                  e	j`                  dfejb                  e	jd                  dafejf                  e	jd                  dbfejh                  e	jd                  dfejj                  e	jl                  dbfejn                  e	jp                  dbfejr                  e	jt                  dbfejv                  e,j                  jx                  dafejz                  e,j                  jx                  dbfej|                  e,j                  jx                  dfej~                  e,j                  j                  dafej                  e,j                  j                  dbfej                  e,j                  j                  dffD ]*  \  ZZZ ej                  eeef      e:ej                   <   , ej                  j                  e	j                  dbfej                  j                  e	j                  dbfej                  j                  e	j                  dafej                  j                  e	j                  dafej                  j                  e	j                  dafej                  j                  e	j                  dafej                  j                  e	j                  dbfej                  j                  e	j                  dbfej                  j                  e	j                  dbfej                  j                  e	j                  dbfi
Z eLej                        d9e2d4ej                  d"eejj                     fdh       Zej                  j                  e	j                  ej                  j                  e	j                  ej                  j                  e	j                  ej                  j                  e	j                  ej                  j                  e	j                  ej                  j                  e	j                  iZ eLej                        d9e2d4ej                  d"eejj                     fdi       Z eLej                        d9e2d4ej                  d"eejj                     fdj       Z eLej                        d9e2dkej                  d"eejj                     fdl       Z eLej                        d7e2dmej                  d"eejj                     fdn       Z eLej                        d9e2doej                  d"eejj                     fdp       Z eLej                        d9e2dqej                  d"eejj                     fdr       Z eLej                        d7e2d4ej                  d"eejj                     fds       Zdtej|                  duejj                  fdvZ eLej                        d7e2d4ej                  d"eejj                     fdw       Zdxee   dyee   dzeejj                     d"e?ee   eejj                     f   fd{Z eLej                        d7e2d4ej                  d"eejj                     fd|       Z eLej                        d7e2d4ej                  d"eejj                     fd}       Zd~ee   d"ej                  fdZ eLej                        d7e2d4ej                  d"eejj                     fd       Z eLej                        d7e2d4ej                  d"eejj                     fd       Z eLej                        d7e2d4ej                  d"eejj                     fd       Z eLej                        d7e2d4ej                  d"eejj                     fd       Z eLej                        d7e2d4ej                  d"eejj                     fd       Z  eLej                        d7e2d4ej                  d"eejj                     fd       Z eLej                        d7e2d4ej                  d"eejj                     fd       Zdeej                     d"ej                  fdZd1ejj                  dej                  d"e/j                  fdZd1e/j                  d"ejj                  fdZ eLej                        d7e2d4ej                  d"eejj                     fd       Z	 eLej                        d7e2d4ej                  d"eejj                     fd       Z eLej                        d7e2d4ej                  d"eejj                     fd       Z eLej                        d7e2d4ej                  d"eejj                     fd       Ze?ee   e,j                   ej"                  f   Zdeejj                     deej                     d"e?eejj                     eedz     f   fdZdeejj                     deedz     d"eejj                     fdZd7e2dej*                  dej*                  deKejp                     deedz     deejj                     d"eedz     fdZ eLej.                        d7e2dej.                  d"e6fd       Z eLej2                        d7e2dej2                  d"e6fd       Zd4ejp                  deej                     d"eej|                     fdZ eLej8                        d7e2dej8                  d"e6fd       Z eLej<                        d7e2dej<                  d"e6fd       Z eLej@                         eLejB                        d7e2d4ejp                  d"e6fd              Z"d4ejp                  d"eDfdZ#dejH                  d"ejn                  fdZ%dejH                  de.jL                  dz  deDd"e2fdZ'	 ddejH                  de.jL                  dz  deDfdZ(y)z8Lowering rules and pass for the MLIR Mosaic GPU dialect.    )CallableIterableSequenceN)AnyProtocolcast)mlir)mosaic_gpu_dialect)ir)_gpu_ops_gen)arith)builtin)func)gpu)math)memref)nvvm)scf)vector)safe_zip)layouts)utils   )fragmented_array)inference_utils)launch_context)tcgen05)wgmmac                      e Zd ZU ej                  dz  ed<   ej                  dz  ed<   ej                  dz  ed<   ej                  dz  ed<   eed<    e	j                  e      Zeej                  ej                  z     ed<    e	j                  d	d
      Zedz  ed<   dej                  ddfdZdej                  fdZy)LoweringContextNr   !single_thread_per_block_predicate%single_thread_per_warpgroup_predicatesingle_warp_per_block_predicateauto_barriers)default_factorylowered_operationsF)initdefaultis_collective_kernelopreturnc                     d|j                   vry| j                  |j                   d   | _        y| j                  |j                   d   k7  rt        d      y)zChecks that the collective attribute is consistent across operations.

    It is an error to mix collective and non-collective operations in the same
    kernel.
    
collectiveNzGCollective attributes are inconsistent across operations in the kernel.)
attributesr)   
ValueError)selfr*   s     g/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/mosaic/gpu/dialect_lowering.pycheck_collectivez LoweringContext.check_collectiveB   s_     2==(  ("$--"=d		"	"bmmL&A	A  
B    c                    t        |      sy |j                  x}t        vrt        d|       t        |   }t	        j
                  |      r#t	        j                  |      st        | d       || |      }t        |t              sKt        |j                  |      D ]  \  }}|j                  |        | j                  j                  |       y y )NzMissing lowering rule for z, is missing a layout and can not be lowered.)_should_lowerOPERATION_NAME
_loweringsNotImplementedErrorr   should_have_layouthas_any_layout_setr/   
isinstanceRecursedzipresultsreplace_all_uses_withr&   add)r0   r*   namelowering_rulenew_resultsoldnews          r1   lower_opzLoweringContext.lower_opR   s    !!!*4"<RD ABBt$M ))
004"IJKKb)Kk8,"**k2 '(#s!!#&'
!!"% -r3   )__name__
__module____qualname__r   LaunchContext__annotations__r   Valuebooldataclassesfieldsetr&   	OperationOpViewr)   r2   rF    r3   r1   r    r    4   s     ..55%'XX_4)+D8#%88d?26Gk6G6G7c",,23  '8k&7&7$'t  t  & &r3   r    c                       e Zd Zy)r<   N)rG   rH   rI   rS   r3   r1   r<   r<   h   s    r3   r<   r7   ir_valueexpected_typesr+   c                 "   t        t        j                  | j                  j                        }t        |t        j                        st        | d      t        j                  |j                  D cg c]  }|j                   c}|j                        }t        |t        j                        rt        |      }nt        |t              s|g}t        ||d      D ]%  \  }}|j                  |k7  st        d| d|        ||fS c c}w )a   Undoes the provided unrealized conversion cast.

  The `ir_value` must be an unrealized conversion cast. This function will
  create a new conversion cast that undoes the original one. The returned tuple
  contains:
  - The original unrealzied conversion cast (useful for extract attributes).
  - The list of operands of the original conversion cast (which are the result
    values of the undone conversion cast).

  The function will verify that the returned values have types that match
  `expected_types`.
  z is not a conversion_castTstrictzExpected type z for value )r   r   UnrealizedConversionCastOpowneropviewr;   r/   unrealized_conversion_castoperandstyper>   r   OpResultListlistr=   )rU   rV   conversion_castoperandconverted_outputsvts          r1   _undo_conversion_castrg   u   s      (((..*?*?/ 
OW%G%G	H
((AB
CC88#2#;#;<w||< !2??3./'.*+#^DA ;davv{s+aS9::; 
+	++ =s   4Dr   tyc           
         t        j                  |g| j                  j                         j	                               }t
        j                  j                  | j                  j                  D cg c]?  }t
        j                  j                  t
        j                  j                  d      |      A c}      |j                  d<   t        j                  | j                        |j                  d<   |j                   S c c}w )zyConverts a FragmentedArray to an IR value.

  The fragmented array's signedness is omitted from the IR representation.
  @   registers_shapelayout)r   rZ   	registersflattentolistr   	ArrayAttrgetshapeIntegerAttrIntegerTypeget_signlessr.   r   to_layout_attrrl   result)r   rh   rb   ss       r1   fragmented_array_to_irry      s     66	d&&..0779/ 35,,2B2B))//D
 	nn44R8!<D 3/./
 *1)?)?*/X& 
		Ds   /AC?dtypec                 H    t         j                  j                  |       rdS dS )zReturns `False` for Integer types, `None` otherwise.

  When converting from Pallas dtype to IR type, we lose the `is_signed`
  information. We can default to `False` for most use cases.
  FN)r   rt   r;   rz   s    r1   _default_is_signedr}      s      ..++E2<<r3   fragmented_array_as_irrl   	is_signedc                    | j                   j                  d   }t        j                  |      }t	        j
                  | j                        }|j                  t        |j                              }|j                  |j                        }t        | |gt        j                  |      z        \  }}	|	d   j                   j                  }
|j                  D ]  }|j                  |   |
j                  |<     t!        j"                  t%        |	            j'                  |j                  d   D cg c]  }|j(                   c}      }t        j*                  j-                  |j.                  d   j                  j                        r|dn|}t1        j2                  |||      j5                  t        j                  |            S c c}w )Nrl   r   rk   F
_registers_layout
_is_signed)r[   r.   r   from_layout_attrr   
VectorTyper_   rk   tuplerr   registers_element_typeelement_typerg   r   prodr\   nparrayra   reshapevaluert   r;   outputsfaFragmentedArray	to_layout)r~   rl   r   producer_layout_attrproducer_layout	vector_ty	reg_shapereg_tyrb   rd   reverse_conversion_cast	attributeattrrm   s                 r1   _fragmented_array_from_irr      s   
 055@@J,,-AB/mm2778)--eIOO.DE)11)2H2HI&'<vh9)==($/$ .a066=="-- Zi4C4N4Ny4Y&&y1Z hht-./77+667HIJDTZZJ) ^^66q9>>KKL"*	I			O	
Ig&&v./0 Ks   1Gtransformed_memreflogical_type
transformsc                 h    t        j                  |g| g      }||j                  d<   |j                  S )zWraps a transformed memref to an unrealized cast with transforms.

  The return type of the cast is the untransformed logical type.
  r   )r   rZ   r.   rw   )r   r   r   rb   s       r1   wrap_transformed_memrefr      s=     66n)*/ .8/\*			r3   refexpected_transformsc                     t        |      \  }}t        | j                  |      }t        | |g      \  }\  }||j                  d   k7  rt        d| d|j                  d          |S )zDUwraps a memref from an unrealized cast and verifies its transforms.r   zExpected transforms z  do not match actual transforms )+swizzle_and_transforms_from_transforms_attrtransformed_smem_ref_typer_   rg   r.   r/   )r   r   _r   transformed_typerb   rw   s          r1   unwrap_transformed_memrefr      s    
 >>QR-!Z.sxxD3C:J9KL/8F O66|DD

23 4&11,?@	B 
 
-r3   r*   c                       fd}|S )Nc                 Z    't        t              rnj                  }| t        |<   | S N)r;   strr6   r7   )fop_namer*   s     r1   wrapperz#_register_lowering.<locals>.wrapper  s-    	~ S)r/@/@gjHr3   rS   )r*   r   s   ` r1   _register_loweringr      s     
.r3   c                  @    t         j                  j                  d      S )Nrj   )r   rt   ru   rS   r3   r1   _lowered_barrier_typer   
  s    		$	$R	((r3   ctxc                    t         j                  j                  d      }t               }t	        |j
                  j                        D ]}  }t        j                  t        j                  |j                  |g|      t        j                  |j                  j                  t        j                  z  |      | j                          t!        j"                          g S )N    )	predicate)r   rt   ru   r   rangenum_barriersr   r   mbarrier_initr   getelementptrbase_pointercarrival_countWARPGROUP_SIZEr!   r   barrier)r   r*   i32lowered_barrier_typeis        r1   $_initialize_barrier_op_lowering_ruler     s    
 	##B'#.0&&' aBOOaS2FG""U%9%99	
 77 ++-	)r3   r   c                    t        d |j                  D              st        d| d      g }t        |j                  t	        j
                  |            D ]   \  }}|j                  t        ||             " t        j                  | }t        |t        j                        r|g}t        ||j                        D cg c]  \  }}t        ||j                         c}}S c c}}w )Nc              3   n   K   | ]-  }t         j                  j                  |j                         / y wr   )r   r   r;   r_   ).0rc   s     r1   	<genexpr>z9_optimization_barrier_op_lowering_rule.<locals>.<genexpr>)  s#     OR]]%%gll3Os   35zOptimization barrier op z has non-vector operands.)allr^   r8   r   r   
in_layoutsappendr   r   optimization_barrierr;   r   r>   ry   r_   )r   r*   fragmented_arraysrc   rl   lowered_fragmented_arraysarrrw   s           r1   &_optimization_barrier_op_lowering_ruler   $  s    
 
O2;;O	O

"2$&?@  !"++/I/I"/MN Iogv6wGHI !557HI)2+=+=>!: ; "";RZZH

#v S&++.
  
s    C&c           
         t         j                  j                  |j                        st	        d|       t        j                  |j                        }|j
                  st	        d|       t        j                  |j                  j                        }t        |j                        }t        t        j                  j                  t        j                   |j                  |j#                               t%        |j&                        t)        j*                  |j,                  d   d         |      |j                  j                        gS )NzUnsupported constant op: out_layoutsr   r   )r   DenseElementsAttrr;   r   r8   is_splatr   rw   r_   r}   r   ry   r   r   splatr   constantget_splat_valuer   rr   r   r   r.   )r   r*   r   rh   r   s        r1    _arith_constant_op_lowering_ruler   <  s     
			(	(	2
 9">
??


rxx
(%	
 9">
??	}}RYY^^$" 1) 



"
"nnR__e.C.C.EFBHHo&&r}}]'CA'FG!	 #  ))..

 
r3   ref_tyswizzleminimum_swizzlec                    ||k  rt        d| d| d      t        j                  |d       D ci c]  \  }}|t        |       }}}t	        t        t
        j                     |j                  dg             }|j                  dg       }t        |      dkD  rt        | d      t        |      dk(  rCt        |d	   j                        t        | j                        k7  rt        d
|d	    d|  d      t        |      dkD  rt        | d      t        |      dk(  r/t        |d	   t
        j                        st        |d	    d      yyc c}}w )aQ  Checks that the list of provided transforms and swizzle are supported.

  Currently, we allow the following:
    - any swizzle that is larger than or equal to `minimum_swizzle`;
    - optionally, a single tile transform (with rank equal to the rank of the
      memref being annotated);
    - optionally, a single transpose transform.
  zUnsupported swizzle z smaller than .c                 6    t        | t        j                        S r   )r;   r   TileTransform)rf   s    r1   <lambda>z=_check_transforms_and_swizzle_are_supported.<locals>.<lambda>m  s    
1n.J.J K r3   TFr   z' contains more than one tile transform.r   zeOnly tile transforms with rank equal to the rank of the memref being annotated are supported but got z for z" contains more than one transform.z is not a transpose transform.N)r8   	itertoolsgroupbyra   r   r   r   rq   lentilingrr   r;   TransposeTransform)	r   r   r   r   kre   partitioned_transformstile_transformsother_transformss	            r1   +_check_transforms_and_swizzle_are_supportedr   W  s    

wi~o5FaH  ##
K
!Q aj  
>''(  r*/ ,//r:A

BC  	Q
?1$$%V\\)::33B13E2FeHA  		Q

>?  		a&q)>+L+LMa !!?
@  N  ?s   Ec                       e Zd ZdedefdZy)	_Transfer	optimizedr+   c                      y r   rS   )r0   r   s     r1   __call__z_Transfer.__call__  s    r3   N)rG   rH   rI   rM   r   r   rS   r3   r1   r   r     s      r3   r   transferr   c                 Z    | | |      S 	  | d      S # t         $ r  | d      cY S w xY w)zMIf `optimized` is `None`, retry `transfer` with `optimized=False` on failure.Tr   F)r/   )r   r   s     r1   _retry_on_failurer     s=    I
%d##	 %e$$%s    **c                    t        j                        \  }t        j                  j                  j
                        j                  }t        |      dt        j                  dt        j                  ffd}t        j                  |      rTt        j                  |      }t        j                  j                  j                  |j                         } ||      gS t        j"                  |      st%         d|       j&                  j&                  j(                  nd }t        j*                  |      t        j,                  j                  j
                        }|j.                  ?t        j                  j1                  j                  t3        |            } ||      gS |j.                  t5        j6                         k7  rt%        d|j.                         t        j8                        d   }	t;        |	      \  }
t<        j>                  j@                  k7  xs |
}|rOtC        ||
       tE        j                  |	      d	t2        dt        j                  ffd
}tG        ||      }n*d	t2        dt        j                  ffd}tG        ||      } ||      gS )Nr   r+   c                 D    t        | j                  j                        S r   )ry   rw   r_   )r   r*   s    r1   _fragmented_array_to_irz>_vector_load_op_lowering_rule.<locals>._fragmented_array_to_ir  s     ""2BIINNCCr3   )r   vec_sizez has an unsupported layout: rl   r   r   Unsupported memory space: r   r   c                 L    t         j                  j                  |       S )N)r   rl   r   )r   r   
load_tiled)r   r   rl   r   transformed_refs    r1   r   z1_vector_load_op_lowering_rule.<locals>.load_tiled  s0    **

 +  r3   c                 ^    t         j                  j                  j                  |       S )Nr   )r   r   load_untiledsource)r   r   rl   r*   s    r1   r   z3_vector_load_op_lowering_rule.<locals>.load_untiled  s1    ,,
))	 -  r3   )$r   r   r   r   rw   r_   r   r}   r   r   rL   r   is_strided_fragmented_layout#from_strided_fragmented_layout_attrload_stridedr   r   is_tiled_layoutr/   r   r   from_tiled_layout_attr
MemRefTypememory_spacer   rM   r   smemin_transformsr   mgpuSwizzlingMode
kNoSwizzler   r   r   )r   r*   out_layout_attrr   r   strided_layoutr   r   r   transforms_attrr   has_transformsr   r   r   rl   r   r   s    `            @@@@r1   _vector_load_op_lowering_ruler    sm    '2226?ryy~~.;;, .)D**D	xxD
 ))/:@@N ))66
		(( 7 
 $$4566		 	 	1
t77HI
JJ$&LL$<bll  $)))/:&==(& ))66
		y/	 7  $$4566EJJL(
1&2E2E1FG
HH#11"5a8/C': d00;;;Iz./
GL/		?KOd r'9'9   )Y? ););  )yA
!"2
3	44r3   c                 x  
 t        j                  |      \  }t        |j                  |      
| j                  rt        j                          |j                  t        j                  j                        }|j                  |j                  j                  nd }|j                  
j                  t        |             n|j                  t!        j"                         k(  rt        j$                  |      d   }t'        |      \  }t(        j*                  j,                  k7  xs |}|r4t/        ||       t1        |      dt        f
fd}t3        ||       n2dt        f
fd}	t3        |	|       nt5        d|j                         | j                  rt        j                          g S )Nr   r   r   c                 ,    j                  |        y r   )store_tiled)r   r   r   unwrapped_refs    r1   r  z3_vector_store_op_lowering_rule.<locals>.store_tiled
  s    $$]GYGr3   c                 ,    j                  |        y )Nr   )store_untiled)r   r   r   s    r1   r  z5_vector_store_op_lowering_rule.<locals>.store_untiled  s    &&si&@r3   r   )r   r   r   valueToStorer$   
mgpu_utilswarpgroup_barrierdestinationr   r  r_   r   r   r  r  rM   r   r  r  r   r	  r
  r  r   r   r   r/   )r   r*   to_store_layoutref_typer   r  r   r  r  r  r   r   r   r  s             @@@@r1   _vector_store_op_lowering_ruler    sq    &004?.rP  "
#]]388$($&LL$<bll  $)"""3$y/"B

,%33B7:OEGZ  2 2 = ==KN1(JP/_EmH H Y/A4 A y1
1(2G2G1HI
JJ  "	)r3   c                     ~ t        j                  |      \  }t        |j                  |      }|j	                  |j
                  j                         g S r   )r   r   r   r   debug_printformat)r   r*   rl   as       r1   _debug_print_op_lowering_ruler"    sB     
''+(6&1!--		 	)r3   re   c                 V   t        | t        j                        r| j                  xt        j                  k(  r yxt        j
                  k(  r yxt        j                  k(  r yxt        j                  k(  r yxt        j                  k(  r yt        j                  k(  ry	 t        | j                        S t        | t        j                        sJ |        | j                  t        j                  | j                        k(  rd| j                   d	S t        | j                        S )
NWGMMA	WGMMA_ROWWGMMA_TRANSPOSEDTCGEN05TCGEN05_TRANSPOSEDTCGEN05_TMEM_NATIVE)packingzTMEM_DEFAULT(packing=))r;   r   r   rl   WGMMA_LAYOUTWGMMA_ROW_LAYOUTWGMMA_TRANSPOSED_LAYOUTTCGEN05_LAYOUTTCGEN05_TRANSPOSED_LAYOUTTMEM_NATIVE_LAYOUTr   r   TMEMReftmem_default_layoutr*  )re   s    r1   pprint_layoutr4  (  s    2%%&
((2??2%2%%!2'2''#  $188}a),1,)xx7..qyyAA$QYYKq11qxx=r3   c                    ~ t         j                  j                  |j                  j                        rft        j                  |      \  }t        |j                  |      }t        |j                  j                  j                  t        |                   g S t        j                  |      \  }t        |j                  |      }t        |j                  j                  j                  t        |                   g S r   )r   r   r;   r   r_   r   r   r   printr   r4  in_tmem_layouts_tmem_ref_from_ir)r   r*   rl   r!  r   s        r1   _print_layout_op_lowering_ruler9  @  s     
]]bhhmm,**2.IV!"((F3A	"))//
 
 q!1
23
 
)  //3IV
BHHf
-C	"))//
 
 s!3
45	)r3   c           	         ~ t        j                  |      \  }t        j                  |j                  j
                        }t        j                  j                  |j                  t        |j                        |j                  j                  t        j                  |      t!        |j                              }t#        ||      gS )Nr   )r   r   r   r   rw   r_   r   r   broadcasted_iotar   r   rr   	dimensionr   r   r   r}   ry   )r   r*   rl   result_typer!  s        r1   "_broadcasted_iota_op_lowering_ruler>  P  s     
((,(6biinn-+))Kllv&";#;#;< * ! !K
0	11r3   c                 Z   t        j                  |j                  j                        }t        j
                  j                  |j                  t        |j                        t        j                  |j                  d   d         t        |j                              }t        ||      gS )Nr   r   r   )r   r   r   r_   r   r   r   r   r   rr   r   r   r.   r}   r   ry   )r   r*   
out_vec_tyr   s       r1   "_vector_broadcast_op_lowering_rulerA  a  s     }}RYY^^,*''--iiJ
--
&q
) #:#:#:; .  !!1:
>	??r3   c                 (   t        j                  |      \  }t        j                  |j                  j
                        }|j                  sJ t        |j                  |      }t        |j                  t        |j                              |      gS r   )r   r   r   r   rw   r_   has_static_shaper   r   ry   r   r   rr   )r   r*   rl   r@  r!  s        r1   #_vector_shape_cast_op_lowering_rulerD  q  ss     ''+(6}}RYY^^,*		$	$$	$		62!QYYuZ-=-='>?L
 r3   c                 &   ~ t        d |j                  D              rt        d      t        j                  |      \  }t        j
                  |      \  }||k(  sJ t        j                  |j                  j                        }|j                  sJ t        |j                  |      }t        d t        |j                  |j                   d      D              }||   }|j"                  t%        j&                  |      k(  sJ t)        ||      gS )Nc              3   `   K   | ]&  }t        j                  |      j                  d k7   ( ywr   N)r   rs   r   r   rx   s     r1   r   zA_vector_extract_strided_slice_op_lowering_rule.<locals>.<genexpr>  s$     :!		 	 A	%:s   ,.z`strides` must contain only 1s.c              3      K   | ]W  \  }}t        j                  t        j                  |      j                  t        j                  |      j                         Y y wr   )r   DynamicSlicer   rs   r   )r   offsetlengths      r1   r   zA_vector_extract_strided_slice_op_lowering_rule.<locals>.<genexpr>  sL       && 
..
 
&
&v(>(D(Ds   AATrX   )anystridesr8   r   r   r   r   r   rw   r_   rC  r   r   r   r=   offsetssizesrl   r   r   ry   )r   r*   	in_layout
out_layoutr@  r!  indicesrw   s           r1   ._vector_extract_strided_slice_op_lowering_rulerT  ~  s     
:rzz::
?
@@**2.+9 ,,R0,:	j	  	 }}RYY^^,*		$	$$	$		95!   

BHHTB	 ' W:&	'22:>	>>	>
 
4	55r3   c                    ~ t        j                  |      \  }t        j                  |j                  j
                        j                  }d }t        |j                  ||      }t        |j                        xdk(  r t        t        j                  j                  dg|t        j                               t        j                   d |j"                  d               }|j%                  dt'        t)        |j*                              |      }nFxdk(  rnxdk(  rnxdk(  rn n t-        d	|j                         	 t-        d	|j                         t/        ||j0                  j
                        gS )
Nz#vector.kind<add>   r  rK  r@   z#vector.kind<maxsi>z#vector.kind<maxui>z#vector.kind<maximumf>Unsupported reduction kind: )r   r   r   r   r   r_   r   r   r   kind_slice_smemr  rq   r   r  r   r   r.   reducer   r   rr   r8   ry   rw   )r   r*   rl   r   r   r!  scratchrw   s           r1   "_vector_reduction_op_lowering_ruler]    s    
''+(6ryy~~.;;,)		69=!BGG	
--

QCEJJL

I
..r}}X6
7g xxuS\2G<f 58PP  ">rwwi HII	
">rwwi HII
 
8	99r3   c                    ~ t        j                  |      \  }}t        j                  |      \  }t        j                  |      t
        j                  k7  rt        d|       t        j                  |      t
        j                  t
        j                  hvrt        d|       ||k7  rt        d| d|       d }t        |j                  ||      }t        |j                  ||      }t        j                  t!        |j"                        j%                  d      j'                  d      j)                            xt        j                  j*                  k(  r& |j-                  d|j.                  d         }||z  }nxt        j                  j0                  k(  rn>xt        j                  j2                  k(  rn xt        j                  j4                  k(  rn n2 |j-                  d	|j.                  d         }|j7                  |      }n	 t        d
|j"                         t9        ||j:                  j<                        gS )NzUnsupported input layout: zUnsupported output layout: zOutput layout z# must match the accumulator layout z#vector.kind<>r@   r   maxrX  )r   r   r   r   r   r   r,  r8   r-  WGMMA_COL_LAYOUTr/   r   r   accr   CombiningKindr   rY  removeprefixremovesuffixupperADDr[  reduction_dimsMAXIMUMFMAXSIMAXUIr`  ry   rw   r_   )	r   r*   rQ  
acc_layoutrR  r   	source_faacc_farw   s	            r1   ,_vector_multi_dim_reduction_op_lowering_rulero    s    
+66r:9j ,,R0,:i(BOO;
 :9+F
GGj)2   ;J<H
II:

 %<	  )'		9iH)$RVVZC&	"''l0==cBHHJ	 
"			!	!r'8'8';<ff%%%
$&


$
$
$&


$
$	% r'8'8';<fzz&!f	
">rwwi HII
 
8	99r3   c                    t        j                  |      \  }t        j                  |      \  }t        |j                  |      }|j                  t        j                  |            }t        ||j                  j                        gS r   )r   r   r   r   xr   r   r   ry   rw   r_   )r   r*   rQ  rR  in_array	out_arrays         r1   "_mgpu_layout_cast_op_lowering_rulert    sk      **2.+9 ,,R0,:&rttY7(  !9!9*!EF)
 BIINN
;	<<r3   c                 R   t        j                  |j                  j                        }t        j                  |j                  j                        }t        |j                        dk7  st        |j                        dk7  rt        d|       t        |j                        }t        j                  |      d   }t        |j                  |      }t        j                  t        j                  |      d         }|j!                  |j                  ||      }t#        ||      gS )Nr      zIBroadcast in dim with non-trivial broadcast dimensions is not supported: r   )r   r   rc   r_   rw   r   rr   r8   r   broadcast_dimensionsr   r   r   r   r   r   broadcast_in_dimry   )	r   r*   in_tyout_tybroadcast_dimsin_layout_attr
operand_farR  outs	            r1   '_mgpu_broadcast_in_dim_op_lowering_ruler    s     --


(%==(&c&,,/14
	d	 
 001."--b1!4.(^D*''(C(CB(G(JK*##FLL.*M#
 f
-	..r3   .c                    d}g }| D ]8  }|t        |  d      t        j                  j                  |      r t        j                  |      j                  }St        j
                  j                  |      rOt        j
                  |      j                  }t        j                  t        |            }|j                  |       t        j                  j                  |      rPt        j                  |      j                  }t        j                  t        |            }|j                  |       0t        d       |xs t        j                  j                  t        |      fS )a  Returns the swizzle and MemrefTransforms for the given transforms.

  Args:
    transforms: a list of transform attributes.

  Returns:
    A tuple containing the swizzle mode and MemRefTransforms corresponding to
    the parameter transforms. If `transforms` is empty, or does not contain
    any swizzling transform, the swizzle mode is assumed to be kNoSwizzle.
  Raises:
    ValueError: if a swizzling transform is followed by any transform.
  Nz' contain more transforms after swizzle.zUnknown transform: {transform})r/   r	  SwizzleTransformAttrr;   r   TileTransformAttrr   r   r   r   r   TransposeTransformAttrpermutationr   r
  r  )r   r   gmem_transforms	transformr   tiling_transformr  transpose_transforms           r1   r   r     s3    ':</ 9i*%LMNN  ++I6 )))4<<g				*	*9	5%%i077f'55eFmD-.		$	$	/	/		://	:FFk*==

 01788%9( 
	1D&&1153I	IIr3   c           	      d   t        j                  |       }|s|s| S t        j                  |       st        d|  d      | j                  }| j                         \  }}|r=t        |      dk7  rt        d|       |d   dk7  s|d   |d   k7  rt        d|        |D ]  }t        |j                  |            } |r0t        |      dk(  rd}nEt        |      d	k(  rd
}n4t        d|       t        t        t        t        |                        }dgt        |      z  }t        dt        |            D ]  }	||	   }
||	dz
     }||   ||   z  ||
<    t        j                  j                  || j                   | j"                  t        j$                  j                  ||            }|S )zMReturns the transformed ref type for the given logical ref and transforms.
  z+Only workgroup memory is supported but got r   rv  z*Only 2D shapes can be transposed, but got r   r   z6Only contiguous 2D memrefs can be transposed, but got )r   r   rV  )rv     r   r   z4Expected a 2D or 4D shape after transforms, but got r  rl   )r   is_memref_transposedis_smem_refr/   rr   get_strides_and_offsetr   r8   ra   transform_shaper   reversedr   r   r  rq   r   r  StridedLayoutAttr)r   r   
transposedrr   rN  rK  rf   minor_to_major_stride_ordernew_stridesr   dimprev_dim
new_ref_tys                r1   r   r   #  s    ))&1*	JM			6	"
B6(!L
MM
,,%113/'6
5zQ6ug
>  qzQ'!*a0B6(
K   +a""5)*E+  5zQ$*!	Uq$0!@
H  #(s5z1B(C"Dc%j +CJ ?a
%a
(C*1Q3/H"8,uX>K?
 }}  &&!!%%fk:	 ! * 
r3   c                     t        j                  | j                        }t        ||      }||k(  r| S t        j
                  }t	        j                  | |      }t	        j                  |||      }|S )a8  Applies transforms on the ref, and makes sure that their effect is
  propagated appropriately on the strides.

  This function is used any time we lower from a dialect SMEM ref (2D for wgmma)
  with given transforms to a "physical" SMEM ref (4D for wgmma) that is fully
  transformed and transposed as needed.
  rW  )ptr_memory_space)r   r  r_   r   r   WORKGROUP_NVPTX_ADDRESS_SPACE
memref_ptrptr_as_memref)r   r   r   r  msptrnew_refs          r1   reinterpret_smem_refr  ]  se     =="&(<*zJ**"2.#Z"E'	.r3   load_opc                 |   | j                   J t        j                  j                  |j                        }t        j                  |      \  }t        |      \  }}t        |j                  |      }g }t        |j                  |j                  d      D ]e  \  }}	t        j                  t        j                   j#                         |      }
|	dk  r|
nt        j$                  |
|	      }|j'                  |       g |j(                  xs g D cg c]4  }t+        j,                  t        j.                  |      j0                        6 }}| j2                  rt5        j6                          | j                   j9                  |j:                  |t=        |      |j>                  |d||| j@                  	       g S c c}w )NTrX   r   F)	src_refdst_ref
gmem_slicer   r-   arriver   gmem_transformr   )!r   r   DialectBarrierReffrom_barrier_memrefr   r   r  r   r   r  r=   rS  slice_lengthsr   
index_castr   	IndexTyperq   rJ  r   r-   r   	Dimensionrs   r   r$   r  r  
async_copyr   r   barrier_refr"   )r   r  r   r  r   r   unwrapped_destinationr  idx_i32sizeidxre   axisr-   s                 r1   !_mgpu_async_load_op_lowering_ruler  r  s    
			''	'##77H'%33G<?C': 4? *7??G,A,A$O mgt


2<<++-w
7CaxU//T:Aa $$*
 
mmBNN4(../*  	  "nn#z"!!99   
 
)3s   9F9c                    | j                   J g }t        |j                  |j                  d      D ]e  \  }}t	        j
                  t        j                  j                         |      }|dk  r|nt        j                  ||      }|j                  |       g |j                  rt        d      | j                   j                  |j                  t!        |      d d| j"                         g S )NTrX   r   z,Collective prefetches are not supported yet.rS   )gmem_refr  r   r  r   )r   r=   rS  r  r   r  r   r  rq   r   rJ  r   r-   r8   async_prefetchr   r   r"   )r   r  r  r  r  r  re   s          r1   %_mgpu_async_prefetch_op_lowering_ruler    s     
			''	'*7??G,A,A$O mgt


2<<++-w
7CaxU//T:Aa
 
L
MM##~~z"99 $  
)r3   store_opc           
          | j                   J t        j                  |      \  }t        |      \  }}t	        |j
                  |      }g }t        |j                  |j                        D ]e  \  }}t        j                  t        j                  j                         |      }	|dk  r|	nt        j                  |	|      }
|j!                  |
       g t#        t$        d      rN|j&                  Bt%        j(                  |j&                  j*                        j,                  j/                         }nd }| j                   j1                  ||j2                  t5        |      ||| j6                  |j8                  |       g S )Nr   TMAReduction)r  r  r  r   r  r   r  reduction_op)r   r   r  r   r   r   r=   rS  r  r   r  r   r  rq   r   rJ  r   hasattrr	  r  r  r   rA   lowerr  r  r   r"   commit_group)r   r  r  r   r   unwrapped_sourcer  r  r  r  re   r  s               r1   "_mgpu_async_store_op_lowering_ruler    sX    
			''	'%33H=?C': /xP*8++X-C-CD mgt


2<<++-w
7CaxU//T:Aa T>"x'<'<'H$$X%:%:%@%@AFFLLNLL ""z"99""   	 
)r3   c                     ~ t        j                  |      d   }t        |j                  |      }t	        j
                  |j                        |j                  k(  sJ |j                  gS )Nr   )r   r7  r8  r   r   rv   rl   
new_layout)r   r*   rQ  tmem_refs       r1   _tmem_layout_cast_lowering_ruler    sX    
 
--b1!4)rvvy1(				0BMM	AA	A
&&/r3   c                    ~ t        j                  |      d   }t        j                  |      d   }t        |j                  |      }t
        j                  j                  d      }t        j                  ||j                        }t        j                  |j                  |      }t        j                  |j                  j                   g|g      }||j"                  d<   |j                  gS )Nr   r   rl   )r   r7  out_tmem_layoutsr8  r   r   rt   ru   r   r   rK  addiaddressr   rZ   rw   r_   r.   )	r   r*   r|  r  r   r   rK  	dest_addrr   s	            r1   _slice_tmem_lowering_ruler    s     
"2226q9.#44R8;/RYY7&
##B'#>>#ryy)&jj0)		+	+RYY^^,<yk	J$-$//(
++r3   source_is_signedtarget_is_signedc                 Z   t        j                  |      \  }t        j                  |      \  }||k7  rt        d      |j                  j
                  j                  }t        |j                  d   ||      }|j                  ||      }t        ||j                  j
                        gS )NLayout mismatchr   r   )r   r   r   r/   rw   r_   r   r   r^   astypery   )	r   r*   r  r  rQ  rl   	target_tyrc   	converteds	            r1   _conversion_op_lowering_ruler    s      **2.+9((,(6&
&
''iinn)))%bkk!nf>NO'nnY2BnC)
 BIINN
;	<<r3   TF)r  r  implc                    t        j                  |      }t        j                  |      \  t        fd|D              rt	        d      t        |j                  |      }t        |d      r7|j                  t        j                  j                  d      k(  } |||      }n ||      }t        ||j                  j                        gS )Nc              3   (   K   | ]	  }|k7    y wr   rS   r   rQ  rl   s     r1   r   z*_unary_op_lowering_rule.<locals>.<genexpr>0       9f	9   r  fastmathz#arith.fastmath<afn>)approx)r   r   r   rM  r/   r   rc   r  r  r   	Attributeparsery   rw   r_   )	r   r*   r  r   r   r!  r  	result_farl   s	           @r1   _unary_op_lowering_ruler  (  s     ))"-*((,(69j99
&
''

FI>!R[[BLL../EFFFQv&IQI
 BIINN
;	<<r3   )r  r   c                 F   t        j                  |      }t        j                  |      \  t        fd|D              rt	        d      t        |j                  |      }t        |j                  |      }t         |||      |j                  j                        gS )Nc              3   (   K   | ]	  }|k7    y wr   rS   r  s     r1   r   z+_binary_op_lowering_rule.<locals>.<genexpr>T  r  r  r  )r   r   r   rM  r/   r   lhsrhsry   rw   r_   )r   r*   r   r  r   r  r  rl   s          @r1   _binary_op_lowering_ruler  J  s     ))"-*((,(69j99
&
''!"&&&)<#!"&&&)<#
 c3
@	AAr3   c                    t        j                  |      }t        j                  |      \  t        fd|D              rt	        d      t
        |j                  j                     \  }}t        |j                  |      }t        |j                  |      }t         |||      |j                  j                        gS )Nc              3   (   K   | ]	  }|k7    y wr   rS   r  s     r1   r   z)_cmpi_op_lowering_rule.<locals>.<genexpr>  r  r  r  )r   r   r   rM  r/   
CMPI_IMPLSr   r   r   r  r  ry   rw   r_   )r   r*   r   r  r   r  r  rl   s          @r1   _cmpi_op_lowering_ruler    s     ))"-*((,(69j99
&
''r||112/$	!"&&&)<#!"&&&)<#
 c3
@	AAr3   c                 |   t        j                  |      }t        j                  |      \  t        fd|D              rt	        d      t
        |j                  j                     }t        |j                        }t        |j                        }t         |||      |j                  j                        gS )Nc              3   (   K   | ]	  }|k7    y wr   rS   r  s     r1   r   z)_cmpf_op_lowering_rule.<locals>.<genexpr>  r  r  r  )r   r   r   rM  r/   
CMPF_IMPLSr   r   r   r  r  ry   rw   r_   )r   r*   r   r  r  r  rl   s         @r1   _cmpf_op_lowering_ruler    s     ))"-*((,(69j99
&
''	BLL&&	'$!"&&&1#!"&&&1#
 c3
@	AAr3   c                    t        j                  |      }t        j                  |      \  t        fd|D              rt	        d      t        |j                        }t        j                  |j                  j                        j                  }|j                  |t        |            }t        ||j                  j                        gS )Nc              3   (   K   | ]	  }|k7    y wr   rS   r  s     r1   r   z,_bitcast_op_lowering_rule.<locals>.<genexpr>  r  r  r  )output_is_signed)r   r   r   rM  r/   r   in_r   r   rw   r_   r   bitcastr}   ry   )r   r*   r   r  out_element_typer~  rl   s         @r1   _bitcast_op_lowering_ruler    s     ))"-*((,(69j99
&
''!"&&&1#]]299>>2??)*:; 	 	# !biinn
5	66r3   wgmma_opc                    t        j                  |      }|d   t        j                  t        j
                        k(  sJ t        j                  |      \  }|t        j                  t        j
                        k(  sJ |j                  j                  j                  }t        j                  j                  |      rdnd }t        |j                  |d   |      }t        j                   j#                  |      }t        j$                  j                  |j                  j                        r3d }t        j&                  |      d   }	d }
t)        |j*                  |	      }nDt        j&                  |      \  }}	t)        |j                  |      }
t)        |j*                  |	      }t-        |	      \  }}	t.        j0                  j2                  }t5        t        j6                  |j*                  j                        |	||       t        j$                  j                  |j                  j                        rz|t        j                  j9                  d      k(  rt        j:                  nt        j
                  }|d   t        j                  |      k(  sJ t        |j                  |d   |      }n_t-        |      \  }}t5        t        j6                  |j                  j                        |||       ||k7  rt=        d| d|       |
J |
}t        j                  ||||      }t?        |j@                  jC                  t        j
                        |j                  j                        gS )Nr   T   r   z4Non-matching swizzles of operands a and b in WGMMA: z != )r   )"r   r   r   rv   r   r,  r   r!  r_   r   r   rt   r;   r   accumulatorr   WGMMAAccumulatorfrom_registersr   r  r   br   r	  r
  k32ByteSwizzler   r  ru   WGMMA_LAYOUT_8BITr/   ry   r   r   )r   r  r   rR  r   r   regsrb  a_transformsb_transformsunwrapped_a_refunwrapped_b_ref	b_swizzler   expected_a_layout	a_operand	a_swizzlenew_accs                     r1   _mgpu_wgmma_op_lowering_ruler    s    ))(3*	A'00A	AA	A ,,X6,:	w--boo>	>>	> --,nn//=d4) 
#JqM9
$ 	--d3#]]hjjoo.L"00:1=LO/

LIO!0!>!>x!HL,/

LIO/

LIOG)\ &&55/-mmHJJOO$lI ]]hjjoo. 2>>66q99 	__ 
 a=G223DEEEE)(**jmYOIII| 0
hjjoo&i I@ L[  &&&IKKYK'
--
!
!"//
2



#
#
 r3   	arrive_opc                    t         j                  j                  |j                        }|j                  j
                  }|r| j                  }t         j                  }nd}d }|j                  j                  |||       g S )Nr   )r   orders_tensor_corer   )
r   r  r  r   r  r   r"   r   r  r  )r   r	  r   	orders_tcr   r   s         r1   _mgpu_arrive_op_lowering_ruler    s     ##77	8I8IJ'**00) 99I((M MI	!"  
 
)r3   arrive_expect_tx_opc                    |j                   j                  }|t        j                  z  rt	        d      |t        j                  z  }t        j
                  |t        j                  j                  d            }t        j                  j                  |j                        }t        j                  |j                         |       g S )Nz4Only copies of a multiple of 128 bytes are supportedr   )	expect_txr   r   r   r8   r   r   rt   ru   r  r  r   nvvm_mbarrier_arrive_expect_txget_ptr)r   r  bytesr   s       r1   '_mgpu_arrive_expect_tx_op_lowering_ruler    s     
'
'
-
-%
U!!!
>  E   %
''%44R8
9%##77!!' &&w'8%@	)r3   wait_opc                     t         j                  j                  |j                        }|j	                  |j
                         g S r   )r   r  r  r   wait_parityparity)r   r  r   s      r1   _mgpu_wait_op_lowering_ruler  7  s5    
 ##77H'	gnn%	)r3   c                    ~ t        |j                  j                  |j                        }t	        j
                  |j                        }|j                  t        j                  j                  d      k(  rt        j                  |      rJ |gS t        j                  |      d   }t        |      \  }}t        ||      }t        ||j                  j                  |      }|gS )N!mosaic_gpu.barrierr   )rZ  rw   r_   rK  r   r  r   Typer  r   has_out_transforms_setout_transformsr   r  r   )	r   r*   
sliced_ref	memref_tyr  r   r   r   wrapped_refs	            r1   !_mgpu_slice_smem_op_lowering_ruler"  B  s     
299>>2995*mmJOO,).C DD 55b999<"11"5a8.=nM-!Z(Z@/'X+
r3   rw   rK  c                    t         j                  j                  d      }t        j                  t         j
                  j                  t        j                  f|t        j                                     }t        j                  t         j                  j                         |      }| }t         j
                  j                  |       rt        j
                  |       }|j                  t         j                  j!                  d      k(  rFt         j
                  j                  |j"                  t%               t        j                               }t'        j(                  |||g       }| |k(  r|S t+        j,                  | g|g      S )Nr  rW  r  )r   rt   ru   r   dynamic_shared_memoryr  rq   r   DYNAMICr  r   r  r  r;   r   r  r  rr   r   r   viewr   r]   )rw   rK  i8	smem_baselowered_result_typer   r&  s          r1   rZ  rZ  X  s   	~~""1%"''mm("5::<H) BLL,,.7&]]f%f%I/D!EEMM--
//02 .  
()VR	@$""K		+	+VHtf	==r3   c                     ~ t        j                  |      \  }t        |j                  |      }t        j                  |      d   }t        ||j                  j                  |      }|gS )z[Lowering rule for mgpu.WithTransformsOp.
  This is a noop that simply returns its input.
  r   )r   r  r   r   r  r   rw   r_   )r   r*   r  unwrapped_source_refr  r!  s         r1   &_mgpu_with_transforms_op_lowering_ruler,  k  sb     
#11"5/=2266=I"11"5a8.'BIINNN+ r3   r   static_offsetsdynamic_offsetsc           	         d}g }g }|dt        |         D ]N  }|j                  |       |t        j                  j	                         k(  s6|j                  ||          |dz  }P t        | |t        |        d d      D ]  \  }}|t        j                  j	                         k(  rt        j                  ||   t        j                  |t        j                  j                                     }|j                  |       |j                  t        j                  j	                                |dz  }||z  dk(  sJ |j                  ||z          |dgt        |       z  z  }||fS )zComputes the static and dynamic offsets after the given tiling is applied.

  Conceptually, this function is analogous to
  tile.transform_shape(static_offsets), except that it also handles dynamic offsets.
  r   Nr   TrX   )r   r   r   
ShapedTypeget_dynamic_stride_or_offsetr=   r   divuir   r   r  rq   )	r   r-  r.  dynamic_offset_indexnew_static_offsetsnew_dynamic_offsetsrK  	tile_size
dyn_offsets	            r1   _tile_transform_offsetsr8  }  sp     ~#f+.  ff%;;==  1E!FGa	  nc&k\^,T 5i ;;== ;;
.
/
'')R\\--/
0j   , J J LMai1$$$) 34!5& c&k))	0	00r3   c                 ^	   ~ t        d |j                  D              rt        d      |j                  rt        d      t	        j
                  |j                  j                        }t        j                  |      rt        d      t        j                  |      rt        j                  |      \  }t        j                  |      \  }||k(  sJ t        |j                  |      }g }d}t        |j                   |j"                  d      D ]]  \  }}	t        j$                  j'                  |      r|j(                  |   }|dz  }|j+                  t        j,                  ||	             _ t/         |j0                  |       gS t        j2                  |      d   }
t        j4                  |      d   }|
|k7  rt        d	      t7        |j                  |
      }t9        |      \  }}|t:        j<                  j>                  k7  r|d
z  t        j@                  |jB                        z  }|jE                         \  }}t        ||j                   |j"                  d      D ]j  \  }}}	|dk7  r|	|z  dk7  rtG        d|	d|d      t        j$                  j'                  |      rt        d      ||z  dk7  sZtG        d|d|d       |x  rb dk(  r]  tI        jJ                  |jL                  j                  ||j(                  d d |j                   |j"                  |j                        }nJ   r9 dk(  r3\  }tO        |tP        jR                        rt	        j
                  |j                        }|jT                  }t        d tW        |j"                        tY        |       d  D              rt        d      |j[                  tW        |j"                              }t]        |tW        |j                         tW        |j(                              \  }}tI        jJ                  t_        |jL                  j                  |      ||d d ||dgtY        |j`                        z        }n 	 t        d      tc        |jL                  |jL                  j                  |      }|gS )Nc              3   &   K   | ]	  }|d k7    ywrG  rS   rH  s     r1   r   z3_memref_subview_op_lowering_rule.<locals>.<genexpr>  s     +Aa+s   z,SubViewOp only supports static strides of 1.z%SubViewOp only supports static sizes.z.SubViewOp does not support transposed memrefs.r   TrX   r   zESubViewOp transforms for the input and output refs must be identical.r  zSwizzled dimension of size=z$ is not a multiple of swizzle_elems=r   z6Slicing a swizzled dynamic dimension is not supported.zsubview offset=)r-  static_sizesstatic_stridesc              3   Z   K   | ]#  }t         j                  j                  |       % y wr   )r   r0  is_dynamic_sizerH  s     r1   r   z3_memref_subview_op_lowering_rule.<locals>.<genexpr>  s'       --
'
'
*s   )+z>SubViewOp only supports static sizes for the tiled dimensions.z0SubViewOp only supports a single tile transform.)2rM  r<  r8   rP  r   r  r   r_   r   r  is_tmem_refr   r7  r  r8  r=   r-  r;  r0  r>  rO  r   rJ  _tmem_ref_to_irslicer  r  r   r   r	  r
  r  bitwidthr   r  r/   r   	SubViewOprw   r;   r   r   r   ra   r   r  r8  r   rr   r   )r   r*   src_tyin_tmem_layoutout_tmem_layoutr   rS  r3  rK  r  r  r  r+  r   r   swizzle_elemssource_stridesr   stridenew_subview_optile_transformin_transformed_tyr   	new_sizesr4  r5  r!  s                              r1    _memref_subview_op_lowering_rulerN    sV    
+**++
L
MMXX
E
FF==(&
'
N
OO
v&66r:^'88<__,,,
BII~
6CGB--rtL 7		&	&v	.01!nnU''56	7
 ICIIw/011!//3A6-"11"5a8.n$
O  3299mLCNS':""---aK5>>&2E2E#FFM557NA #))2??4! 
 
1			"*TG ,q"
 	
 
	&	&v	.!D
 	
 
-	1	$viDm5EQG
 	
%
, 		''
))..

**

****	n 
 
	z..:V:VW--(<(A(AB$$f	 (#f+8 
 "L
 	
 !00boo1FGi0G
$r(()4

+;1-- ''
#BIINNJ
?



+ s#4#:#:;;	n 
2 

<  (RYY^^^+ r3   c                    ~ t        j                  |      d   }t        j                  |      d   }||k7  rt        d      t	        j
                  |j                  j                        }t	        j
                  |j                  j                        }|j                  |j                  k7  rt        d      |j                  |j                  k7  rt        d      |j                         \  }}|j                         \  }}	||k7  rt        d      t        |j                  |      }
t	        j
                  |
j                        }|j                         \  }}t        j                  j                  |	|      }t        j
                  j                  |j                  |j                  |j                  |      }t!        j"                  ||
      }t%        |j                  |j                  j                  |      }|gS )zYLowering rule for memref.CastOp.
  Only casts that add a dynamic offset are supported.
  r   zBCastOp transforms for the input and output refs must be identical.zFCastOp only supports casts between memrefs with the same element type.z?CastOp only supports casts between memrefs with the same shape.zACastOp only supports casts between memrefs with the same strides.r  )r   r  r  r8   r   r  r   r_   rw   r   rr   r  r   r  rq   r  r   CastOpr   )r   r*   r  r  ry  rz  
in_stridesr   out_strides
out_offsetr+  rL  transformed_stridesrR  out_transformed_tynew_cast_opr!  s                    r1   _memref_cast_op_lowering_rulerW    s    
!//3A6-"11"5a8.n$
L  --		
'%==(&
6...
P  [[FLL 
I  ..0-*a"99;+z;
K  3299mLmm$8$=$=>,CCEq##''
4GH*}}(($$$11	 )  02FG+'"))...+ r3   r  c                 z    t         j                  j                  t         j                  j	                  |             S r   )r   AffineMapAttrrq   	AffineMapget_permutation)r  s    r1   _permutation_to_affine_map_attrr\  M  s*     
				bll::;G	HHr3   c                    ~ t        j                  |      d   }t        |j                  |      }t	        j
                  |j                        }t        |j                        dk(  r|j                  }n~t        |j                        dk(  r[|j                  t        ddg      k(  rt        g d      }n>|j                  t        ddg      k(  rt        g d      }nt        d      t        d      t        j                  |      d   }t        |      \  }}t        j                  t!        |j"                  j                  |      ||      }	t%        |	j"                  |j"                  j                  |      }
|
gS )	Nr   rv  rV  r   )r   r   rv  r  )r   r   r  rv  zUnsupported permutation.z8TransposeOp only supports transposing 2D and 4D memrefs.)r   r  r   r  r   r  r_   r   rr   r  r\  r8   r  r   r   TransposeOpr   rw   r   )r   r*   r  unwrapped_in_refrL  new_permutationr  r   r   new_transpose_opr!  s              r1   "_memref_transpose_op_lowering_rulerb  S  sI    
!//3A6-.rvv}Emm$4$9$9:		 	 !Q&nnO
""#q(	~~8!Q@@7Eo	:Aq6B	B7Eo :;;
B  #11"5a8.=nM-!Z''		
; (ryy~~~+ r3   c                 L   ~ t        j                  |      d   }t        |j                  |      }t	        j
                  |j                        }t        j                  |      d   }t        |      \  }}t        |j                  j                  |      }t        |j                        }	t        |j                        t        |j                  j                  j                        z
  }
|
dkD  r!t        d |	|
 d  D              rt!        d      t        |j"                        }t%        |||
z         D ]  }|	j'                  |g        t)        j*                  |||	|j,                  |j                        }t/        |j                  |j                  j                  |      }|gS )Nr   c              3   8   K   | ]  }t        |      d kD    ywrG  )r   )r   rq  s     r1   r   z8_memref_expand_shape_op_lowering_rule.<locals>.<genexpr>  s      !c!fqj!s   z,Expanding tiled dimensions is not supported.)output_shapestatic_output_shape)r   r  r   srcr   r  r_   r  r   r   rw   ra   reassociationr   rr   rM  r8   rf  r   r   r   ExpandShapeOpre  r   )r   r*   r  r_  rL  r  r   r   rU  rh  num_tiling_dimsstart_indexr   new_expand_shape_opr!  s                  r1   %_memref_expand_shape_op_lowering_rulerm  x  s}    
!//3A6-.rvv}Emm$4$9$9:"11"5a8.=nM-!Z0Lr''(-)//03rvv{{7H7H3II/
 qS !'(8(9:!  L
MMB**++kO;< a! ,,??,22 (  "))...+ r3   c                     ~ t        j                  |      d   }|rt        d|       t        j                  t        |j                  |      |j                  |j                        }|j                  gS )znLowering rule for memref.LoadOp.

  Loads are never transformed so this rule is mostly just a pass-through.
  r   z+memref.LoadOp does not support transforms: )r   rS  nontemporal)	r   r  r8   r   LoadOpr   rS  ro  rw   )r   r*   r  new_load_ops       r1   _memref_load_op_lowering_rulerr    sm     
!//3A6-
 KB4P
QQ&ryy-@jj..+
 

	r3   c                     ~ t        j                  |      d   }|rt        d|       t        j                  |j
                  t        |j                  |      |j                  |j                         g S )zpLowering rule for memref.StoreOp.

  Stores are never transformed so this rule is mostly just a pass-through.
  r   z,memref.StoreOp does not support transforms: )r   r   rS  ro  )	r   r  r8   r   StoreOpr   r   rS  ro  )r   r*   r  s      r1   _memref_store_op_lowering_ruleru    sh     
!//3A6-
 LRDQ
RR..HH&ryy-@jj..	 
)r3   c                    | j                  |       t        j                  |j                  j                        j
                  }|d   |j                  j                  z  }t        j                  | j                        5  t        j                  |j                  ||j                  d       ddd       t        j                           t#        j$                  |j                  g       }t'        j(                  |j                  j                  g|g      }|j                  |j*                  d<   |j                  |j*                  d<   t-        j.                  |      d   |j*                  d<   |j                  gS # 1 sw Y   xY w)	z#Lowering rule for mgpu.TmemAllocOp.r   FexactNr-   r*  r   rl   )r2   r   r  rw   r_   rr   r*  r   r  whenr#   r   
tmem_allocsmem_ptrr-   r   r   r   loadr   rZ   r.   r   r  )r   r*   re  ncols	tmem_addrcast_ops         r1   _tmem_alloc_op_lowering_ruler    s#   
 rryy~~.44,
q/RZZ--
-%s::; Gr{{E2==FG++-kk"++r*)..	yy~~' &(]]'\""$**'Y!0!A!A"!Ea!H'X
..	G Gs    .E::Fc                     | j                  |       t        j                  | j                        5  t	        j
                  |j                         ddd       g S # 1 sw Y   g S xY w)z3Lowering rule for mgpu.TmemRelinquishAllocPermitOp.N)r2   r  ry  r#   r   tmem_relinquish_alloc_permitr-   )r   r*   s     r1   ._tmem_relinquish_alloc_permit_op_lowering_ruler    sP    
 rs::; 8((78	)8	)s    AA%c                 >   t         j                  j                  d      }t        |j                  |g      \  }\  }t        j
                  |j                  d         j                  }t        j                  |j                  d         j                  }t        j                  |j                  j                        j                  }|d   |z  }t        j                  | j                        5  t        j                   |||d       ddd       g S # 1 sw Y   g S xY w)z%Lowering rule for mgpu.TmemDeallocOp.r   r-   r*  r   Frw  N)r   rt   ru   rg   r  BoolAttrr.   r   rs   r  r_   rr   r  ry  r#   r   tmem_dealloc)	r   r*   r   rb   r~  r-   r*  re  r}  s	            r1   _tmem_dealloc_op_lowering_ruler    s    
 	##B'#!6r{{SE!J/;I{{?55lCDJJ*NN?55i@AGG'r{{//066,
q/W
$%s::; DE:UCD 
)D 
)s   .DDattrsc                     d}| D ]N  }t         j                  j                  |      s#|t        d      t        j                  |      j                  }P ||S t         j
                  j                  S )z8Returns the swizzle transform from the given attributes.Nz.Multiple swizzle transforms are not supported.)r	  r  r;   r/   r   r
  r  )r  r   r   s      r1   _swizzler    sp    ' 8d  ++D1		IJJ))$/77g	8
 'JT-?-?-J-JJr3   expected_layoutc                 ,   t         j                  j                  | j                        st	        |  d      t        j                  | j                        }|j
                  t        j                         k7  rt	        |  d|j
                   d      t         j                  j                  d      }t        | |g      \  }\  }t        |j                        }|j                  }|j                  d   }||k7  rt	        |  d| d| d      t        j                   |      }	t        |	t"        j$                        sJ t'        j(                  |	j*                  |	j,                  |	j.                  |	j0                        }
t'        j2                  ||||
      S )	zuReturns a TMEMRef from an IR value.

  Throws an error if the annotated layout does not match the expected layout.
  z is not a memref.z has a memory space z that is not TMEM.r   rl   z has a layout z) that does not match the expected layout r   )r   r  r;   r_   r/   r  r  tmemrt   ru   rg   r   rr   r   r.   layouts_libr   r   TiledLayoutr   
TMEMLayoutr   	warp_dims	lane_dims
vector_dimr2  )r   r  
mem_ref_tyr   r   r~  rr   el_tylayout_attrrl   tmem_layouts              r1   r8  r8    sl    
	!	!#((	+
u-.
//}}SXX&*
 11
%#J$;$;#<<NO  	##B'#+C#7$

  
!%

!
!%)+O#
%~k] +"#1	&  ''4&	FBNN	++	+""mmV%%v'7'79J9J+ 
E5+	>>r3   c                 F   t         j                  j                  | j                  | j                  t        j                               }t        j                  |g| j                  g      }t        j                  | j                        |j                  d<   |j                  S )z#Returns an IR value from a TMEMRef.rW  rl   )r   r  rq   rr   rz   r  r  r   rZ   r  r  rv   rl   r.   rw   )r   r_   r   s      r1   r@  r@  5  sm    			399ciijoo>O		P$		+	+TFS[[M	B$)88D$//(	r3   c                 ,   | j                  |       t        j                  |      }|d   }t        |j                  |      }t        j                  |j                        r[t        j                  |      \  }}t        |      }t        |      }t        |j                  |      }	t        |j                  |      }
nSt        |j                  |d         }	t        j                  |      \  }t        |      }|}t        |j                  |      }
t        j                  | j                        5  t        j                   ||	|
|||j"                  |j$                  |j&                  |j(                  j*                  	       d d d        g S # 1 sw Y   g S xY w)Nr   r   )r  r  a_scaleb_scale
accumulater-   )r2   r   r7  r8  r  r   r  r!  r  r  r   r  r  ry  r!   r   mmar  r  r  r-   r   )r   r*   r7  rl  acc_refr  r   r  r  a_refb_refs              r1   _tcgen05_mma_op_lowering_ruler  =  sJ    r#33B7/q!*bnnj9'
rtt!0!>!>r!BL,&I&I%bddL9E%bddL9EbddOA$67E$2226N\&II%bddL9Es<<= KK



====&&
 
) 
)s   .AF		Fc                    ~ t        j                  |      d   }t        |j                  |      }t        j                  |      d   }t        j                  |      }t        t        j                  |j                  j                        j                        }|j                  ||      }t        ||j                  j                        gS )z'Lowering rule for mgpu.AsyncLoadTmemOp.r   )r   r7  r8  r   r   r  r  r}   r   r  r_   r   r|  ry   rw   )r   r*   r|  r  r  rR  r   r   s           r1   !_async_load_tmem_op_lowering_ruler  d  s    
 
"2226q9.ryy.9(#//3A6/11/B* ryy~~!>!K!KL)}}Z+"
 RYY^^
4	55r3   c                     ~ t        j                  |      d   }t        |j                  |      }t        j                  |      d   }t        |j                  |      }|j                  |       g S )z(Lowering rule for mgpu.AsyncStoreTmemOp.r   )r   r7  r8  r  r   r   r   store)r   r*   r|  r  r   s        r1   "_async_store_tmem_op_lowering_ruler  s  sb    
 
"2226q9.r~~~>("--b1!4. N;"
..	)r3   c                    ~ |j                   j                  d   }t        |j                  |j                  d      D ]  \  }}|j                  |        d}t        j                  j                  }|j                  D ]X  }t        |j                  t        j                        r|J |j                  }8|j                          |j                  |       Z |t!        d      |j                  S )z)Lowering rule for mgpu.CustomPrimitiveOp.r   TrX   Nz,A custom return op must terminate the block.)bodyblocksr=   	argumentsr^   r?   r   InsertionPointcurrent
operationsr;   r\   r	  ReturnOpdetach_from_parentinsertr/   )r   r*   blockarg	return_opips         r1   '_mgpu_custom_primitive_op_lowering_ruler    s    
 

''..
%U__bkk$? "gc2b!" )	  " b"))T]]+))iIIbM 
C
DD			r3   values
fa_layoutsc                    t        |      }g }g }| D ]  }t        j                  j                  |j                        rt        |t        |            }|j                  |j                  j                         |j                  |j                  j                  |j                  t        j                  |j                        f       |j                  |       |j                  d        ||fS )a  Flattens a sequence of values.

  Non-vector values are preserved as is. Vectors are mapped to fragmented
  arrays and then flattened into per-register values.

  Args:
    values: The sequence of values to flatten.
    fa_layouts: The layouts of vectors in ``values``.

  Returns:
    A tuple of (flattened values, templates). The templates are used to
    reconstruct the vectors from the per-register  values.
  N)iterr   r   r;   r_   r   nextextendrm   flatr   rr   rl   )r  r  fa_layouts_itrw   	templatesre   r   s          r1   _flatten_ir_valuesr    s      z"-&,.) a	}}'$Q](;<bmmBLL%%&**BIIr}}QVV7LMNmmAt 
	r3   flat_valuesr  c           	         g }t        |       }|D ]  }||j                  t        |              |\  }}}t        j                  t        t        j                  |            D cg c]  }t        |       c}t              }	t        j                  |	j                  |      |t        |j                              }
|j                  t        |
|              |S c c}w )z&The inverse of ``_flatten_ir_values``.r|   r   )r  r   r  r   asarrayr   r   r   objectr   r   r   r}   r   ry   )r  r  rw   flat_values_ittemplaterk   rl   vec_typer   value_registersr   s              r1   _unflatten_ir_valuesr    s     &$. ;hmmD()(0%OVXjj',TYY-G'HI!n	IO "**?;%h&;&;<E
 MM(9:; 
- 	Js   %C
	old_block	new_blocklast_op_typeargs_templatenew_leading_argsc                 t   d}t        j                  |      5  t        |j                  t	        |      d |      }|t        |      z   }t        |j                  |d      D ]  \  }	}
|	j                  |
        g |D ]  }t        ||      s#|j                  |       | j                  |       3|J t        j                  |      rt        j                  |      ng }t        |t        j                        r/t!        |j"                  |      \  }}t        j$                  |       nbt        |t        j&                        r:t!        |j(                  |      \  }}t        j*                  |j*                  |       nt-        d|       |j/                            	 ddd       |J |S # 1 sw Y   xY w)a  Moves the operations from `old_block` to `new_block`.

  The input arguments to the block, if any, are flattened using the provided
  `args_template`, except for any new_leading_args which are simply prepended
  to the flattened arguments and must be part of the template.

  The last operation of the old block must be of type `last_op_type` which
  is expected to be either a `scf.YieldOp` or a `scf.ConditionOp`. This
  operation is recreated with flattened output arguments.
  NTrX   zUnsupported op type: )r   r  r  r  r   r   r=   r?   r;   r   rF   r   has_in_layouts_setr   r   YieldOpr  r^   yield_ConditionOpargs	conditionr8   erase)r   r  r  r  r  r  out_template	new_carrynew_argsold_argnew_argr*   r   flat_operands
flat_carrys                  r1   1_move_scf_block_to_block_with_flattened_argumentsr    s   $ ,	# $Y%8%8=M9N9O%PR_`I%	"22H	 3 3XdK -##G,-	l L)R### 11"5 &&r* 	
 b#++&(:2;;(P
%-
**]
#COO,%7%I
"*l
--j
1#&;B4$@A
A

)4 
	!!	!	7 s   FF..F7for_opc                    t        j                  |      st        | |      S t        j                  |      }t        j                  |      }|j
                  j                  t        |j
                  j                        dz
     }t        j                  |      }||k7  s||k7  rt        d      t        |j                  |      \  }}t        j                  |j                  |j                  |j                  |      }t!        | |j
                  |j
                  t        j"                  ||j$                         t'        |j(                  |      S )Nr   r  )r   r9   _traverse_op_lowering_ruler   r   r  r  r   r/   r  initArgsr   ForOp
lowerBound
upperBoundstepr  r  induction_variabler  r>   )	r   r  r   r   yield_opyield_layoutsflat_init_argsr  
new_for_ops	            r1   _for_op_lowering_ruler    s    
	+	+F	3%c622))&1*++F3+[[##C(>(>$?!$CD(!,,X6-;*"=
&
''"4ooz#.- yykk	* 4	kkoo	kk## 
j00-	@@r3   while_opc                 ~   t        j                  |      st        | |      S |j                  j                  d   }|j
                  j                  d   }|j                  t        |j                        dz
     }|j                  t        |j                        dz
     }t        j                  |      rt        j                  |      ng }t        j                  |      rt        j                  |      ng }|r+t        j                  |      }||k7  rt        d| d|       |r+t        j                  |      }	||	k7  rt        d| d|	       t        |j                  |      \  }
}t        ||      }t!        j"                  ||
      }|
D cg c]  }|j$                   }} |j                  j                  j&                  | }t)        | ||t         j*                  |      } |j
                  j                  j&                  | }t)        | ||t         j,                  |       t/        |j0                  |      S c c}w )Nr   r   zInput layouts z do not match yield layouts zOutput layouts z  do not match condition layouts )r   r9   r  beforer  afterr  r   should_have_in_layoutr   should_have_out_layoutr   r/   r  inits_infer_flat_result_typesr   WhileOpr_   r   r  r  r  r  r>   )r   r  before_blockafter_blockcondition_opr  r   r   r  condition_layouts
flat_initsinits_templateresult_typesnew_while_opre   
init_typesnew_before_blockresults_templatenew_after_blocks                      r1   _while_op_lowering_ruler  )  s9    
	+	+H	5%c844''*,%%a(+((\-D-D)E)IJ,##C(>(>$?!$CD( 
	.	.x	8   *  
	/	/	9 !!(+  #..x8M]":, '_ 
 '22<@''K= ) " 
  2(..*M*n)(K@,\:6, !++1+*+6\((//66
CF		oo 5L&&--44lC/3		kk 
l224D	EE) ,s   H:r   c                    g }t        |      }| j                  D ]  }t        j                  j	                  |j
                        s|j                  |j
                         Ht        j                  |j
                        }t        j                  t        |            }|j                  |j                  |j                        gt        j                  |j                  t!        |j"                                    z          |S r   )r  r>   r   r   r;   r_   r   r  r   r  r  r   r   r   r   rk   r   rr   )r*   r   r  out_layouts_itrr  rl   s          r1   r  r  l  s     !#,$.:: 	a==##AFF+!&&!}}QVV$H))$~*>?F		&	&x'<'<	=>
))F**5+@A
B	C	 
r3   if_opc                 P    t        j                  |      st        | |      S t        r   )r   r9   r  r8   )r   r  s     r1   _if_op_lowering_ruler  ~  s%     
	+	+E	2%c511r3   	switch_opc                    t        j                  |      st        | |      S t        j                  |      }t	        j
                  t        ||      |j                  |j                        }g }t        |j                  |j                  d      D ]?  \  }}|j                  \  }|j                  d   }t        | ||t        j                  g       }A t        |j                  |      S )NTrX   r   )r   r9   r  r   r   IndexSwitchOpr  r  casesr=   regionsr  r  r  r  r>   )	r   r  r   new_switch_opr  region
new_regionr  r  s	            r1   _index_switch_op_lowering_ruler    s     
	+	+I	6%c955++I6+##y+6mmoo- 8:..t fj mmGU!!!$IHUIs{{B 
m335E	FFr3   c           	      .   t        j                  |      rt        d|       |j                  j                  D ]H  }|D ]A  }t        |      D ]1  }t        j                  |      5  | j                  |       d d d        3 C J t        S # 1 sw Y   GxY w)Nz:Rule cannot handle an op with vector operands or results: )
r   r9   r/   	operationr  ra   r   r  rF   RECURSED)r   r*   r  r  block_ops        r1   r  r    s    
 ''+

DRDI  $$ !f !5k !(x( 	!
,,x
 	! 	!!!!
 
/	! 	!s   %B	Bc                     | j                   j                  d      xsa t        j                  |       xsJ t        j                  |       xs3 t        j
                  |       xs t        d | j                  D              S )z2Returns 'true' if the operation should be lowered.zmosaic_gpu.c              3   @   K   | ]  }|D ]  }t        |         y wr   )rM   )r   r  r  s      r1   r   z _should_lower.<locals>.<genexpr>  s      4!4QT!W4W4s   )r6   
startswithr   r9   should_have_transformsshould_have_tmem_layoutrM  r  )r*   s    r1   r5   r5     sq     	""=1 5		+	+B	/5		/	/	35 
	0	0	45 
4bjj4	4r3   modulec                 "   | j                   j                  D ]l  }|j                  j                  D ]Q  }|j                  D ]@  }|j                  D ]/  }|j                  j
                  dk(  s|j                  c c c c S  B S n t        d      )Nz
gpu.launchzgpu.launch op not found.)r  r  r  r  r  rA   r/   )r  r*   r  r  sub_ops        r1   _gpu_launch_opr    s    KK"" $b,,&& $== $%&& 	$F""l2####	$$$$ 	-..r3   r   r$   c           	         |t        dddd|      S t        |       }t        j                  j	                  |j
                  d   j                  d         5  t        j                  t        j                  j                        }t        j                  t        j                  j                        }t        j                  j                  }t        j                  j!                  d      }t        j"                  |t        j$                  d      t        j&                  d|            }t        |||||      cddd       S # 1 sw Y   yxY w)z:Returns a `LoweringContext` for the given `LaunchContext`.Nr   )scoper   F)sync)r    r  r   r  at_block_beginr  r  r   single_thread_predicateThreadSubsetBLOCK	WARPGROUPr   CmpIPredicateeqrt   ru   cmpiwarp_idxr   )	r  r   r$   gpu_launch_opblock_predicatewarpgroup_predicater'  r   warp_predicates	            r1   _lowering_contextr.    s
    4tT=AA (-	''(=(=a(@(G(G(JK 33  &&O  77  ** 
				B
..
%
%b
)CZZENN$>3PN  s   C#EEc                    | j                   j                  t        j                         | j                   j	                          t        | ||      }t        j                  | j                        5  t        | j                        D ]  }|j                  |        	 d d d        |j                  D ]  }|j                           y # 1 sw Y   +xY wr   )contextappend_dialect_registrymlir_interpreterupstream_dialectsload_all_available_dialectsr.  r   r  r  ra   rF   r&   r  )r  r   r$   r   r*   
lowered_ops         r1   lower_mgpu_dialectr6    s     	..(()9)K)KL..,,.&.-@#	% 6;; 	ll2 ** j	 s   0,CCr   )T()  __doc__collections.abcr   r   r   rN   	functoolsr   r   operatortypingr   r   r   jax._src.interpretersr	   r2  jax._src.libr
   r	  jax._src.lib.mlirr   jax._src.lib.mlir.dialectsr   r   r   r   r   	mlir_mathr   r   r   r   jax._src.utilr   jax.experimental.mosaic.gpur   r  r   r  numpyr    r   r   r   r   r   r   	dataclassr    r<   r  rL   MlirLoweringRuleResultrQ   rR   MlirLoweringRuler7   dictr   rK   r  r   rZ   rg   r   ry   rM   r}   r  r   rp   r   r   r_   r   r   InitializeBarrierOpr   OptimizationBarrierOpr   
ConstantOpr   r
  r  r  MemRefTransformr   r   r   VectorLoadOpr  VectorStoreOpr  DebugPrintOpr"  r2  r4  PrintLayoutOpr9  BroadcastedIotaOpr>  BroadcastOprA  ShapeCastOprD  ExtractStridedSliceOprT  ReductionOpr]  MultiDimReductionOpro  LayoutCastOprt  BroadcastInDimOpr  r   r   r  AsyncLoadOpr  AsyncPrefetchOpr  AsyncStoreOpr  TmemLayoutCastOpr  SliceTmemOpr  r  ExtFOpExtSIOpExtUIOpFPToSIOpFPToUIOpSIToFPOpTruncFOpTruncIOpUIToFPOpr*   r  r  partialr6   r  RsqrtOprsqrtExpOpexpExp2Opexp2SinOpsinCosOpcosLogOplogTanhOptanh
unary_implr   r  AddIOpr@   AddFOpSubIOpsubSubFOpMulIOpmulMulFOpFloorDivSIOpfloordivDivUIOpDivFOptruedivRemSIOpmodRemUIOpRemFOpAndIOpand_OrIOpor_XOrIOpxorMaxSIOpr`  MaxUIOp
MaximumFOpMinSIOpminMinUIOp
MinimumFOpbinary_implr&  r'  nesltltslelesgtgtsgegeultuleugtuger  CmpIOpr  CmpFPredicateOEQUNEOLTOLEOGTOGEr  CmpFOpr  	BitcastOpr  WGMMAOpr  ArriveOpr  ArriveExpectTxOpr  WaitOpr  SliceSMEMOpr"  rZ  WithTransformsOpr,  intr8  rC  rN  rP  rW  rY  r\  r^  rb  ri  rm  rp  rr  rt  ru  TmemAllocOpr  TmemRelinquishAllocPermitOpr  TmemDeallocOpr  r  r8  r@  TcGen05MMAOpr  AsyncLoadTmemOpr  AsyncStoreTmemOpr  CustomPrimitiveOpr  FragmentedLayoutr   _VectorTemplater  r  Blockr  r  r  r  r  r  IfOpr  r
  r  FuncOpLaunchOpr  r5   Moduler  rJ   r.  r6  rS   r3   r1   <module>r     s   ? 8 8      & & : 3   3 , . + * 8 - + * - " > ;  $       0& 0& 0&f :!"((+h6 bllRYY./1GG 
 +-
D&&' ,$,hh$,RWW%$, 7--x/AAB$,N (( .0gg XX .=bgg =$+ = "0HH0LL0 d{0 	0>  ''    XX	  	(*XX&	d299o$	 "223	)rww ) D,,-	   bhh .* D../"" bhh 0. E$$%!,,bhh &< +/*<*<*G*G	6MM67786 6 ''	6r %	 %dTk %c % D%%&J5J5 --J5bhhJ5 'J5Z D&&')	)"00)bhh) ()X D%%&	"//bhh 'R'''//9 c 0 D&&'	"00bhh ( D**+2	2"442bhh2 ,2  F&&'@@"..@bhh@ (@ F&&'		"..	bhh	 (	 F0016	6$::6bhh6 26. F&&':	:$00:bhh: (:2 F../':	':$88':bhh': 0':T D%%&== --=bhh= '= D))*// 11/bhh/ +/&&J&J
4u^%C%CS%HIIJ&JR7MM7n44c9:7 ]]7t	n44c9: XX* D$$%-	-#'#3#3-bhh- &-` D(()	#'#7#7bhh *0 D%%&(	($($5$5(bhh( '(V D))*				 bhh	 +	 D$$%	"..bhh &==
		= Tk= Tk	=
 bhh=$ \\4
]]D$
]]E5!
^^T4 
^^T5!
^^T4 
^^T4 
^^UE"
^^UD!
/ *B* #4)"3"3"''#*R, "	=== 3***
+= d{	=
 bhh=* **00$7__b((,,d3r))..5__b((,,d3__b((,,d3__b((,,d3r))..5" B
I #4)"3"3J)#*RBBB d{B 			R//0"2D2DD	B bhhB$ \\8<<'
\\8<<&
\\8<<'
\\8<<&
\\8<<'
\\8<<&
**D1
]]H%%u-
\\8##T*
]]HLL$'
]]HLL%(
\\8<<&
\\8==%(
[[(,,&
\\8<<'
]]B&&**D1
]]B&&**E2
r))--t4
]]B&&**D1
]]B&&**E2
r))--t4+# BY. #4)"3"3[I#*R/: 
X[[%0	X[[%0	hkk40	hkk40	hkk40	hkk40	hkk51	hkk51	hkk51	hkk51
 ELL!
B
B!LL
Bbhh
B "
B 
X[[	X[[	X[[	X[[	X[[	X[[
 ELL!
B
B!LL
Bbhh
B "
B EOO$77!OO7bhh7 %7  DLL!BB"&,,BbhhB "BJ DMM"	%)]]bhh #6 D))*-1-B-Bbhh +0 DKK !%bhh ! D$$%	"..bhh &*> > >& D))*	"33bhh +"+1SM+1SM+1 bhh'+1 8C=(288,,-	+1\ F$$%l	l$..lbhhl &l^ FMM".	.$mm.bhh. #.bI#II F&&'!	!$00!bhh! (!H F(()'	'$22'bhh' *'T FMM"	$mmbhh #* FNN#	$nnbhh $, D$$%	"..bhh &0 D445	">>bhh 6 D&&'	"00bhh ($KHR\\* Kt/A/A K ?	 ?$&LL ?__ ?F RXX  D%%&#	#"//#bhh# '#L D(()6	6"226bhh6 *6 D))*	"33bhh + D**+	"44bhh ,4 r':':BMMIJRXX,4R\\,B
8BHHx$(>??@<"((#0849O0Pbhh0.	.xx. xx. ryy/	.
 Od23.  ). o$%.` CIIA	A"%))AA AD CKK ?F	?F$'KK?F?F !?FD
		 ( 6bgg$ CHH	!$  C%%&G	G%(%6%6GG 'G2 DKK L))*	 ii + !bii D /299 / /II"0047  	B II"0047 r3   