
    uki                     
   d dl mZmZ d dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlZddlmZ ddlmZ ddlmZ dZ dZ!e
d   Z"de"de#fdZ$ejJ                  Z% G d d      Z& e&       Z' ejP                  d       G d d             Z) G d dejT                        Z+ ejP                  d       G d  d!e)             Z, ejP                  d       G d" d#e)             Z- ejP                  d       G d$ d%e)             Z.ej^                  Z/d&Z0 G d' d(      Z1 G d) d*      Z2d+ejf                  dejh                  fd,Z5de"dz  d-ejl                  de7fd.Z8d/ejl                  de"dz  de9fd0Z: G d1 d2ejT                        Z; ejP                          G d3 d4             Z< G d5 d6e=      Z>d9d7ejf                  dejf                  fd8Z?y):    )CallableSequenceN)AnyLiteral)mosaic_gpu_dialect)ir)_gpu_ops_gen)arith)builtin)func)gpu)llvm)memref)nvvm   )fragmented_array)profiler)utils   @   )addminmaxincdecandorxoruminumaxsminsmaxreduction_opreturnc                     | dd  S )N r#   s    e/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/mosaic/gpu/launch_context.py_reduction_op_to_ptxr*   9   s    	bc	    c                       e Zd Zy)GlobalBroadcastN__name__
__module____qualname__r'   r+   r)   r-   r-   ?       r+   r-   T)frozenc                       e Zd Zdej                  dej                  fdZdeej                     deej                  df   fdZdee	   dee	df   fdZ
dee	   dee	df   fd	Zd
e	dd fdZy)MemRefTransformrefr$   c                     t        d      N&Subclasses should override this methodNotImplementedErrorselfr6   s     r)   applyzMemRefTransform.applyG       
F
GGr+   idx.c                     t        d      r8   r:   r=   r@   s     r)   transform_indexzMemRefTransform.transform_indexJ   r?   r+   shapec                     t        d      r8   r:   r=   rD   s     r)   transform_shapezMemRefTransform.transform_shapeM   r?   r+   c                     t        d      r8   r:   rF   s     r)   transform_stridesz!MemRefTransform.transform_stridesP   r?   r+   leading_rankc                     t        d      )zReturns a transform that accepts a ref with the extra `leading_rank` dims.

    The returned transform should leave the leading dimensions unchanged and
    only apply to the suffix of the shape.
    r9   r:   r=   rJ   s     r)   batchzMemRefTransform.batchS   s     F
GGr+   N)r/   r0   r1   r   Valuer>   r   tuplerC   intrG   rI   rM   r'   r+   r)   r5   r5   E   s    Hrxx HBHH HH"((!3 Hbhhm8L HH8C= HU38_ HHXc] HuS#X HH H(9 Hr+   r5   c                   P    e Zd Z ej                         Z ej                         Zy)RoundingN)r/   r0   r1   enumautoUPDOWNr'   r+   r)   rR   rR   \   s    tyy{"	$r+   rR   c                   
   e Zd ZU dZeedf   ed<   dZedz  ed<   de	j                  de	j                  fdZd	ee	j                     dee	j                  df   fd
Zdee   deedf   fdZdee   deedf   fdZdedefdZy)TileTransformav  Tiles a suffix of memref dimensions.

  For example, given a memref of shape (5, 128, 128) and a tiling of (64, 32),
  the shape of the result will be (5, 2, 4, 64, 32). The shape always ends with
  the tile shape, and the size of tiled dimensions is divided by the tile size.
  This is especially useful for swizzled WGMMA, which expect tiled layouts in
  shared memory.
  .tilingNroundingr6   r$   c           	         t        j                  |j                        j                  }t	        | j
                        }||z   }t        | j
                  d d d   t        |      d d d         D ]  \  }}t        j                  |j                        j                  }||   }||kD  r||z  r| j                  x t        d| d| d|       xt        j                  k(  r t        t        j                  k(  rPt        d       g|z  }	|	j!                  t        d||z  |z               t#        j$                  |t'        |	            }n	 t        d| j                         |}t#        j(                  ||d |f      } g t        ||z
        t        ||z
  |d      t        ||z
  dz   |d      }
t#        j*                  ||
      S )	Nz.When no rounding mode is specified, dimension z4 must have size smaller or a multiple of its tiling z
, but got r   zUnknown rounding mode:    r   )r   
MemRefTypetyperanklenrY   ziprangerD   rZ   
ValueErrorrR   rU   r;   rV   sliceappendr   memref_slicerO   memref_unfoldmemref_transpose)r=   r6   untiled_ranktiling_rank
tiled_ranktd	ref_shapesslicespermutations              r)   r>   zTileTransform.applyn   s   ==*//Ldkk"K+JDKK"%u\':4R4'@A 31--)//i
A,a	
Qq5B1# F??@cA3P  ''d}q(fmmE!Q!VaZ01&&sE&M:c!8HIIQq	2c+3,	|k)	*	|k):q	9 
|k)A-z1	=K
 !!#{33r+   r@   c                    t         j                  j                         t        | j                        }g |d |  fdt        || d  | j                        D        fdt        || d  | j                        D        S )Nc              3   d   K   | ]'  \  }}t        j                  |t        |             ) y wN)r
   divuic.0irm   indexs      r)   	<genexpr>z0TileTransform.transform_index.<locals>.<genexpr>   .      

1 KK1Q;'

   -0c              3   d   K   | ]'  \  }}t        j                  |t        |             ) y wru   )r
   remuirw   rx   s      r)   r|   z0TileTransform.transform_index.<locals>.<genexpr>   r}   r~   )r   	IndexTypegetra   rY   rb   )r=   r@   rk   r{   s      @r)   rC   zTileTransform.transform_index   s    LLEdkk"K
	]{l	


C.<




C.<


 
r+   rD   c                    t        | j                        }| j                  Dt        || d  | j                        D ]&  \  }}||z  st	        d| d| j                   d       n2| j                  t
        j                  k7  rt        | j                        g |d |  d t        || d  | j                        D        | j                  S )NzExpected GMEM slice shape z# suffix to be a multiple of tiling z.
If you're using padded async copies, your slice might need to extend out of bounds of the GMEM buffer (OOB accesses will be skipped).c              3   ,   K   | ]  \  }}||z    y wru   r'   ry   rp   rm   s      r)   r|   z0TileTransform.transform_shape.<locals>.<genexpr>   s     	DTQ!q&	D   )ra   rY   rZ   rb   rd   rR   rV   r;   )r=   rD   rk   size	tile_sizes        r)   rG   zTileTransform.transform_shape   s     dkk"K}} |}!5t{{C /$	)*5' 2 &88  
(--	'..	}		DS|}!5t{{C	D 
 r+   stridesc                     t        | j                        }g |d |  d t        || d  | j                        D        || d  S )Nc              3   ,   K   | ]  \  }}||z    y wru   r'   r   s      r)   r|   z2TileTransform.transform_strides.<locals>.<genexpr>   s     	EDAq!a%	Er   )ra   rY   rb   )r=   r   rk   s      r)   rI   zTileTransform.transform_strides   s_    dkk"K	;,		EC 6D	E 
+	 r+   rJ   c                     | S ru   r'   rL   s     r)   rM   zTileTransform.batch   s    Kr+   )r/   r0   r1   __doc__rO   rP   __annotations__rZ   rR   r   rN   r>   r   rC   rG   rI   r5   rM   r'   r+   r)   rX   rX   a   s     S/"(HtO"4rxx 4BHH 4B"((!3 bhhm8L 8C= U38_ *x} sCx   r+   rX   c                       e Zd ZU dZeedf   ed<   d Zdej                  dej                  fdZ
deej                     deej                  df   fd	Zd
ee   deedf   fdZdee   deedf   fdZdedefdZy)TransposeTransformzTransposes memref dimensions..rr   c                     t        | j                        t        t        | j                              k7  rt        d      y )Nz,All elements of `permutation` must be unique)ra   rr   setrd   r=   s    r)   __post_init__z TransposeTransform.__post_init__   s6    
4C(8(8$9 ::EFF ;r+   r6   r$   c                 B    t        j                  || j                        S ru   )r   ri   rr   r<   s     r)   r>   zTransposeTransform.apply   s    !!#t'7'788r+   r@   c                 @    t        fd| j                  D              S )Nc              3   (   K   | ]	  }|     y wru   r'   )ry   pr@   s     r)   r|   z5TransposeTransform.transform_index.<locals>.<genexpr>   s     2AQ2   rO   rr   rB   s    `r)   rC   z"TransposeTransform.transform_index   s    2!1!1222r+   rD   c                 @    t        fd| j                  D              S )Nc              3   (   K   | ]	  }|     y wru   r'   )ry   r   rD   s     r)   r|   z5TransposeTransform.transform_shape.<locals>.<genexpr>   s     4aq4r   r   rF   s    `r)   rG   z"TransposeTransform.transform_shape   s    44#3#3444r+   r   c                 @    t        fd| j                  D              S )Nc              3   (   K   | ]	  }|     y wru   r'   )ry   r   r   s     r)   r|   z7TransposeTransform.transform_strides.<locals>.<genexpr>   s     66r   r   )r=   r   s    `r)   rI   z$TransposeTransform.transform_strides   s    6T%5%5666r+   rJ   c                 \    t        g t              fd| j                  D              S )Nc              3   (   K   | ]	  }|z     y wru   r'   )ry   rn   rJ   s     r)   r|   z+TransposeTransform.batch.<locals>.<genexpr>   s      La\!1 Lr   )r   rc   rr   rL   s    `r)   rM   zTransposeTransform.batch   s.    M%
M L4;K;K LM r+   N)r/   r0   r1   r   rO   rP   r   r   r   rN   r>   r   rC   rG   rI   r5   rM   r'   r+   r)   r   r      s    %S#XG9rxx 9BHH 93"((!3 3bhhm8L 358C= 5U38_ 57x} 7sCx 7  r+   r   c                       e Zd ZU dZeedf   ed<   ej                  defd       Z	de
j                  de
j                  fdZdee
j                     dee
j                  df   fd	Zd
ee   deedf   fdZdedefdZy)CollapseLeadingIndicesTransformz#Collapses leading indices into one..r   r$   c                 :    t        j                  | j                   S ru   )mathgcdr   r   s    r)   common_stridez-CollapseLeadingIndicesTransform.common_stride   s    88T\\""r+   r6   c                     t        j                  |j                        }|j                         \  }}|t         j                  j                         k(  rt        d      t         fdt        |j                  d t         j                         |d t         j                               D              dz   }|g|j                  t         j                        d  } j                  g|t         j                        d  }t         j                  j                  ||      }t         j                  j                  ||j                  ||j                         }	t#        j$                  |	|g g g |g||      S )Nz!Dynamic offsets are not supportedc              3   N   K   | ]  \  }}|d z
  |z  j                   z    ywr   N)r   )ry   rn   rp   r=   s      r)   r|   z8CollapseLeadingIndicesTransform.apply.<locals>.<genexpr>   s0      Aq 
Q!t)))s   "%r   )static_offsetsstatic_sizesstatic_strides)r   r^   r_   get_strides_and_offset
ShapedTypeget_dynamic_stride_or_offsetr;   sumrb   rD   ra   r   r   StridedLayoutAttrr   element_typememory_spacer   reinterpret_cast)
r=   r6   ref_tyr   offset	max_bound	new_shapenew_strides
new_layout
new_ref_tys
   `         r)   r>   z%CollapseLeadingIndicesTransform.apply   sM   ]]388$F335OGV;;== CDD LL,3t||,-w7JT\\9J/K
 
 	

I >V\\#dll*;*<=>I%%DDLL0A0B(CDK%%))&+>J""6&&
F4G4GJ ""CRx"	 r+   r@   c                 x   t         j                  j                         }t        d|      }t	        |d t        | j                         | j                        D ]F  \  }}t        j                  |t        j                  |t        || j                  z  |                  }H |g|t        | j                        d  S )Nr   )r   r   r   rw   rb   ra   r   r
   addimulir   )r=   r@   r{   flat_idxrz   rp   s         r)   rC   z/CollapseLeadingIndicesTransform.transform_index   s    LLEE{HC*T\\*+T\\: 1
EJJq!A););$;U"CDh /s3t||,-.//r+   rD   c                     t        d |d t        | j                         D              rt        d      dg|t        | j                        d  S )Nc              3   &   K   | ]	  }|d k7    ywr   r'   )ry   rp   s     r)   r|   zBCollapseLeadingIndicesTransform.transform_shape.<locals>.<genexpr>  s     
5a16
5   z'Expected leading indices to be squeezedr   )anyra   r   rd   rF   s     r)   rG   z/CollapseLeadingIndicesTransform.transform_shape  sL    

55!3#dll"34
55@AA*c$,,'()**r+   rJ   c                     t         ru   r:   rL   s     r)   rM   z%CollapseLeadingIndicesTransform.batch	  s    
r+   N)r/   r0   r1   r   rO   rP   r   	functoolscached_propertyr   r   rN   r>   r   rC   rG   r5   rM   r'   r+   r)   r   r      s    +c?#S # #rxx BHH 00"((!3 0bhhm8L 0+8C= +U38_ +
  r+   r   mosaic_gpu_smem_allocc            
           e Zd ZdZdej
                  fdZ	 ddedej                  dedz  dej                  dz  fd	Zd
 Zdeej                  ej                   ej"                  f   fdZdej"                  fdZd Zy)Scratcha  Manages ops handling the GMEM scratch that contains the TMA descriptors.

  TMA descriptors are created on the host and then copied to GMEM. So there
  needs to be some code on the host to allocate and initialize the TMA
  descriptors. However, we only know what descriptors we need after we have
  lowered the entire kernel. This class helps manage everything needed to
  correctly allocate and initialize the scratch.

  To help reconcile the needs of kernels that use the dialect lowering with
  those that use MGPU APIs directly, this class only creates the relevant ops
  lazily. Eager creation would make them appear dead before dialect lowering
  and MLIR's DCE would remove them.

  During the lowering, we collect information about how many bytes are needed
  and also how each descriptor should be initialized on the host. At the end
  of the lowering, the finalize_size() method should be called to add the
  necessary code on the host to allocate and initialize all descriptors.

  Here's how the IR looks after the initial ops are created for the first time:


  %1 = llvm.alloc_op {elem_type = !llvm.array<0 x i8>} -> !llvm.ptr
  %2 = llvm.load_op (%1) : (!llvm.ptr) -> !llvm.array<0 x i8>
  ...
  %3 = gpu.launch async
    ^bb0:
      %4 = builtin.unrealized_conversion_cast_op(%2)
             : (!llvm.array<256 x i8>) -> !llvm.ptr


  And here is an example of how the IR could look like after finalize_size() is
  called:


  %11 = llvm.alloc_op {elem_type = !llvm.array<256 x i8>} -> !llvm.ptr
  %22 = llvm.load_op (%11) : (!llvm.ptr) -> !llvm.array<256 x i8>
  ...
  # Ops inserted to initialize the tma descriptors on the host:
  ...
  %33 = llvm.getelementptr %11[0] : (!llvm.ptr) -> !llvm.ptr, i8
  call @mosaic_gpu_init_tma_desc (%33, ...)
  ...
  %44 = llvm.getelementptr %11[128] : (!llvm.ptr) -> !llvm.ptr, i8
  call @mosaic_gpu_init_tma_desc (%44, ...)
  ...
  %55 = gpu.launch async
    ^bb0:
      %66 = builtin.unrealized_conversion_cast_op(%22)
             : (!llvm.array<256 x i8>) -> !llvm.ptr

  gpu_launch_opc                     d| _         g | _        d| _        |}|j                  dk7  r&|j                  j
                  }|j                  dk7  r&|J || _        y )Nr   Fzbuiltin.module)next_offset	host_init_ops_creatednameparentopview
_module_op)r=   r   ops      r)   __init__zScratch.__init__E  s\    D79DND
 
B
''%
%99b ''%
%>>DOr+   Nop_nameblocktag_attribute_namer$   c                     |D ]Z  }|j                   |k(  r|||j                  v r|c S |j                  D ]&  }|D ]  }| j                  |||      }||c c c S  ( \ y ru   )r   
attributesregions_find_first_op)r=   r   r   r   r   regionchild_ops          r)   r   zScratch._find_first_opS  s}      		G	

$(:bmm(K	JJ & 	E((%9KL(!O		 r+   c                 ,   | j                   ry d| _         | j                  d| j                  j                        }|J t        j
                  j                  d      }t        j
                  j                  d      }t        j                  j                  d      }t	        j                  |      5  t        j                  |t        d|      |t              }t        j                  j                         |j                   t"        <   t        j$                  ||      }d d d        t        j                  j'                  |j                  j(                  d         5  t+        j,                  |gg       d d d        y # 1 sw Y   axY w# 1 sw Y   y xY w)	NTz
gpu.launch	!llvm.ptrz!llvm.array<0 x i8>r   r   )	alignmentr   )r   r   r   bodyr   TypeparseIntegerTypeget_signlessInsertionPointr   AllocaOprw   TMA_DESCRIPTOR_ALIGNMENTUnitAttrr   r   MOSAIC_GPU_SMEM_ALLOC_ATTRLoadOpat_block_beginblocksr   unrealized_conversion_cast)r=   r   ptr_tyempty_arr_tyi64alloc_opload_ops          r)   _create_opszScratch._create_opsb  s;   D''doo6J6JKM$$$WW]];'F77==!67L
..
%
%b
)C			=	) 4
!As)\,h
 9;8Ih45L(3g4 
			)	)-*<*<*C*CA*F	G >((&G9=> >4 4> >s   1A-E>F
>F
Fc                    | j                   s| j                          | j                  d| j                  j                  t
              }|J |j                  j                  \  }|j                  }|j                  j                  dk(  sJ |j                  j                  \  }|j                  }|j                  j                  dk(  sJ |||j                  fS )Nzllvm.allocaz	llvm.loadz"builtin.unrealized_conversion_cast)r   r   r   r   r   r   resultusesowner	operationr   )r=   r   
alloc_userr   load_op_user
device_ptrs         r)   _find_alloc_load_and_device_ptrz'Scratch._find_alloc_load_and_device_ptrz  s     
""t++-GH ??''LZG!![000^^((N\##J$$(LLLLWj////r+   c                 .    | j                         \  }}}|S ru   )r   )r=   _r   s      r)   r   zScratch.device_ptr  s    ;;=Aq*r+   c                    | j                   dk(  ry| j                         \  }}}t        j                  |      5  | j                   }t        j                  j                  d| d      }t        j                  j                  |      |_        |j                  j                  |       | j                  D ]  } ||j                          	 ddd       y# 1 sw Y   yxY w)z
    Allocates and initializes the host buffer. This needs to be done after
    lowering, i.e. after all TMA descriptors have been recorded. Only then we
    know what the scratch contains.
    r   Nz!llvm.array<z x i8>)r   r   r   r   r   r   TypeAttrr   	elem_typer   set_typer   )r=   r   r   r   gmem_scratch_bytesscratch_arr_tyinit_callbacks          r)   finalize_sizezScratch.finalize_size  s     1??AHgq			7	# '++ww}}|4F3Gv%NOn;;??>:hnnn->> '-hoo&'' ' 's   BCCru   )r/   r0   r1   r   r	   LaunchOpr   strr   BlockOpViewr   r   rO   r   r   r   rN   r   r   r  r'   r+   r)   r   r     s    2fL$9$9  MQ!#?BTz	yy4>00T]]DKK120$"(( 'r+   r   c                       e Zd Zy)_DefaultPredicateNr.   r'   r+   r)   r  r    r2   r+   r  gmem_refc                    t        | j                  t        j                        st	        d|  d      t        | t        j
                        rH| j                  j                  j                  | j                     } t        | t        j
                        rHt        | j                  j                  t        j                        st        d| j                   d      | S )zReturns the kernel argument value for a given gmem_ref.

  The kernel argument is expected to be an unrealized conversion cast. This
  function will recursively go up block arguments in case of nested blocks.
  z	Expected z to have a memref type.zM to be an unrealized conversion cast corresponding to a GMEM kernel argument.)
isinstancer_   r   r^   rd   BlockArgumentr   operands
arg_numberr   r   UnrealizedConversionCastOpr;   )r  s    r)   "_find_kernel_argument_for_gmem_refr    s     
HMM2==	1
y
*AB
CC8R--.~~##,,X-@-@AH 	8R--. 
HNN))7+M+M	N

HNN# $4 	4  
/r+   dtypec                 0   t         j                  j                  d      }t         j                  j                  d      }t         j                  j	                         }t         j
                  j	                         }t         j                  j	                         }| x yxdk(  r
 ||||||fv S xxdk(  rnxdk(  rn n  |||fv S xxdk(  rnxd	k(  rnxd
k(  rnxdk(  rn n  |||fv S xxdk(  rnxdk(  rn n  ||k(  S xdk(  rnxdk(  rnxdk(  rn y |||fv S )a  Returns whether the given TMA reduction op supports the given dtype.

  This function essentially implements the table at:
  https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor
  with the following differences:
  - For `add` reductions, we also support int64, treating it as uint64.
  - For `and`, `or`, and `xor` reductions, we support signed integer types.
  - For `inc` and `dec` reductions, we support both signed and unsigned i32
    treating both as unsigned.
      r   NTr   r   r   r    r   r"   r!   r   r   r   r   r   )r   r   r   F16Typer   F32TypeBF16Type)r#   r  i32r   f16f32bf16s          r)   _is_tma_reduction_op_supportedr     s     	##B'#
##B'#


#


#		$		sCsC000	sDk!!	*&6F	*sCj  	c\		sCj  r+   r   c                    t         j                  j                  |       ret        j                  |       }|dk(  rd}|S |dk(  rd}|S |dk(  rd}|S |dk(  rd}|S |dk(  r
|dv rd	nd
}|S |dk(  r
|dv rdnd}|S t        d|       t         j                  j                  |       rd}|S t         j                  j                  |       rd}|S t         j                  j                  |       rd}|S t         j                  j                  |       rd}|S t         j                  j                  |       rd}|S t         j                  j                  |       rd}|S t         j                  j                  |       rd}|S t        d|        )zCReturns the TMA DMA type for the given element type and signedness.r]         r   r      r  )r!   r"   	      r   
   zUnsupported integer bitwidth:          zunsupported TMA dtype )r   r   r  r   bitwidth_implrd   r  r  r  Float8E5M2TypeFloat8E4M3FNTypeFloat8E8M0FNUTypeFloat4E2M1FNType)r   r#   bitwidth	tma_dtypes       r)   _tma_dma_typer2    s   
 ^^|,""<0H1}i< 
; 
Qi8 
7 
Qi4 
3 
Ri0 
/ 
R#'77!Qi, 
+ 
R$(88"ai( 
% 7zBCC	zz\*I  
 
zz\*I 
 
{{l+I 
 
##L1I 
 
%%l3I 
 
&&|4I
 
	 
%%l3I 
 -l^<
==r+   c                   P    e Zd Z ej                         Z ej                         Zy)AsyncCopyImplementationN)r/   r0   r1   rS   rT   TMACP_ASYNCr'   r+   r)   r4  r4  	  s    		#TYY[(r+   r4  c                      e Zd ZU ej                  ed<   eed<   eeeef   ed<   dZ	e
dz  ed<    ej                  ed      Zeeej                  eedf   edz  eedf   eef   ej                  f   ed	<   dZeed
<   ej*                  d        Z	 d<dej0                  eej0                     z  dz  dej                  fdZdd d fdededz  deej                  gdf   deej                  gef   dej                  f
dZdej                  deedf   deej                  z  ez  dz  deedf   dedz  dedz  fdZdej                  dedeedf   deej0                     dz  dedz  d e fd!Z!dej                  d"ej                  dz  dedz  d#e"e   d$eej                  df   d%eedf   deedf   deej0                     dedz  fd&Z#d'd'dddddd e$       de jJ                  d(d)ej                  d*ej                  dedeeedf   z  deej                  z  ez  dz  d+e&jN                  dz  dedz  d,edz  deej0                     ej0                  z  dz  dedz  d-ej                  dz  e$z  dedz  d e fd.Z(d'd'dddd e$       d/dej                  dedeeedf   z  deej                  z  dz  dedz  deej0                     ej0                  z  dz  dedz  d-ej                  dz  e$z  fd0Z)de&jT                  jV                  fd1ed2ed3e&jT                  fd4Z,d1efd5Z-d6 Z.d7ej                  d8ej                  fd9Z/d7ej                  fd:Z0dej                  fd;Z1y)=LaunchContextmodulescratchcluster_sizeNr   F)default_factoryinit.tma_descriptorsis_device_collectivec              /      K   | j                   * | j                   j                  |i |5  d  d d d        y d  y # 1 sw Y   y xY wwru   )r   record)r=   argskwargss      r)   named_regionzLaunchContext.named_region  sJ     }} 4==00    s   )A
>A
AA
dimr$   c                    |t         j                  }nt        |t         j                        r|f}t        j                  j                         }d}t        d|      }t        |      D ]n  }| j                  |   dk(  rt        j                  |t        j                  t        j                  |      t        ||                  }|| j                  |   z  }p |S )z\Returns the index of a block within a subset of the cluster spanned by the given dimensions.r   r   )r   	Dimensionr  r   r   r   rw   sortedr;  r
   r   r   cluster_block_id)r=   rE  r{   strider@   rn   s         r)   cluster_idxzLaunchContext.cluster_idx"  s     {MMc	C	'FcLLEF
Au+CC[ %			1		"JJsEJJs';';A'>&%@PQRc!!!$$f	%
 Jr+   c                      y ru   r'   )r   s    r)   <lambda>zLaunchContext.<lambda>8  s    r+   c                     | S ru   r'   )xs    r)   rM  zLaunchContext.<lambda>9  s     r+   r   r   r   device_initc                   	 t         j                  j                  d      t         j                  j	                  d      	||}| j
                  j                  |z  rt        | j
                  j                  | j
                  xj                  |z  c_        	fd}| j
                  j                  j                  |       t        j                  	| j
                  j                         g gt        j                  j                        }|j                  | j
                  j                         j                           ||j"                        S )zAllocates a GMEM scratch buffer.

    The buffer is initialized on the host and then copied to GMEM before the
    kernel launch.
    r"  r   c           
      x     t        j                  | g gt         j                  j                               y ru   )r   getelementptrGEPNoWrapFlagsnone)host_ptr
alloc_baser   i8r   s    r)   host_init_wrappedz7LaunchContext._alloc_scratch.<locals>.host_init_wrappedH  s1    


VXrJ<TEXEXE]E]
^r+   )r   r   r   r   r   r:  r   r;   r   rf   r   GEPOpr   rT  rU  
move_afterr   r   )
r=   r   r   r   rP  rY  geprW  rX  r   s
      `   @@@r)   _alloc_scratchzLaunchContext._alloc_scratch4  s     
	$	$Q	'BWW]];'Fi||)+))JLL$ 	LL!!"34 **'')2
|RATATAYAYC NN4<<**,223szz""r+   r  gmem_transformgmem_peer_idtransformed_slice_shapeswizzler#   c           
          t              t        t        j                  j                        j
                  |      f} j                  j                  |d       x}t        j                  j                  d      t        j                  j                  d      t        j                  j                  d       f
d}	d }
 j                  t        t        |	|
      }| j                  |<   |S )Nr  r   r   c                   
 }D ]  }|j                  |      } t        j                  |j                        }|j	                         \  }}|d   dk7  rt        d|d    d      t        j                  |      ^}}}t        j                  |      }fd}	t        j                   |	|            }
d}t        j                  |
 |	|      g|g|j                  t        j                  j                        }t        t               rMj#                          t%        j&                  d      }t        j(                  |j                  ||gg g d	      }nmkt        t        j*                        st-              }n	 t/        d
      }j#                          t        j(                  |j                  ||gg g d	      }|j2                  }|dz  t5        |      k(  sJ t6        j8                  j:                  n}t-              }| ||t-        |      t=        j>                  |d | D cg c]
  } |	|       c}      t=        j>                  ||d  D cg c]
  } |	|       c}      t-        |      t=        j>                  D cg c]  }t-        |       c}      g}tA        j(                  g d|       y # t0        $ r}t        d      |d }~ww xY wc c}w c c}w c c}w )Nr\   r   zgTMA requires the stride of the last dimension after transforming the GMEM reference to be 1, but it is .c                 0    t        j                  |       S ru   )r
   
index_cast)rz   r   s    r)   rM  zDLaunchContext._get_tma_desc.<locals>.init_tma_desc.<locals>.<lambda>x  s    5++C3 r+   i   r   nvshmemx_mc_ptrcalleer$  )fuelz6Failed to recompute the async_copy peer id on the hostnvshmem_ptrr]   mosaic_gpu_init_tma_desc)!r>   r   r^   r_   r   rd   r   extract_strided_metadata extract_aligned_pointer_as_indexr   inttoptrrS  r   rT  rU  r  r-   _ensure_nvshmem_declsr
   constantcallrN   rw   _recompute_peer_idReplicationErrorr`   ra   mgpu_dialectSwizzlingMode
kNoSwizzler   
pack_arrayr   ) rV  r6   rm   r   r   r   r   sizes_and_stridesaligned_ptr_idxas_i64	alloc_ptrllvm_dynbase_ptr
world_teampeer_ider`   swizzle_argdtype_or_bitwidthrz   vrB  r_  r  r^  r  r   r   r=   ra  r1  r`  s                          r)   init_tma_descz2LaunchContext._get_tma_desc.<locals>.init_tma_desch  s    	A#	sxx(224
2;!"+a!  )/(G(G(L%6% AA#F3MM&&*AB	%%Iv/(V=P=PRVReReRjRj
 lO4

$
$
&~~c1-*YYmm8$&( %L"((3c*G*<bAg
 
$
$
&YYmm!"( {{ax301111  &&11 	 i-dCL1B5D1IJAfQiJK1B451IJAfQiJKk31HIAa3iIJ	
 			"0$7A $ J6 KJIs*   K 3K!K&K+	KKKc                     | S ru   r'   )r   s    r)   cast_tma_descz2LaunchContext._get_tma_desc.<locals>.cast_tma_desc  s
     r+   )r   r   rP  )r  r2  r   r^   r_   r   r>  r   r   r   r   r   r]  TMA_DESCRIPTOR_BYTESr   )r=   r  r^  r_  r`  ra  r#   tma_desc_keytma_descr  r  r  r   r   r1  s   ``````     @@@@r)   _get_tma_desczLaunchContext._get_tma_descU  s     2(;HbmmHMM:GGVI 5wP\^ghL((,,\4@@INN''+cNN''+cww}}[)fG8 G8P $$
,!#	 % h ,4d<(Or+   
gmem_slice
collectivepartitionedimplementationc           
      V    t         j                  j                         t        j                  |j                        }|j                         \  }}	|t        j                  |j                        k7  rt        d      |D 
cg c]  }
t        |
t        j                         }}
d}t        |      r|ddgk7  rt        d      |d   }t        |t        j                        st        d      t        |j                        dk7  rt        d	      |j                   }t         j"                  j                  |      rt        j$                  |      d
kD  rt        d      |j&                  rt        d      t)        d      g|dd }t        j*                  |t        j                  |j                        j                  |t,        j.                  k7        \  }}}||j                  d   g|dd }~t1        fd|D              }~|r+|t,        j.                  k7  rt        d      |t        d      |3t3        j4                  t3        j6                  t3        j8                  |             |dz   k(        d   d   }|st        d      t;        j<                   fd|D              }|dkD  rt;        j<                   j>                        dk7  rt        d      ||   |z  dk7  rt        d| d||          ||xx   |z  cc<   tA        |      }tC        jD                  ||   tC        jF                   jI                  |      tK        ||                     ||<   t1        |      }t1        d tM        |      D              rP|t,        jN                  k7  r=tM        |      D cg c]
  \  }}|r	| }}}tQ        g |      gfd|D        }t1        |      }|D ]$  }|jS                  |      }|jU                  |      }& tA        |      |||fS c c}
w c c}}w )z:Performs setup common to TMA and CP_ASYNC implementations.z3async_copy assumes the GMEM reference is contiguousNTFzFGathers/scatters only supported along the first dimension of 2D arraysr   z0Gather/scatter indices must be a FragmentedArrayr   z!Gather/scatter indices must be 1Dr  zDGather/scatter indices must be integers that are at most 32-bit widez'Gather/scatter indices must be unsigned)	check_oobc              3   n   K   | ],  }t        |t        j                        st        |      n| . y wru   )r  r   rN   rw   )ry   rz   r{   s     r)   r|   z4LaunchContext._prepare_async_copy.<locals>.<genexpr>  s.      >?:a2!U9s   25z6Only the TMA implementation supports collective copiesz1Collective copies with gather/scatter unsupportedz(Only collective loads can be partitionedc              3   <   K   | ]  }j                   |     y wru   r;  ry   rn   r=   s     r)   r|   z4LaunchContext._prepare_async_copy.<locals>.<genexpr>  s     !K1$"3"3A"6!K   r]   z7Partitioned loads only supported for clusters of size 2zThe collective size (zO) must divide the slice shape along the partitioned dimension, but it has size c              3   ,   K   | ]  \  }}|s	|  y wru   r'   )ry   rz   squeezeds      r)   r|   z4LaunchContext._prepare_async_copy.<locals>.<genexpr>  s      axs   
c              3   R   K   | ]  }|j                  t                       y wru   )rM   ra   )ry   rm   squeezed_dimss     r)   r|   z4LaunchContext._prepare_async_copy.<locals>.<genexpr>$  s     N!!''#m"45Ns   $')+r   r   r   r^   r_   r   r   get_contiguous_stridesrD   r;   r  faFragmentedArrayr   rd   ra   
mlir_dtyper   r0  	is_signedre   parse_indicesr4  r5  rO   npwherecumsumarrayr   prodr;  listr
   r   r   rK  rw   	enumerater6  r   rC   rG   )r=   r  r  r^  r  r  r  gmem_ref_tygmem_stridesr   rp   is_gathered_dimgather_indices	idx_dtypebase_indicesslice_shapeis_squeezeddyn_base_indicescollective_sizerz   r  sliced_dimsrm   r{   r  s   `                      @@r)   _prepare_async_copyz!LaunchContext._prepare_async_copy  sB    LLE--.K!88:OL!u33K4E4EFF
? 
 CMMQz!R%7%78MOM04N
?	T5M	)!
 	
 "!}n(:(:;KLL	^!!	"a	'<== ++i^^&&y1U^^I5NQS5S_``		!	!BCC$K1*QR.1j .3-@-@
hmm$** $;$?$??	.*L+{ !#))!,?{12?k CO  	 	266	6QRR		#!"UVVHH
))RXXk**
+{1}
<>>?AABDk CDD		!K
!KKo	1	99T&&'1,#G  {#o5:%o%6 7k*+- 
 	K _4  01(-

[)JJ  ,aK0H%.P)
% !!12 &{3 M +B+K+KK*3K*@Q;1hQQkQ*+I]+I[+IJ PN~NPn $K 3**+;<%%k2k3
 	[ u Nb Rs   !P -
P%8P%smem_refr  r  r  c                 X    t         j                  j                         t        |       t        j                  |j
                        }t        j                  |j                        |j                         \  }t        fddd D              rt        d      t        |      t              dkD  r}|{t              z
  dz   dkD  rt        d      t        fd|D              }t        |      }g ||}|j                        t!        |j#                  t                          dt!              t!              t%        d	 d D              sJ t'        j(                   fd
|	D              }|dkD  r	|
|J dt*        dt         j,                  dt*        ffd} j/                  |	      }|}|duxr |t0        j2                  j4                  k7  }t7        |rdd n      D ]z  \  }}||z  dk(  r ||||       d} na||z  dk(  rV|dkD  s*t9        j:                  |t=        |            } ||||       t9        j>                  |t=        |            }||z  }z n ~|dkD  rt        d d|       tA              dkD  rt        dt                     d   z  x}dz  dk7  rt        d|       |L|t0        j2                  j4                  k7  r/d   |dz  z  k7  r!t        d|d| d|dz  z   dd    d	      |fS )zAFinalizes setup specific to the TMA implementation of async_copy.c              3   4   K   | ]  }|z  d z  dk7    yw)r   r   Nr'   )ry   rp   element_bitwidths     r)   r|   z-LaunchContext._prepare_tma.<locals>.<genexpr>H  s!     
Fq1#%*
Fs   Nr\   zUasync_copy requires all GMEM strides except the last one to be a multiple of 16 bytesr(  r   z5Async copies only support striding up to 5 dimensionsc              3   (   K   | ]	  }|     y wru   r'   )ry   rn   r  s     r)   r|   z-LaunchContext._prepare_tma.<locals>.<genexpr>V  s     "Jq<?"Jr   c              3   &   K   | ]	  }|d k(    ywr   r'   ry   rn   s     r)   r|   z-LaunchContext._prepare_tma.<locals>.<genexpr>_  s     ?!qAv?r   c              3   <   K   | ]  }j                   |     y wru   r  r  s     r)   r|   z-LaunchContext._prepare_tma.<locals>.<genexpr>g       I 1 1! 4Ir  rE  r@   
num_chunksc           	      4   | k\  sJ | xx   |z  cc<   t        j                  |t        |                }t        j                  |    |      | <   Bt	        j
                  t        d       f| z
  z  t	        j                  ||          fz         y y ru   )r
   r   rw   r   r   rg   re   ds)	rE  r@   r  block_offsetr  r{   num_squeezed_dimsr  r  s	       r)   partition_dimz1LaunchContext._prepare_tma.<locals>.partition_dimj  s    ''''CZ'zz#qS)95'AB %

+;C+@, O''T{n&7 78,C(89;<(  r+   r   z>None of the leading dimensions in the transformed slice shape z% is divisible by the collective size    zKAsync copies only support copying <=256 elements along each dimension, got r   zhAsync copies require the number of bits copied along the last dimension to be divisible by 128, but got r"  zAsync copies with swizzle=z7 require the last dimension of the slice to be exactly z bytes i.e.  z elements, but got z
 elements.)!r   r   r   r  r^   r_   r   r0  r   r   r   rd   ra   rO   r   rC   r  rG   allr   r  rP   rN   rK  ru  rv  rw  r  r
   r   rw   rv   r   )r=   r  r  ra  r  r  r  r  r^  r  r  r  r   squeezed_dim_stridescollapser  r  r@   rem_collective_sizehas_swizzlerE  
slice_sizedim_idx	zeroth_bwr  r  r{   r  s   ` ` ``                  @@@@r)   _prepare_tmazLaunchContext._prepare_tma3  s    LLE 'x0--.K~~k&>&>?!88:OL!

FL"4E
FF"  M*
;! 6	[	-	-	1A	5C
 	
 #"JM"JJ01EFh222n!112BC11%2DEFk,-{#K?{+=,=>???? iiIjIIO{2###S rxx S   Z(c+

 ?33>>> 
 ')+cr
{ /#z ++q0
S"5
6 !

 :-2!^kk#qU';<G#w
3++c1Z#78CJ.
  	q	 }  "
 	
 ;#";/02  !_'777	3>!C88A{D 
 	|11<<<O!0@@@'wj )"") +{//0 1"oj*  k#3^DDr+   r'   )r  r^  r_  barrierra  arriver  r  	predicater#   r  src_refdst_refr  r  r  c                D!   XY t         j                  j                         Yt         j                  j	                  d      }t         j                  j	                  d      }t         j                  j	                  d      }t        j
                  |j                        }t        j
                  |j                        }|j                  }t        j                  |      }||j                  k7  rt        d| d|j                         t        |	t        j                        r|	f}	n|	d}	t        |t              s|f}t        |t              s|f}|;|t        j                   k7  rt        d      t#        ||      st        d	| d
|       |j$                  `t        j&                  |      rK||}}|t        j                   k(  r|-t        d      |t        j(                  k(  sJ |t+        d      |Ed}nBt        j&                  |      r"|j$                  ||}}|t        d      |d}nt        d      |	r||u rt        d       j-                  ||||	|
|      \  }}}}}~t        j
                  |j                        }t        j
                  |j                        }|t        j(                  k(  r|rt+        d      t/        d |dt1        |       D              sJ |t1        |      d |j2                  k7  r9t        dt        |j2                         dt        |t1        |      d              |t        j(                  k(  r:|	rJ |
J t        |t4              st+        d      |t+        d      ||u rt        d      |J d|z  |z  }|t7        d|f      fk7  rt+        |      t9        j:                  g |j2                  dd || }|j=                         d   }|j>                  jA                  |      |jB                  d D  cg c]  } tE        jF                  ||        }!} t        jH                  |jK                         |!      }"t        jH                  |jM                         |!      }#tE        jN                  |"|#      }$|dk\  rdnd|z  }%|dk  r|}&nFt         jP                  j                  |      r%t        jP                  |      jR                  dk(  r|}&n|}&tE        jT                  |$tW        |%|            }$|jB                  dk7  rt+        d      t8        jX                  j[                  |||t        |j2                        d      }'t        j\                  t        j^                  |      |$g|&      }(ta        jb                  t         jd                  jg                  d       |(      }(|jh                  |z  dz  })|)dk(  rtj        jl                  jn                  ntj        jl                  jp                  }*|'D ]]  \  }+},}-}.ts        d! tu         |-       |d"      D              }/t        j\                  |(|/|%z  g|&      }0tk        jv                  |.|0|)|*       _ |tk        jx                          yt*        |t        j                   k(  sJ  j{                  |||||||||	|

      \  }}}}|J t        j
                  |j                        j=                         \  }1}2t}        d# tu        |1t        j~                  |j2                        |j2                        D              rt        d$|1 d%|j2                         t        j                   fd&|	D              }3t        j                  |      |z  |3z  dz  dk(  sJ tW        t        j                  |      |z  |3z  dz  |      }4|ddlB}5t        j                  |5jt                  d"      }6||u rt+        d'      |J |j                         }7|rt+        d(      |t        d)      t        |t4              st        d*      |j                  t8        j                  k7  rt        d+|j                         d,}8|8|d-   z  |z  }9|9d.z  rt        d/|9dz   d0      |rEt        j                  t        j                  j                        }:t        j                  |7|4|:1       |j=                         \  }}2t1        |      dk(  sJ |j2                  \  }2};d2}<|D ]$  }=|=j                  |      }|=j                  |<      }<& |<D  cg c]  } t        |        }>}  j                  |d|d|d-   f||      }?t8        j                  jh                  |8k(  sJ tE        j                  t        j                  d3      tE        jF                  |t        j                              }@tE        j                  |@tW        |8|            }Ats        d4 tu        |>dd- |dd- |dd-       D              }B|B|;kD  rt+        d5      t        j                  tD        jN                  Yfd6 |6|>||      D        tE        jF                  Yd            }CtE        j                  ||C      }Ct        j                  t        j                  j                        }t        d7  |6|dd- |>dd-       D              }Dt        |j                  j                        D ]Y  \  }E}Ft        j                  |j                        dk7  r4tE        j                  t         j                  j                  d8|      F      }FE|8z  t        j                  z  }GtE        jN                  AtE        jF                  ||G            }Htu        |>||<      D IJ cg c]F  \  }I}J} |Ir>tE        j                  tE        jT                  HtW        | |            tW        J|            H }K}J}I} |KD Ecg c]  }EtE        j                  Y|E       }K}Et        |8      D Ecg c]"  }Eta        j                  FtW        |E|            $ }L}Et        j                  D      D ]  }Mt        K      Xt        Xfd9tu        |>dd- |M      D              }Nt        j                  ||N      }Ot        j^                  |Od:;      }.ts        d<  |6|>dd- |M|dd-       D              }PtE        jN                  CtE        jF                  ||P            }Qta        j                  t         jd                  jg                  d=      ||.|?|7|QgLd>d?d@|8dz   z  z   dA        \ y|J  j                  |||t        |      ||      }?t        |      D Rcg c]  }RtE        j                  ||R       }S}Rt        |t4              r-t        j                  t        j                  j                        }|)tW        dt         j                  j	                  d            }t        j^                  |d:;      }.||u r|J |j                         }7|J |3dkD  r|
|3dk(  sJ |rvtE        j                  tD        j                  j                   j                  |	      tW        dY            }TtE        j                  ||T      }:t        j                  |7|4|:1       t1        |      }UdBj                  dC t        d,d,|Uz         D              }Vta        j                  t         jd                  jg                  d=      ||.|?|7gSdD|U dE|V dFd?d@|Uz  z   dA       y|rt        j                  |7|4|1       |3dkD  r5tE        j                  |t        j                   j                  |	            }Wnd}Wtk        j                  |.|?S|7g W|G       y|t1        |      }UdBj                  dH t        d:d:|Uz         D              }Vta        j                  t         jd                  jg                  d=      ||.|?gSdI|U dJt        |       dK|V dLdMd@|Uz  z   dA       |rtk        j                          yytk        j                  |?|.S|1       |rtk        j                          yyc c} w c c} w c c} }J}Iw c c}Ew c c}Ew c c}Rw )Na  Initiates an async copy between GMEM and SMEM.

    Exactly one of `src_ref` and `dst_ref` must be in GMEM and in SMEM, and the
    SMEM reference must be contiguous. The GMEM window that is read or written
    to is specified by the `gmem_slice`. The copy can change the order in which
    the data appears in the window by applying a sequence of transforms to the
    GMEM reference (as specified by `gmem_transform`).

    When `collective` is specified (only allowed for GMEM -> SMEM copies), the
    identical async_copy must be scheduled by all blocks that share the same
    coordinates along collective dimensions within a cluster. The behavior is
    undefined otherwise. The semantics of collective loads depend further on the
    `partitioned` argument:

    - If `partitioned` is not specified, all blocks load the same data into
      their shared memory and all receive the update in their barriers, unless
      `arrive` is False. If `arrive` is False, you should expect the barrier to
      have expect_tx incremented by the same amount of bytes as if `collective`
      was not specified.
    - If `partitioned` is specified, each block only loads a separate slice of
      the data into SMEM, partitioned into equal tiles along the `partitioned`
      dimension. In this case only the barrier of the first block in the
      collective will have its expect_tx incremented by the total size of the
      transfer across all blocks involved in the collective. Barriers supplied
      by other blocks will be ignored (even if `arrive` is True).
    r"  r$  r  z Expected same element type, got z and Nr'   z/Only the TMA implementation supports reductionszReduction op z: not supported by the TMA implementation for element type z1Barriers are required for TMA GMEM -> SMEM copiesz9Barriers are unsupported for CP_ASYNC GMEM -> SMEM copiesTz0Barriers are unsupported for SMEM -> GMEM copiesz#Only SMEM <-> GMEM copies supportedz*Only GMEM -> SMEM copies can be collectivez9Integer indexing in gmem_slice not supported for CP_ASYNCc              3   &   K   | ]	  }|d k(    ywr   r'   r  s     r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>+  s     @!qAv@r   zMExpected the SMEM reference to have the same shape as the transformed slice: z != zaCP_ASYNC needs to be performed by the whole warpgroup and does not support the predicate argumentz:Gather/scatter unsupported for the CP_ASYNC implementationz9CP_ASYNC implementation only supports GMEM -> SMEM copiesr   r   r]   zOnly 2D copies implementedF)	optimizedz!llvm.ptr<1>c              3   ,   K   | ]  \  }}||z    y wru   r'   )ry   rz   rp   s      r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>h  s     _1a!e_r   )strictc              3   <   K   | ]  \  }}}||k7  xr |d k7    ywr   r'   )ry   rp   csrn   s       r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>  s.      Ar1 	
RAFs   zFasync_copy needs the SMEM reference to be contiguous, but got strides z for shape c              3   <   K   | ]  }j                   |     y wru   r  r  s     r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>  r  r  z.Scatter unsupported for the TMA implementationz6Gather/scatter unsupported when using integer indexingz+Gather/scatter TMA can't perform reductionsz(Gather/scatter TMA can't use a predicatez#Unsupported gather indices layout: r#  r\   i   zBGather/scatter TMA would require breaking it up into transfers of z( bytes, but need a multiple of 128 bytes)r  )r   r   )syncc              3   8   K   | ]  \  }}}|s
|d z
  |z    ywr   r'   )ry   grn   rp   s       r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>  s)      (aA q5A+(s   z*Non-gather dims don't fit into the columnsc              3   ~   K   | ]4  \  }}}|s,t        j                  |t        j                  |             6 y wru   )r
   r   rq  )ry   r  r@   rJ  r{   s       r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>  s:       !S& 	 jjennUF;<s   :=c              3   .   K   | ]  \  }}|rd n|  ywr   r'   )ry   rn   r  s      r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>  s      % Aqq!a-%s   )r#  c              3   B   K   | ]  \  }}|rt              n|  y wru   )next)ry   r  rz   gather_slice_idx_its      r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>
  s*      !Q ,-d&'!3s   r&  )r   c              3   2   K   | ]  \  }}}|s||z    y wru   r'   )ry   r  r@   rJ  s       r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>  s'      ! !S& 	 Fl!s   
!llvm.voidz@$0 cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [$1], [$2, {$4, $5, $6, $7, $8}], [$3];zb,r,l,r,rhas_side_effects,c              3   &   K   | ]	  }d |   yw$Nr'   ry   rz   s     r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>A       DA!A3Dr   z
            {
            .reg .b32 mapped_addr;
            @$0 mapa.shared::cluster.u32 mapped_addr, $3, 0;
            @$0 cp.async.bulk.tensor.zrd.shared::cta.global.tile.mbarrier::complete_tx::bytes.cta_group::2
                                  [$1], [$2, {z-}], [mapped_addr];
            }
            )multicast_maskr  c              3   &   K   | ]	  }d |   ywr  r'   r  s     r)   r|   z+LaunchContext.async_copy.<locals>.<genexpr>b  r  r   z @$0 cp.reduce.async.bulk.tensor.zd.global.shared::cta.z.tile.bulk_group [$2,{z	}], [$1];zb,r,l)qr   r   r   r   r   r^   r_   r   r   r0  rd   r  r   rG  rO   r4  r5  r   r   is_smem_refr6  r;   r  r  ra   rD   r  rX   r  tiled_copy_smem_gmem_layoutr   rY   tile_stridesr`   r
   rq  dyn_dotlane_indiceswarp_indicesr   	FloatTypewidthrv   rw   r  transfer_tiledrS  
memref_ptrr   addrspacecastr   r   vector_lengthr   LoadCacheModifierKindCGCAr   rb   cp_async_shared_globalcp_async_commit_groupr  r   r  r   r  builtinsr   partialget_ptrlayoutTMA_GATHER_INDICES_LAYOUTsingle_thread_predicateThreadSubset	WARPGROUPnvvm_mbarrier_arrive_expect_txrI   boolr  r   warp_idxWARPS_IN_WARPGROUPr   reducerf  WARPr  	registersflatr  extui
VectorTyperc   extractelementr  ndindexiterrg   
inline_asmreversedcmpiCmpIPredicateeqrK  andijointruncicluster_collective_maskr;  *cp_async_bulk_tensor_shared_cluster_globalr*   cp_async_bulk_commit_group&cp_async_bulk_tensor_global_shared_cta)Zr=   r  r  r  r^  r_  r  ra  r  r  r  r  r#   r  rX  i16r  
src_ref_ty
dst_ref_tyr   r  r  r  r  r  r  r  r  smem_ref_tyswizzle_elemsr  r  rp   dst_tiled_strideslane_offsetwarp_offset
dyn_offsetoffset_scalegep_type	transfersgmem_base_ptrbytes_per_transfercache_modifier_get_updateget_base_idxsmem_ptrconstant_offsetgmem_ptrsmem_stridesr   r  transfer_bytesr  zipsbarrier_ptrROWS_PER_INSTRsingle_tma_bitsarrive_predicate	gmem_colsslice_gather_stridesrm   is_gather_dimr  r  gather_linear_idx_warpmax_non_gather_linear_indexcol_base_offsetnon_gather_slice_shaperz   reggather_linear_idx_reggather_linear_idxr  rn   gather_slice_idxgather_rowsnon_gather_idxssmem_indicestransfer_smem_refcol_slice_offset
col_offsetr@   rev_dyn_base_indicesfirst_blockr`   idx_operandsr  r  r{   sZ   `                                                                                       @@r)   
async_copyzLaunchContext.async_copy  s   X LLE		$	$Q	'B
..
%
%b
)C
..
%
%b
)Cw||,Jw||,J**L~~l3z...,\N ;%%&( 
 *cmm,=j		jne,&(nj%(=j	266	6JKK+L,GL> *00<~?
 	

 &5+<+<Z+H"Gh	266	6?NO
O!8!A!AAAA#I  
			:	&:+B+B+J"Gh		KLL	<==h')CDD 	  	 	--.K--.K0999m
E  @{+>C,>?@@@@3}%&';+<+<<!!&{'8'8!9 : ;KM 2 34568  0999^   	#45!.
 	
 
	#!"^__	W	TUU   'k%55m	M1m*<=?	?!.11-- R#%,.>f !779!<l ==--l;K<L<L<NO ..a
   MM&"5"5"79JKkMM&"5"5"79JKk::k;7j*a/QQ:J5Jl	A	<<""<0R\\,5O5U5UYZ5Z;;z1\3+?@j			Q	!">??$$33
GVU;+<+<%= 4 i ))%*:*:8*DzlT\]m((~)FVm!//2BBaG  2% 
$
$
'
')),, 
 4= \
/$x_LNLY]0^__&&},7V6WYab##Hh8JN[\ 
""$  "! 488888 		
 >X{,n mmHMM2IIKOL!
 (():):;
  "^;{/@/@.AC 
 iiIjIIO99[!$44FJaOOO		+!11OCqH#N !x||D9d 
W	!"RSS   OO%k	!"Z[[		!FGG	#45CDD			"">">	>>~?T?T>UVWWn&R8;KKo	4	1$%%MO
 	

 
 889K9K9U9UV,,&	
 $::<olA!### &&la.4 I!**<8 223GHI )==1tAw=m=##
Bq+b/&:G\h
 ))77>III 
..d
#
..e66
7h  %zz(Anc4JK %( (]3B/Sb1A<PSQSCTU( %! 
%y	0!"NOO!((
**$(!1<% ..
"
o ((o>o//0B0B0G0GHi$ %$(Sb)9=";M$N%   n66;;< ,&!S>>.334:BMM--dC8#># !N 2U5M5M M!JJ"ENN38M$N
 }k;OP
 
1a KK$5qCyA1Q9M
 

 AQQ1E,,UA6QQ9>~9N
45DQq#Y/
 
  "zz*@A 	O $%5 6
 mCR0/B , $00<H
%%&7aH(  !$($o|CR7H%! 
 zz/5>>#GW3XY*
//ggmmL)(Hk:TT R$.1"455#'	',Z !!!!!.,kG\H /77G.H'*c"  ).///0B0B0L0LMiAr~~22156iq9H7   OO%k!!!	1	!8!###

!!$$d&6&6z&BAaK+ #ZZ	;?


.
.>5E ;xxDq!d(1CDDGGMM,'(KO:NO& '+V ,00<~ > t#!	
 

.
.>Y Q <<5001B1BJO.  .77h 4k2)Y	

 
	!;xxDq!d(1CDD
''--
%Xh
=(<
=,TF2GH\]iHjGk  lC  DP  CQ  Q[  \
D4K
	
 

)
)
+  	33h 4		
 

)
)
+ M	z >n

 R
Ls+   AB:AB#AAB
6AB"'ABAB)r  r^  r_  ra  r  r  r  c                   t         j                  j                  d      }	t        |t        j
                        r|f}n|d}t        |t              s|f}t        |t              s|f}t        j                  }
| j                  ||||||
      \  }}}}}~| j                  |d ||||||||
      \  }}}}|t        d      | j                  |||t        |      |d       }t        |      D cg c]  }t        j                  |	|       }}t        |t               r-t#        j$                  t"        j&                  j(                        }|)t+        dt         j                  j                  d            }t-        |      }dj/                  d t1        dd|z         D              }t3        j4                  t         j6                  j9                  d	      ||g|d
| d| ddd|z  z   d       y c c}w )Nr  r'   z+Gather/scatter prefetch not implemented yetr(   r   r  c              3   &   K   | ]	  }d |   ywr  r'   r  s     r)   r|   z/LaunchContext.async_prefetch.<locals>.<genexpr>  s     @asG@r   r]   r  z"@$0 cp.async.bulk.prefetch.tensor.zd.L2.global.tile [$1, {z}];zb,lr  Tr  )r   r   r   r  r   rG  rO   r4  r5  r  r  r;   r  r  r
   rf  r  r   r  r  r	  rw   ra   r  rc   r   r  r   r   )r=   r  r  r^  r_  ra  r  r  r  r  implr  r  r  r  r   r  r@   rP  r`   rR  s                        r)   async_prefetchzLaunchContext.async_prefetchs  s	    ..
%
%b
)C*cmm,=j		jne,&(nj%(=j#''D 	  *nj+t	 	 		
 7Q%~ ! MNN!!.,kG$ " H /77G.H'*c"  ).///0B0B0L0LMiAr~~22156i{D88@U1a$h-?@@LOO
l#	H434
,TF2J<.X\]ts   .G.allow_groupsawait_read_onlyscopec                    t        j                  ||       |t        j                  j                  k(  rt        j
                          y |t        j                  j                  k(  rt        j                          y t        d|       )N)readzUnsupported scope: )	r   cp_async_bulk_wait_groupr   r  r	  warpgroup_barrierr  warp_barrierrd   )r=   rX  rY  rZ  s       r)   await_async_copyzLaunchContext.await_async_copy  sg     	!!,_E"",,,	%$$))	),UG455r+   c                 V    t        j                  |       t        j                          y ru   )r   cp_async_wait_groupr   r^  )r=   rX  s     r)   await_cp_async_copyz!LaunchContext.await_cp_async_copy  s    \*	r+   c                    | j                   ry d| _         t        j                  | j                  j                        5  t        j
                  j                  t        j                  j                  d            }t        j                  d|d       t        j
                  j                  t        j                  j                  d            }t        j                  d|d       t        j
                  j                  t        j                  j                  d            }t        j                  d	|d       d d d        y # 1 sw Y   y xY w)
NTz!llvm.func<i32()>nvshmem_my_peprivate)sym_visibilityz$!llvm.func<!llvm.ptr(!llvm.ptr,i32)>rk  z$!llvm.func<!llvm.ptr(i32,!llvm.ptr)>rg  )r?  r   r   r9  r   r  r   r   r   r   
LLVMFuncOp)r=   nvshmem_my_pe_typenvshmem_ptr_typenvshmemx_mc_ptr_types       r)   rp  z#LaunchContext._ensure_nvshmem_decls  s       $D			4;;++	, ;;??277==9L+MN
oo
-i 
''-->
? oom%5iP[[__
''-->
? oo
1)  s   C=EEr6   peerc                 b   | j                          t        j                  j                  |j                        rt        j                  |j                        }|j                         \  }}t        j                  j                  |j                  |j                  t        j                  j                  d|      |j                        }t        j                  | j                  t        j                  |      |      |      S |j                  t        j                  j!                  d      k7  rt#        d|j                         |j                  t        j$                  j'                  d      k7  rt#        d|j                         t)        j*                  |j                  ||gg g d      S )Nr   r   z Unsupported type for to_remote: r  zpeer index must be an i32, got rk  rh  )rp  r   r^   r  r_   r   r   rD   r   r   r   r   ptr_as_memref	to_remoter  r   r   rd   r   r   r   rr  )r=   r6   rl  r   r   r   result_types          r)   ro  zLaunchContext.to_remote  s@    	}}) }}SXX&f002jgqMM%%
,,






"
"1g
.


	k   
..))#.
5{  xx277==--9#((DEEyyBNN//338DEE99SXXT{B=IIr+   c                    t         j                  j                  d      }| j                          t         j                  j                  |j                        st        d|j                         t        j                  |j                        }|j                         \  }}t         j                  j                  |j                  |j                  t         j                  j                  d|      |j                        }t        j                  |d      }t!        j"                  |      }t%        j&                  |j                  ||gg g d      }	t!        j(                  t!        j*                  |	|            S )Nr  z*Unsupported type for to_remote_multicast: r   rg  rh  )r   r   r   rp  r^   r  r_   rd   r   r   rD   r   r   r   r
   rq  r   r  r   rr  MultimemRefrn  )
r=   r6   r  r   r   r   rp  r  ptrmc_ptrs
             r)   to_remote_multicastz!LaunchContext.to_remote_multicast  s   
..
%
%b
)C ==##CHH-CCHH:NOO ]]388$F..0JGQ--##
  G,	K Q'J


3
CYY:s#R4EF U00EFFr+   c                     | j                          t        j                  j                  d      }t	        j
                  |g g g d      S )Nr  re  rh  )rp  r   r   r   r   rr  )r=   r  s     r)   	device_idzLaunchContext.device_id  s9     
..
%
%b
)C99S"b"_==r+   ru   )2r/   r0   r1   r   Moduler   r   rO   rP   r   OnDeviceProfilerdataclassesfielddictr>  rN   r5   r   r?  r  
contextlibcontextmanagerrD  r   rG  r   rK  r   r]  r-   TMAReductionOpr  r4  r  r  r  r  r5  r   
BarrierRefrS  rW  r  r	  r`  rc  rp  ro  ru  rw  r'   r+   r)   r8  r8    sd   
))c3m$$&*(t#* k59 4BHHeCHosTz5#9M3NPSUXXYhh :  %$  CG#--!884?	xx* #.</:## t# 288*d*+	#
 RXXJO,# 
xx#Bff OS01f "((N_4t;	f
  %S#Xf Tzf #T)fPtt t OS01	t
 3==)D0t :t .tlxExE 4xE Tz	xE
 9xE bhhm,xE 38_xE OS01xE 3==)xE :xE~ FH>B)-  CG $7H7J,00G0K0K!D, xxD, xx	D,
 D, &os.B(CCD, "((N_4t;D, $&D, TzD, TkD, 3==)CMM9D@D, :D, D#44D, #T)D,  .!D,T DF*.AE"5F5HJ hhJ 	J
 $eOS,@&AAJ .4'J 4ZJ '#--7$>J tJ xx$!22JZ 8="'"4"4">">
6
604
6
6c (J288 J288 J,GRXX G,> >r+   r8  c                       e Zd Zy)rt  Nr.   r'   r+   r)   rt  rt    r2   r+   rt  r  c                 x   |dk(  rt        d      t        | t        j                        rt        d      | j                  j
                  }|j                  j                  d      r|j                  D cg c]  }t        ||dz
         }}|j                  D cg c]  }|j                   }}|j                  D ci c]  }||j                  |    }}t        j                  j                  |j                  |||      }	t        |	j                        dkD  r|	j                  S |	j                   S t        |t"        j$                        rR|j&                  j(                  dk(  r9t        j*                  j-                  d      }
t#        j.                  |
g g g d      S t        d	|       c c}w c c}w c c}w )
Nr   zDgmem_peer_id computation is too complicated to recompute on the hostz/Can't recompute a value that's a block argumentzarith.r   re  r  rh  z1Unrecognized op can't be recomputed on the host: )rt  r  r   r  r   r   OPERATION_NAME
startswithr  rs  resultsr_   r   	Operationcreatera   r   r   CallOpri  valuer   r   rr  )r  rj  r   rO  new_operandsrresult_typesnanew_attributesnew_opr  s              r)   rs  rs    sr   	QY
N  ))*
L
MM}}"!!(+=?[[I&q$(3ILI$&JJ/qAFF/L/68mmDb"--++DND\\  
<~F !0146>>G&--G DKK RYY__%G
..
%
%b
)C99S"b"_==9">	  J/Ds   5F-F2>F7)r"  )@collections.abcr   r   r}  rz  rS   r   r   typingr   r   jax._src.libr   ru  jaxlib.mlirr   jaxlib.mlir.dialectsr	   r
   r   r   r   r   r   r   numpyr   r   r  r   r   r  r   r  r	  r*   rw   r-   GLOBAL_BROADCAST	dataclassr5   EnumrR   rX   r   r   ry  r   r   r  rN   r  r  r   r  r   rP   r2  r4  r8  	Exceptionrt  rs  r'   r+   r)   <module>r     sC    /       ;  - & ( % $ % ' %  $    ~ #  
GG  #$  d#H H $H,tyy 
 d#ZO Z $Zz d#  $4 d#/o / $/d ,, 4 O' O'd hh''.! 4'!02!	!D&''& 4'& 	&Rdii 
 C> C> C>L y  RXX r+   