
    uki                       U d Z ddlmZmZ ddlZddlZddlZddlZddlZddl	m
Z
mZ ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlZ dZ!e"e#d<   dZ$e"e#d<   e$e!z  Z%e"e#d<   dZ&dZ'dZ(dejR                  de"fdZ* e*ejR                  jV                        Z,	 ddejZ                  de"dz  fdZ.d  Z/d! Z0d"e"e1z  fd#Z2d$ Z3d%dd&d'Z4 ejj                  d%(       G d) d*             Z6d+ejn                  d,ejn                  fd-Z8ed.   Z9	 dd/ejt                  d+ejn                  d0e9d1e;dz  fd2Z< ejj                  d%(       G d3 d4             Z=d5 Z>ej~                  d6        Z@d7 ZA ej                  eAej                  ej                        ZE ej                  eAej                  ej                        ZHdd8ZIdd9ZJdd:ZK G d; d<ej                        ZMdaNeMdz  e#d=<   eMj                  fd>eMfd?ZPej~                  eMj                  fd>eMfd@       ZQdA ZRdB ZSddCedD   dz  fdEZTd/ejt                  fdFZUd/ejt                  fdGZVd/ejt                  fdHZW ejj                  d%(       G dI dJ             ZXeXZYdKejn                  dejn                  fdLZZ e[d      fdMejZ                  dNe[dz  fdOZ\dKejn                  dPe]e"   dQe]e"   fdRZ^dKejn                  e6z  dSe_e"dTf   dejn                  e6z  fdUZ`dKejn                  e6z  dejn                  e6z  fdVZadKejn                  dejn                  fdWZbdKejn                  dejn                  fdXZcdKejZ                  de;fdYZddKejn                  dZee"   dejn                  fd[Zed%d\dSe_e"dTf   d]e;de_e]ejn                  e"z     e]e"   e]e;   f   fd^Zfd_ Zgd` Zhda Zi ejj                  d%(       G db dc             Zj ejj                  d%(       G dd de             Zk ejj                  d%(       G df dg             Zl ejj                  d%(       G dh di             Zmdj Zn G dk dl      Zo G dm dn      Zpdo Zqdp ZrddqZsdre_e"e"e"f   dseej                     ej                  z  fdtZuduej                  j                  dejt                  fdvZwduej                  j                  de;dz  fdwZxd+ejn                  dxeejn                  e"z     duejt                  dejn                  fdyZydz Zzd{ejn                  d|e"ejn                  z  fd}Z{d~ejn                  dejn                  dZejn                  fdZ|d{ejn                  dejt                  fdZ}d{e"de"fdZ~dejn                  de[fdZdeejn                     dejn                  fdZdeejn                     dejn                  fdZdde;fdZdej                  fdZdej                  fdZdKejn                  ejt                  z  de;fdZdKejn                  ejt                  z  de;fdZ	 ddejdejn                  dz  fdZde_ejn                  ejn                  ejn                  ejn                  f   fdZdejn                  fdZddejn                  dejn                  dejn                  dz  fdZy)zUtilities for code generator.    )IteratorSequenceN)AnyLiteral)numpy)mosaic_gpu_dialect)mlir)ir)_gpu_ops_gen)arith)builtin)gpu)llvm)memref)nvvm)scf)vector    	WARP_SIZE   WARPGROUP_SIZEWARPS_IN_WARPGROUPl         i      address_spacereturnc                     | xt         j                  j                  k(  r yt         j                  j                  k(  ry	 t	        d|        )N      zaddress_space not supported: )r   AddressSpaceGlobal	WorkgroupNotImplementedError)r   s    \/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/mosaic/gpu/utils.pygpu_address_space_to_nvptxr$   4   sA    	 			 	 				#	#	
"? OPP    	memref_typtr_memory_spacec                 X   |j                         \  }}|dk7  rt        d      t        j                  j	                  d      }t        |j                        }|dnd| d}|dkD  r-t        j                  j                  d| d| d	| d
| d	      }n&t        j                  j                  d| d| d      }t        j                  |      }	t        j                  |	| dg      }	t        j                  |	| dg      }	t        j                  |	t        j                  |t        j                  j                  |d            dg      }	|dkD  rt        |j                        D ]P  \  }
}t        j                  |	t        j                  |t        j                  j                  ||            d|
g      }	R t        |      D ]P  \  }
}t        j                  |	t        j                  |t        j                  j                  ||            d|
g      }	R t!        j"                  |g|	g      S )Nr   z2Non-zero offset is not supported for ptr_as_memref@   ptrzptr<>!llvm.struct<(, , i64, array< x i64>, array<	 x i64>)>, i64)>r      r      )get_strides_and_offset
ValueErrorr
   IntegerTypeget_signlesslenshapeTypeparser   UndefOpInsertValueOp
ConstantOpIntegerAttrget	enumerater   unrealized_conversion_cast)r*   r&   r'   stridesoffseti64rankptr_tydesc_tydesciss               r#   ptr_as_memrefrL   C   s    446/'6q[
I
JJ
##B'#	Y__	$$,5D9I8J!2L&	AXggmm
6(-v >y	"G
 ggmmnVHBvhgFGG	g	$			D#s	+$			D#s	+$			
DOOC!3!3C!;<qc
$ 
AX)//* 1
R^^%7%7Q%?@1a&d '" 1
R^^%7%7Q%?@1a&d 
	+	+YK$	@@r%   c                 |   | st        d      | d   j                  }t        j                  j	                  d      }t        j
                  j                  d      }t        j                  |t        t        |       |      |      }t        |       D ])  \  }}t        ||g|      }t        j                  ||       + |S )NzEmpty arrayr   r)   	!llvm.ptr)r5   typer
   r6   r7   r:   r;   r   allocacr8   rA   getelementptrstore)valueselem_tyrE   rG   arr_ptrrJ   velem_ptrs           r#   
pack_arrayrY   e   s    	
]
##1INN'
##B'#77==%&KK#f+s 3W=' daWqc73HJJq( 
.r%   c                 `    g }d}| d d d   D ]  }|j                  |       ||z  } |d d d   S )Nr   )append)xsstrides_retstridexs       r#   get_contiguous_stridesra   r   sK    +&dd8 av
aKF 
TrT	r%   valc           	         t         j                  j                  |      st         j                  j                  |      rUt        | t        t
        j                  f      st        t        |             t         j                  j                  ||       }nt         j                  j                  |      r!t         j                  j                  ||       }ngt         j                  j                  |      r=t        j                  |t!        | t        j                  |      j"                              S t%        |      t'        j(                  ||      S N)r
   r6   
isinstance	IndexTypeintnpinteger	TypeErrorrO   r?   r@   	FloatType	FloatAttr
VectorTyper   	broadcastrQ   element_typer"   r   constant)rb   tyattrs      r#   rQ   rQ   {   s    ^^r"bll&=&=b&AcC,-d3i  >>b#&D	||r"<<B$D	}}#B#r}}R'8'E'E FGG
b
!!	D	!!r%   c                    t         j                  j                  | j                        rd| fS t         j                  j                  | j                        rct        j                  | j                        j
                  dk  r3t        j                  t         j                  j                  d      |       } d| fS t         j                  j                  | j                        rd| fS t         j                  j                  | j                        s)t         j                  j                  | j                        r6t        j                  t         j                  j                         |       } d| fS t        d| j                         )Nz%llur)   z%fzCan't print the type )r
   rf   re   rO   r6   widthr   extuir7   F32TypeBF16TypeF16Typeextfr@   r"   )args    r#   _debug_scalar_ty_formatr{      s   \\SXX&3;^^sxx(	~~chh%%*KK33B7=c3;ZZ388$9[[CHH%)>)>sxx)H
**RZZ^^%s
+C93CHH:>??r%   T)uniformscopec                   |s|t        d      |t        j                  }g }g }|D ]c  }t        j                  j                  |j                        rt        j                  j                         }t        j                  |j                        }t        |j                        dkD  rt        d|       t        |j                  d         D 	cg c]8  }	t        j                  |g t        j                  j                  |	g            : }
}	t!        t#        t$        |
       \  }}ddj'                  |       d}||z  }nt%        |      \  }}|j)                  |       |t        |j                        |j)                  |       f |rt+        j,                  t.        |	      nt0        j2                  } |       5  t5        j6                   | j8                  | d
z   |       d d d        y c c}	w # 1 sw Y   y xY w)Nz2Cannot specify scope to a non-uniform debug_print.r   z.2D+ vectors are not supported in debug_print: r   dynamic_positionstatic_position[,]r}   
)r5   ThreadSubset	WARPGROUPr
   rm   re   rO   rf   r@   r8   r9   r"   ranger   extractDenseI64ArrayAttrzipmapr{   joinr\   	functoolspartialsingle_thread
contextlibnullcontextr   printfformat)fmtr|   r}   argstype_formatsnew_argsrz   indexvec_tyrJ   vec_args
ty_formats	ty_formatctxs                 r#   debug_printr      s   	U&
I
JJ
]""E,( #c	}})ll e}}SXX&f	V\\	Q	!x
 	
 a)  ..! 2266s;h  c"98DEj$chhz*+1-i$h.s3niooc))	"5#: 
 U3!! 
 u D


L1D8(CD D/.D Ds   =G3'G88H)frozenc                       e Zd ZU ej                  ed<   edej                  fd       Zdej                  de	ej                     fdZ
y)MultimemRefrefr   c                 T    t        j                  | j                  j                        S rd   )r
   
MemRefTyper   rO   selfs    r#   rO   zMultimemRef.type   s    ==''r%   valueindicesc                 l    t        t        | j                  t        |                  }t	        ||       y rd   )
memref_ptrmemref_slicer   tuplemultimem_store)r   r   r   r*   s       r#   rS   zMultimemRef.store   s%    
\$((E'N;
<C3r%   N)__name__
__module____qualname__r
   Value__annotations__propertyr:   rO   r   rS    r%   r#   r   r      sH    	xx-(BGG ( ( HRXX,> r%   r   r*   r   c           
         t         j                  j                  d      }t        |j                        x}dvrt        d      |dz  }t        |t         j                  j                  |f|            }t        |      D cg c],  }t        j                  |t        j                  ||            . }}|dk(  rd}d}n7ddj                  d	 t        d|dz         D               d
}dt        |      z   }t        j                   t         j"                  j%                  d      | g|d| d| dddt'        |      z  z   d       y c c}w )Nr      r   r)   r   z.Only 32-, 64- and 128-bit stores are supportedr   z$1 {r   c              3   &   K   | ]	  }d |   yw$Nr   .0rJ   s     r#   	<genexpr>z!multimem_store.<locals>.<genexpr>   s     IasGI   }.v
!llvm.voidzmultimem.st.relaxed.sys.globalz.f32 [$0], ;lz,rThas_side_effects)r
   r6   r7   bitwidthrO   r5   bitcastrm   r@   r   r   extractelementr   rp   r   str
inline_asmr:   r;   r8   )	r*   r   i32bwvector_lengthrJ   regsvec_ptxvec_mods	            r#   r   r      s<   
##B'#UZZ  b6
E
FF(-
%**M+;SA
B% ]#

 %Q!78
$ 
 aGG388IU1ma6G-HIIJ"MGS''G //ggmmL!
lTl&wi{7)1E	D3t9
s   1E)addminmaxandorxorrq   	reduction	is_signedc                     t         j                  j                  d      }t        |       dvrt	        d      t         j
                  j                  |       rht        j
                  |       }t        |j                        dkD  rt	        d      |j                  d   }|t        |j                        z  dz  }t         j                  j                  |j                        r|dk7  rt        d      t        |j                        dvrt        d	      |d
v rdt        |j                         }n|dv r,|t	        d      |rdnd t        |j                         }nht	        d|       t         j                  j                  |j                        r|dvrt	        d      t         j                  j                  |j                        r|dk7  rt	        d      d}nt         j                  j                  |j                        rd}nt         j                  j                  |j                        rd}nt         j                  j                  |j                        rd}nat         j                   j                  |j                        rd}n5t        |j                        t        |j                        t        |       |dk(  rd}	d}
n3ddj#                  d t%        |      D               d}	dt'        |      z   }
d}|dk(  r|}n6t         j(                  j+                  d dj#                  d!g|z         d"      }t-        j.                  ||gd#| | |
 d$| d%|	 d&| d'd(|z  d)z   d*+      }|dk(  rt1        ||       S t%        |      D cg c]  }t-        j2                  |||g       }}t         j
                  j5                  d,|      }t1        t7        |D cg c]  }t1        ||       c}      |       S c c}w c c}w )-Nr   r   z-Only 32-, 64- and 128-bit loads are supportedr   Only 1D vectors are supportedr   z4Only single-element integer operations are supported>   r   r)   z7Only 32-bit and 64-bit integer operations are supported>   r   r   r   b>   r   r   r   zDSignedness must be specified for integer min, max and add reductionsrK   uz!Unsupported reduction operation: z.Only add, min and max are supported for floatsr   zOnly add is supported for f32f32bf16x2f16x2e5m2x4e4m3x4z$0r   r   r   c              3   &   K   | ]	  }d |   ywr   r   r   s     r#   r   z'multimem_load_reduce.<locals>.<genexpr>+  s     FasGFr   r   r   r,   r   z)>z&multimem.ld_reduce.relaxed.sys.global.. z, [$z];z=r,r   Tr   r   )r
   r6   r7   r   r5   rm   re   r8   r9   ro   r"   rk   rv   rw   rx   Float8E5M2TypeFloat8E4M3FNTyper   r   r   r:   r;   r   r   r   extractvaluer@   vector_concat)rq   r*   r   r   r   vtyr   vector_i32_lengthptx_tyr   r   acc_prec
asm_out_tyout_reg_structrJ   out_regs
vec_i32_tyout_regs                     r#   multimem_load_reducer      s    	##B'#b\&
D
EE]]b!
--
C
399~677IIaLM%1A1A(BBbH	~~  !1!12	!	!B
 	
 
#""	#8	3!E
 	
 
*	*Xc../01--  %C#.x8H8H/I.JK<YKHII		 	 !1!1	2	/	/IJJ			s//	0:;
;;;!!#"2"23::  !1!12''(8(89))#*:*:;!#"2"233 0 011
b
!!!GG388FU3D-EFFGrJGS*++G (!J
5',="=>?rBJ ??
e.yk(G9AfXUVW^V_ `b"#%. !>2&& () 	#~s3H  ""4-J8Lww
3LM
  Ms   P'Pc                   P    e Zd ZU ej                  ed<   eedf   ed<   ed        Z	y)	ForResultop.resultsc                 \    t        | j                        dk7  rt        | j                  d   S )Nr   r   )r8   r   r5   r   s    r#   resultzForResult.resultQ  s'    
4<<A<<?r%   N)
r   r   r   r   ForOpr   r   r   r   r   r   r%   r#   r   r   L  s+    	ii-c? r%   r   c                      dt        |t        t        f      s|g}dt        j                  j                  |      \   fd}|S )NFTc                    t        j                  
j                  d      }t        j                  
j                  d      }t        j                  |
|      }t        j                  |j                        5  |j                  }t        j                  j                  |j                        }r|\  } | ||      }r|g}t        j                  j                  |      \  }}|k7  rt        |      t        j                  |       d d d        |j                   }	t#        |t        j                  j                  |	            S # 1 sw Y   ?xY w)Nr   r   )r   rp   rO   r   r   r
   InsertionPointbodyinduction_variablejaxtree	unflatteninner_iter_argsflattenr5   YieldOpr   r   )fc0c1for_oprJ   inner_carrys
new_carrysnew_flat_carrysnew_carry_treedeffinal_flat_carrysboundcarry_treedefflat_carrysunwraps             r#   wrapperzfori.<locals>.wrapper_  s   	

A	&B	

A	&BYYr5"k2F			6;;	' #

#
#aXX''v7M7MNl	%Q%j	 \
+.88+;+;J+G(o(	m	+*M::	kk/"# ""=2CD # #s   9BE

E)re   listr   r  r  r  )r  carrysr  r  r  r  s   `  @@@r#   forir  X  sG    &	FT5M	*XFF"xx//7+}* 
.r%   c              #      K   t        j                  t        j                  |       j                        5  d  t        j
                  g        d d d        y # 1 sw Y   y xY wwrd   )r
   r   r   IfOp
then_blockyield_)conds    r#   whenr  w  sD     	$223 	JJrN  s   3A$A	A$A!A$c                 
   t         j                  j                  d      fd} | | t        j                  j
                              } | |t        j                  j
                              }t        j                  j                  t        j                  j                  fD ]Z  }t        j                  |t        j                   | | |            |            }t        j                  | | ||                  }\ |S )Nr   c                 0    t        j                  |       S rd   )r   
index_cast)r`   r   s    r#   <lambda>z_3d_to_1d_idx.<locals>.<lambda>  s    U%%c1- r%   )r
   r6   r7   r   	Dimensionr`   yzr   addimuli)
dim_idx_fndim_size_fnas_i32idxr_   dimr   s         @r#   _3d_to_1d_idxr.  ~  s    
##B'#-&z#--//*+#+cmmoo./&mmoos}}/ :c
**S%**VJsO%<fE
FCZZ{3'7 89F: 
*r%   c           
          t         j                  j                  d      }t        d|      }t	        j
                  | j                  || t        ||      t        d|      t        j                  j                        S )Nr          )	r
   r6   r7   rQ   r   	shfl_syncrO   ShflKindr,  )rb   lane_idxr   masks       r#   _warp_bcastr6    s[    
##B'#	
:s	$		hhc1Xs+QtS\4==;L;L
 r%   c                     t         j                  j                  d      }t        j                  t               t        d|            }| rt        |      S |S )Nr      r
   r6   r7   r   shrui
thread_idxrQ   r6  )syncr   warp_idxs      r#   r=  r=    sB    
##B'#[[qCy1( #'X	4H4r%   c                     t         j                  j                  d      }t        j                  t               t        d|            }| rt        |      S |S )Nr      r9  )r<  r   wg_idxs      r#   warpgroup_idxrA    sB    
##B'#;;z|Qq#Y/& !%V	0&0r%   c                   r    e Zd Z ej                         Z ej                         Z ej                         Zy)r   N)r   r   r   enumautoWARPr   BLOCKr   r%   r#   r   r     s)    	$diik)
$))+%r%   r   	_ONCE_PERr}   c                    t        j                  t        j                  j	                  d            }| t
        j                  k(  r|S t               }| t
        j                  ur*t        j                  |t        d|j                              }t        j                  t        j                  j                  |t        d|j                              }t        j                   ||      S )zReturns a predicate that selects a single thread.

  Args:
    scope: What level of the thread hierarchy to select a thread from. For
      example, if the scope is BLOCK, only one thread per block will be
      selected.
  r   r3   r   )r   
elect_syncr
   r6   r7   r   rE  r=  rF  r   remuirQ   rO   cmpiCmpIPredicateeqandi)r}   electedwarp
first_warps       r#   single_thread_predicaterR    s     OOBNN77:;'
lN	$
,$$$;;tQq$))_-Dzz%--00$!TYYH*	J	((r%   c              #   2  K   t         t         | k\  rd yt         }| a 	 t        j                  t        |             }t	        j
                  |j                        5  d t        j                  g        ddd       |a y# 1 sw Y   |a yxY w# |a w xY ww)zRuns the context only from a single thread.

  Args:
    scope: What level of the thread hierarchy to select a thread from. For
      example, if the scope is BLOCK, only one thread per block will be
      selected.
  N)rG  r   r  rR  r
   r   r  r  )r}   
prev_scopeif_ops      r#   r   r     s      yE1	
*)HH,U34E			5++	, 	kk"o I	 I
Is@   B=B B7B ?BBB 
BB BBc                  v    t         j                  j                  d      } t        j                  | g dddd      S )Nr   zmov.u32  $0,%clock;=rr   Tasm_dialectr   r
   r6   r7   r   r   r   s    r#   clockr\    s5    
##B'#		2$dD
 r%   c                  t    t         j                  j                  d      } t        j                  | g ddd      S )Nr   zmov.u32  $0,%smid;rW  r   )rY  rZ  r[  s    r#   smidr^    s.    
##B'#	b"6!	LLr%   kind)lowhighc                     | :t         j                  j                  d      }t        j                  |g dddd      S t         j                  j                  d      }t        j                  |g d| d d	  d
ddd      S )Nr)   zmov.u64  $0,%globaltimer;z=lr   TrX  r   zmov.u32  $0,%globaltimer_r2   r   rW  rZ  )r_  rE   r   s      r#   globaltimerrc    s    	\
..
%
%b
)C??
#  	##B'#		!$r(1-

 r%   c                 @    t        |       }|dz  dk(  sJ |        |dz  S )Nr   r   )r   )rq   r   s     r#   	bytewidthre    s)    |"	a1b	q.r%   c                    t         j                  j                  |       ryt         j                  j                  |       rt        j                  |       j                  S t         j
                  j                  |       rt        j
                  |       j                  S t        +| t         j                  j                  d      k(  r	t        dz  S t         j                  j                  |       rJt        j                  |       }t        j                  |j                        t        |j                        z  S t!        |       )Nr   !mosaic_gpu.barrierr   )r
   FloatTF32Typere   r6   rt   rk   dialectr:   r;   MBARRIER_BYTESrm   mathprodr9   r   ro   r"   )rq   r   s     r#   bitwidth_implrm    s    
   $^^r">>"###\\R <<!!!R277==1F#GGA]]b!
--
C99SYY(3+;+;"<<<Br%   c                 ^    t        |       }|j                         dk7  rt        d|       |S )Nr   z.Only power of 2 bitwidths are supported, got: )rm  	bit_countr5   )rq   r   s     r#   r   r     s5    &1
EfXN
OO	-r%   c                   B    e Zd ZU ej                  ez  ed<   eed<   d Zy)DynamicSlicebaselengthc                     t        | j                  t              r'| j                  dk  rt        d| j                         | j                  dk  rt        d| j                         y )Nr   zbase must be non-negative, got z!length must be non-negative, got )re   rr  rg   r5   rs  r   s    r#   __post_init__zDynamicSlice.__post_init__"  sX    $))S!dii!m8DEE{{Q:4;;-HII r%   N)r   r   r   r
   r   rg   r   ru  r   r%   r#   rq  rq    s    
3
+Jr%   rq  r   c                    t        j                  | j                        }t        ||j                        \  }}}|j                         \  }}t         j                  j                         }|}	|	|k7  r0t        ||      D ]!  \  }
}t        |
t              r	|	|
|z  z  }	|}	 n t        ||      D cg c]
  \  }}|r	| }}}t        ||      D cg c]
  \  }}|r	| }}}t         j                  j                  |	|      }t        j                  | ||dgt        |j                        z  t         j                  j                  ||j                   ||j"                              }|S c c}}w c c}}w )Nr   )result_type)r
   r   rO   parse_indicesr9   r4   
ShapedTypeget_dynamic_stride_or_offsetr   re   rg   StridedLayoutAttrr@   r   subviewr8   ro   memory_space)r   r   ref_tybase_indicesslice_shapeis_squeezedmemref_stridesrD   dynamic_offset
new_offsetr,  r_   rK   squeezenew_strides	new_shape
new_layout	ref_slices                     r#   r   r   ,  s`   =="&+8+M(,[ "88:.&====?.*>!<8 V	C	cFl"
#
 nk:Aw'a+  $'{K#@PZQqP)P##''
K@*nn	cC--##
V((*f6I6I) 
 Qs   1
E+<E+
E1E1r~  	dim_slicec                 N   t         j                  j                  | j                        syt        j                  | j                        j                  |   }| j
                  |   }t        t        ||      d d      }t        ||dd        D ]  \  \  }}\  }}||z  |k7  s y y)NTc                     | d   S Nr   r   )r`   s    r#   r#  z,_is_contiguous_shape_slice.<locals>.<lambda>Y  s
    1 r%   )keyreverser   F)r
   r{  re   layoutrC   r9   sortedr   )r~  r  rC   r9   ssprev_stride_r_   s           r#   _is_contiguous_shape_slicer  N  s     
			(	(	7  /77	B'
,,y
!% c'5!~tD"+.r2ab6? '{A~$ 
r%   sh0sh1c           	         d\  }}dt         t        t        f   ffd}|t              k  r|t              k  r|   |   kD  r6 |||         \  }}t        | |||       } || ||dz    |||z
  z  }|}nF|   |   k  r1 |||         \  }}|g|| t	        | |||z
        } |dz  }|dz  }n
|dz  }|dz  }|t              k  r|t              k  r|t              k  r/|t              k(  sJ t	        | |dz
  t              |z
  dz         } |t              k  r:|t              k(  sJ t        | |dz
  |dz
     gdgt              |z
  z  z         } | S )zReshapes using only "parallel" folds/unfolds.

  This function uses folds/unfolds that are "parallel" in that they
  only act on original dimensions, i.e. they won't fold into an
  intermediate dimension that they will then unfold.
  )r   r   r   c                     | |   |k  sJ d}t        |t        |             D ]/  }|| |   z  }||k(  r	|dz   |fc S ||kD  st        d d d       t        d d d      )Nr   zCan't reshape  to z( by composing independent folds/unfolds.z;Unreachable: number of elements don't match in each shape (z ans ))r   r8   r"   AssertionError)r9   offtargetr-  tor  r  s        r#   
fold_untilz_reshape.<locals>.fold_untilk  s    :
CCU$ 
	U2Yc	Avs{	v "SEcU + 
 	

 
EcU K5	 r%   r   )r   rg   r8   memref_unfoldmemref_fold)	r   r  r  i0i1r  r,  r  r-  s	    ``      r#   _reshaper  a  s    &"bc3h * 	SX"s3x-
2wR#r3r7+fc1#r3r#;/cRc"rAvC"Hnbb	R3r7	CSW-hc3Ec"SkRr*cAgbAgbAgbAgb# 	SX"s3x-( #c(]S>>
c263s8b=1#4
5C#c(]S>>
R!Vc"q&k]aSCHrM5J%J
KC	*r%   r9   .c                    t        | t              rt        t        | j                  |            S t	        j
                  | j                        }t        j                  |j                        t        j                  |      k7  rRt        d|j                   dt        j                  |j                         d| dt        j                  |       d	      t        d |D              st        d|       t        |j                        }t        |      }||k(  r| S |s |j                         \  }}t        j                  j                  t        j                   j#                  d            }|j$                  |k(  rFt        j                  j                  t        j                   j#                  t'        |                  }n-t        j(                  j                  |dgt'        |      z        }t        j
                  j                  ||j*                  ||j,                        }	t/        j0                  |	| g g |      S |s@|j                         \  }}t        j                  j                  t        j                   j#                  |j2                              }t        j4                  j7                  d	      }
|j$                  |k(  s|j$                  |
k(  r=t        j                  j                  t        j                   j#                  d            }n t        j(                  j                  |g       }t        j
                  j                  d
|j*                  ||j,                        }	t/        j8                  |	| g       S |j                         \  }}t        d t;        |j                  t=        |j                        |d      D              r"t?        tA        | d|j2                        d|      S tC        | ||      S )zReshape by means of folding and unfolding.

  The use of memref fold/unfold may avoid some possible issues with
  strided memrefs.
  z/Cannot reshape to a different size. Ref shape: z (size: z), new shape: r  c              3   &   K   | ]	  }|d kD    yw)r   Nr   )r   r-  s     r#   r   z!memref_reshape.<locals>.<genexpr>  s     &S1W&r   zKShapes must havbe only positive dimensions (no -1 or 0 dimensions allowed) r   r   strided<[1]>r   c              3   <   K   | ]  \  }}}|d k(  xs ||k(    ywr   Nr   )r   ds1s2s       r#   r   z!memref_reshape.<locals>.<genexpr>  s.      	
!R 1fb	s   T)strict)"re   r   memref_reshaper   r
   r   rO   rk  rl  r9   r5   allr  r4   AffineMapAttrr@   	AffineMapget_identityr  r8   r{  ro   r}  r   expand_shaperF   	Attributer;   collapse_shaper   ra   r  r  r  )r   r9   r~  	src_shape	dst_shaper  rD   identityr  	result_tycontig_strided_1drC   s               r#   r  r    s-    [!~cggu566=="&	YYv||		% 00

9&,, HIIfll#$N5' :IIeQ	  
 
&&	&
	G	 
 6<< )5k))J	--/IAv##BLL$=$=a$@AH}} ##''
,,
#
#C	N
3j ''++FQC#i.4HIj!!6&&
F4G4GI y#r2yAA	--/IAv##BLL$=$=fkk$JKH**>:}} FMM5F$F##''(A(A!(DEj''++FB7j!!
FV-@-@I   C44,,.*'1 	
,,
 
.
		  S!V[[91eDD	#y)	,,r%   c           
      
   t        | t              r t        t        | j                  ||            S t	        j
                  | j                        }t        |j                        }|dk  rt        d| d      ||z   t        |      kD  rt        d| d| d|       t        j                  ||||z          g||||z    t        j                  j                  t        j                  j!                  |j"                              }t        j$                  j'                  d      }|j(                  |k(  s|j(                  |k(  rMt        j                  j                  t        j                  j!                  |j"                  |z
  dz               }nt+        |t-        |||z               rF|j/                         \  }}	|||z   dz
     g||||z    t        j0                  j                  |	|      }n2t        d	|j/                         d    d
|j                  d|d|      t        j
                  j                  ||j2                  ||j4                        }
t7        |      D cg c]  }|g }}|j9                  t7        |      D cg c]  }||z   	 c}       |j;                  d t7        ||z   |j"                        D               t        |      |
j"                  k(  sJ t=        j>                  |
| |      S c c}w c c}w )Nr   z
Dimension z is negativezFolding z dimensions starting from  is out of bounds for shape r  r   zstrides=z, ref_ty.shape=z, dim=z, fold_rank=c              3   "   K   | ]  }|g 	 y wrd   r   )r   r  s     r#   r   zmemref_fold.<locals>.<genexpr>
  s     @qs@s   ) re   r   r  r   r
   r   rO   r  r9   r5   r8   rh   rl  r  r@   r  r  rF   r  r;   r  r  slicer4   r{  ro   r}  r   r\   extendr   r  )r   r-  	fold_rankr~  r  r  r  r  r  rD   new_tyr  assocrJ   s                 r#   r  r    s    [!{377C;<<=="&6<< )1W
z#l3
449_s9~%

9+7u =[	"  ')ggicIo.N&O%P)C#	/"!!",,";";FKK"HI(ll((8]]h&--3D"D!!%%
!!&++	"9A"=>J "&%S9_*EF 779K*5cIo6I*J)KKcIo&%%))&+>J

60021566F H6I<	! 
 ==$$j&2E2E& c

#1A3
#%
#,,y!12Aa23,,@E#	/6;;?@@	Uv{{	""	"			vsE	22	 $2s   $
K;L c                    t        j                  | j                        }t        |j                        t        d D              dkD  rt        d      t        j                  D cg c]  }||	 c}         z  rt        d         t        fdD              dz    t         j                  j                  t         j                  j                  |j                              }t         j                  j!                  d      }|j"                  |k(  s|j"                  |k(  rVt         j                  j                  t         j                  j                  |j                  t%              z   dz
              }nq|j'                         \  }}	|   }
g }t)              D ]  }|j+                  |
       |
|z  }
 t)        |      |dz    t         j,                  j                  |	|      }t         j                  j                  |j.                  ||j0                        }|j                  k(  ret3        |j                        D cg c]  }|g }}|d   j5                  t3        |j                  |j                  t%              z   dz
               nzt3              D cg c]  }|g }}|j+                  t        t3        t%              z                      |j5                  fdt3        dz   |j                        D               t%        |      |j                  k(  sJ t7        j8                  || |g |j                        S c c}w c c}w c c}w )	zOUnfolds dim into two dimensions, the size of leading one given be major_factor.c              3   $   K   | ]  }|d u  
 y wrd   r   )r   r	  s     r#   r   z memref_unfold.<locals>.<genexpr>  s     $qd$s   r   zCan only infer one dimensionzNon-divisible unfold:c              3   6   K   | ]  }|   z  n|  y wrd   r   )r   r	  r-  known_factor_prodr  s     r#   r   z memref_unfold.<locals>.<genexpr>  s)      BCQYin))A=s   r  r[   c              3   B   K   | ]  }|t              z   d z
  g  ywr  )r8   )r   r  factorss     r#   r   z memref_unfold.<locals>.<genexpr>4  s"     MA!c'l"Q&'Ms   )r
   r   rO   r  r9   sumr5   rh   rl  r   r  r@   r  r  rF   r  r;   r  r8   r4   reversedr\   r{  ro   r}  r   r  r   r  )r   r-  r  r~  r	  r  r  r  r  rD   r  inserted_stridesr  r  r  r  r  s    ``            @@r#   r  r    s   =="&6<< )$G$$q(
3
44gg'CQQ]qCDs^''
,ing
FF GN ' %)C#'!!",,";";FKK"HI(ll((8]]h&--3D"D!!%%
!!&++G"<q"@AJ !779Kc"Kg k*Qk "**:!;KcAg%%))&+>J==$$j&2E2E& 	FKK,-QaS-E-	"IU6;;c'l(BQ(FGH*%QaS%E%	LLeCs7|!3456	LLMsQw1LMM	Uv{{	""	"			VS%V\\	BBC D4 . &s   ,M 4M 
M%6
M*c                    t        j                  | j                        }||j                  k(  rt	        |j
                        }|j                  d       t         j                  j                  t         j                  j                  |j                              }|j                  |k(  rJt         j                  j                  t         j                  j                  |j                  dz               }nD|j                         \  }}|j                  d       t         j                  j                  ||      }t         j                  j                  ||j                  ||j                        }t!        |j                        D 	cg c]  }	|	g }
}	|
d   j                  |j                         t#        j$                  || |
g |j
                        S t'        | |d      S c c}	w )zInserts a singleton dimension.r   r[   r  )r
   r   rO   rF   r  r9   r\   r  r@   r  r  r  r4   r{  ro   r}  r   r   r  r  )r   r-  r~  r  r  r  r  rD   r  r  r  s              r#   memref_unsqueezer  9  sl   =="&FKKV\\"IQ##BLL$=$=fkk$JKH}} ##''
,,
#
#FKK!O
4j #99;k6''++FK@j]]6&&
F4G4GF  ,-QaS-E-	"IV[[!vsE2v||DDc9--	 .s   
G c                 j    | j                         \  }}t        j                  }|D ]  }||kD  r y|} y)NTF)r4   rk  inf)r   rC   r  r  r_   s        r#   is_memref_transposedr  R  sA    ))+*'1+ fK 
r%   permutationc                    t        j                  | j                        }|j                         \  }}|D cg c]  }||   	 }}|D cg c]  }|j                  |    }}t         j
                  j                  ||      }t         j                  j                  ||j                  ||j                        }	t        j                  |	| t         j                  j                  |            S c c}w c c}w rd   )r
   r   rO   r4   r9   r{  r@   ro   r}  r   	transposer  get_permutation)
r   r  r~  rC   rD   pr  r  r  r  s
             r#   memref_transposer  \  s    =="&113/'6%011+1(341v||A4)4##''<*==$$j&2E2E& 
		c2<<//<
  24s   C+	C0)	check_oobr  c          	         t        | t              s| f} t        |      t        |       z
  x}r| t        d       f|z  z  } g }g }g }t	        t        | |            D ]  \  }\  }}	t        |t        j                  t        j                  f      r|j                  }t        |t              rd|r$||	k\  s|dk  r| |	kD  rt        d| d| d|       |j                  |dk\  r|n|	|z          |j                  d       |j                  d       t        |t              r|j                  |j                  dk7  rt        d      |j                  xs d}
|
dk  r|	|
z   }
|j                   xs |	}|dk  r|	|z   }|r(|
dk  s|
|	k\  s
|dk  s||	kD  rt        d| d| d|       |j                  |
       |j                  ||
z
         |j                  d	       t        |t"              r|rJt        |j$                  t              r0|j$                  |j&                  z   |	kD  rt        d| d| d|       |j                  |j$                         |j                  |j&                         |j                  d	       %t        |t        j(                        rit        j*                  j                  |j,                        st/        d
      |j                  |       |j                  d       |j                  d       t        t-        |             t        |      t        |      cxk(  rt        |      cxk(  rt        |      k(  sJ  J |||fS )Nr   zIndex z along axis r  r   TzStrided slices not implementedzSlice FzExpected an index-typed index)re   r   r8   r  rA   r   r
   	OperationOpViewr   rg   
IndexErrorr\   stepr"   startstoprq  rr  rs  r   rf   rO   r5   )r   r9   r  trailing_dimsr  r  r  axisr,  r  r  r  s               r#   rx  rx  j  s    
E5	!HE%j3u:--]-	eDk^m++E,++%c%&78 .+dLS%#bii01JJc#s	uqcTE\SEdV+GwO
 	
 #us{;	C			#((a-!"BCCiin1e	XXd	t|	
!)u~TE\SEdV+GwO
 	
 % &	C	&	
SXXs
#3::(=(ESEdV+GwO
 	
 #((#$	C	"\\$$SXX.899#S	**].+^ 
\	c+.	P#k2B	Pc%j	PP	PP	P	{K	//r%   c                      t        j                  t         j                  j                  t         j                  j
                         t                y )N)space)r   fence_proxy	ProxyKindasync_sharedSharedSpace
shared_ctawarpgroup_barrierr   r%   r#   commit_sharedr    s3    
nn!!)9)9)D)D r%   c            
         t         j                  j                  d      } t        j                  t         j
                  j                  d      t        j                  t        d      t        d|             gdt         ddd	
       y )Nr   r   F)r<  r   zbar.sync $0, r   rTr   )r
   r6   r7   r   r   r:   r;   r   r'  rA  rQ   r   r[  s    r#   r  r    sc     	##B'#//ggmmL!zz-U+Qq#Y78n%Q'	r%   c                  |    t        j                  t        dt        j                  j                  d                   y )Nr0  r   )r   bar_warp_syncrQ   r
   r6   r7   r   r%   r#   warp_barrierr    s&    Qz2>>#>#>r#BCDr%   c            
          e Zd ZU ej                  ed<   ej                  ed<   ej                  ed<   eed<   e	 ddej                  dedd fd       Zde	d    fd	Z
dej                  ez  dd fd
ZddZddefdZdej                  deej                  ej                  f   fdZ	 	 	 	 ddedededej                  dz  fdZ	 ddeej                  z  dej                  dz  fdZd Zy)
BarrierRefbase_addressrD   phasesnum_barriersbarrier_memrefarrival_countr   c           	         t        j                  | j                        }|j                  \  }|dkD  rt	        d      t         j
                  j                  d      }t         j
                  j                  d      }t        | t              }t        j                  t         j                  j                  d|      g g       }t        j                  t        d|      |g        t        t        j                         5  t#        |      D ].  }t%        j&                  t)        ||g|      t        ||             0 	 d d d        t+        |t        d|      ||      S # 1 sw Y   !xY w)Nr   *Only up to 32 barriers per group supportedr)   r}  r   r   r   )r
   r   rO   r9   r"   r6   r7   r   WORKGROUP_NVPTX_ADDRESS_SPACEr   rP   r@   rS   rQ   r   r   rF  r   r   mbarrier_initrR   r  )	r  r  
barrier_tyr  r   rE   addressr  rJ   s	            r#   
initializezBarrierRef.initialize  s"    ~223J%%N\b LMM
..
%
%b
)C
..
%
%b
)C%BG ]]2==,,R5r2>F
LL1cFB'	\//	0 
\" 
!'A3,mS!	


 gqCy&,??
 
s   ==EE$c              #   v   K   | j                   dk(  r|  y t        | j                         D ]	  }| |     y wNr   )r  r   r   rD   s     r#   __iter__zBarrierRef.__iter__  s=     Aj$++, &6ls   79c                    t         j                  j                  d      }t        |t              r+|| j
                  k\  rt        d| d      t        ||      }n]t         j                  j                  |j                        rt        j                  ||      }n|j                  |k7  rt        d|       t        | j                  t        j                  | j                   |      | j"                  d      S )Nr   zBarrier offset z is out of boundsz,Expected a dynamic index or an integer, got r   )r
   r6   r7   re   rg   r  r  rQ   rf   rO   r   index_castuir5   r  r  r'  rD   r  )r   rD   r   s      r#   __getitem__zBarrierRef.__getitem__  s    
..
%
%b
)C&#	4$$	$?6(2CDEE~f		 	 	-!!#v.f		EfXNOO

4;;'		 r%   c                 X   t         j                  j                  d      }t        j                  |d      }t        j
                  ||      }t        j                  | j                         ||       |r8t        j                  t         j                  j                  d      g ddd       y y )Nr   i r   z!tcgen05.fence::after_thread_sync;r   Tr   )r
   r6   r7   r   rp   ru   r   mbarrier_try_wait_parityget_ptrr   r   r:   r;   )r   parityorders_tensor_corer   tickss        r#   wait_parityzBarrierRef.wait_parity  s~    
..
%
%b
)CNN3)E[[f%F!!$,,.&%@
oo
''--
%

-
 r%   r
  c                     t        j                  | j                  g       }| j                  |      \  }}t        j                  || j                  g        | j                  ||       y rd   )r   loadr  update_paritiesrS   r  )r   r
  paritiesr	  new_paritiess        r#   waitzBarrierRef.wait  sP    {{4;;+H//9FL
LLt{{B/V/0r%   r  c                 ^   t         j                  j                  d      }t        j                  t        d|      | j                        }t        j                  t        j                  j                  t        j                  ||      t        d|            }|t        j                  ||      fS )Nr   r   r   )r
   r6   r7   r   shlirQ   rD   rK  rL  nerN  xori)r   r  r   bitmaskr	  s        r#   r  zBarrierRef.update_parities	  sz    
..
%
%b
)Cjj1cDKK0GZZ

8W =qCyF 5::h000r%   Ncan_complete	predicatec                 2   t         j                  j                  d      }|r7t        j                  t         j
                  j                  d      g ddd       |rbdx}}|d}d}t        j                  t         j                  j                  d      | j                         g||gng z   | d	| d
d|z   d       y |t        d      t        |t         j                  j                  d            }t        j                  || j                         |       y )Nr)   r   "tcgen05.fence::before_thread_sync;r   Tr   @$2,bz7 mbarrier.arrive.release.cta.shared::cta.b64 $0, [$1], r   z=l,rz.Predicate not supported for no-complete arriver   )r
   r6   r7   r   r   r:   r;   r  r"   rQ   r   mbarrier_arrive_nocomplete)	r   r  r  r
  r  rE   pred_ptxpred_constraintcounts	            r#   arrivezBarrierRef.arrive  s    ..
%
%b
)C
oo
''--
%

.
 #%%h		
oo
..
%
%b
)<<>
Y-Byk
KJ _A
?
" 
	!<
 	
 r~~::2>?e
%%c4<<>5Ar%   bytesc                 l   t        |t              r*t        |t        j                  j                  d            }n^t        j                  j                  |j                        r5t        j                  j                  d      }t        j                  ||      }t        | j                         ||       y )Nr   r  )re   rg   rQ   r
   r6   r7   rf   rO   r   r"  nvvm_mbarrier_arrive_expect_txr  )r   r#  r  r   s       r#   arrive_expect_txzBarrierRef.arrive_expect_tx6  sz     %r~~22267e		 	 	,NN''+csE*e"r%   c                     t         j                  j                  d      }t        | j                  | j
                  g|      S )Nr)   )r
   r6   r7   rR   r  rD   )r   rE   s     r#   r  zBarrierRef.get_ptrB  s1    
..
%
%b
)C**T[[M3??r%   r   F)r   TFNrd   )r   r   r   r
   r   r   rg   staticmethodr  r   r  r  r  boolr  r   r  r"  r'  r  r   r%   r#   r  r    s;   
((
((56@hh@/2@@ @,. 3 < "1T 11bhh 152889K3L 1 !&#'#B#B #B 	#B
 D#BL AE
>
.0hho
@r%   r  c                   z   e Zd ZU eed<   e	 ddej                  dedd fd       Z	de
d    fdZdej                  ez  dd fdZdd	Zdd
efdZdej                  deej                  ej                  f   fdZdd
efdZdeej                  z  fdZd Zdej                  fdZedej                  fd       Zy)DialectBarrierRefbarrier_refr  r  r   c           	         t        j                  | j                        }|j                  \  }|dkD  rt	        d      t        | t              }t        j                  |||       t         j                  j                  d      }t        j                  t         j                  j                  d|      g g       }t        j                  t        d|      |g        t!        t#        |t        d|      ||            S )Nr   r  r  r   r   r.  )r
   r   rO   r9   r"   r   r  ri  initialize_barrierr6   r7   r   rP   r@   rS   rQ   r-  r  )r  r  r  r  r  r   r  s          r#   r  zDialectBarrierRef.initializeK  s    
 ~223J%%N\b LMM%BG w|D
..
%
%b
)C]]2==,,R5r2>F
LL1cFB'w!S	6<H r%   c              #      K   | j                   j                  dk(  r|  y t        | j                   j                        D ]	  }| |     y wr   )r.  r  r   r  s     r#   r  zDialectBarrierRef.__iter__`  sI     $$)j$**778 &6ls   AArD   c                 2    t        | j                  |         S rd   )r-  r.  r  s     r#   r  zDialectBarrierRef.__getitem__g  s    T--f566r%   c                 <    | j                   j                  ||       y rd   )r.  r  )r   r	  r
  s      r#   r  zDialectBarrierRef.wait_parityj  s      );<r%   r
  c                 j    | j                   j                  J | j                   j                  |       y rd   )r.  r  r  r   r
  s     r#   r  zDialectBarrierRef.waitm  s/    ""...,-r%   r  c                 8    | j                   j                  |      S rd   )r.  r  )r   r  s     r#   r  z!DialectBarrierRef.update_paritiesq  s    ++H55r%   c                 L    t        j                  | j                         |       y rd   )ri  ArriveOpas_barrier_memrefr6  s     r#   r"  zDialectBarrierRef.arrivet  s    T++-/ABr%   r#  c                 N    t        j                  | j                         |       y )N)barrier	expect_tx)ri  ArriveExpectTxOpr:  )r   r#  s     r#   r'  z"DialectBarrierRef.arrive_expect_txw  s    T%;%;%=Or%   c                 6    | j                   j                         S rd   )r.  r  r   s    r#   r  zDialectBarrierRef.get_ptrz  s    ##%%r%   c                    | j                   j                  }|dk(  rdn|f}t        j                  j	                  |t        j
                  j                  d            }t        j                  |g| j                         g      S )Nr   r   rg  )
r.  r  r
   r   r@   r:   r;   r   rB   r  )r   r  r9   memref_types       r#   r:  z#DialectBarrierRef.as_barrier_memref}  se    ##00L!#B,E--##E277==9N+OPK--{mdlln=MNNr%   r<  c                    t        j                  |j                        }|j                  dkD  s,|j                  t         j
                  j                  d      k7  rt        d|j                         t         j
                  j                  dt         d      }t        j                  |g|g      } | t        |t        dt         j                  j                  d            d|j                  dk(  rdn|j                  d   	      
      S )z?Creates a DialectBarrierRef from a memref of a dialect barrier.r   rg  zQExpected a memref with rank 0 or 1 and element type !mosaic_gpu.barrier, but got z
!llvm.ptr<r+   r   r)   N)r  rD   r  r  r0  )r
   r   rO   rF   ro   r:   r;   r5   r  r   rB   r  rQ   r6   r7   r9   )clsr<  rA  ptr_typeaddrs        r#   from_barrier_memrefz%DialectBarrierRef.from_barrier_memref  s     ---K!{77277==<   **1,,9 
 ww}}z*G)HJKH--xj7)DDQ33B78*//14!+:K:KA:N	
 r%   Nr   r)  )r   r   r   r  r   r*  r
   r   rg   r  r   r  r  r  r+  r  r   r  r"  r'  r  r:  classmethodrF  r   r%   r#   r-  r-  G  s     hh  (!45 73 73F 7=.T .6bhh 652889K3L 6Ct CPC"((N P&O O   r%   r-  c                       e Zd ZU eed<   ej                  dz  ed<   edej                  dede	e
j                  e	e
j                     z     deeeef   dd f
d	       Zd
 Zd ZddefdZd Zd Zy)CollectiveBarrierRefr<  Ncluster_maskr  r  dimscluster_shaper   c                 $   t         j                  j                  d      }|D cg c]?  }t        |t        j
                        r|   nt        j                  fd|D              A }}t        |      t        |      z
  dz   }|dk(  rt        d |D              sJ d }nFt        d|      }t        ||      D ]+  \  }}	|	dk(  rt        j                  |t        |            }- t         j#                  | ||z        }
t%        |
|      S c c}w )Nr   c              3   (   K   | ]	  }|     y wrd   r   )r   ddrL  s     r#   r   z2CollectiveBarrierRef.initialize.<locals>.<genexpr>  s     5R}R(5   r   c              3   &   K   | ]	  }|d k(    ywr  r   )r   rK   s     r#   r   z2CollectiveBarrierRef.initialize.<locals>.<genexpr>  s     ,Aa,r   r   )r  )r
   r6   r7   re   r   r$  rk  rl  r  r8   r  rQ   r   r   oricluster_collective_maskr  r  rI  )r  r  rK  rL  r   r  
dims_shapecluster_arrival_countrJ  sizer<  s      `       r#   r  zCollectiveBarrierRef.initialize  s#    ..
%
%b
)C 	  a' 	aYY5155	6J   
Oc$i7!;!,,,,,lq#Ylz* 
'!T19 yy1-C

 ##m6K&K $ G  66/s   ADc              #   ^   K   | j                   D ]  }t        || j                          y wrd   )r<  rI  rJ  )r   r   s     r#   r  zCollectiveBarrierRef.__iter__  s,     \\ 7 D$5$5667s   +-c                 H    t        | j                  |   | j                        S rd   )rI  r<  rJ  r  s     r#   r  z CollectiveBarrierRef.__getitem__  s    V 4d6G6GHHr%   r
  c                    |r7t        j                  t        j                  j	                  d      g ddd       | j
                  j                  dk7  rt        d      | j                  >t        t        j                  	      5  | j
                  j                          ddd       yt        j                  j                  d
      }t        j                   t#               t%        t&        |            }t        j(                  |t%        t&        dz  |            }t        j*                  t        j,                  j.                  t        j0                  | j                  t        j2                  t%        d|      |            t%        d|            }t        j*                  t        j,                  j4                  t        j                   |t%        t&        dz  |            t%        d|            }t        j0                  ||      }t        j                  t        j                  j	                  d      || j
                  j7                         |gddd       y# 1 sw Y   yxY w)zArrives on a barrier in all blocks that share at least one of the coordinates along the collective dimensions.

    Note that unlike in arrive, each warpgroup arrives once.
    r   r  r   Tr   r   z#Can only arrive on a single barrierNr   r      r   z
    {
        .reg .b32 mapped_addr;
        @$0 mapa.shared::cluster.u32 mapped_addr, $1, $2;
        @$0 mbarrier.arrive.shared::cluster.b64 _, [mapped_addr];
    }zb,r,r)r   r   r
   r:   r;   r<  r  r5   rJ  r   r   r   r"  r6   r7   r   rJ  r;  rQ   r   divuirK  rL  r  rN  r  rM  r  )r   r
  r   thread_in_warpgroupsignaled_blockis_collective_blockis_signaling_threadshould_arrives           r#   r"  zCollectiveBarrierRef.arrive  s   
 
oo
''--
%

.
 ||  A%<== |556 
..
%
%b
)C++jlAnc4JK[[Q~3S9N  **

4$$ejj1cN&KL	!S	
  **'>R+?)EF	!S	
 JJ24GHMOO
l#	,,.?		 	's   IIc                 <     | j                   j                  |i | y rd   )r<  r  r   r   kwargss      r#   r  zCollectiveBarrierRef.wait  s    DLLt&v&r%   c                 <     | j                   j                  |i | y rd   )r<  r  rb  s      r#   r  z CollectiveBarrierRef.wait_parity  s    DLLd-f-r%   r)  )r   r   r   r  r   r
   r   r*  rg   r   r   r$  r   r  r  r  r+  r"  r  r  r   r%   r#   rI  rI    s    4!7hh!7!7 S]]Xcmm%<<=!7 3S=)	!7
 !7 !7F7I.t .`'.r%   rI  c                       e Zd ZU ej                  ed<   	 	 ddej                  ez  dej                  dz  defdZe	ddej                  dz  fd       Z
	 ddej                  d	dej                  ez  d
edefdZy)SemaphoreRefr*   Nr   r  relaxedc                 v   t         j                  j                  d      }t        |t         j                        st        ||      }n'|j                  |k7  rt        d|j                         |t        t        j                        }|rdnd}t        j                  || j                  ||gd| ddd	       y )
Nr   Expected a i32 value, got rg  releasez@$3 atom.add.z.sys.global.u32 $0, [$1], $2;z=r,l,r,bTr   )r
   r6   r7   re   r   rQ   rO   r5   rR  r   r   r   r   r*   )r   r   r  rg  r   	semanticss         r#   signalzSemaphoreRef.signal  s     ..
%
%b
)CeRXX&sme	s	3EJJ<@AA),*@*@Ai$	)IOO	5)$
	{"?@r%   c                    t         j                  j                  d      }t        |t         j                        st        ||      }n'|j                  |k7  rt        d|j                         |t        t        j                        }t        j                  t         j                  j                  d      | ||gddd       y )Nr   ri  r   zu{
            @$2 multimem.red.release.sys.global.add.u32 [$0], $1;
            fence.proxy.alias;
        }
        zl,r,bTr   )r
   r6   r7   re   r   rQ   rO   r5   rR  r   r   r   r   r:   r;   )r*   r   r  r   s       r#   signal_multimemzSemaphoreRef.signal_multimem  s    
..
%
%b
)CeRXX&sme	s	3EJJ<@AA),*@*@AiOO
l#	eY	
 	
r%   T)	decrementr}   ro  r}   c          	         t         j                  j                  d      }t        |t         j                        st        ||      }n'|j                  |k7  rt        d|j                         t        |      5  t        j                  |g|g      }|j                  j                  j                  |      }t         j                  j                  |      5  |j                   \  }|rt#        j$                  ||      }t'        j(                  || j*                  ||gddd      }	t"        j,                  j.                  }
t#        j0                  |
|	|      }t#        j2                  |	|      }nXt'        j(                  || j*                  gdd	d      }	t"        j,                  j4                  }t#        j0                  ||	|      }|}t        j6                  ||g       d d d        |j8                  j                  j                  |      }t         j                  j                  |      5  t        j:                  |j                          d d d        t'        j(                  t         j<                  j?                  d
      g ddd       d d d        |t@        jB                  k(  rtE                y |t@        jF                  k(  rtI                y t        d|       # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   kxY w)Nr   ri  r   z1atom.acquire.sys.global.cas.b32 $0, [$1], $2, $3;z=r,l,r,rTr   z#ld.relaxed.sys.global.b32 $0, [$1];z=r,lr   zfence.acquire.sys;r   zUnsupported scope: )%r
   r6   r7   re   r   rQ   rO   r5   r   r   WhileOpbeforeblocksr\   r   at_block_begin	argumentsr   subir   r   r*   rL  r  rK  maxuiult	conditionafterr  r:   r;   r   r   r  rE  r  )r   r   ro  r}   r   while_opbefore_blockexpected_in_memorynew_val	in_memoryne_pred
comparisonnew_expected_in_memorylt_predafter_blocks                  r#   r  zSemaphoreRef.wait1  si    ..
%
%b
)CeRXX&sme	s	3EJJ<@AA	U	# 'cUUG,h__++2237l++L9 <+55	JJ159'ooxx+W5A#) ''**'zz'96HI*#(;;y%#@
 ooxxj3#) ''++'zz'9e<*#5
 j#9":;3<4 NN))005k++K8 *

;(()*
oo
''--
%


C'P &&&	,##	#n,UG455S< <6* *?' 's?   :AK1DKAK1( K%?K1K"	K1%K.	*K11K:NFrd   r   )r   r   r   r
   r   r   rg   r+  rl  r*  rn  r   r   r  r   r%   r#   rf  rf    s    	xx-
 $(	XX^ D 	, RXX_  ,  :6 (22:6XX^:6 	:6
 :6r%   rf  c                  r    t        j                  t        j                  j	                  d      g ddd       y )Nr   zfence.release.sys;r   Tr   r   r   r
   r:   r;   r   r%   r#   fence_release_sysr  n  s*    //ggmmL!r%   c                   v   e Zd ZU eedf   ed<   eedf   ed<   eedz  df   ed<   eej                  df   dz  ed<   dddddeedf   deedz  df   deej                  df   dz  d	eedf   dz  d
eedf   dz  f
dZe	deedf   fd       Z
e	d        Zdej                  ez  deej                     fdZy)	Partition.source_boundstarget_boundsN	partitionbase_offsetr  
num_chunks
chunk_sizeelementsr  r  c                   || _         || _        || _        t        | j                         t        | j                        k7  rt        |d cxu r|cxk(  rt	        d       ||| _        nw|J t        |      t        | j                         k7  rt        g }t        ||      D ]+  \  }}||z  rt	        d||      |j                  ||z         - t        |      | _        t               }	| j                  D ]K  }
|
d|
cxk  rt        | j
                        k  st         t        |
|	v rt        |	j                  |
       M t        | j                   | j                        D ]%  \  }}
|
	|| j
                  |
   z  st	        d       y )N:Exactly one of num_chunks and chunk_size must be specifiedzNon-divisible partitionr   zNon-divisible partitioning)r  r  r  r8   r5   r  r   r\   r   setr   )r   r  r  r  r  r  r  elschunk	seen_dimsr  tbs               r#   __init__zPartition.__init__~  s    "DDN"D
4#dnn"55T/Z/
F  0 %d###	ZC 2 23	3mHj1 +*#u;4h
K
KSE\*+ !/dI^^ 	
1.s4--.. /	
immA T''8 7A	
2 2 21 555667r%   r   c                     | j                   S rd   r  r   s    r#   r  zPartition.num_chunks  s    r%   c                 h     t         fdt         j                   j                        D              S )Nc              3   P   K   | ]  \  }}||n|j                   |   z    y wrd   r  )r   r  r  r   s      r#   r   z/Partition.target_block_shape.<locals>.<genexpr>  s6      B iR4#5#5a#888s   #&)r   r   r  r  r   s   `r#   target_block_shapezPartition.target_block_shape  s.     ++T^^<  r%   source_coordsc                    g }t         j                  j                         }t        t	        | j
                  | j                              D ]z  \  }\  }}|t        d|      }n#t        j                  t        ||      ||         }| j                  #t        j                  | j                  |   |      }|j                  |       | |S r  )r
   rf   r@   rA   r   r  r  rQ   r   r(  r  r'  r\   )r   r  coordsr   rJ   tbsr  dim_bases           r#   get_basezPartition.get_base  s    FLLE T%<%<dnn!MN 8C	
Q;::aUm]1-=>				%::d..q18<mmH Mr%   )r   r   r   r   rg   r   r
   r   r  r   r  r  r  r  r   r%   r#   r  r  x  s%   sCx sCx 3:s?##RXXs]#d** 26+/+/*7c3h*7 sTz3'	*7
 3'$.*7 S/D(*7 S/D(*7X %S/    RXX^ RXX r%   r  c            
           e Zd ZU eed<   dddddedej                  dz  dedz  dedz  fdZe	d	efd
       Z
dej                  d	ej                  fdZdddddej                  dz  dedz  dedz  fdZy)Partition1Dr  Nr  r  r  r  r  c                    || _         |d cxu r|cxk(  rt        d       t        |fd      }||f|d<   |t        dd|fi|| _        y |J t        dd|fi|| _        y )Nr  r   )r  r  r  r  r  r   )r  r5   dictr  r  )r   r  r  r  r  common_kwargss         r#   r  zPartition1D.__init__  s     #DT/Z/
F  0 8+>M&1^mM" KZMK]Kdn### KZMK]Kdnr%   r   c                 4    | j                   j                  d   S r  )r  r  r   s    r#   r  zPartition1D.num_chunks  s    >>''**r%   r  c                 >    | j                   j                  |      d   S r  )r  r  )r   r  s     r#   r  zPartition1D.get_base  s    >>""=1!44r%   )r  r  r  r  c                ~    t        | j                  j                  d   |||| j                  |            S d       S )Nr   )r  r  r  )r  r  r  r  )r   r  r  r  s       r#   refinezPartition1D.refine  sK     ))!,,1,=DMM%(	  DH	 r%   )r   r   r   r  r   rg   r
   r   r  r   r  r  r  r   r%   r#   r  r    s     &*##LL 88d?	L
 *L *L, +# + +5BHH 5 5  $## XX_ *	
 *r%   r  c                 :   t        |      t        |       kD  r*t        d| dt        |       d|  dt        |        d	      |s| S t        |      }t        | | d  |      D ]  \  }}||z  st        d| |       g | d |  d t        | | d  |      D        |S )Nz9Expected tiling to be at most rank of shape. Got tiling: z (rank: z) and shape z).zNon-divisible tiling:c              3   ,   K   | ]  \  }}||z    y wrd   r   )r   rK   ts      r#   r   ztile_shape.<locals>.<genexpr>  s     =41aQ=   )r8   r5   r   )r9   tilingtiling_rankrK   r  s        r#   
tile_shaper    s    [3u:
	88CK=UG <J<r	 
 
LF+%&/ ?da1u.v>>?
]{l
=3uk\]3V<=
 
 r%   c                    t        | j                        dk(  sJ d|z  dk(  r|dk  sJ t        j                  j	                  d      }| }t        j                  |      }|j                         st        d| d      t        |      }t        |      D ]h  }t        j                  |j                  t        d|      |t        d|z  |      t        d|      t        j                  j                        } |||      }j |S )	z$Reduce a value across the warpgroup.r3   r   r   z6Warp reduction group size should be a power of 2 (got r  r0  r   r1  )re  rO   r
   r6   r7   rh   log2
is_integerr5   rg   r   r   r2  rQ   r3  bfly)r   r   
group_sizer   r   itersrJ   other_results           r#   warp_tree_reducer    s    	5::	!	##	#	jA	*"22	2
##B'#&
''*
%				

@AN  e*%< 	&a>>	*c	!q&#	$L %F	& 
-r%   c                 r   t         j                  j                  d      }t        j                  | j                        }t        |j                        }|dndt        |      z   dz   }t         j                  j                  d|z         }|dk(  r't         j                  j                  d| d| d	      }n,t         j                  j                  d| d| d
| d| d	      }t        j                  |g| g      }t        j                  ||dg      }	t        j                  ||dg      }
t        |j                        }|dk  r|j!                         ^ }}|t         j"                  j%                         k7  rF|j'                         dk(  sJ d|z  }||z  dk7  rt)        | d|d|d      t+        ||z  |      }nt        j,                  |
t+        ||      t        j.                  j0                        }t        j2                  |t+        d|            }nG|dz  dk(  sJ t        j,                  |
t+        |dz  |      t        j.                  j0                        }t        j4                  |t        j6                  t        j8                  ||	      |t        j.                  j0                              S )Nr)   r   <r+   rN   r   r,   r-   r1   r.   r/   r0   r   r2   r   z static_offset=z is not divisible by packing=`)overflow_flags)r
   r6   r7   r   rO   r8   r9   r   r:   r;   r   UnrealizedConversionCastOpr   r   r   ro   r4   ry  rz  ro  r5   rQ   mulIntegerOverflowFlagsnoneudivinttoptrr   ptrtoint)
memref_argr}  rE   r&   rF   r  rG   rH   rI   aligned_ptroffset_elemselem_bitwidthr  static_offsetpackingoffset_bytesoffset_bitss                    r#   r   r      s|   
##B'#mmJOO,)	Y__	$$"#L0A*AC*G%77==u,-&	QYggmmnVHBvhgFGGggmm
6(-v >y	"G 
	+	+WI
|	D$!!&$4+""3qc2,9112-Q!88:QBBDD$$&!+++]"g		 A	%k)=* +z
 	
 }/5lHH

M3
2277k
 YY{AaI6l1!!!88	-1
c"0055L
 

hh
--[
)
2277
 r%   rL  
collectivec                     t        |t        j                        r|f}t        j                  j                  d      }t        d|      }t         d d d         d d d   }t        |t        j                        D ]p  \  }}||v r |   dk7  st        j                  |t        j                  |            }t        j                  |t        j                  |t        ||                  }r d}|D 	cg c]  }	||	   	 }
}	t         fd|D              }t        j                   |      D ]$  }|dt#        d t        ||
      D              z  z  }& t        j$                  t        ||      |      S c c}	w )Nr   r   r[   r   c              3   (   K   | ]	  }|     y wrd   r   )r   r  rL  s     r#   r   z*cluster_collective_mask.<locals>.<genexpr>q  s     @=+@rP  c              3   ,   K   | ]  \  }}||z    y wrd   r   )r   rJ   rK   s      r#   r   z*cluster_collective_mask.<locals>.<genexpr>s  s     NAq1uNr  )re   r   r$  r
   r6   r7   rQ   ra   r   r   r  cluster_block_idr'  r(  r   rh   ndindexr  r  )rL  r  r   
mask_shiftcluster_stridesr_   cluster_dimdim_idxmask_unshiftedr  collective_stridescollective_shaper,  s   `            r#   rS  rS  V  s\    
CMM*J 	##B'#Cy**=2+>?"E/ #--@ fkj [!Q&""3(<(<[(IJg::

**Wan
-j .4>?q*??@Z@@ZZ() Oca3NS:L1MNNNNNO	Anc*J	77	 @s   4E2dtypec                    t        j                  |       } t        j                  | t         j                        r<t        j
                  j                  t        j                  |       j                        S t        j                  |       S rd   )jnpr  
issubdtyperi   r
   r6   r7   iinfobitsr	   dtype_to_ir_typer  s    r#   r  r  w  sW    
))E
%^^E3;;'>>&&syy'7'<'<==			u	%%r%   c                     t        j                  | t         j                        ryt        j                  | t         j                        r$t        j                  | t         j                        S y r  )r  r  bool_ri   signedintegerr  s    r#   r   r     sC    ^^E399%
~~eS[[)>>%!2!233	r%   r   c                    |D cg c]  }t        |t              r|nt         }}|D cg c]  }t        |t              r| }}t        j                  | j
                  | |||t        j                  j                        S c c}w c c}w rd   )re   rg   	DYNAMIC32r   rR   rO   GEPNoWrapFlagsr  )r*   r   r  rJ   static_indicesdyn_indicess         r#   rR   rR     s     ELLqAs+A:L.L#>q:a+=>+>				hh	

  M>s   BB Bc                     t        |       t        |      k(  sJ t        j                  t        j                  d t        | |      D              S )Nc              3   N   K   | ]  \  }}t        j                  ||        y wrd   )r   r(  )r   ar   s      r#   r   zdyn_dot.<locals>.<genexpr>  s     &NDAquzz!Q'7&Ns   #%)r8   r   reducer   r'  r   r`   r%  s     r#   dyn_dotr    s;    	Q3q6					%**&NC1I&N	OOr%   r`   distancec           
      ,   t         j                  j                  d      }t         j                  j	                         }t        |t              rt        ||      }| j                  x}|k7  rt        | j                        x}dk  rd|z  dk(  sJ t        | t         j                  j                  |            } t        j                  t         j                  j	                  d|z  f| j                              }t        j                  | |g t         j                   j	                  dg            } n|dkD  r|dz  dk(  sJ |dz  }t        | t         j                  j	                  |f|            }t        j                  |j                        }	t#        |      D ]{  }
t        j$                  |g t         j                   j	                  |
g            }t'        ||      }t        j                  ||	g t         j                   j	                  |
g            }	} t        |	|      S t        | |      } t)        j*                  |t        d|      | |t        d|      t(        j,                  j.                        }	t        |      x}dk  rt         j                  j                  |      }t        |	t         j                  j	                  d|z  f|            }t        j$                  |g t         j                   j	                  dg            }	t        |	|      S )Nr   r   r   r0  r1  )r
   r6   r7   rf   r@   re   rg   rQ   rO   r   r   r   
mlir_undefrm   r   insertr   r   r   	shfl_bflyr   r2  r3  r  )r`   r  r   r   rw  
x_bitwidthempty32	num_wordsxs_vecr%  rJ   x_elemy_elembits_tyy_vecs                  r#   r  r    s   
##B'#
,,


%#3HVVk#qvv&&
",*_!!!
!R^^00<
=a 1 123C2Eqvv NOg
--

..22A37	a 
b"_!!!"iq"--++YL#>?f
//&++
&aY 
!0044aS9

 68,MM0044aS9	

 Q$$3A
nn	
Ccl
mm! [))jR/nn))*5GAr}}(("
*:)<gFGE,,00!5	A
 
K	  r%   ra  r`  c                    t         j                  j                  d      }| j                  x}|j                  k7  r%t	        d| j                   d|j                         | j                  |k7  rt        | |      } |j                  |k7  rt        ||      }|j                  |k7  rt        ||      }t        j                  || ||gdd      }t        ||      S )Nr   zTypes must match, got z and zprmt.b32 $0, $1, $2, $3;z=r,r,r,r)r
   r6   r7   rO   r5   r   r   r   )ra  r`  r  r   rw  r   s         r#   prmtr    s    
##B'#YYk388+
-dii[chhZH
II	YY#4DXX_
#s
C+s+K??	D#{#%?& 
	%%r%   new_typec                 	   | j                   |k(  r| S t        | j                         x}t        |      x}k7  r"t        d| j                    d| d| d| d	      t        j                  j                  | j                         rt        j                  j                  |      rt        j                  |      }t        j                  | j                         }|j                  t        |j                        t        j                  |j                        z  k(  sJ t        j                  t        j                  t        j                  j                  d|      |       g t        j                   j                  dg            S t        j                  j                  | j                         rt        j                  j                  |      rt        j                  |      }t        j                  | j                         }|j                  t        |j                        t        j                  |j                        z  k(  sJ t        j                  |t        j"                  t        j                  j                  d|      |             S t        j                  j                  | j                         rt        j                  j                  |      r|t        j                  | j                         }t        j                  |      }t        |      t        |      k7  rt        d| j                    d|       t        j                  ||       S t        j                  j                  | j                         r5t        j$                  j                  |      rt'        j                  ||       S t        j$                  j                  | j                         r5t        j                  j                  |      rt'        j                  ||       S t        j$                  j                  | j                         r5t        j$                  j                  |      rt'        j                  ||       S t        d| j                    d|       )	NzCan't bitcast z (of bitwidth z) to r  r   r   r   r  )rO   r   r5   r
   rm   re   r6   rt   ro   rk  rl  r9   r   r   r   r@   r   rn   rk   r   )r`   r  x_bwnew_bwx_tyr  s         r#   r   r     s.   VVxHqvvdhx.@$@FA

tfE( DHA	  ]]aff%"..*C*CH*M~~h'H== D>>Xd&7&78499TZZ;PPPPP>>r}}((x8!<,,00!5 
 ^^qvv&2==+C+CH+M}}X&H>>!&&!D::("7"78499<     >>&""2==#4#4T4#@!D  ]]aff%"--*B*B8*L== D]]8$F~&))xtH:>??>>(A&&^^qvv&2<<+B+B8+L==1%%\\QVV$)B)B8)L==1%%\\QVV$)@)@)J==1%%^AFF84z:;;r%   r%  c                     | |z   dz
  |z  S r   r   r  s     r#   ceil_divr
    s    
a%!)	r%   rW   rK   c                 |   t        j                  | j                        }t        |j                        dk7  rt        d|       |j                  \  }t        t        |      |         }t        j                  t         j                  j                  |f|j                        | |j                  xs dg|gdg      S )Nr   zOnly 1D vectors are supported r   )r
   rm   rO   r8   r9   r"   r   r   extract_strided_slicer@   ro   r  )rW   rK   v_tyv_lenslice_lengths        r#   vector_slicer    s    	qvv	$_
 >tfE
FFJJ'5U5\!_%,		%	%mm):):;ww|!nnc
 r%   vectorsc                 D   | st        d      | d   j                  }t        j                  j	                  |      st        d      t        j                  |      }|j
                  dk7  rt        d      | D ]  }|j                  |k7  st        d       t        |       S )Nz+Cannot concatenate an empty list of vectorsr   z$Cannot concatenate non-vector valuesr   r   z-Cannot concatenate vectors of different types)r5   rO   r
   rm   re   rF   r"   _vector_concat_rec)r  r   rW   s      r#   r   r     s    	
B
CC
#		!	!#	&
;
<<
c#XX]
=
>> Havv}FGGH 
G	$$r%   c                    | x  r dk(  r\  } |S    r dk(  r\  }}t        j                  |j                        j                  \  }t        j                  |j                        j                  \  }t         j                  j                  t        t        ||z                     }t        j                  | d|iS  	 | sJ t        | d t        |       dz         }t        | t        |       dz  d        }t        ||g      S )Nr   r2   r5  )r
   rm   rO   r9   r   r@   r  r   r   shuffler  r8   )r  rW   wr  w_lenr5  r   r  s           r#   r  r  -  s    	h 
	aff%++guaff%++gu!!%%d5+?&@Ad^^W0400	 

 
nW
W%8s7|q'89
:a
WS\Q%6%89
:aA''r%   c                 v   |dk(  ry|dk  s$t        | j                  t        j                        sy|dz
  }| j                  j                  }|xt
        j                  d x*\    t        | j                  j                  d   ||dz
        S  xt
        j                  d x2\    t        j                  |j                        j                  |z  dk(  S  xt
        j                  d xM\    t        | j                  j                  d   ||      xs$ t        | j                  j                  d   ||      S  xt
        j                  d xM\    t        | j                  j                  d   ||      xr$ t        | j                  j                  d   ||      S  xxt
        j                  d x\   nK xt
        j                  d x\   n3 xt
        j                   d x\   n xt
        j"                  d x\   n  nL  t        | j                  j                  d   ||      xr$ t        | j                  j                  d   ||      S xxt
        j$                  d x\   n xt
        j&                  d x\   n  nL  t        | j                  j                  d   ||      xr$ t        | j                  j                  d   ||      S t
        j(                  d xa\   |j+                         dk(  xrJ t        | j                  j                  d   ||      xs$ t        | j                  j                  d   ||      S  y)zMReturns True if the value is statically known to be divisible by the divisor.r   Tr   Fr   r2   )re   ownerr
   r  opviewr   IndexCastOpis_known_divisibleoperandsr>   r?   r   MulIOpSelectOpMaxSIOpMinSIOpMaxUIOpMinUIOpAddIOpSubIOpAndIOpro  )r   divisor	max_depth	new_depthdef_ops        r#   r  r  =  s   \]*U[[",,?!m);;&				 4 4Q 7)a-PP 
				^^FLL)//'9Q>> 
	  
++

q
!7I Kekk2215w	JK 
 
	
++

q
!7I Lu{{33A6KL 
 
O?5==?_U]]_u}}	N
++

q
!7I Lu{{33A6KL 
).%,,.	(
++

q
!7I Lu{{33A6KL 
 A% 
U[[11!4gy
I M 4 4Q 7)L 
 
r%   c                  @    t         j                  j                  d      S )z0Returns the attribute for the SMEM memory space.z#gpu.address_space<workgroup>r
   r  r;   r   r%   r#   smemr-  i  s    			;	<<r%   c                  @    t         j                  j                  d      S )z0Returns the attribute for the TMEM memory space.z#mosaic_gpu.tmemr,  r   r%   r#   tmemr/  n  s    			.	//r%   c                     t        | t        j                        r| j                  } t        j                  j                  |       st        d|        t        j                  |       } | j                  duxr | j                  t               k(  S )zReturns true if the input mem ref or memref type points to SMEM.

  If the input is not at all of a memref type, raises a ValueError.
  Expected a memref type but got N)re   r
   r   rO   r   r5   r}  r-  r   s    r#   is_smem_refr3  s  q    
 RXX
((C		!	!#	&
6se<
==
c#				%	D#*:*:df*DDr%   c                     t        | t        j                        r| j                  } t        j                  j                  |       st        d|        t        j                  |       } | j                  duxr | j                  t               k(  S )zReturns true if the input mem ref or memref type points to TMEM.

  If the input is not at all of a memref type, raises a ValueError.
  r1  N)re   r
   r   rO   r   r5   r}  r/  r2  s    r#   is_tmem_refr6    r4  r%   r<  r  c                    |t        t        j                        }d}d}t        | d      }t	        j
                  t        j                  j                  d      ||j                         g||gng z   | dd|z   d	
       y)zAtomically cancels a pending cluster launch.

  The response is stored in a opaque 128-bit value containing the CTA id of the
  first CTA in the canceled cluster.
  Nr  r  r   r  r   zx clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [$0], [$1];zr,rTr   )
rR  r   rF  r   r   r   r
   r:   r;   r  )
result_refr<  r  r  r   rE  s         r#   try_cluster_cancelr9    s     '(:(:;I(/	JQ	/$//ggmmL!W__!-)27	
  or%   c                    t         j                  j                  d      }t         j                  j                  d      }t        j                  j                  ||||g      }t        | d      }t        j                  ||gdd      }dD cg c]  }t        j                  |||g       }}t        j                  ||dg      }g ||S c c}w )zDecodes the response of `try_cluster_cancel`.

  It checks if the cancellation was successful, and if yes, it also extracts
  the CTA ID of the first CTA in the canceled cluster.
  r   r   r   r  z
    {
        .reg .b128 handle;
        ld.shared.b128 handle, [$4];
        clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 $3, handle;
        @$3 clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 {$0, $1, $2, _},  handle;
    }z=r,=r,=r,=b,r)r   r   r2   )	r
   r6   r7   r   
StructTypeget_literalr   r   r   )	r8  r   r  	struct_tyrE  rI   r,  cta_idscancelled_launchs	            r#   query_cluster_cancelr@    s     	##B'#	~~""1%"oo))3S"*=>)	JQ	/$	f	 
$ =FFSTsD3%0F'F&&r4!5	%7	%$	%% Gs   Cnanosc                 t    t        j                  t        j                  j	                  d      | gddd       y)z>Sleeps the current thread for the given number of nanoseconds.r   znanosleep.u32 $0;r  Tr   Nr  )rA  s    r#   	nanosleeprC    s,    //ggmmL!g	r%   r=  c                     	 t        j                  d | ||      S # t        $ r t        j                  | ||      cY S w xY w)Nr%  )r   mbarrier_arrive_expect_txrj   )r<  r=  r  s      r#   r&  r&    sE    S))$iXX	 S))'9	RRSs    !??rd   r  )T)
   )__doc__collections.abcr   r   r   dataclassesrC  r   rk  typingr   r   r  r   r  jax._src.libr   ri  jax.interpretersr	   jaxlib.mlirr
   jaxlib.mlir.dialectsr   r   r   r   r   r   r   r   r   rh   r   rg   r   r   r   DYNAMICr  rj  r   r$   r!   r  r   rL   rY   ra   floatrQ   r{   r   	dataclassr   r   r   MultimemReductionOpr:   r+  r   r   r  contextmanagerr  r.  r   	thread_id	block_dimr;  block_idgrid_dim	block_idxr6  r=  rA  IntEnumr   rG  rF  rR  r   r\  r^  rc  re  rm  r   rq  dsr   r  r  r  r  r   r  r  r  r  r  r  rx  r  r  r  r  r-  rI  rf  r  r  r  r  r  r   r$  rS  	DTypeLiker  r   rR   r  r  r  r   r
  r  r   r  r  r  r-  r/  r3  r6  r9  r@  rC  r&  r   r%   r#   <module>r\     s   $ .       
  6 !  - & ( $ % ' % $ '  	3  (I5 C 5
	
Qc.>.> Q3 Q !;!  CGAMMA584ZAD
"3; "@ %) (DV d#	 	 $	  6 EF  "	[
[	[ #[ d{	[| d#  $>   Y}cmmS]]K
ImS\\3<<H	514<<  "&	<$ % 3?2D2D )< )$ (4(:(:   2M
gm,t3 ,"''  bgg  &  d#J J $J bhh "(( F 6;4[MM&+dl&;"(( ;c ;c ;|B-	K	B-(-c3hB-XXB-J(3	K	(3XX(3V'Crxx 'C"(( 'CT."(( .BHH .2bmm  "(( # 288  9=:0S/:015:0
43cDJ67:0z
E d#D@ D@ $D@N d#P P $Pf d#c. c. $c.L d#h6 h6 $h6VH HV, ,^(43l8c3'8'#--78B&CJJ00 &RWW &SZZ)) dTk 	$RXX^4=?WWXXP
4! 4!S288^ 4!n&rxx &bhh &RXX & &<rxx &<277 &<R  BHH  %8BHH- %"(( %( 2 (rxx ( ) )X=bll =
0bll 0

ERXX' 
ED 
E
ERXX' 
ED 
E  "& xx$8&
288RXXrxx12&BRXX SBHH S SVXV^V^aeVe Sr%   