
    uki;                      U d Z ddlmZ ddlmZmZmZ ddlZddlZddl	Z	ddl
Z
ddlmZmZmZmZmZmZ ddlZddlmc mc mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lm
Z ddlmZ ddlmZ ddl Z!ddl"m#Z#  ed      Z$e#jJ                  Z%dZ&e%e&z  Z'dZ(dZ)e#jT                  Z* ejV                  d       G d d             Z,dSdZ- ejV                  d       G d d             Z. ejV                  d       G d d             Z/dTdZ0 ejV                  d       G d d             Z1 ejV                  d       G d d             Z2e1e2z  e/z  Z3 e/ e,d        e.d      f e.d!      d"fd#$      Z4 e/ e,d%      d&d" e.d      fd#$      Z5 e/ e,d'      d(d)d#$      Z6 e/ e,d*      d+d,d#$      Z7 e/ e,d-      d(d)d#$      Z8 e/ e,d.      d+d/d#$      Z9 e/ e,d0      d(d)d#$      Z: e/ e,d1      d2d3d"$      Z; e/ e,d4      d(d)d#$      Z< e/ e,d5      d2d3d"$      Z= e/ e,d67      d&d" e.d8      fd#$      Z> e/ e,d 7       e.d8      f e.d!8      d"fd#$      Z?dUd9Z@ e@d:      ZA e/ e,d;      d< e.d      fd#$      ZBej                  j                   ejV                  d=dd>       G d? d@                    ZEeeFeGdAf   geFeGdAf   f   ZHdBeIdC<    G dD dEe      ZJ ejV                  d       G dF dGeJ             ZK ejV                  d       G dH dIeJ             ZL	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dVdJZMdWdKZNdWdLZOdWdMZPe	 	 	 	 	 	 	 	 dXdN       ZQedYdO       ZQdP ZQ	 	 	 	 	 	 	 	 	 	 dZdQZRd[d\dRZSy)]zUtilities for code generator.    )annotations)CallableIterableSequenceN)AnyProtocol	TypeAliasTypeVarcastoverload)ir)arith)gpu)llvm)math)memref)vector   )utilsT       T)frozenc                  |    e Zd ZU dZded<   d Zd ZddZddZddZ	dd	Z
dd
ZddZ	 	 	 	 	 	 ddZddZddZy)TilingaP  A tiling expression describing a permutation of elements of an nd-array.

  To apply one level of tiling to an array, each of the trailing dimensions (up
  to the rank of the tile) is unfolded into two dimensions: first equal to the
  ratio of the dimension size and the tile size, and second equal to the tile
  size. Then, all newly unfolded minor dimensions are transposed to appear at
  the end.

  This expression describes multi-level tiling, by applying each element of
  `tiles` in sequence to the array.

  See https://openxla.org/xla/tiled_layout for a more detailed explanation.
  tuple[tuple[int, ...], ...]tilesc                &   | j                   sy t        | j                   d         }| j                   D ]]  }t        |      |kD  rt        d      |st        d      t        d |D              rt        d| j                          t        |      }_ y )Nr   z!Tiles must have a decreasing rankzTiles must not be emptyc              3  &   K   | ]	  }|d k    ywr   N .0ds     g/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/mosaic/gpu/fragmented_array.py	<genexpr>z'Tiling.__post_init__.<locals>.<genexpr>K   s     "Q!V"   z/Tile shape must only have positive sizes, got: )r   len
ValueErrorany)selflast_tile_ranktiles      r%   __post_init__zTiling.__post_init__B   s    ::A'N

 !	T^	#<==233	"T"	"J4::,WXX4yn!    c                \    ddj                  t        t        | j                               dS )NzTiling( ))joinmapstrr   r+   s    r%   __str__zTiling.__str__O   s&    RWWSdjj123155r/   c                .    | fd} j                   D ]{  }t        |      t        |      kD  r |        |dt        |        |t        |       d }}t        d t        ||      D              r |        g |d t        ||      D        |}} |S )z,Computes the shape of an array after tiling.c                 :    t        dj                   d        NzTiling z does not apply to shape r)   r   
orig_shaper+   s   r%   failzTiling.tile_shape.<locals>.failU   s     ,Ej\RSSr/   Nc              3  2   K   | ]  \  }}||z  d k7    ywr    r!   r#   sts      r%   r&   z$Tiling.tile_shape.<locals>.<genexpr>[   s     :DAqQUaZ:s   c              3  ,   K   | ]  \  }}||z    y wNr!   r#   r$   rB   s      r%   r&   z$Tiling.tile_shape.<locals>.<genexpr>]   s     I41aQI   )r   r(   r*   zip)r+   shaper>   r-   untiled_dims
tiled_dimsr=   s   `     @r%   
tile_shapezTiling.tile_shapeR   s    JT

 R	TSZ	!&{T
!3UCI:;5GJl	:C
D$9:	:QQI3z43HIQDQeR Lr/   c                H    | fd}t         j                        D ]  }t        |      t        |      kD  r |        |ddt        |      z   }|dt        |      z  t        |        }|t        |       d }||k7  r |        g |d t        ||      D        } |S )zBComputes the shape of an array before tiling from its tiled shape.c                 (    t        d  d d      )Nzshape z* is not a valid result of applying tiling .)r)   r<   s   r%   r>   z!Tiling.untile_shape.<locals>.failc   s#    :,Ha
P r/   Nc              3  ,   K   | ]  \  }}||z    y wrD   r!   rE   s      r%   r&   z&Tiling.untile_shape.<locals>.<genexpr>o   s     H$!QAHrF   reversedr   r(   rG   )r+   rH   r>   r-   rI   rJ   tiling_dimsr=   s   `      @r%   untile_shapezTiling.untile_shape`   s    J $ J	TSZ	?BTN+lc$iT
3j3t9*+&k		IIH#j$2GHIeJ Lr/   c                V   t        | j                        dk  r| S | j                  d   }| j                  d   g}| j                  dd D ]K  }t        |      D ]  \  }}|dk7  s||d } n d}|t        |       d }||k(  r9|}|j                  |       M t	        t        |            S )a  Returns a canonicalized version of the tiling.

    We define a tiling to be canonical if, at each step (except the first one,
    which defines the base tile shape):

    1. The tiling partitions at least one dimension in more than 1 tile. For
       example, the tiling `(8, 8)(8, 8)` is not canonical, as applying it
       yields a shape `(1, 1, 8, 8)`. We canonicalize it to `(8, 8)`, which
       allows getting rid of the unnecessary `1` dimensions.
    2. The leading dimensions of each tile are not `1`. If canonicalizing a
       tile in this way leads to an empty tile, then the tile is given shape
       `(1,)`---which is still a meaningful (final) tile. For example, the
       tiling `(8, 8)(1, 4)` is not canonical, as applying it yields a shape
       `(8, 2, 1, 4)`. We canonicalize it to `(8, 8)(4,)`, which allows
       getting rid of the unnecessary `1` dimension, and yields a shape
       `(8, 2, 4)`.
    r   r   Nr   )r(   r   	enumerateappendr   tuple)r+   rH   
new_tilingr-   ir$   canonical_tilerJ   s           r%   canonicalizezTiling.canonicalizer   s    $ 4::!kJJqME**Q-J

12 (D/ $!Q68.

 #n--./j	~	%e'( %
#$$r/   c                    | j                   D ]:  }|dt        |        |t        |       d }}g |d t        ||      D        |}< |S )z.Computes the strides of an array after tiling.Nc              3  ,   K   | ]  \  }}||z    y wrD   r!   r@   s      r%   r&   z&Tiling.tile_strides.<locals>.<genexpr>   s     @tq!QU@rF   r   r(   rG   )r+   stridesr-   untiledtileds        r%   tile_strideszTiling.tile_strides   sb    

 J{T
+Wc$iZ[-AugI'I@s5$/?@I5IgJ Nr/   c                    t        | j                  d         }|dk  s||k\  rt        d| d|        dg|z  }d||<   t        d | j	                  t        |            D              S )zJResult is True whenever the tiled dim originated from the given input dim.r   Invalid dimension  for tiling r   c              3  &   K   | ]	  }|d k(    ywr    r!   r#   rA   s     r%   r&   z(Tiling.tile_dimension.<locals>.<genexpr>   s     CAaCr'   )r(   r   r)   rY   rd   )r+   dimtiling_rankra   s       r%   tile_dimensionzTiling.tile_dimension   sp    djjm$K
Qw#$+C5TFCDDcKGGCLC!2!25>!BCCCr/   c                   t        | j                  d         }|dk  s||k\  rt        d| d|        |}g }t        | j                  d         }| j                  D ]V  }|t        |      k\  sJ ||t        |      z
  z  }t        |      }|dk\  r|d| ||dz   d z   }|s n|j                  |       X t	        t        |            S )z2Returns a tiling with the given dimension removed.r   rf   rg   Nr   )r(   r   r)   rX   r   rY   )r+   rj   rk   dim_in_tiler   r,   rB   s          r%   remove_dimensionzTiling.remove_dimension   s    djjm$K
Qw#$+C5TFCDDKEA'NZZ s1v%%%^c!f,,k1vn		l{Oaa 011ll1o %,r/   c                    t        |      t        |      k7  rt        d| d| d      |f fd	} j                  D ]  } |t        |      t        |      kD         |dt        |        |t        |       d }}|dt        |        |t        |       d }}g g }
}	g g }}t        |||      D ]`  \  }}}g g }}g g }}t        t	        |      t	        |      d      D ]  \  }}||k  r6 |||z  dk7         ||z  }|j                  |       |j                  |       A|d	k7  r` |||z  dk7         |j                  |       |j                  |       ||k7  r(|j                  ||z         |j                  ||z         d	}|j                  |       |j                  |         ||d	k7         |	j                  |ddd
          |j                  |ddd
          |
j                  |ddd
          |j                  |ddd
          c g ||	|}g ||
|} t        d |D              t        d |D              fS )a  A fused version of `tile_shape` and `tile_strides` for nested shapes.

    By nested shape we mean that each logical dimension (i.e. each element of
    shape/strides) is actually composed out of multiple physical dimensions.
    For example, a row-major array of logical shape (128, 128) that is tiled
    into (64, 64) tiles would have a nested shape ((2, 64), (2, 64)) (i.e. each
    dim is split into two sub-dims) and nested strides of
    ((2 * 64 * 64, 64), (64 * 64, 1)).
    Shape z and strides z must have the same lengthc                @    | rt        dj                   d|       y r:   r;   )condrH   r+   s     r%   fail_ifz1Tiling.tile_nested_shape_strides.<locals>.fail_if   s(    	74::,.GwOPP 
r/   NTstrictr   r   c              3  :   K   | ]  }|rt        |      nd   ywrV   NrY   r"   s     r%   r&   z3Tiling.tile_nested_shape_strides.<locals>.<genexpr>   s     5!!eAh%5   c              3  :   K   | ]  }|rt        |      nd   ywry   rz   r"   s     r%   r&   z3Tiling.tile_nested_shape_strides.<locals>.<genexpr>   s     7!!eAh%7r{   )r(   r)   r   rG   rR   rX   rY   )r+   rH   ra   rt   r-   untiled_shapetiled_shapeuntiled_stridestiled_stridesmajor_dim_shapesmajor_dim_stridesminor_dim_shapesminor_dim_stridesrB   	dim_shapedim_stridesmajor_dim_shape_revmajor_dim_stride_revminor_dim_shape_revminor_dim_stride_revr$   rA   s   `                     r%   tile_nested_shape_stridesz Tiling.tile_nested_shape_strides   s    5zS\!5'wi/I
J  " Q 

  Kc$i#e*$%#(3t9*#5uc$iZ[7I[m'.{T
';Wc$iZ[=Q}o,.),.)'*4m'L =
#!Y461461+Xk-B4P 	+DAqUAEQJ!GA&&q) ''*AvAEQJ&&q) ''*Av!((a0"))!a%0A&&q) ''*!	+" 	Q 3DbD 9: 3DbD 9:  !5dd!;<  !5dd!;<1=2 ED 0D3CDeJ/J$5J8IJgA KD 	5u557w77 r/   c                    | j                   D ]K  }|d t        |        |t        |       d  }}g |d t        ||      D        d t        ||      D        }M |S )Nc              3  ,   K   | ]  \  }}||z    y wrD   r!   r#   r[   rB   s      r%   r&   z&Tiling.tile_indices.<locals>.<genexpr>   s     0tq!AF0rF   c              3  ,   K   | ]  \  }}||z    y wrD   r!   r   s      r%   r&   z&Tiling.tile_indices.<locals>.<genexpr>   s     /daAE/rF   r`   )r+   indicesr-   rb   rc   s        r%   tile_indiceszTiling.tile_indices   sx    

 {T
+Wc$iZ[-Aug0s5$/0 0c%./g Nr/   c                    t        | j                        D ]V  }|d dt        |      z   }|dt        |      z  t        |        }|t        |       d  }g |d t        |||      D        }X |S )NrO   c              3  4   K   | ]  \  }}}||z  |z     y wrD   r!   )r#   or[   rB   s       r%   r&   z(Tiling.untile_indices.<locals>.<genexpr>  s     N71aQUQYNs   rQ   )r+   r   r-   rb   outerinners         r%   untile_indiceszTiling.untile_indices   s    $ Pc$i(gb3t9nc$iZ0es4yjk"eO'ONc%6MNOg	P
 Nr/   NrH   tuple[int, ...]returnr   )r   r   )ra   r   r   r   )rj   intr   ztuple[bool, ...])rj   r   r   r   )rH   r   ra   r   r   z?tuple[tuple[tuple[int, ...], ...], tuple[tuple[int, ...], ...]])r   r   r   r   )__name__
__module____qualname____doc____annotations__r.   r7   rK   rT   r]   rd   rl   ro   r   r   r   r!   r/   r%   r   r   1   sf     
%$!6$#%JD &9(9 +9 G	9vr/   r   c              #  Z   K   t        |       }t        |       D ]  \  }}||z
  |f  yw)zHLike built-in enumerate, but returns negative indices into the sequence.N)r(   rW   )elemsoffsetr[   es       r%   enumerate_negativer     s5     u:& da
f*a-s   )+c                      e Zd ZU ded<   y)
Replicatedr   timesN)r   r   r   r   r!   r/   r%   r   r     s    	*r/   r   c                  h   e Zd ZU dZded<   ded<   ded<   ded<   d	Zd
ed<   ddZej                  dd       Z	ej                  dd       Z
d dZedd       Zej                  dd       Zej                  d!d       Zed!d       Zd"dZd#dZd#dZ	 	 	 	 	 	 d$dZd%dZd%dZd&dZd'dZd(dZy))TiledLayouta  A FragmentedArray layout derived from a tiling expression.

  A logical array is transformed according to the tiling expression, and then
  split across warps (within a warpgroup), lanes, and vectorized according to
  the dimension indices. All dimension indices must be negative and should refer
  to the dimensions after tiling is applied.

  To better understand this layout, consider the example of WGMMA-related tiling
  from https://docs.nvidia.com/cuda/parallel-thread-execution/#wgmma-64n16-d as
  applied to a 128x128 array. The corresponding TiledLayout has a tiling of:

      (64, 8)(16, 8)(8, 8)(1, 2)

  and warp_dims=(-8,), lane_dims=(-4, -3), vector_dim=-1.

  We begin by applying the tiling (note that it always applies to a suffix):

          Tiled shape                       Remaining tiling actions
  ===========================================================================
  128 128                                  (64, 8)(16, 8)(8, 8)(1, 2)
    2  16  64  8                           (16, 8)(8, 8)(1, 2)
    2  16   4  1  16  8                    (8, 8)(1, 2)
    2  16   4  1   2  1  8  8              (1, 2)
    2  16   4  1   2  1  8  4  1  2

  The last expression is our final shape. At this stage, we're ready to partition
  the dimensions: warp_dims=(-8,) means that the 8-th dimension from the
  end is partitioned over 4 warps in a warpgroup (and so it must be of size 4).
  lane_dims=(-4, -3) indicate that those two dimensions are partitioned over
  the lanes within a warp (their product must be equal to 32, i.e. warp size).
  Finally, vector_dim=-1 indicates that each (logical) register is a vector
  containing 2 elements (there are no shape restrictions here).

  Given the above, the shape of the (logical) register array used to represent
  the array in each thread is: (2, 16, 1, 1, 2, 1, 1, 1, 1, 1). We have set all
  the dimensions above to 1, since each thread is a member of a single warp,
  a single lane, and the elements along the vectorized dimension are represented
  by a single (logical) register.
  r   tilingtuple[int | Replicated, ...]	warp_dims	lane_dimsr   
vector_dimTzdataclasses.InitVar[bool]_check_canonicalc                <   | j                   j                  st        d      | j                   j                  d   }| j                   j                  |      h | j                  | j
                  | j                  }t        |      t        | j                        t        | j
                        z   dz   k7  rt        d      |D ]8  }|dk\  rt        d      |t              t        |      z
   k  s/t        d       t        j                  fd| j                  D              }|t        k7  rt        d      t        j                  fd	| j                  D              }|t        k7  rt        d
      |r$| j                         }| |k7  rt        |  d      y y )Nz"Tiling must have at least one tiler   r   z!Duplicate partitioning dimensionszAll dimensions must be negativezDimension out of rangec              3  `   K   | ]%  }t        |t              r|j                  n|    ' y wrD   
isinstancer   r   r#   r$   min_tiled_shapes     r%   r&   z,TiledLayout.__post_init__.<locals>.<genexpr>U  0       a,/!2DD   +.zJThe product of warp dims does not equal the number of warps in a warpgroupc              3  `   K   | ]%  }t        |t              r|j                  n|    ' y wrD   r   r   s     r%   r&   z,TiledLayout.__post_init__.<locals>.<genexpr>^  r   r   z5The product of lane dims does not equal the warp sizez is not canonical.)r   r   r)   rK   partitioned_warp_dimspartitioned_lane_dimsr   r(   r   prodr   WARPS_IN_WARPGROUPr   	WARP_SIZEr]   )	r+   r   	min_shapedims_setr$   warp_dims_prodlane_dims_prodcanonical_layoutr   s	           @r%   r.   zTiledLayout.__post_init__F  s   ;;;<<!!!$Ikk,,Y7O		#	#&*&@&@BF//H 8}D667#d>X>X:YY\]]]:;; 3	
a:;;	
s?#c)n45	5122	3
 YY  N ++  YY  N "NOO**,	!	!D6!3455 
" r/   c                :    t        d | j                  D              S )Nc              3  B   K   | ]  }t        |t              r|  y wrD   r   r   r"   s     r%   r&   z4TiledLayout.partitioned_warp_dims.<locals>.<genexpr>k        z!Z'@a   )rY   r   r6   s    r%   r   z!TiledLayout.partitioned_warp_dimsi          r/   c                :    t        d | j                  D              S )Nc              3  B   K   | ]  }t        |t              r|  y wrD   r   r"   s     r%   r&   z4TiledLayout.partitioned_lane_dims.<locals>.<genexpr>q  r   r   )rY   r   r6   s    r%   r   z!TiledLayout.partitioned_lane_dimso  r   r/   c              #    K   t         j                  j                  d      }t         j                  j	                         }t        t        j                  |            }| j                  j                  |      }|| j                   d  D cg c]  }t        ||       }}t        j                  | j                         |      }t        j                  | j                         |      }	t        j                   ||	      }
| j#                  |      }t%        j&                  |      D ]  }t)        d t+        ||      D              }t        j                   |
t        ||            }g }|D ]e  }|j-                  t        j.                  |t        j0                  |t        ||                         t        j2                  |t        ||            }g t        |        y c c}w w)Nr   c              3  ,   K   | ]  \  }}||z    y wrD   r!   r#   r[   rA   s      r%   r&   z*TiledLayout.thread_idxs.<locals>.<genexpr>  s     G41aQGrF   )r   IntegerTypeget_signless	IndexTypegetrY   r   get_contiguous_stridesr   rd   tiled_tiling_rankcdyn_dotwarp_indiceslane_indicesr   addiregisters_shapenpndindexsumrG   rX   index_castuidivuiremui)r+   rH   i32indexcontig_stridesrd   rA   dyn_tile_strideswarp_offsetlane_offset
dyn_offsetregister_shapetile_idxtile_lin_idxdyn_lin_idxidxstrides                    r%   thread_idxszTiledLayout.thread_idxsu  s     ..
%
%b
)CLLE577>?N;;++N;L+79O9O8O8P+QRa!S	RR-- 1 1 35EFK-- 1 1 35EFKK5J))%0NJJ~. G3x+FGGlJJz1\3+?@kc" ?&

5%%eU[[aPSn-UVWkk+q~>? #J Ss   B	G$GEG$c                4    | j                   j                  d   S )zThe shape of the first tile in the tiling expression.

    This tile acts as the divisibility constraint for a suffix of arrays to
    which this layout applies.
    r   )r   r   r6   s    r%   base_tile_shapezTiledLayout.base_tile_shape  s     ;;Qr/   c                h    | j                   }| j                  j                  |      t        |      d S )a2  The shape of the suffix of the array after tiling.

    We only allow our repeated tiling actions to further subdivide the
    dimensions created by previous tiling actions (except for the first one),
    so the tiled shape always ends with this suffix, no matter what array shape
    it's applied to.
    N)r   r   rK   r(   )r+   r   s     r%   tiled_tiling_shapezTiledLayout.tiled_tiling_shape  s1     **O;;!!/233G3HIIr/   c                ,    t        | j                        S rD   )r(   r   r6   s    r%   r   zTiledLayout.tiled_tiling_rank  s    t&&''r/   c                4    | j                   | j                     S rD   )r   r   r6   s    r%   vector_lengthzTiledLayout.vector_length  s    ""4??33r/   c                X    t         j                  j                  | j                  f|      S rD   )r   
VectorTyper   r   r+   rB   s     r%   registers_element_typez"TiledLayout.registers_element_type  s"    ==d002A66r/   c                    t        | j                  j                  |            }| j                  D ]  }d||<   	 | j                  D ]  }d||<   	 d|| j
                  <   t        |      S )`Returns the shape of the register array needed to represent an array of the given logical shape.r   )listr   rK   r   r   r   rY   )r+   rH   r~   r$   s       r%   r   zTiledLayout.registers_shape  sp    t{{--e45K'' k!n'' k!n#$K r/   c                   | j                   }t        |      }| j                  D ]
  }||   ||<    | j                  D ]
  }||   ||<    || j                     || j                  <   | j
                  j                  t        |            S )zmReturns the logical shape of an array given its register array shape.

    Inverse to `registers_shape`.
    )r   r   r   r   r   r   rT   rY   )r+   rH   tiled_tilingr$   s       r%   shape_from_registers_shapez&TiledLayout.shape_from_registers_shape  s    
 **LKE'' !aeAh!'' !aeAh!)$//:E$//;;##E%L11r/   c                  	
 t         j                  j                  d      	| j                  
t	        
fd|D              }t        j                  |      }t	        	fdt        ||      D              }t        j                  	d      gt        
      z  }t        ||      D ]  \  }}t        |t              r|||<    t	        |      S )Nr   c              3  `   K   | ]%  }t        |t              r|j                  n|    ' y wrD   r   )r#   r$   r~   s     r%   r&   z1TiledLayout._delinearize_index.<locals>.<genexpr>  s/       a,+a.@r   c           
   3     K   | ]E  \  }}t        j                  t        j                  t        |            t        |             G y wrD   )r   r   r   r   )r#   r   sizer   r   s      r%   r&   z1TiledLayout._delinearize_index.<locals>.<genexpr>  s?      FD 	EKKQvs^4aclCs   AAr   )r   r   r   r   rY   r   r   rG   r   constantr(   r   r   )r+   r   dims
dims_shapedims_stridesdims_indicesfull_indicesr$   r[   r   r~   s    `       @@r%   _delinearize_indexzTiledLayout._delinearize_index  s     ..
%
%b
)C))K  J //
;L j9 L NN3*+c+.>>LD,' 1	Az	"l1o r/   c                    t         j                  j                  d      }t        j                  t        j                         t        t        |            }| j                  || j                        S Nr   )r   r   r   r   r   r   
thread_idxr   r   r
  r   )r+   r   lane_idxs      r%   r   zTiledLayout.lane_indices  sM    
..
%
%b
)C{{5++-qC/@AH""8T^^<<r/   c           	     (   t         j                  j                  d      }t        j                  t        j
                  t        j                         t        t        |            t        t        |            }| j                  || j                        S r  )r   r   r   r   r   r   r   r  r   r   r   r
  r   )r+   r   warp_idxs      r%   r   zTiledLayout.warp_indices  sf    
..
%
%b
)C{{E$$&)S(9:	
c"H ""8T^^<<r/   c                   |dk  s%|t        | j                  j                  d         k\  rt        d| d| j                         | j                  j	                  |      }| j
                  | j                  j                  |      t        j                  d d d         d d d   j                         | j                     r0t        g |j                  d      }d}D cg c]  }|dz
  	 c}n| j                  | j                     z   }dfdt        |t        fd| j                  D              t        fd	| j                  D              |d
      j!                         S c c}w )Nr   
Dimension z is out of range for rw   rV   r   c                X    t        | t              r| S |    rt        |      S | |    z   S rD   r   )r$   r  dim_offsetsremoved_dims     r%   replace_tiled_dimz7TiledLayout.remove_dimension.<locals>.replace_tiled_dim  s3    	Az	"q>$;q>!!r/   c              3  Z   K   | ]"  }t        |t              r|n ||          $ y wrD   r   r#   r$   r  r~   s     r%   r&   z/TiledLayout.remove_dimension.<locals>.<genexpr>  4      
 Az*A0A![QR^0TT
   (+c              3  Z   K   | ]"  }t        |t              r|n ||          $ y wrD   r   r  s     r%   r&   z/TiledLayout.remove_dimension.<locals>.<genexpr>  r  r  Fr   )r$   int | Replicatedr  r   )r(   r   r   r)   ro   r   rl   r   cumsumtolistr   r   r   rY   r   r   r]   )	r+   rj   rZ   new_vector_dimr   r  r  r  r~   s	        @@@@r%   ro   zTiledLayout.remove_dimension  sV   
Qw#T[[..q122C5(=dkk]KLL--c2J))K++,,S1K))K"-.tt4;;=K4??#3J,,3d34jn$/0qQU0kT__)EEn"  
^^
 	
 	 
^^
 	
 	 ln 1s   *E5c                P    | }t        |d      D ]  }|j                  |      } |S )NTreverse)sortedro   )r+   axesreduced_layoutas       r%   reducezTiledLayout.reduce  s3    ND$' :%66q9n:r/   c                ~  	
 | j                   j                         }| j                  }| j                  
|j	                  |      t        |      d }t        |      dz
  }g }t        
      D ]7  }|dk\  r|||   k(  r|j                  d       |dz  }'|j                  d       9 |dk(  sJ t        j                  |      ddd   j                         dfd	d
fdt        |t        	fd	| j                  D              t        	fd
| j                  D               	| j                        d      S )z;Returns a version of this layout where tiling is canonical.Nr   r   FTrw   c                8    t        | t              r| S | |    z   S rD   r   )r$   r  s    r%   r  z3TiledLayout.canonicalize.<locals>.replace_tiled_dim5  s     Q
+QC[^1CCr/   c                8    t        | t              xs |    dk7  S Nr   r   )r$   r   s    r%   is_nontrivialz/TiledLayout.canonicalize.<locals>.is_nontrivial8  s     :&D*<Q*?1*DDr/   c              3  @   K   | ]  } |      s |        y wrD   r!   r#   r$   r-  r  s     r%   r&   z+TiledLayout.canonicalize.<locals>.<genexpr>=       OqmA>N"O   c              3  @   K   | ]  } |      s |        y wrD   r!   r/  s     r%   r&   z+TiledLayout.canonicalize.<locals>.<genexpr>>  r0  r1  r  )r$   r  )r   r]   r   r   rK   r(   rR   rX   r   r  r  r   rY   r   r   r   )r+   canonical_tilingrA   canonical_tiled_tiling_shaper   rev_removed_dimsr$   r  r-  r  r   s          @@@@r%   r]   zTiledLayout.canonicalize  s/   {{//1A00#3#>#>q#A#a&'#J -.2F, () &	1:6BB&!%& R<<)),-dd3::<KDE ODNNOOODNNOO$//* r/   N)r   boolr   r   )rH   r   r   zIterable[tuple[ir.Value, ...]])r   r   rB   ir.Typer   r9  r   )r   ir.Valuer  r   r   tuple[ir.Value, ...])r   r;  )rj   r   r   r   )r%  Sequence[int]r   r   )r   r   )r   r   r   r   r   r   r.   	functoolscached_propertyr   r   r   propertyr   r   r   r   r   r   r   r
  r   r   ro   r(  r]   r!   r/   r%   r   r     s   &N .))))/
 15-4!6F  
  
*     	J 	J ( ( 4 472!=*=
= D5r/   r   c                    t        |       dk7  rt        d|  d      | d   dz  dk7  s| d   dz  dk7  rt        d|  d      t        S )	zReturns the tiled layout relevant for WGMMA operations.

  The tiled layout is equivalent to one described here in PTX documentation:
  https://docs.nvidia.com/cuda/parallel-thread-execution/#wgmma-64n16-d
     rq   z
 is not 2Dr   @   r      z is not a multiple of 64x8)r(   r)   WGMMA_LAYOUTrH   s    r%   _tiled_wgmma_layoutrF  D  s]     	Z1_
veWJ/
00
1X]a58a<1,
veW$>?
@@	r/   c                  N    e Zd ZU dZdZded<   ddZddZddZ	 	 	 	 ddZ	d	 Z
y
)WGSplatFragLayouta>  A fragmented array where all the values are equal represented as a register per thread.

  FragmentedArrays in this layout can be are always the result of a
  splat, each thread in the warpgroup has a single copy of the value,
  while the FragmentedArray pretends it has whatever shape the user
  wants. This means we can trivially broadcast, reshape and do
  elementwise operations with all other layouts.

  Examples:

  To load a value in
  ```
  FragmentedArray.splat(memref.load(ref_1d, [1]), (10,20,2))
  ```

  A shape is always provided for sanity check reasons.

  r!   r   rH   c           
         t        | j                        t        |      k  xr2 t        d t        | j                  ddd   |ddd         D              S )zCheck that the shape can be broadcast.

    All source dimensions must match the target's trailing dimensions by
    equality or being set to 1 (i.e. we can broadcast 1-sized dimensions or
    create new leading dimensions).
    c              3  :   K   | ]  \  }}||k(  xs |d k(    ywr   Nr!   )r#   dim1dim2s      r%   r&   z5WGSplatFragLayout.can_broadcast_to.<locals>.<genexpr>o  s,      1D$ 	!	!1r{   Nrw   )r(   rH   allrG   r+   rH   s     r%   can_broadcast_toz"WGSplatFragLayout.can_broadcast_toh  sR     tzz?c%j( S 1djj2.dd<1 . r/   c                    |S rD   r!   r   s     r%   r   z(WGSplatFragLayout.registers_element_typet  s    Hr/   c                     ~y)r   r!   r!   rO  s     r%   r   z!WGSplatFragLayout.registers_shapew  s    r/   c                    ~| j                   S rD   rE  rO  s     r%   r   z,WGSplatFragLayout.shape_from_registers_shape|       	::r/   c                0    || j                   k(  sJ t        rD   )rH   NotImplementedErrorrO  s     r%   r   zWGSplatFragLayout.thread_idxs  s    DJJ
r/   N)r   r6  r8  r   )r   r   r   r   rH   r   rP  r   r   r   r   r!   r/   r%   rH  rH  Q  s<    & %

"r/   rH  c                  j    e Zd ZU dZded<   ded<   d Zedd       ZddZdd	Z		 	 	 	 dd
Z
d Zd Zy)WGStridedFragLayoutz6Convert the array to 1D and then shard across threads.r   rH   r   vec_sizec                    t        j                  | j                        | j                  t        z  z  dk7  rt        | t        f      y Nr   )r   r   rH   rY  WARPGROUP_SIZEr)   r6   s    r%   r.   z!WGStridedFragLayout.__post_init__  s;    	wwtzzdmmn<=Bn-.. Cr/   c                   t         j                  j                  |      st        |      t        j                  |      }t	        j
                  |j                        x}dz  ry|dz  }d|z  dk(  rd|z  dk7  sJ |       t        j                  |j                        t        z  dk7  ryt        j                  |j                        t        z  } | t        |j                        t        d|z  |            S )zReturns a WGStridedFragLayout for the given shaped type.

    Return None if the shaped type cannot have a strided layout.
    rC  Nr   rH   rY  )r   
ShapedTyper   	TypeErrormgpubitwidthelement_typer   r   rH   r\  r   rY   min)cls	shaped_tyrb  bwmax_vec_sizes        r%   from_shaped_typez$WGStridedFragLayout.from_shaped_type  s     ==##I.i  i(IMM)"8"899Q>	QBr6Q;17a<++'yy!N2a7779??+~=LIOO$s17L/I r/   c                X    t         j                  j                  | j                  f|      S rD   )r   r   r   rY  r   s     r%   r   z*WGStridedFragLayout.registers_element_type  s     ==dmm-q11r/   c                    || j                   k7  rt        d| d|        t        j                  | j                         t        | j
                  z  z  fS )r   rq   z is not compatible with )rH   r)   r   r   r\  rY  rO  s     r%   r   z#WGStridedFragLayout.registers_shape  sK    

w&>tfEFFIIdjj!nt}}&DEGGr/   c                    ~| j                   S rD   rE  rO  s     r%   r   z.WGStridedFragLayout.shape_from_registers_shape  rT  r/   c              #    K   || j                   k(  sJ t        j                  j                         }| j	                         D ]y  }g }t        | j                         D ]I  }t        ||      }|j                  t        j                  ||             t        j                  ||      }K |j                          | { y wrD   )rH   r   r   r   linear_thread_idxsrR   r   rX   r   r   r   r#  )r+   rH   r   vresrj   s         r%   r   zWGStridedFragLayout.thread_idxs  s     DJJLLE$$& c$**%  #Um

5;;q#&'KK3  
kkmis   B=B?c              #    K   t         j                  j                         }t        j                  | j
                        }|t        | j                  z  z  dk(  sJ |t        | j                  z  z  }t        j                  t        j                  t        j                  j                        t        t        |            }t        j                  |t        | j                  |j                               }t#        |      D ]B  }t        j$                  |t        |t        z  | j                  z  |j                                D yw)zThe indexes to be used for vector load/store WGStridedFragLayout.

    Yields:
      The indices of the vector that correspond to the current thread.
    r   N)r   r   r   r   r   rH   r\  rY  r   r   r   	thread_id	Dimensionxr   mulityperanger   )r+   r   cardinalityreg_numtidxoffr[   s          r%   rn  z&WGStridedFragLayout.linear_thread_idxs  s      LLE''$**%K.4==89Q>>>nt}}<=G;;s}}S]]__5q7OPD
**T1T]]DII6
7C7^ NJJsAa.04==@$))LMMNs   EEN)rf  r9  r   zWGStridedFragLayout | Noner8  r   )r   r   r   r   r   r.   classmethodri  r   r   r   r   rn  r!   r/   r%   rX  rX    sT    >	-/  (2H"
Nr/   rX  )rC  rA  rC  rO   rw   r   r   r   ))rB     r}  rV   )rB  rC  r  rC  rC  rC  r~  ))rO   )r  r  r  r~  rV   ))r  r  )rB  r  r  r  rC  r  r   )r  r  r  r}  r  )r  rO   r  ))rB  r   )r  r   )rC  r   r}  )r  r  r  rA  rA  rA  r   )i)r  )   rC  r   rC  r  r~  )r  r  r  r  r  ))r  )r   r}  rV   )r   )r   c                >    t        t        d| fd| ff      ddd      S )a  A layout resembling the logical organization of TMEM.

  The 128 rows in a tile are assigned to 128 lanes in the warpgroup. Useful when
  the result needs to be processed in registers and then stored back into TMEM.
  Usually shouldn't be used if the result is to be written back to SMEM, as
  there is no good way to store it without bank conflicts, but it still
  sometimes pays off.
  r  r   r  rO   rw   r  )r   r   r   s    r%   tmem_native_layoutr  p  s1     
sM"R$789	
 r/   rA  )r  r  r  F)initr   slotsc                  b   e Zd ZU  ej                  d      Zded<   ded<   ded<   	 	 	 	 	 	 d[d	Zed
d
d	 	 	 	 	 	 	 d\d       Z	e	 d]d
d	 	 	 d^d       Z
e	 d]d
d	 	 	 	 	 	 	 	 	 	 	 d_d       Zed`d       Zedad       ZdbdZd
d	 	 	 dcdZd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd  Zd! Zd" Zd# Z d$ Z!d% Z"d& Z#d' Z$d( Z%d) Z&d* Z'd+ Z(d, Z)d- Z*d. Z+d/ Z,d0 Z-ddd1Z.ddd2Z/dd3ded4Z0dd3ded5Z1dd3ded6Z2dd3ded7Z3dd3ded8Z4dd3ded9Z5dd3ded:Z6dd3ded;Z7e	 	 	 	 dfd<       Z8d
d	 	 	 	 	 dgd=Z9ddd>Z:dhd?Z;ddd@Z<d
d	 	 	 	 	 didAZ=	 d]	 	 	 	 	 	 	 djdBZ>dddCZ?dkdDZ@dddEZA	 	 	 	 dldFZBdG ZCed
d	 	 	 	 	 	 	 	 	 dmdH       ZDdd
dI	 dndJZEdodKZFdLdMdN	 	 	 	 	 	 	 dpdOZGe	 	 dq	 	 	 	 	 	 	 	 	 drdP       ZHedLd
dMdQ	 	 	 	 	 	 	 	 	 	 	 dsdR       ZIdtdSZJdudvdTZKed
eLdMeMj                  dMdU	 	 	 	 	 	 	 	 	 	 	 	 	 dwdV       ZOedxdW       ZPe	 du	 	 	 	 	 	 	 	 	 dydX       ZQdY ZRedZ        ZSy
)zFragmentedArrayF)repr
np.ndarray	registersFragmentedLayoutlayoutbool | None	is_signedc                  t         j                  | d|       t         j                  | d|       t         j                  | d|       |dut        j                  j	                  | j
                        k7  rt        d|d| j
                         | j                  xt        dx\  } t        j                  |j                  d   j                        j                  \  }t        j                  |      t        j                  |j                        t        z  |z  k7  r&t!        d	|j                   d
t         d| d| d	      y xt"        d x+\    |j$                  dk7  rt!        d|j                         y t&        d x)\   	 | j                  j)                  |j                         y 	 t*        # t         $ r t!        d      dw xY w)zInitializes a fragmented array.

    This is a low-level API. Prefer using classmethods to construct fragmented
    arrays instead.
    r  r  r  Nz[is_signed must be non-None if and only if the MLIR type is an integer type, got _is_signed= for r!   r   z(Invalid register array shape: math.prod(z) * z * z, want: math.prod(r2   r   zInvalid register array shape: z4Register array shape does not match the tiled layout)object__setattr__r   r   r   
mlir_dtyper`  r  rX  r   flatrv  rH   r   r   r\  r)   rH  r  r   r   rV  )r+   
_registers_layout
_is_signedrH   reg_sizes         r%   __init__zFragmentedArray.__init__  s    t[*5
tXw/
t[*5$2>>#<#<T__#MM, *}E$//1BD 
 ++%%]]:??1#5#:#:;AA
IIeyy))*^;hFG 89I9I8J K !XJ.@qJ G & ??a;J<L<L;MNO
O    =	
++
0
01A1A
B  !!  	D	s   	%F7 7GN)r  rY  c          	        t         j                  j                  |j                        st	        |j                        t        j                  |j                        }t        |j                        }|Bt        j                  |      }|8t        | dt         dt        j                  |       d      t        ||      }t        j                  |j                  |      t               }t         j"                  j%                  |j&                  f|j(                        }| j+                  ||j&                        D ]%  \  }	}
}} |
|t-        j.                  |||             '  | |||      S )Nz6 must have a number of elements that is a multiple of z (got r2   r^  dtyper  r  r  )r   
MemRefTyper   rv  r`  rY   rH   rX  ri  r)   r\  r   r   r   emptyr   r  r   r   rY  rc  transfer_stridedr   load)re  refr  rY  ref_tyrH   r  r  vec_ty_getupdater   s               r%   load_stridedzFragmentedArray.load_strided  s3    ==##CHH-chh]]388$F&,,E"33F;f	h vdii&6%7q:
 	

 #Bf//6fEI]]163F3FGF"%"6"6sFOO"L 7fc3YFC567)V	JJr/   r  c               ^   |xs t        |      }|xt         d x\    n\ xt        d x\   n xt        d x\   n  n1 t        j                  |j                  |j                        |      }n	 t        |       | t        j                  |j                  |      |t              ||      S )Nr!   r  r  )rH  rX  r   r   	broadcastr   rv  rV  r   fullr   r  )re  valuerH   r  r  s        r%   splatzFragmentedArray.splat  s     /(/F
   =;=0  ))%**5u
 !&))77611%8%vN r/   c                    t        |      k\  rt        d      d fdt        j                  j	                  t        j                         |||      j                  fdd|      S )z?Creates a broadcasted iota array along the specified dimension.z7`dimension` must be smaller than the rank of the array.c                    t         j                  j                        rIt         j                  j	                  d      }t        j                  t        j                  ||             S t        j                  |       S r  )r   	FloatTyper   r   r   r   uitofp
index_cast)r   r   r  s     r%   r   z.FragmentedArray.broadcasted_iota.<locals>.cast  sZ    		 	 	'nn))"-||E5#3#3C#=>>eS))r/   r  c                     |         S rD   r!   )_r   r   	dimensions     r%   <lambda>z2FragmentedArray.broadcasted_iota.<locals>.<lambda>  s    tC	N+ r/   Tcreate_arrayr  )r   r:  r   r:  )r(   r)   ra  r  r  r   
mlir_undefforeach)r  rH   r  r  r  r   s   ` `  @r%   broadcasted_iotaz FragmentedArray.broadcasted_iota  su     CJ
C * %%	 & 
 g+  	r/   c                    | j                   xt        dx\  } |S  xt        d x\  } |S  t        d x1\   | j                   j	                  | j
                  j                        S  	 t        )Nr!   rE  )r  rX  rH  r   r   r  rH   rV  rO  s     r%   rH   zFragmentedArray.shape  sa    
++%% &)) *={{55dnn6J6JKK !!r/   c                   | j                   j                  d   j                  }| j                  xxt        d x\   n xt
        d x\   n  n!  t        j                  |      j                  S t        d x\   |S  	 t        )Nr   r!   )r  r  rv  r  rX  r   r   r   rc  rH  rV  )r+   reg_tys     r%   r  zFragmentedArray.mlir_dtype(  sf    ^^  #((F
++0  =;=0}}V$111 !!r/   c                p  ' t         j                  j                  d      ''fd}| j                  |k(  r| S | j                  }t        j                  | j                        }t        t        ft        t        ff}|dv r| j                  |f|v s|| j                  f|v rt        j                  t        j                  j                  t        j                   t        j"                  t        j$                          |d             |d             |d            }t        j&                  | |d       |d            }g }| j(                  j*                  D ]  }	|	j,                  }
|d	k(  rEt        j.                  |	'      }	t        j0                  |	d      }t        j2                  |	||      }n>|dk(  r*t         j4                  j7                  d
'      }t9        d      D cg c]9  }t        j.                  t        j:                  |	t=        ||dz               '      ; }}t        j&                  ||d   |d         }t        j0                  |d      }t        j&                  ||d   |      }t        j&                  |||d         }t        j>                  t        j.                  ||      t        j.                  ||      g      }t        j.                  ||
      }ntA        d|       |jC                  t        j.                  ||
              tE        jF                  |tH              jK                  |jM                  |            }tO        ||| jP                        S tS        | j                  tT              r!tS        |tT              r| j                  tW        | j                  jX                        k(  r|tW        |jX                        k(  rtE        jZ                  |jM                  |      tH              }| j                  jX                  |jX                  kD  r| j                  jX                  |jX                  z  }|jX                  }tE        j\                  | j(                        D ]W  \  }}	t9        |      D ]D  }t        j:                  |	t=        ||z  |dz   |z              }|||d   |d   |z  |z   g|dd <   F Y n| j                  jX                  |jX                  k  r|jX                  | j                  jX                  z  }tE        j^                  |j                        D ]T  }t        j>                  t9        |      D cg c]%  }| j(                  |d   |d   |z  |z   g|dd    ' c}      }|||<   V tO        ||| jP                        S | j                  t`        k(  r|t        k(  r|jM                  |      }|d   dk(  sJ | j(                  j                  g |dd ddk(  sJ tE        jZ                  |tH              }tE        j^                  |dd       D ]J  }t        j>                  t9        d      D cg c]  }| j(                  g ||d    c}      |g |d<   L tO        ||| jP                        S | j                  t        k(  r|t`        k(  r|jM                  |      }| j(                  j                  d   dk(  sJ |g | j(                  j                  dd ddk(  sJ tE        jZ                  |tH              }tE        j\                  | j(                        D ]E  \  }}	t9        d      D ]2  }t        j:                  |	t=        ||dz               |g |dd |d<   4 G tO        ||| jP                        S | j                  tb        k(  r|t        k(  rwt        j                  | j                        x}d	k  rR|d   d	z  dk(  sJ tE        jZ                  |jM                  |      tH              }t        j                  t        j                  j                  t        j                   t        j$                          |d             |d            }| j(                  }|dk(  r|j                  d   dz  rte        d      tE        jf                  |dd      }ti        |j*                  ddd   |j*                  ddd         D cg c]  \  }}t        j>                  ||f       } }} tE        jF                  | tH              jJ                  g |j                  dd |j                  d   dz   }tE        jf                  |dd      }tE        j\                  |      D ]|  \  }}	|d	k(  r|	j,                  j                  dgk(  sJ t        j:                  |	t=        dd            }!t        j:                  |	t=        dd            }"t        j&                  ||"|!      }#t        j0                  |#d      }$t        j&                  ||!|$      }!t        j&                  ||$|"      }"|!||d   |d   dz  g|dd <   |"||d   |d   dz  dz   g|dd <   |dk(  r|	j,                  j                  dgk(  sJ t        j0                  |	d      }$t        j&                  | |d       |d            }t        j2                  |	|$|      }%t9        d      D ]D  }t        j:                  |%t=        |dz  |dz  dz               }	|	||d   |d   dz  |z   g|dd <   F |dk(  sJ |	j,                  j                  dgk(  sJ t        j0                  |	d      }$t        j&                  | |d       |d            }t        j2                  |	|$|      }%t9        d      D ]D  }t        j:                  |%t=        |dz  |dz  dz               }	|	||d   |d   dz  |z   g|dd <   F  tk        d |D              sJ tO        ||| jP                        S | j                  tl        k(  r|tb        k(  rt        j                  | j                        dk(  r|d   dz  dk(  sJ |d   dz  dk(  sJ tE        jZ                  |jM                  |      tH              }t         j                  j                  d      ''fd}t        j                  t        j                  jn                  t        j                   t        j$                          |d             |d            }&tE        j\                  | j(                        D ]  \  }}	t        j4                  |	j,                        j                  dgk(  sJ t        j0                  |	d      }$t        j&                  |& |d       |d            }t        j2                  |	|$|      }%t9        d      D ]D  }t        j:                  |%t=        |dz  |dz  dz               }	|	||d   |d   dz  |z   g|dd <   F  tk        d |D              sJ tO        ||| jP                        S | j                  tl        k(  r-|t        k(  r$| jq                  tb              jq                  |      S tS        | j                  tr              ste        d| j                   d|       t-        |       ju                  | j(                  jw                         | j                  || jP                        S c c}w c c}w c c}w c c}}w )z2Converts the fragmented array to the given layout.r   c                0    t        j                  |       S rD   r   r  rt  r   s    r%   r  z+FragmentedArray.to_layout.<locals>.<lambda>6  s    %..a( r/   >   r  r   r   rA  r   iT  iv2  r  rV   r   zUnsupported bitwidth: r  r  Nrw   zkThis relayout implementation requires an even number of column tiles (to pack pairs of them for efficiency)rC  i@b  i7  c              3  $   K   | ]  }|d u 
 y wrD   r!   r#   rs     r%   r&   z,FragmentedArray.to_layout.<locals>.<genexpr>       61$6   rB  c                0    t        j                  |       S rD   r  r  s    r%   r  z+FragmentedArray.to_layout.<locals>.<lambda>  s    ENN3* r/   c              3  $   K   | ]  }|d u 
 y wrD   r!   r  s     r%   r&   z,FragmentedArray.to_layout.<locals>.<genexpr>  r  r  zCannot convert from  to r  )<r   r   r   r  rH   r   rb  r  rD  WGMMA_TRANSPOSED_LAYOUTTCGEN05_LAYOUTTCGEN05_TRANSPOSED_LAYOUTr   cmpiCmpIPredicateeqr   r   r  selectr  r  rv  bitcast	shfl_bflyprmtr   r   rw  vector_sliceslicevector_concatr)   rX   r   asarrayr  reshaper   r  r  r   r   r  r   r  ndenumerater   WGMMA_LAYOUT_ACC_32BITWGMMA_LAYOUT_UPCAST_2XrV  moveaxisrG   rN  WGMMA_LAYOUT_UPCAST_4Xult	to_layoutrH  r  item)(r+   
new_layoutr   rH   rb  transpose_pairsis_even_rowpermtmp_new_regsregr  reg_shflnew_regi32_vecr[   regsreg_to_shflnew_reg_lownew_reg_highnew_reg_i32new_regsnew_registersratio
new_lengthr   new_regs_shapedtype_bitwidthis_evenr  col_minor_registerslhflat_registerslowhighto_exchange	exchangedblendis_01r   s(                                          @r%   r  zFragmentedArray.to_layout3  s   
..
%
%b
)C(A{{j kJJE~~doo.H	./	23O 8	j!_4$7JJ



 
 
++ekk%"2"2"4ad;QqT
B
A$k
 \\+qy!F)<dl$$ <#r>c3'#__S!,(JJsHd3'^MM%%dC0' Qx mmE..sE!QUODcJ$  [$q'47C+__[!4([$q'8D+k8T!WE,++mmK1mmL'2- + MM+v6'3H:>?
?EMM'6:;/<0 
f
**512  zdnn  	4;;,z;/KK-dkk.G.GHH,Z-E-EFFhhz99%@Om		"	"Z%=%=	=))Z-E-EE--
t~~6 	LHC< La((U1z>AEZ+?@G ELM3q63q6E>A#5@AB@A	L	L ;;$$z'?'??((DKK,E,EE::m112 	'C''U|) nnSVSVe^a%7A#ab'AB) '  '-
	' "J4>>  {{,,|1K!11%8nB1$$$^^!!%A~cr':%AA%Aq%AAAA.7hN3B/0 ##1105a3
+,DNN:C::A:&3
 3 zdnn  {{l"z5K'K!11%8n^^!!"%***A!5!5cr!:AAAqAAAA.7hnnT^^4 Q(#sq 	QA(-(:(:3aQ(P($S"X$q$!$
%	QQ zdnn  	--,&$~~doo>>^2E1X]ahhz99%@Om





 
 %++e.>.>.@!A$"G1g ..i	1	??1!#>  !kk)Q; #((1-/B/G/G1/M
1 A'
 
 EBJJ~V<DD 
 &&s+
-@-F-Fr-Ja-O
	 KK	2q1	nnY/ 4F(#sRA3&
&& ""3a4###Cq!5$WdC8+ook15)Wc95#gy$7$<?-QQ!8c!Bi8
9@D-QQ!a<#a)<
=q A3&
&& ooc1-) gqy!F)<$**S)T2%8 Fa$$UE!a%Q,CDCBEM3q63q6A:>>C"I>?F  1$
$$A3&
&&ooc1-) gqy!F)<$**S)T2%8 Fa$$UE!a%Q,CDCBEM3q63q6A:>>C"I>?Fe4Fj 66666"J4>>  	--00NN4??+q01X]a1X]ahhz99%@OmNN''+c
*ajj



!
!5;;u/?/?/A1Q4#H!A$e nnT^^4 D(#s}}SXX&,,333 OOC+	 ||E1V9ai8

3	40q 	DA""5%Aq1uqy*AB#@C-QQ!a<#a)<
=	DD$ 66666"J4>>  {{,,|1K^^23==jIIdkk#45 T*
>  :tzz:   UR)3
N
s   #>|#*|(
|-
0!|2output_is_signedc               R  
 t        | j                  t              r| j                  }t	        |      D ]x  \  
}t        |t
              st        |j                  t              s( |j                  
fd| g|d 
 |
dz   d  d|ic S t        j                  ||j                        }z | j                  |k7  r% | j                  |      j                  g|d|iS g }|D ]  }t        |t
              st        |t        t        f      r!t        j                  || j                        }n%t        |t        j                         st#        |      t
        j%                  || j                  | j                  | j&                        }t        |j                  t              r|j                  j)                  | j                        s%t+        d| j                   d|j                         t
        j%                  |j,                  j.                  d   | j                  | j                  |j&                        }n\| j                  |j                  k7  rt+        d      | j,                  j                  |j,                  j                  k7  rt+        d	      |j1                  |        t        j2                  | j,                        }t        j4                  | j,                        D ]  \  } |gfd
|D         |<    |j.                  d   j6                  }	t        j8                  j                  |	      rt        j8                  |	      j:                  }	|+t        j<                  j                  |	      r| j&                  }t        || j                  |      S )Nc                *     |g|d  | |d   S rD   r!   )r   thisargsr[   ops      r%   r  z,FragmentedArray._pointwise.<locals>.<lambda>,  s'    R%FtBQx%F%FT!"X%F r/   r   r  )rH   r  r  zCannot broadcast shape z to layout r   z$Incompatible FragmentedArray layoutsz#Incompatible FragmentedArray shapesc              3  <   K   | ]  }|j                        y wrD   )r  )r#   r   r   s     r%   r&   z-FragmentedArray._pointwise.<locals>.<genexpr>Z  s     EQC 0E   r  )r   r  rH  rH   rW   r  
_pointwiser   broadcast_shapesr  floatr   r   r   r  r   ValuerV  r  r  rP  r)   r  r  rX   
empty_liker  rv  r   rc  r   )r+   r  r  otheroutput_shaper   
other_arrsr  r  r  r[   r   s    `        @@r%   r  zFragmentedArray._pointwise   s   
 $++01ZZlE" D$!Q!_-
AHH&78F Ray QUW~	
  0  ,,\177C,D 
|	#6t~~l+66

)9
 	
 J ?+a%&gga)!Arxx(#A&
&!!TZZt~~ " 
 
AHH/	0xx((4'

|;qxxjIK K!!KKQ**;;kk	 " 
 ;;!(("AB
B>>1;;#4#44@A
A78 }}T^^,HNN4>>2 GSFE*EFhsmG]]1""F	}}'}}V$11fBNN$=$=f$ET[[=M r/   c                    | S rD   r!   r6   s    r%   __pos__zFragmentedArray.__pos__d  s    Kr/   c                    t         j                  j                  | j                        r| j	                  t
        j                        S t         j                  j                  | j                        rd| z
  S t        S r[  )	r   r  r   r  r  r   negfr   NotImplementedr6   s    r%   __neg__zFragmentedArray.__neg__g  sP    	||t/__UZZ((		"	"4??	3Xor/   c                   t         j                  j                  | j                        r| j	                  t
        |      S t         j                  j                  | j                        r | j	                  t        j                  |      S t        S rD   )
r   r  r   r  r  addfr   r   r   r+  r+   r$  s     r%   __add__zFragmentedArray.__add__o  Z    	||t/__T5))		"	"4??	3__UZZ//r/   c                    | |z   S rD   r!   r/  s     r%   __radd__zFragmentedArray.__radd__w      %<r/   c                   t         j                  j                  | j                        r| j	                  t
        |      S t         j                  j                  | j                        r | j	                  t        j                  |      S t        S rD   )
r   r  r   r  r  mulfr   r   ru  r+  r/  s     r%   __mul__zFragmentedArray.__mul__z  r1  r/   c                    | |z  S rD   r!   r/  s     r%   __rmul__zFragmentedArray.__rmul__  r4  r/   c                   t         j                  j                  | j                        r| j	                  t
        |      S t         j                  j                  | j                        r | j	                  t        j                  |      S t        S rD   )
r   r  r   r  r  subfr   r   subir+  r/  s     r%   __sub__zFragmentedArray.__sub__  r1  r/   c                    t         j                  j                  | j                        r| j	                  d |      S t         j
                  j                  | j                        r| j	                  d |      S t        S )Nc                    t        ||       S rD   )r;  rA   r   s     r%   r  z*FragmentedArray.__rsub__.<locals>.<lambda>  s    $q!* r/   c                .    t        j                  ||       S rD   )r   r<  r@  s     r%   r  z*FragmentedArray.__rsub__.<locals>.<lambda>  s    %**Q*: r/   )r   r  r   r  r  r   r+  r/  s     r%   __rsub__zFragmentedArray.__rsub__  sX    	||t/__4e<<		"	"4??	3__:EBBr/   c                    t         j                  j                  | j                        st        S | j                  t        j                  |      S rD   )r   r  r   r  r+  r  r   divfr/  s     r%   __truediv__zFragmentedArray.__truediv__  s3    <<""4??3??5::u--r/   c                    t         j                  j                  | j                        st        S | j                  d |      S )Nc                .    t        j                  ||       S rD   )r   rD  r@  s     r%   r  z.FragmentedArray.__rtruediv__.<locals>.<lambda>  s    

1a(8 r/   )r   r  r   r  r+  r  r/  s     r%   __rtruediv__zFragmentedArray.__rtruediv__  s0    <<""4??3??8%@@r/   c                p   t         j                  j                  | j                        r| j	                  d |      S t         j
                  j                  | j                        rL| j                  r | j	                  t        j                  |      S | j	                  t        j                  |      S t        S )Nc                T    t        j                  t        j                  | |            S rD   	mlir_mathfloorr   rD  r@  s     r%   r  z.FragmentedArray.__floordiv__.<locals>.<lambda>      yuzz!Q'78 r/   )r   r  r   r  r  r   r  r   
floordivsir   r+  r/  s     r%   __floordiv__zFragmentedArray.__floordiv__  s    	||t/__
8%  
	"	"4??	3	u//77u{{E22r/   c                <   t         j                  j                  | j                        r| j	                  d |      S t         j
                  j                  | j                        r2| j                  r| j	                  d |      S | j	                  d |      S t        S )Nc                T    t        j                  t        j                  ||             S rD   rK  r@  s     r%   r  z/FragmentedArray.__rfloordiv__.<locals>.<lambda>  rN  r/   c                .    t        j                  ||       S rD   )r   rO  r@  s     r%   r  z/FragmentedArray.__rfloordiv__.<locals>.<lambda>  s    E,<,<Q,B r/   c                .    t        j                  ||       S rD   )r   r   r@  s     r%   r  z/FragmentedArray.__rfloordiv__.<locals>.<lambda>  s    EKK1,= r/   )r   r  r   r  r  r   r  r+  r/  s     r%   __rfloordiv__zFragmentedArray.__rfloordiv__  sw    	||t/__
8%  
	"	"4??	3	BEJJ=uEEr/   c                    t         j                  j                  | j                        st        S | j
                  r | j                  t        j                  |      S | j                  t        j                  |      S rD   )
r   r   r   r  r+  r  r  r   remsir   r/  s     r%   __mod__zFragmentedArray.__mod__  sO    >>$$T__5~~__U[[%00__U[[%00r/   c                    t         j                  j                  | j                        st        S | j
                  r| j                  d |      S | j                  d |      S )Nc                .    t        j                  ||       S rD   )r   rW  r@  s     r%   r  z*FragmentedArray.__rmod__.<locals>.<lambda>      %++a*; r/   c                .    t        j                  ||       S rD   )r   r   r@  s     r%   r  z*FragmentedArray.__rmod__.<locals>.<lambda>  r[  r/   )r   r   r   r  r+  r  r  r/  s     r%   __rmod__zFragmentedArray.__rmod__  sI    >>$$T__5~~__;UCC__;UCCr/   c                j    t         j                  j                  | j                        st        S | dz  S )Nrw   )r   r   r   r  r+  r6   s    r%   
__invert__zFragmentedArray.__invert__  s(    >>$$T__5"9r/   c                    t         j                  j                  | j                        st        S | j                  t        j                  |      S rD   )r   r   r   r  r+  r  r   orir/  s     r%   __or__zFragmentedArray.__or__  s3    >>$$T__5??599e,,r/   c                    | |z  S rD   r!   r/  s     r%   __ror__zFragmentedArray.__ror__  r4  r/   c                    t         j                  j                  | j                        st        S | j                  t        j                  |      S rD   )r   r   r   r  r+  r  r   andir/  s     r%   __and__zFragmentedArray.__and__  3    >>$$T__5??5::u--r/   c                    | |z  S rD   r!   r/  s     r%   __rand__zFragmentedArray.__rand__  r4  r/   c                    t         j                  j                  | j                        st        S | j                  t        j                  |      S rD   )r   r   r   r  r+  r  r   xorir/  s     r%   __xor__zFragmentedArray.__xor__  rh  r/   c                    | |z  S rD   r!   r/  s     r%   __rxor__zFragmentedArray.__rxor__  r4  r/   c                    | j                  |t        j                  j                  t        j                  j
                  t        j                  j
                        S N)f_predsi_predui_pred)_comparer   CmpFPredicateOEQr  r  r/  s     r%   __eq__zFragmentedArray.__eq__  H    ==""&&##&&##&&	   r/   c                    | j                  |t        j                  j                  t        j                  j
                  t        j                  j
                        S rq  )ru  r   rv  UNEr  ner/  s     r%   __ne__zFragmentedArray.__ne__  ry  r/   c                    | j                  |t        j                  j                  t        j                  j
                  t        j                  j                        S rq  )ru  r   rv  OLTr  sltr  r/  s     r%   __lt__zFragmentedArray.__lt__  H    ==""&&##''##''	   r/   c                    | j                  |t        j                  j                  t        j                  j
                  t        j                  j                        S rq  )ru  r   rv  OLEr  sleuler/  s     r%   __le__zFragmentedArray.__le__  r  r/   c                    | j                  |t        j                  j                  t        j                  j
                  t        j                  j                        S rq  )ru  r   rv  OGTr  sgtugtr/  s     r%   __gt__zFragmentedArray.__gt__  r  r/   c                    | j                  |t        j                  j                  t        j                  j
                  t        j                  j                        S rq  )ru  r   rv  OGEr  sgeuger/  s     r%   __ge__zFragmentedArray.__ge__  r  r/   c                  t         j                  j                  | j                        r%t	        j
                  t        j                  |      }nt         j                  j                  | j                        rV| j                  r%t	        j
                  t        j                  |      }n+t	        j
                  t        j                  |      }nt        S | j                  ||d      S )NFr  )r   r  r   r  r=  partialr   cmpfr   r  r  r+  r  )r+   r$  rr  rs  rt  preds         r%   ru  zFragmentedArray._compare  s    	||t/uzz62d		"	"4??	3	  W5  W5??4???r/   c                   t         j                  j                  | j                        r\t        j
                  }t         j                  j                  | j                        r| j                  d      }| j                  ||      S t         j                  j                  | j                        rA| j                  | j                  rt        j                  |      S t        j                  |      S t        )Nmax.NaN.f32)r   r  r   r  r   maximumfF32Type_lift_fast_instrr  r   r  maxsimaxuirV  )r+   r$  r  s      r%   maxzFragmentedArray.max"  s    	||t/h			t	/((7__Xu--		"	"4??	3__%++% -2[[%   r/   c                t   t         j                  j                  | j                        r | j	                  t
        j                  |      S t         j                  j                  | j                        rA| j	                  | j                  rt
        j                  |      S t
        j                  |      S t        rD   )r   r  r   r  r  r   minimumfr   r  minsiminuirV  r/  s     r%   rd  zFragmentedArray.min/  s~    	||t/__U^^U33		"	"4??	3__%++% -2[[%   r/   approxc               d   t         j                  j                  | j                        st        |ra| j                  }t        j                  |t         j                  j                  |d            }t        t        | |z        j                         S | j                  t        j                        S )Ng+eG?)r   r  r   r  rV  r   r  	FloatAttrr   r   r  exp2r  rL  exp)r+   r  r  log2es       r%   r  zFragmentedArray.exp9  sy    <<""4??3ooennUBLL$4$4U<N$OPe/4%<05577??9==))r/   c               ^   t         j                  j                  | j                        st        |r^t         j
                  j                  | j                        st	        | j                        | j                  | j                  d            S | j                  t        j                        S )Nzex2.approx.ftz.f32)
r   r  r   r  rV  r  r  r  rL  r  r+   r  s     r%   r  zFragmentedArray.exp2B  st    <<""4??3ZZ""4??3!$//22__T223GHII??9>>**r/   c               L   t         j                  j                  | j                        st        |rU| j                  }t        j                  |t         j                  j                  |d            }| j                  d      |z  S | j                  t        j                        S )Ng9B.?Tr  )r   r  r   r  rV  r   r  r  r   log2r  rL  log)r+   r  r  ln2s       r%   r  zFragmentedArray.logK  su    <<""4??3ooeNN5",,"2"25:L"MNcYYdY#c))??9==))r/   c               |   t         j                  j                  | j                        st	        | j                        |r^t         j
                  j                  | j                        st	        | j                        | j                  | j                  d            S | j                  t        j                        S )Nzlg2.approx.ftz.f32)
r   r  r   r  rV  r  r  r  rL  r  r  s     r%   r  zFragmentedArray.log2T  s}    <<""4??300ZZ""4??3!$//22__T223GHII??9>>**r/   c               2   t         j                  j                  | j                        st        |r1| j                  t         j
                  j                         k7  rt        | j                  |r| j                  d            S t        j                        S )Nzsin.approx.f32)r   r  r   r  rV  r  r   r  r  rL  sinr  s     r%   r  zFragmentedArray.sin]  p    <<""4??3$//RZZ^^%55??39./ ?H}} r/   c               2   t         j                  j                  | j                        st        |r1| j                  t         j
                  j                         k7  rt        | j                  |r| j                  d            S t        j                        S )Nzcos.approx.f32)r   r  r   r  rV  r  r   r  r  rL  cosr  s     r%   r  zFragmentedArray.cosf  r  r/   c               2   t         j                  j                  | j                        st        |r1| j                  t         j
                  j                         k7  rt        | j                  |r| j                  d            S t        j                        S )Nztanh.approx.f32)r   r  r   r  rV  r  r   r  r  rL  tanhr  s     r%   r  zFragmentedArray.tanho  sp    <<""4??3$//RZZ^^%55??4:/0 @I r/   c               2   t         j                  j                  | j                        st        |r1| j                  t         j
                  j                         k7  rt        | j                  |r| j                  d            S t        j                        S )Nzrsqrt.approx.f32)r   r  r   r  rV  r  r   r  r  rL  rsqrtr  s     r%   r  zFragmentedArray.rsqrtx  sp    <<""4??3$//RZZ^^%55??5;01 AJ r/   c                      fdS )Nc                 `  	 t         j                  j                         }| d   j                  	t	        	fd| D              sJ 	|k(  rot        t              rZdj                  d t        t        |       dz         D              }t        j                  ||  d| ddd	t        |       z  z         S  |  S t         j                  j                  	      rt        j                  	      }t        j                  	      j                  \  }t        |      D ]  }| D cg c]8  }t        j                   |g t         j"                  j                  |g      
      : }} 
| }t        j$                  ||g t         j"                  j                  |g      
      } |S t'        	      c c}w )Nr   c              3  <   K   | ]  }|j                   k(    y wrD   )rv  )r#   r'  arg_tys     r%   r&   zGFragmentedArray._lift_fast_instr.<locals>.fast_instr.<locals>.<genexpr>  s     0a6!0r  , c              3  &   K   | ]	  }d |   yw)$Nr!   )r#   r[   s     r%   r&   zGFragmentedArray._lift_fast_instr.<locals>.fast_instr.<locals>.<genexpr>  s     E11#wEr'   r    ;z=fz,fdynamic_positionstatic_position)r   r  r   rv  rN  r   r5   r3   rw  r(   r   
inline_asmr   r  rH   r   extractDenseI64ArrayAttrinsertrV  )r  f32args_ptxresultvec_lenr[   r'  vsvrr  
fast_instrinstrs            @r%   r  z4FragmentedArray._lift_fast_instr.<locals>.fast_instr  s   JJNNcAw||f040000	3eS!YYEc$i!m0DEE(4E7!H:Q/s4y8H1H  
==##F+(MM&)//	w 	A   nn#%"$"6"6":":A3"?"  2"==! 2266s;	&	  !&))#s   !=F+r!   )r  r  s   `@r%   r  z FragmentedArray._lift_fast_instr  s    !*D r/   c                  |d ut         j                  j                  |      k7  rt        d|d|       || j                  k(  r| S | j
                  j                  d   j                  }t         j                  j                  |      r@t        j                  |      j                  }t         j                  j                  ||      n|| j                  fd|      S )Nzhoutput_is_signed must be non-None if and only if the MLIR type is an integer type, got output_is_signed=r  r   c                0    t        j                  |       S rD   )r   r  )rt  tys    r%   r  z)FragmentedArray.bitcast.<locals>.<lambda>  s    %--A& r/   r  )r   r   r   r`  r  r  r  rv  r   rH   r   r  )r+   eltr  reg_type	reg_shaper  s        @r%   r  zFragmentedArray.bitcast  s     	$)B)B3)GG2 02%u> 
 dook~~""1%**H	}})--)//i==Y,bb??&9I   r/   c           	        t        | j                  t              st        d      t	        j
                  || j                        \  }}}t        d |D              rt        d      t        |      rt        d      | j                  j                  }t        | j                        t        |      z
  x}rd|z  |z   }t        d t        ||d      D              rt        d	| d
| d      t        d t        ||d      D              rt        d| d| d      t        d t        |||d      D              }| j                  |   }t        || j                  | j                        S )N,Only arrays with tiled layouts can be slicedc              3  P   K   | ]  }t        |t        j                           y wrD   r   r   r"  r#   r   s     r%   r&   z.FragmentedArray.__getitem__.<locals>.<genexpr>       
9:c288$
9   $&(Only slicing with static indices allowed7Integer indexing not implemented (only slicing allowed)rV   c              3  ,   K   | ]  \  }}||z    y wrD   r!   )r#   brB   s      r%   r&   z.FragmentedArray.__getitem__.<locals>.<genexpr>  s     
ITQ1q5
IrF   Tru   zdBase indices of array slices must be aligned to the beginning of a tile. The array uses a tiling of z, but your base indices are z*. Consider using a different array layout.c              3  ,   K   | ]  \  }}||z    y wrD   r!   )r#   r  rB   s      r%   r&   z.FragmentedArray.__getitem__.<locals>.<genexpr>  s     
LTQ1q5
LrF   zQThe slice shape must be a multiple of the tile shape. The array uses a tiling of z, but your slice shape is c              3  N   K   | ]  \  }}}t        ||z  ||z   |z          y wrD   r  r#   r  r  rB   s       r%   r&   z.FragmentedArray.__getitem__.<locals>.<genexpr>  1      Aq! 	a1fq1ul#   #%r  )r   r  r   rV  r   parse_indicesrH   r*   r)   r   r(   rG   rY   r  r  r  )	r+   r   base_idxslice_shapeis_squeezedr   untiled_rankregister_slicesr  s	            r%   __getitem__zFragmentedArray.__getitem__  sy   dkk;/ NOO).)<)<S$**)M&Hk;

9
99ABB
; YZZkk11O4::_)===|=|+o=o

IS?4H
II//>.? @"#MO 
 
LSodK
LL./ 0]DF 
  8[/$O O ~~o.HT[[T^^ r/   c           	     T   t        |t              st        d|       t        | j                  t              st        d      t        j                  || j                        \  }}}t        d |D              rt        d      t        |      rt        d      |j                  t        |      k7  r$t        dt        |       d|j                         |j                  | j                  k7  r%t        d|j                   d	| j                         |j                  | j                  k7  r%t        d
|j                   d| j                         | j                  j                  }t        |      t        | j                        k7  rt        d      t        d t        |||d      D              rt        d      t        d t        |||d      D              }| j                  |   j                  |j                  j                  k(  sJ |j                  | j                  |<   y )Nz!Expected a FragmentedArray, got: r  c              3  P   K   | ]  }t        |t        j                           y wrD   r  r  s     r%   r&   z.FragmentedArray.__setitem__.<locals>.<genexpr>  r  r  r  r  zSlice has shape z, but assigned array has shape zArray has dtype z, but assigned array has dtype zArray has layout z , but assigned array has layout z$Tiling has different rank than arrayc              3  <   K   | ]  \  }}}||z  xs ||z    y wrD   r!   r  s       r%   r&   z.FragmentedArray.__setitem__.<locals>.<genexpr>  s,      Aq! 	
AQs   Tru   z#Only tile aligned slicing supportedc              3  N   K   | ]  \  }}}t        ||z  ||z   |z          y wrD   r  r  s       r%   r&   z.FragmentedArray.__setitem__.<locals>.<genexpr>   r  r  )r   r  r)   r  r   rV  r   r  rH   r*   rY   r  r   r(   rG   r  )r+   r   r  r  r  r  r   r  s           r%   __setitem__zFragmentedArray.__setitem__  s   e_-:5'BCCdkk;/ NOO).)<)<S$**)M&Hk;

9
99ABB
; YZZ{{eK((U;/0 1kk]  4??*U--. /oo   ||t{{"ell^ ,kk]  kk11O
?s4::. FGG
 8[/$O    EFF 8[/$O O >>/*00EOO4I4IIII&+ooDNN?#r/   c                    t        t        j                  | j                        | j                  | j
                        S )Nr  )r  r   copyr  r  r  r6   s    r%   r  zFragmentedArray.copy  s-    774>>*>> r/   c               !   >?@ABCDEFGHI t         j                  j                  d      }t         j                  j                  d      @t         j                  j                  d      }t         j                  j                  d      ?t         j                  j	                         >t         j
                  j	                         }t         j                  j	                         } j                  }||k(  r3 j                  |k(  r S t         j                   j                  |      S  j                  j                  d   }|j                  }	t         j                  j                  |	      }
|
r(t!        t        j                  |	      j"                        nd}|\  It%        j&                  |      Iz  x}dz  rt)        d|       t        t+        |j,                  d	d       x}t.        j0                        xrJ t%        j&                  |j2                  j                        dk(  xr |j4                  d   j6                  d
k(  E	 	 	 	 	 	 d$E Ifd}||k(  r||k(  rd%?fd}t9        j:                   j                        }Idk(  rdnd} ||d      D ]  \  }}t        j                  |j                        j"                  \  }|Iz  dk(  sJ t         j                  j                  |dz        }t%        j<                  ||      }|?k7  rt?        j@                  ?|      }tC        tE        |dz  d
            D cg c]  } |||       }}t%        jF                  |D cg c]6  }t/        jH                  t         j                  j	                  d?      |      8 c}      }tK        |      dz  }t%        j<                  |t         j                  j	                  |f@            }d}|D ]b  }t%        jL                  |tO        ||Iz               }t%        j<                  |t         j                  j	                  If|            ||<   |Iz  }d  t        | j                  d       S ||k(  rl j                  r_|>k(  rYIdz  dk(  rPt9        j:                   j                        }t         j                  j	                  If|      }  |dd      D ]  \  }}d&>?fdGt        j                  |j                        j"                  \  }|Iz  dk(  sJ |dz  dk  sJ t         j                  j                  |dz        }g }!Er|j,                  jP                  }"|"jR                  d   j6                  Ft%        j<                  |"j2                  ?      At?        jT                  AtW        d?            BFdz  dk(  sJ |!jY                  ABFGfdtC        |dz        D               nyt%        j<                  ||      C|?k7  rt?        j@                  ?C      Ct?        jT                  CtW        d?            D|!jY                  CDGfdtC        |dz        D               t%        jF                  |!      }#d}|D ]@  }t%        jL                  |#tO        ||Iz               x||<   }$|Iz  }|$j                  | k(  r@J   t        | j                  d       S ||k(  rǉ j                  r|@k(  r|rt9        j:                   j                        }t         j                  j	                  If|      }  |dd      D ]K  \  }}d'd(?@fd}%t        j                  |j                        j"                  \  }|Iz  dk(  sJ |dz  dk  sJ t         j                  j                  |dz        }ErZ|j,                  jP                  }"|"jR                  d   j6                  Ft%        j<                  |"j2                  ?      A |%AF      }&n9t%        j<                  ||      C|?k7  rt?        j@                  ?C      C |%C      }&d}|D ]@  }t%        jL                  |&tO        ||Iz               x||<   }$|Iz  }|$j                  | k(  r@J  N t        | j                  |      S |@k(  r6 j                  r)|>k(  r#Idv rt9        j:                   j                        }?fd}'t[        j\                  t         j                  j	                  Idz  f?            }(t[        j\                  t         j                  j	                  d|            })t9        j^                   j                        D ]U  \  }}Idk(  rwt/        j<                  t         j                  j	                  d|      |      }*t%        jF                  |*|)g      }+ |'|+d      },t[        j`                  |(|,tW        d?            }-nIdk(  rt/        j<                  t         j                  j	                  d?      |      }+ |'|+d      }. |'|+d      }/t[        j`                  |(|.tW        d?            }-t[        j`                  |-|/tW        d
?            }-ntc        I      t/        j<                  t         j                  j	                  If|      |-      ||<   X t        | j                  |      S |>|hv r||k(  rIdk7  rtc        I      t9        j:                   j                        }t[        j\                  t         j                  j	                  d|            }0t9        j^                   j                        D ]  \  }}t/        jd                  |g t         jf                  j	                  dg            }1t/        jd                  |g t         jf                  j	                  d
g            }2|>k(  r,t?        jh                  ||1      }1t?        jh                  ||2      }2t[        jj                  ||2|1gdd      }3t/        j<                  t         j                  j	                  d|      t[        j`                  |0|3tW        d?                  ||<    t        | j                  |      S t         jl                  j                  |      }4t         jl                  j                  |      }5t         j                  j                  |      }6t         j                  j                  |      }7|4r|5rt        jl                  |      jn                  }8t        jl                  |      jn                  }9|8|9k(  r|8dk7  rtc        d      t         j
                  j	                         }: j                  xxtp        d x\   n xtr        d x\   n  nc  t        j                   j                  j                  d   j                        j"                  };t         j                  j	                  |;|:      Hn(tt        d x\   |:Hn 	 tc        d  j                         Hfd!}<nKt        jl                  |      jn                  t        jl                  |      jn                  kD  rt>        jv                  }<nt>        jh                  }<n|6r|7r}t        j                  |      jn                  t        j                  |      jn                  kD  rt>        jx                  }<n j                  rt>        j@                  nt>        jz                  }<ni|6r/|5r- j                  rt>        j|                  nt>        j~                  }<n8|4r%|7r#|rt>        j                  nt>        j                  }<ntc        d"| d#|       t9        j:                   j                        } j                  xxtp        d x\   n xtr        d x\   n  nc  t        j                   j                  j                  d   j                        j"                  };t         j                  j	                  |;|      }=n(tt        d x\   |}=n 	 tc        d  j                         t9        j^                   j                        D ]  \  }} |<|=|      ||<    t        | j                  |      S c c}w c c}w ))Nr   rC  r  r   r  r   rV   z=Register bitwidth in target type must be divisible by 8, got opviewr   c             3  "  K   r1|r/t        j                  	j                        D ]  \  }}|g|f  yt        j                  	j                        }g }g }	 	 t        t	        | 
z  d            D ]2  }t        |      \  }}|j                  |       |j                  |       4 |t        j                  |      f |j                          |j                          # t        $ r Y nw xY w|r|t        j                  |      f yyw)z8Tries to pack registers up to destination vector length.Nr   )r   r  r  rw  r  nextrX   r   r  clearStopIteration)dst_vector_lenif_not_slicedr   r  	generatorr   r   r  regs_from_32bit_slicer+   
vector_lens           r%   packed_registersz0FragmentedArray.astype.<locals>.packed_registers9  s     
=t~~6 	HCs

	..0igd		^z91=> aIHCNN3KK ,,T22
2
**,
--/   	
		u**4000 
s+   ADBC# "D#	C/,D.C// Dc                   g d}t        j                  t        j                  | t        d            t        d            }t        j                  | t        d            }d|cxk  rdk  sJ  J |dk(  r@t        j                  |t        d            }t        j                  |t        d            }t        j                  |t        d            }t        j                  ||gd	|d    d
|d    d|d    d
|d    d	d      S )N)i 8@DiHJLNl   N l   Dq l    r   iwwwwr   rA  r  i22zU
            {
            .reg .b32 pos_f8s, neg_f8s;
            prmt.b32 pos_f8s, r  z$, $1;
            prmt.b32 neg_f8s,    zO, $1;
            prmt.b32 $0, pos_f8s, neg_f8s, $2;
            }
            =r,r,r)r   shruirf  r   ra  r   r  )r  partlutsignlut_idxprmt_sign_patternr   s         r%   upcast_to_f8e4m3fnz2FragmentedArray.astype.<locals>.upcast_to_f8e4m3fnY  s	   
 {{5::c1Z+=>!S	J**S!J"45D}1}}}19KK2s4'T1R:.$!IIdAj#,>?'(  #1vhbQ 1"1vhbQ 1	 
 	
r/   rA  F)r  r  Tc           	         d|cxk  rdk  sJ  J t        j                  | |gd|dz    d| dd      }t        j                  |t        j
                  j                  d            S )Nr   r   zU
              {
              .reg .b32 s<4>;
              prmt.b32 s1, $1, $2, 0xFFz;
              lop3.b32 s2, s1, 0x000F000F, 0x43084308, (0xf0 & 0xcc) ^ 0xaa;
              mov.b32 s3, 0x43084308;
              sub.bf16x2 $0, s2, s3;
              }
              r  r~  )r   r  r   r  r   r   r   )r  reg_shrr  int_regbf16r   s       r%   upcast_i4_to_bf16z1FragmentedArray.astype.<locals>.upcast_i4_to_bf16  sy    dQ

OOGn' (,axj$ 8 ' w(9(9$(EF
Fr/   c              3  @   K   | ]  } d z  |z           yw)rA  r  Nr!   )r#   r  reg_intreg_int_shrslice_offsetr  s     r%   r&   z)FragmentedArray.astype.<locals>.<genexpr>  s/        <1;Lt;STTs   c              3  4   K   | ]  } |         yw)r  Nr!   )r#   r  reg_slice_intreg_slice_int_shrr  s     r%   r&   z)FragmentedArray.astype.<locals>.<genexpr>  s&        /@tLLs   c                   dz  dk(  sJ dj                  fddD              }dj                  fddD              }t        j                  t        j                  j                  d      | gd	| d
| dd      }t        j                  j                  d      }t        j                  t        d      D cg c].  }t        j                  t        j                  ||f      |      0 c}      S c c}w )NrA  r   r1   c              3  T   K   | ]  }t        t        d z  |z   d             ! ywrA     Nr5   rd  r#   r[   first_valid_nibbles     r%   r&   zBFragmentedArray.astype.<locals>.upcast_i4_to_i8.<locals>.<genexpr>  s(     \!S%71%<q%@!!DE\   %()   r   r   r   c              3  T   K   | ]  }t        t        d z  |z   d             ! ywr  r  r  s     r%   r&   zBFragmentedArray.astype.<locals>.upcast_i4_to_i8.<locals>.<genexpr>  s(     ]1c#&8A&=&A1"EF]r!  )r  r     rA  z!llvm.struct<(i32, i32)>a  
              {
              .reg .b32 high_even;  // $2 is high_odd
              .reg .b32 low_odd;    // $2 is low_even
              .reg .b32 sign_even, sign_odd;
              .reg .b32 i8_odd, i8_even;
              shl.b32 high_even, $2, 4;                              // x6x5x4x3x2x1x000
              prmt.b32 sign_even, high_even, high_even, 0xba98;      // s6s6s4s4s2s2s0s0
              prmt.b32 sign_odd, $2, $2, 0xba98;                     // s7s7s5s5s3s3s1s1
              shr.u32 low_odd, $2, 4;                                // 00x7x6x5x4x3x2x1
              lop3.b32 i8_odd, sign_odd, low_odd, 0xf0f0f0f0, 0xe4;  // s7x7s5x5s3x3s1x1
              lop3.b32 i8_even, sign_even, $2, 0xf0f0f0f0, 0xe4;     // s6x6s4x4s2x2s0x0
              prmt.b32 $0, i8_even, i8_odd, 0xzO;            // s3x3s2x2s1x2s0x0
              prmt.b32 $1, i8_even, i8_odd, 0xz>;           // s7x7s6x5s4x4s3x3
              }
              z=r,=r,rr  )r3   r   r  r   Typeparser   r   r   r  rw  r  extractvalue)	r  r   low_prmt	high_prmt
out_structi8_vecr[   r   i8s	    `     r%   upcast_i4_to_i8z/FragmentedArray.astype.<locals>.upcast_i4_to_i8  s     $a'1,
,,WW\|\\(gg]P\]]) ggmm67e/ 08j 9//8k :  '** ==$$T2.&$$Qx& mmD--c:tDfM&   &s   :3C4)r   >   rA  r   c                F    t        j                  | gd|rdnd dd      S )NzN
            {
            .reg .b32 s<3>;
            prmt.b32 s0, $1, 0x43, iBC  i@A  z;
            and.b32 s1, s0, 0xff7fff7f;
            and.b32 s2, s0, 0xff80ff80;
            sub.bf16x2 $0, s1, s2;
            }
            z=r,r)r   r  )r  r  r   s     r%   upcast_i8_to_bf16z1FragmentedArray.astype.<locals>.upcast_i8_to_bf16/  s?     E$ /3F#? @ 
 	
r/   )r  r  z'cvt.rn.satfinite.e4m3x2.f32 $0, $1, $2;z=h,f,fr~  zCConversion between float types of width other than 16 not supportedr!   zUnsupported layout c                X    t        j                  | t        j                  |            S rD   )r   truncfextf)r  rt  	upcast_tys     r%   r  z(FragmentedArray.astype.<locals>.<lambda>  s    RIq1I J r/   zUnsupported conversion  -> )r  r   r  r6  r   z4Iterable[tuple[Sequence[tuple[int, ...]], ir.Value]])r  r:  r  r   )r  r:  r  r:  r  r   r   )r  r:  r   r   )Br   r   r   BF16Typer   r  Float8E4M3FNTyper  r  r  r  r  r  rv  r   r   rY   rH   r   rb  r)   getattrownerr   ExtractStridedSliceOpsourcera   r  r   r#  r  r   extsirw  r  r  r  r(   r  r  r  offsetsr  r   extendr   r  r  insertelementrV  r  r  r2  r  r  widthrX  r   rH  r1  trunciextuisitofpr  fptosifptoui)Jr+   	new_dtyper  i4i16r  f8e4m3fn	cur_dtypeany_regr  is_vector_regr  new_reg_bitwidth	_slice_opr  r  r  packing_widthr   r  
group_sizeint_ty
reg_as_i32r  out_i32_regsout_i32_regout_vec_intout_vector_lenout_vecr   r   sliced_out_vec
out_vec_tyout_int_regsslice_opout_regr  r-  reg_i8r/  empty_vec_32
pad_vec_16reg_16reg_32
new_reg_32
new_vec_32r  r  empty_vec_16e0e1
new_reg_16
from_floatto_floatfrom_integer
to_integercur_ty_widthnew_ty_width	larger_tyrH   convert
new_reg_tyr  r   r,  r  r  r  r  r  r  r  r3  r   sJ   `                                                             @@@@@@@@@@@@r%   astypezFragmentedArray.astype  s
    
	$	$Q	'B		$	$Q	'B
..
%
%b
)C
..
%
%b
)C;;??D
**..
C""&&(HII	9	$^^T[[Y  nn!!!$G||HMM,,X6M8EbmmH-3344ILZ!NN95
BBaG!  	 $??I((	
 	, NN9++001R7		,
 a &&!+ 11/31	=12 B90
> mmDNN3m &?am*=N ,'3}}SXX.44J&!+++,,Z!^<]]3/
S={{3
3* c*/156
 z5
 
 ))++
 R]]..tS9;G+
  \*Q. --**N+<bA
  	C --uVVj%89.  %}}bmm//xH -
 J
&	-< "DKKD  B4>>i4.?JQRNVWDWmmDNN3m==$$j]I>j
 +1DA B,,'3	G" }}SXX.44J&!+++A~###,,Z!^<
 (* YY%%(!))!,22,MM(//37'GQq#Y7+!Q&
&&


 
a0 
  --V4-s]!KK];M#kk-1cC



 
a0  %%l3 	,C).););uVVj%89* -
w J
&+
++	,{B,F "DKKD  B4>>i2o)mmDNN3m==$$j]I>j*1DA ;,,'3#	H }}SXX.44J&!+++A~###,,Z!^< YY%%(!))!,22,MM(//37'"7|L&--V4-s]!KK];M"=1&  	,C).););eFFZ$78* -
w J
&+
++	,m;,x "DKKI  B4>>i4.?JRXDXmmDNN3m
, __R]]%6%6
a7I3%OPl??2==#4#4T3#?@jnnT^^4 
(#s?>>"--"3"3D#">D&&&
';<&(e<*)),
AaIN*1_>>"--"3"3D#">D&!&u5#"65$)),Qq#YG*))*dAaIF*#J/
/#^^MMzmY7
c
" "DKKI  T3KI$9	q!*--mmDNN3m__R]]%6%6tS%ABlnnT^^4 E(#s^^0044aS9

 ^^0044aS9
 zz#r""zz#r""__H5	

 $^^MMdH-|Z1cCEc+E0 "DKKI  ((3J||&&y1H>>,,Y7L**95Jh\\),22l\\),22l		% 2#  JJNN$	kk4$"$}{}4MM$.."5"5a"8"="=>DDE))%;I "!I #%(;DKK=&IJJJ<<	"((2<<	+B+H+HH,,**	*			"	(	(2>>)+D+J+J	J,,!%%++U[[	( $ELLg	
 )u||g"9)D TUUMM$..1M
++0  =;=0dnn11!499:@@]]&&ui8

 !$7}"EFFNN4>>2 4S":s3mC4 $++) _	
+
s   ?AB?%;AC
c                   /0 t         j                  j                  d      }t        t              rfd }t        |t
              r|xdk(  r t        j                   fdD              0t         j                  j                   j                        rt        }0fd}nLt         j                  j                   j                        rt        j                  }0fd}nt         j                        dk(  rt         j                  j                   j                        r j                  d      }nt         j                  j                   j                        rt        j                   }nkt         j                  j                   j                        r- j"                  rt        j$                  nt        j&                  }nt         j                        d }n	 t)        d	|       t        |t
              rJ  j*                  xt,        d
 x\   } t/              t/        t1        t3         j4                                    k7  rt        d      t7        t9        d|z  fd|z  f|ff      ddd      }t;         j<                  j?                  |jA                  t        j                   j4                        f            | j"                        jC                  |d|      S  xtD        d x\    |t        d       j<                  j4                  rJ t;        tG        jH                   | j<                  jK                               tL              tE        tO        fdtQ         j4                        D                     j"                        S  t6        d x\   n 	 t         j*                        t3         j*                  jR                        t3         j4                        k7  rt        t        t              rf j*                  }|jT                  }|jV                  jY                  d         /dd  D ]9  }	tO        d t[        /|jV                  jY                  |	      d      D              /;  j<                  j4                  }
tO        d t[        /|
d      D              }tO        d t[        /|
      D              }tG        j\                  |tL              }t         j^                  ja                         }tG        jb                  |      D ]  }d }tG        jb                  |      D ]F  }tO        d t[        ||      D              }| j<                  |   }1 || j<                  |         }H |J /|jd                     rt        jf                  |jh                        j4                  \  }d }t1        |      D ]E  }tk        jl                  |g t         jn                  ja                  |g            }||n |||      }G tk        jp                  t         jf                  ja                  d|jh                  jr                        |      }tu        /fd |jv                  D              rd}|jx                  d d d   D ]  }t        |tz              r||j|                  z  }#/|   s	|||   z  }1|j                         dk(  sJ ||   }|dkD  sQt        j                  ||      } |||      }|d!z  }|d!z  }|dkD  r/ |t        k(  sJ |       tu        /fd"|j                  D              r|t)        d#      t        jf                  |jh                        j4                  \  }t        j                  |jh                        }|j                  dk7  rt)        d$|j                         |jr                   j                  k7  r%t)        d% j                   d&|jr                         t/              t/        t1        t3         j4                                    k(  }|rdnd}|j4                  d   t        |z  |z  k  rt)        d'      |j                         d   dgk7  rt)        d(      t        j                         }|rt        d|      }n$t        j                  |t        t        |            }t        j                  t        j                  |t        t        |            t        t        |            } t        j                  |t        t        |            }!t        j                  |t        j                  |!|             }"tk        j                  ||t        j                  |"t        ||            g       t        j                          t        /fd)|j                  D              rg t1        t              d}$}#nbt3        |j                        d!k(  sJ |j                  \  }%}&t        |%t              r/|%   rdd!gd}$}#nt        |&t              r/|&   sJ ddgd!}$}#|jh                  }'d }t        j                  | t        j                  ||$            }(|#D ]  })t        j                  |(t        |)|            }*t        j                  |t        j                  t        j                  |!|*      t        ||                  }+tk        j                  |'||+g      },||,n |||,      } t        j                          |||<     t         j4                        }-t        d*      D ]  }	|-|	=  |-stE        d      }.|j                  dk(  sJ |j                  d   }t        jf                  |jh                        j4                  dgk(  sJ tk        jl                  |g t         jn                  ja                  dg            }tG        jH                  |tL              }n:|jC                        }.|j?                  |.jA                  tO        |-                  }t;        ||. j"                        S )+Nr   addc              3  <   K   | ]  }j                   |     y wrD   rE  )r#   r'  r+   s     r%   r&   z)FragmentedArray.reduce.<locals>.<genexpr>  s     #@aDJJqM#@r  c                X    t        j                  | t        | j                              S rD   )r   r6  r   rv  rt  reduced_elemss    r%   r  z(FragmentedArray.reduce.<locals>.<lambda>      Aq/G!H r/   c                X    t        j                  | t        | j                              S rD   )r   ru  r   rv  rv  s    r%   r  z(FragmentedArray.reduce.<locals>.<lambda>  rx  r/   r  r  c                    | S rD   r!   rt  s    r%   r  z(FragmentedArray.reduce.<locals>.<lambda>  s    q r/   z!Unrecognized reduction operator: r^  z?Warpgroup strided layout only support reductions along all axesr  )r  r  rw   )r   r   r   r   r  r   r!   z=Splat reductions only supported when the operator is a stringr  c              3  2   K   | ]  \  }}|vs|  y wrD   r!   )r#   r'  r$   axiss      r%   r&   z)FragmentedArray.reduce.<locals>.<genexpr>  s     KDAqQd]aKs   r   c              3  .   K   | ]  \  }}|xs |  y wrD   r!   r#   r  r$   s      r%   r&   z)FragmentedArray.reduce.<locals>.<genexpr>		  s      Q!&q&   Tru   c              3  .   K   | ]  \  }}|r|nd   ywrK  r!   r  s      r%   r&   z)FragmentedArray.reduce.<locals>.<genexpr>	  s      !QQAr  c              3  .   K   | ]  \  }}|rd n|  ywrK  r!   r  s      r%   r&   z)FragmentedArray.reduce.<locals>.<genexpr>	  s      !QQAr  c              3  ,   K   | ]  \  }}||z     y wrD   r!   )r#   r   r  s      r%   r&   z)FragmentedArray.reduce.<locals>.<genexpr>	  s     @$!QA@rF   r  rV   c              3  (   K   | ]	  }|     y wrD   r!   r#   r$   reduced_dimss     r%   r&   z)FragmentedArray.reduce.<locals>.<genexpr>/	       C\!_C   rA  c              3  (   K   | ]	  }|     y wrD   r!   r  s     r%   r&   z)FragmentedArray.reduce.<locals>.<genexpr>@	  r  r  z>scratch must be provided when cross-warp reduction is requiredz!Expected rank 1 for scratch, got zExpected element type z for scratch, got z3Insufficient scratch space for cross-warp reductionz!Expected scratch to be contiguousc              3  L   K   | ]  }t        |t              xr |     y wrD   )r   r   r  s     r%   r&   z)FragmentedArray.reduce.<locals>.<genexpr>e	  s$     P!z!S!5l1o5P   !$r"  )Zr   r   r   r   r   r5   r   r   r  r  r.  r   r   rV  r  r  r  r  r  r  r)   r  rX  setrw  r(   rH   r   r   r  r  r  r   r(  rH  r   r  r  r  rY   rW   r   r   r   rl   rG   r  r   r   r   r   r   rv  r   r  r  r  rc  r*   r   r   r   r   	bit_countr   r  r   r   r  rankr   get_strides_and_offsetr  r   r   r   r\  ru  r  storewarpgroup_barrierrN  r   rf  r  r  r   r$  r  r  )1r+   r  r}  scratchr   splat_oprY  r  r   r'  
regs_shapereduced_shaperemaining_shapeout_regsr   out_idxr\  red_idxsrc_idxr  scalar_out_regr[   scalarlane_strider$   reduction_sizeother_out_reg
scratch_tyreduces_all_dimsunique_lanesr  r  r  
spill_base	store_idxwarp_offsetswarp_group_maskwd0wd1r  warp_reduction_groupr   reduced_warpload_idxr  reduced_logical_shaper&  r  rw  s1   ` `                                            @@r%   r(  zFragmentedArray.reduce  sw
    ..
%
%b
)C$WdH"c))#@4#@@-\\$$T__5BHH~~((9BHH%doo66ZZ""4??3&&}5B||&&t7B~~((9 $EKKB%doo66 (>rdCD
D"c"""
++:::t9E#djj/233#O  C(N,rH}.>LM	
 ~~--&&		$**(='?@ ~~
 &Q
 	! ;& #M  >>''''zz,,./v &KIdjj$9KK ~~
 	
  = !$++..
4;;&&'3tzz?:$Wd[[F22==//Q8L!"X   v}}/K/KA/NW[\ l %%J "%lJt"L M  "%lJ"? O xxv6HLLE::o. g"gZZ. 9'@#gw*?@@?NN7+'ww 78'9    	f''	(MM',,/55	w 	A>>! 2266s;& '.fB~v4N 	 ""MMdGLL$=$=>
 
Cf&B&BC	C!!$B$' 	#A:&177"K?-a00K((*a////2N 1$#oog{Cm7M2gQk"n	 !1$	# i'44'	Cf&B&BC	C?N  MM',,/55	]]7<<0
??a>z>OPQ
Q""doo5&t&7 8))*,  t9E#djj/,B(CC,q"A!3l!BW!LLPQ
Q,,.q1aS8>?
?%%'
q#Y([[Qy#->?(;;KK
Anc$:;Qy#=N
 ZZ!,>*DE
$$UEJJz8,LM	Wuzz)Qw5FGH	
 	!Pv?O?OPP*FE2D,E*F, V%%&!+
++%%(#sS!l3&7-.FA/Lc3'L,===-.FA/L$zz(ENN34XY' 	CK$8!K:MN,%%jjJ=q#O( VWxj9$#ODGT1B'	C 	!!hwOg"R !,D$' #

"# ):2)>n]]aa g]]7<<(..1#555
..22A37g
 G62h}}T*n!!

(
(/D)E
Fh ^ r/   c                D   t        | j                  t              st        | j                        | j                  |k(  r| S | j                  j                  |      st        d| j                   d|       t        | j                  t        |      | j                        S )NzCan't broadcast r  r  )
r   r  rH  rV  rH   rP  r)   r  r  r  rO  s     r%   r  zFragmentedArray.broadcast	  s    dkk#45,,zzUk;;''.)$**T%ABB>>!%(>> r/   c                ~   | j                   |k(  r| S t        j                  |      t        j                  | j                         k7  rt        d| j                    d|       | j                  xxt
        d x\   n xt        d x\   n  nE  t        j                  | j                  |      }t        | j                  || j                        S t        d x\   | j                  j                  }|sJ | j                   t        |       d  }|t        |       d  }|dd  |dd  k7  s|d   |d   z  rt        d| j                    d| d|       | j                  j                  |      }t        | j                  j!                  |      | j                  | j                        S  	 t#        | j                        )	NzCan't reshape r  r!   rE  r  r   r   z' with a tiled layout with base tile of )rH   r   r   r)   r  rH  rX  dataclassesreplacer  r  r  r   r   r(   r   r  rV  )r+   rH   r  r   old_shape_suffixnew_shape_suffixnew_registers_shapes          r%   r  zFragmentedArray.reshape	  s   zzUkyy499TZZ00

|4w?@@
++6!6!4!66 ((EB
~~~~
 	

 =++55::s?';&;&<= #o"6!6!78 QR $4QR$88"_Q%77tzzl$ug 6./1  #kk99%@~~--.ABKK~~
 	
! * !$++..r/   c                D   t        | j                        dk7  rt        d      |dz  rt        d|       | j                  t        k(  rt
        }n/| j                  t        k(  rt        }nt        | j                        | j                  | j                  d   |fd|      S )Nr   z/Broadcast minor is only supported for 1D arraysrC  z5The broadcast dimension must be a multiple of 8, got r   r5  )
r(   rH   r)   r  WGMMA_ROW_LAYOUTrD  TCGEN05_ROW_LAYOUTr  rV  broadcast_in_dim)r+   nr  s      r%   broadcast_minorzFragmentedArray.broadcast_minor	  s    
4::!HII1uNqcRSS{{&&j	*	*!j,,  $**Q-!3T:FFr/   c                   t        |      D ]?  \  }}| j                  |   ||   k7  st        d| d| j                  |    d||    d       t        | j                  t
              r@t        |       j                  | j                  j                         ||| j                        S t        | j                  t              rt        |t              rt        t        t        |                  t        |      z
  }| j                  j                  |j                  k(  }|t        t        t        |                  k(  }|j                  |k(  sJ |j                  |f       |rW|rUt!        t#        j$                  | j                  t#        j&                  |d t        |                   || j                        S t        | j                  t(              rt        |t(              st+        | j                  |      t-        d t/        ||dd        D              rt+        d	      t        |j0                        t        |      k7  rt+        d
      t3        t        t        t        |                  t        |      z
        }	|j5                  |	      }
|
| j                  k7  rt        d      |j7                  |      }t9        |      }|	D ]6  }t        |j:                  j=                  |            D ]  \  }}|s	d||<    8 |j>                  | j                  j>                  k7  r| j                  j>                  dk(  sJ t#        j@                  | j                        }t#        jB                  | j                        D ]+  \  }}tE        jF                  |g|j>                  z        ||<   - n| j                  }t#        jH                  |jK                  |      |      }t!        ||| j                        S )Nr  z
 has size z in source shape and z in shape after broadcastr  r  c              3  ,   K   | ]  \  }}||k\    y wrD   r!   )r#   d1d2s      r%   r&   z3FragmentedArray.broadcast_in_dim.<locals>.<genexpr>	  s     
PB28
PrF   r   z-source_dimensions must be strictly increasingz0Tiling rank different than broadcast result rankz@Source and destination layouts aren't compatible for a broadcast)&rW   rH   r)   r   r  rH  rv  r  r  r  r  rX  r  rw  r(   rY  r  r   r-   r   r   rV  r*   rG   r   r$  r(  r   r   r   rl   r   r#  r  r   r  broadcast_tor  )r+   rH   source_dimensionsr  r[   
target_dimnew_dims	vec_matchbroadcast_dim_matchnew_dimensionsexpected_layoutr  pre_broadcast_registers_shapenew_dimis_newr  r   r  r  s                      r%   r  z FragmentedArray.broadcast_in_dim	  sy    ##45 
:	A%
+	+:djjm_ 5j!"";=
 	

 $++01$Zuf    $++23
6K^8_U3u:&'#.?*@@h++&&&//9i$E#h-,@(AA\\U"9V\\5$99"	*wwns8}-. ~~
 	
 dkk;/z&+7VV44

P#&79J129N"O
PP OPP
6!!"c%j0 RSSCc%j 12S9J5KKLNmmN3O$++%
L  !007$()<$=!! / !=!=g!FG /)!V-.
'
*// t{{888[[&&!+++--/innT^^4 K(#s,,cUV5I5I-IJ	#K ..iOO78:MM  &T^^ r/   c                    t         j                  j                  | j                        r,t        j                  | j                        j                  dk7  rt
        |j                  d | |      S )Nr   c                0    t        j                  || |      S rD   )r   r  )rB   pfs      r%   r  z(FragmentedArray.select.<locals>.<lambda>
  s    Q1- r/   )r   r   r   r  r@  rV  r  )r+   on_trueon_falses      r%   r  zFragmentedArray.select
  sU    NN%%doo6>>$//*00A5 -tX r/   c                   t        j                  t        j                  j	                  d            }| j                  |||d      }|j                  fdd|      S )Nr   Fr  c                     | S rD   r!   )r  r   fns     r%   r  z'FragmentedArray.build.<locals>.<lambda>.
  s    r3x r/   Tr  )r   r  r   r   r   r  r  )re  rH   r  r  r  undefdummys      `   r%   buildzFragmentedArray.build"
  sU     OOBNN77;<EIIeUFeI<E==di   r/   r  c               8    t         j                  j                         }d|~ fd}t         j                  j                   j                        t        j                   j                  j                        d      D ]q  \  }} j                  |   }t        |      t         j                        k(  sJ | j                  f       t         j                  j                  |j                        rt        j                  |j                        j                  \  }	t        |	      D ]  }
t        |
|      } |t!        j"                  |g t         j$                  j                  |
g            g |dd t'        j(                  |d   |            }snJ t!        j*                  ||   g t         j$                  j                  |
g            |<    \ |||      }siJ ||<   t rJ t-         j                  |      S y)z)Call a function for each value and index.Nc                     |  }j                   j                  d   j                  }r|J t        j                  j                  |      r5t        j                  j                  |j                  |j                        }n|j                  }t        j                  j                   t        j                  |            |S r[  )r  r  rv  r   r   r   r   rH   r   	full_liker   r  )r  r  old_reg_typenew_reg_typer  r  orig_fnr+   s       r%   
wrapped_fnz+FragmentedArray.foreach.<locals>.wrapped_fn=
  s    ~f^^((+00l	(*!!!==##L1**<+=+=v{{K,,<<0MNmr/   Tru   r  rw   r  )r   r   r   rG   r  r   rH   r   r   r  r(   r   r   rv  rw  r   r   r  r  r   r   r  r  )r+   r  r  r  r   r  mlir_idxreg_idxr  r   r[   c_ivalr  r  s   ` `          @@r%   r  zFragmentedArray.foreach1
  s    LLEHG
 !!8!8!DbjjQUQ_Q_QeQeFfost "'NN7#c]c$**o-E$**/EE-		!	!#((	+--)//u 	A!U#nn#%"$"6"6":":A3"?
 >"=uzz(2,<=# ''' &!!# " 4 4 8 8! =	!HW	& h'%
%%!(7
7": !!!$++R[\\ r/   c                    dj                  dgt        | j                        z        | j                  fd       }y )Nr  z{}c                f    j                  d d      }t        j                  |g|| ddi y )N[z]: {}uniformF)formatr   debug_print)r  r   fmt_strfmtidx_fmts      r%   r  z&FragmentedArray.debug_print.<locals>._m
  s7    

Qwiw/0g:#:s:E:r/   )r3   r(   rH   r  )r+   r  r  r  s    ` @r%   r  zFragmentedArray.debug_printk
  s6    iiTZZ01G	\\; ;r/   r  Tswizzle	optimizedc                  t         j                  j                  |j                        st	        |      | j
                  xt        d x:\    t        |t        j                        rt        d      | j                  |       y  xt        d x\    |dk7  rt	        d      t        | j
                  t              sJ | j                  || j
                  j                        D ]k  \  }}}}t        |t        j                        r#|j                   || j                        |       Et!        j                   || j                        ||       m y  t"        d xb\   t        j                  |j                        j$                  }t        j&                  |g d |D        |      }| j)                  |||       y  	 t        | j
                        )Nr!   z&Splat layout does not support multimemr  #Only TiledLayouts support swizzlingc              3      K   | ]  }d   ywrK  r!   r#   r  s     r%   r&   z0FragmentedArray.store_untiled.<locals>.<genexpr>
  s     *@1*@   r  )r   r  r   rv  r)   r  rH  r   MultimemRefrV  _store_untiled_splatrX  r  rY  r  r  r   r   rH   memref_reshapestore_tiled)r+   r  r  r  r   _updater   	ref_shapes           r%   store_untiledzFragmentedArray.store_untiledr
  sz    ==##CHH-sO
++c5,,-#$LM
M!!#&	 
 ! b=@A
A$++':;;;&*&;&;CAUAU&V 	8"C#sU../IIc$..)3/LLT^^,c37		8	 ! =MM#((+11	""3(M*@i*@(M9(MNgC  !$++..r/   c                P   t        j                  |j                        }t        |j                        }t        |t              r|dk7  rt        d      t        j                  |j                  |      t              }t         j                  j                  |j                  f|j                        }	| j!                  ||j                        D ]b  \  }
}}}t#        j$                  t#        j&                  |j(                  t        |                  } ||t#        j*                  |	|||             d  | |||      S t#        j,                  |g d |D        |      }| j/                  |j(                  |||dt1        j2                  t"        j*                  ||      d      S )	Nr  r  r  r  c              3      K   | ]  }d   ywrK  r!   r  s     r%   r&   z6FragmentedArray.load_reduce_untiled.<locals>.<genexpr>
  s     &8Qq&8r  F)	reductionr  )r  r  r  r  	_load_fun_narrow_float_as_int)r   r  rv  rY   rH   r   rX  r)   r   r  r   r  r   r   rY  rc  r  r   
memref_ptrmemref_slicer  multimem_load_reducer  
load_tiledr=  r  )re  r  r  r  r  r  r  rH   r  r  r  r  r   ptrs                 r%   load_reduce_untiledz#FragmentedArray.load_reduce_untiled
  sh    ]]388$F&,,E&-.	B>??((611%8Gi}}  &//!3V5H5HIf$'$8$8foo$N Y
 $Su11#''5:FGy%44VS)YWXY Iv)LL


s$A&8%&8$A5$A
BC>>##&&)y

 #   r/   )r  r  r  c                   t        j                  |j                        }t        j                  |g d |j
                  D        |j
                        }| j                  |||||      S )Nc              3      K   | ]  }d   ywrK  r!   r  s     r%   r&   z/FragmentedArray.load_untiled.<locals>.<genexpr>
  s     &?Qq&?r  )r  r  r  r  )r   r  rv  r   r  rH   r   )re  r  r  r  r  r  r  s          r%   load_untiledzFragmentedArray.load_untiled
  sb     ]]388$F


s$O&?&,,&?$O&,,$O
PC>>W	&I   r/   c           
     D   t        j                  | j                        dk(  rt        dt        j
                  j                               }t        j                  | j                  j                  d   ||gt        t	        j                  |j                        j                        z         y dt        j                  | j                         z  }t#        j                  | j                        |t$        z  k  rd}t#        j                  | j                        t$        z  |z  rt'        dt$         d      t(        j+                  | j                  j                  d   | j                  t-        | j                  |      | j.                        }|j1                  |       y )Nr   r   rB  zaArrays with the splat layout can only be stored when they have a single element or a multiple of z	 elementsr^  )r  r  )r   r   rH   r   r   r   r   r   r  r  r  r(   r  rv  ra  rb  r  r   r\  rV  r  r  rX  r  r  )r+   r  c0rY  fas        r%   r  z$FragmentedArray._store_untiled_splat
  s;   yy!Q  "#bll
..

a
 #tc"--2I2O2O.P'P T]]4??33H	wwtzzX66h	wwtzz^+h6..<-=YH 
 
		A

"hG..	 
 
B Sr/   c                   t        | j                  t              st        | j                        | j                  | j                  }}t        |t
        j                        rR| j                  |j                  ||||      }|D ]-  \  }}}	}
t        j                  |
 || j                               / y | j                  |||||      }|D ]  \  }}}	}
 || j                        }t        j                  |j                        }t        j                  |j                        }t        j                   j                  |j                        rb|dk  r]t        j"                  j%                  |      }t'        j(                  t        j                  j+                  |j                  |      |      }t-        j.                  ||
        y )NrC  )r   r  r   rV  rH   r   r  transfer_tiledr  multimem_storer  r   r   rv  rb  rc  r  r   r   r   r  r   r   r  )r+   r  r  r  r  rH   storesr   r  _idxr  r  r  element_bitwidth
narrow_ints                  r%   r  zFragmentedArray.store_tiled
  s_   dkk;/,,KKEF #u(()""377GVUINf%+ 7
!#wcS#dnn"567 ""3	Jf%+ 
!#wc$..!sxx( >>&*=*=><<""6#6#67<LPQ<Q~~223CD*r}}00zJCP#

3r/   )r  r  r  r  r  c               6   t        |t              st        |      t        j                  |j
                        }|j                  }	|j                  }
t        |
      dz  rt        d      t        |
      dk  rt        d      t        |
t        |
      dz  d  f      }|j                  |
      }t        j                  j                  |j                  f|	      }t        j                   |t#        d|	            }t%        j&                  |j)                  |      |t*              }t        j,                  j                  |	      xr t/        j0                  |	      dk  }t        j2                  j5                  t/        j0                  |	            }t        j                  j                  |j                  f|r|r|n|	      }| j7                  |||||      }|D ]3  \  }}}} |||      }|r|rt        j8                  ||      } |||       5  | |||      S )NrA  #Tiled reference must have even rankz1Tiled reference must have at least two dimensionsr   r  rC  r  )r   r   rV  r   r  rv  rc  rH   r(   r)   r   rT   r   r   r   r   r  r   r   r  r   r  r  r   rb  r   r   r
  r  )re  r  r  r  r  r  r  r  r  r  r~   r   rH   r  zeror  is_narrow_floatr  transfer_tyloadsr  r  r  r  
loaded_regs                            r%   r   zFragmentedArray.load_tiled
  s    fk*'']]388$FE,,K
;!<==
;!JKK[[!1Q!6!89;<F,E]] 4 46>FFAaK0D..u5t6JIll--e4S9NRS9SO,,U^^E-BCJ --##			%*>
EK sGVUIFE#( $fdC[#.j	1^^FJ7
Y
#	$
 )V	JJr/   c           	   #    K   t        j                  |j                        }t        t	        |j
                        |      }	 t        j                  |dt        |j
                              }t        d |j                               }t#        |      D ]  \  }	}|	fd}|	fd}||||f  y # t        $ r |dkD  r7t        j                  |j                        }|j
                  }|j                         \  }}t        t        t        |      dz
              D ]1  }	||	   ||	dz      ||	dz      z  k(  st        j                  ||	d      }3 t        j                  |j                        }|j
                  }|j                         \  }}d}
t        ||      D ]I  \  }}|dk(  rd}
||z  dk7  st        d| d	| d
      d |dkD  s/||z  dk7  s8t        d| d	| d
      d  |
st        d      t        t	        |j
                        |      }|j!                  t	        |j
                              }Y w xY ww)Nr^  r   c                    | gS rD   r!   r{  s    r%   r  z2FragmentedArray.transfer_strided.<locals>.<lambda>G  s    qc r/   r   rA  FTz^The contiguous dimension of the reference must be a multiple of the layout's vector size (got z and vector size r2   zpNon-contiguous dimension of the reference must have strides that are multiples of the layout's vector size (got z@The reference must have a contiguous dimension when vec_size > 1c                    || |<   y rD   r!   )r  r  _is      r%   r  z0FragmentedArray.transfer_strided.<locals>.updateI  s    	"r/   c                    | |   S rD   r!   )r  r  s     r%   r   z-FragmentedArray.transfer_strided.<locals>.getK  s    }r/   )r   r  rv  rX  rY   rH   ra  memref_foldr(   r4   rn  r)   r  rR   rw  rG   r   rW   )r+   r  rY  r  r  idx_genrH   ra   r  r[   has_contiguous_dimr  r   vec_idxr  r   s                   r%   r  z FragmentedArray.transfer_strided  sY    ]]388$F uV\\':XNF)@ S!S%67cJ M6#<#<#>?g( &
7$%  g%%&K  "8	Asxx(224
%E
Q/0 	.AQZ51:!44""31-C	. sxx(224
"w/ 	LD&q[!%h!#@@Dv F""*1. 	
 ax A%X.xj; 		  "P  #v||)<xPf""5#67gE"8sF   A I *B. -AI .BH=7A:H=2H=H=A%H=9I <H==I c              #  :  89:;<=>?@K   d 8j                   @t        j                  | j                        ==j                  }=j
                  dz  rt        d      =j
                  dz  ;t        =j                  ;d       }t        |f      }=j                         \  <}|j                  t        =j                              x}	|k7  rt        d|	 d|       t        ;=fdt        ;      D              }
t        ;<=fdt        ;      D              }@j                  |
|      \  >?t        fd	t        t!        t#        ?>                  D              rt        d
      ?j$                     dk7  rt        d      t!        >      }t!        ?      }g j&                  j(                  j$                  D ]*  }dt+        ||         z  ||<   dt+        ||         z  ||<   , t!        t,        j.                  j1                  |            }t!        t,        j.                  j1                  |            }t!        t,        j.                  j1                  >fdj2                  D                    }t!        t,        j.                  j1                  ?fdj2                  D                    }t!        t,        j.                  j1                  >fdj4                  D                    }t!        t,        j.                  j1                  ?fdj4                  D                    }j6                  }t9        j:                  |      }||z  dz  dk7  rt        d| d||z   d      ||z  dz  }|dvrt        d      d|z  }d|z  }|dz  }||z  }t        j<                  j?                  |      rF|dk  rAt        j@                  jC                  |      }t        jD                  jG                  |f|      }n!t        jD                  jG                  |f|      }=jH                  d}n0tK        jL                  =      rd}nt        d=jH                         |r$|dk7  rtO        d      tQ        |||||||||	      :n
tS               :|D cg c]  }||z  	 }}t,        j.                  j1                  ?jT                   d       D cg c]  } 8||z         }}d%8>fd} tK        jV                   | jY                               |      }!tK        jV                   | j[                               |      }"t]        j^                  |!|"      }#tK        j`                  | |      }$8fd }%||z
  }&tc        jd                  | D ]v  }'tc        jf                  :jh                  D (cg c]
  }( |(|'       c}(      9tc        jj                  9|      })|)|&z  }*|)|*z
  }+t]        j^                  |#:jm                   |%|*                  },t]        jn                  t]        jp                  |, 8|             8|            }-t]        jr                  |- 8|            }.t]        jt                  |,|.      }/tK        jv                  |$|/g|      }0:jm                   |%|+            }1tK        jv                  |0|1g|      }0>fd!}29jy                         D 3cg c]
  }3 |2|3       }4}3|4f:fd"	}5|4f:fd#	}69:>@fd$}7|5|6|7|0f y yc c}w c c}w c c}(w c c}3w w)&ax  Generate a transfer schedule for a tiled layout.

    Given a ref with one level tiling applied to it (we assume all dimensions
    have been tiled), this function generates an iterable describing a good
    schedule for swizzled SMEM loads/stores.

    At each step, the iterable yields a tuple of three values:
    * a function that takes a register array and returns the register to be
      stored at the current address
    * a function that takes a register array and a register loaded from the
      current address, and updates the register array with that register
    * the current address for load/store instructions
    c                h    t        j                  t        j                  j	                  d      |       S r  )r   r  r   r   r   r{  s    r%   r  z0FragmentedArray.transfer_tiled.<locals>.<lambda>e  s     %..!<!<R!@!D r/   rA  r  Nz#The reference has untiled shape of z$ while the register array has shape c              3     K   | ]J  }j                   |z      d k7  r!j                   |   j                   |z      fnj                   |   f L ywrK  rE  )r#   r[   ref_logical_rankr  s     r%   r&   z1FragmentedArray.transfer_tiled.<locals>.<genexpr>u  sd        <<,,-2 
a&,,q+;';<=9?a8J	Ks   AAc              3  n   K   | ],  }j                   |z      d k7  r|   |z      fn|   f . ywrK  rE  )r#   r[   r#  ref_stridesr  s     r%   r&   z1FragmentedArray.transfer_tiled.<locals>.<genexpr>z  sX        <<,,-2 
QQ)9%9:;9DQ8I	Js   25c              3     K   | ]7  \  }\  }}|j                   k7  r t        fd t        ||      D               9 yw)c              3  P   K   | ]  \  }}|j                   z  xr |d k7    ywrK  r  )r#   rA   r$   r  s      r%   r&   z;FragmentedArray.transfer_tiled.<locals>.<genexpr>.<genexpr>  s,     HDAqA$$$/a/H   #&N)r   r*   rG   )r#   r[   ssdsr  s       r%   r&   z1FragmentedArray.transfer_tiled.<locals>.<genexpr>  s@      AxB!!! 	HCBKHHs   =A zVTiled strides must be a multiple of the vector length, except for the vector dimensionrV   zMVectorized dimension should not require further tiling and have a stride of 1r5  c              3  b   K   | ]&  }t        |t              r|j                  fn|    ( y wrD   r   r#   r$   tiled_nested_shapes     r%   r&   z1FragmentedArray.transfer_tiled.<locals>.<genexpr>  0      4OPjJ/
5G5JJ4   ,/c              3  L   K   | ]  }t        |t              rd n|     ywr5  Nr   r#   r$   tiled_nested_stridess     r%   r&   z1FragmentedArray.transfer_tiled.<locals>.<genexpr>  *      6KL
1j)/CA/FF6r  c              3  b   K   | ]&  }t        |t              r|j                  fn|    ( y wrD   r   r,  s     r%   r&   z1FragmentedArray.transfer_tiled.<locals>.<genexpr>  r.  r/  c              3  L   K   | ]  }t        |t              rd n|     ywr1  r   r2  s     r%   r&   z1FragmentedArray.transfer_tiled.<locals>.<genexpr>  r4  r  rC  r   zVector length (z') must be a multiple of bytes, but has z bits>   r  r   rB  r  z!Only swizzled transfers supportedr  r  r  zUnsupported memory space: z*Only optimized transfers to SMEM supportedc           	     "   t        |       
j                  k(  sJ g }t        | 
j                   d        D ]  \  }}|dk(  r|j                  |       t	        j
                  |      }t        t        ||            D ]V  \  }\  }}t        j                  | 	|            }|dk7  rt        j                  | 	|            }|j                  |       X  t        |      t        t        t         
j                   d              k(  sJ |S )NrV   r   )r(   r   rG   rX   r   r   rW   r   r   r   r   r4   )idxsnew_idxsr   r   r   r[   r  r   new_idxr   r  r-  s            r%   expand_nested_dimsz:FragmentedArray.transfer_tiled.<locals>.expand_nested_dims  s	   Y&22222h&8&:R:R9R9T&UV 	#.#y
//#

229=!*3y++F!G 	#A~fKKQvY/'!Vkk'1T73G
//'
"		#	# ]c#c+=v?W?W>W>Y+Z"[\\\\or/   )memory_spacec                V    | j                         D cg c]
  } |       c}S c c}w rD   )r  )constsconstr   s     r%   r  z0FragmentedArray.transfer_tiled.<locals>.<lambda>  s    v}} Ge5 G  Gs   &c           
         g }d}D ]a  }t        j                  |      }| ||t        |      z    }|t        |      z  }|j                  t	        d t        ||      D                     c t        |      S )Nr   c              3  ,   K   | ]  \  }}||z    y wrD   r!   r   s      r%   r&   zMFragmentedArray.transfer_tiled.<locals>.mem_idx_to_reg_idx.<locals>.<genexpr>  s     "PTQ1q5"PrF   )r   r   r(   rX   r   rG   rY   )r   reg_tiled_idxr  r   r   dim_idxsr-  s         r%   mem_idx_to_reg_idxz:FragmentedArray.transfer_tiled.<locals>.mem_idx_to_reg_idx  s~    + 	RI44Y?+(S^";<(
c)n
$(


s"PS;5O"PP
Q		R
 ]##r/   c                R    j                  |D cg c]  }| |   	 c}      S c c}w rD   )r  )r   reg_idxsr  plans      r%   get_registerz4FragmentedArray.transfer_tiled.<locals>.get_register  s&     {{BgDMBCCBs   $c                ^    t        |      D ]  \  }}j                  || |   |      | |<     y rD   )rW   select_if_group)r   newrF  r[   r  rG  s        r%   update_registersz8FragmentedArray.transfer_tiled.<locals>.update_registers  s;     $H- 	FJAw..q$w-E$w-	Fr/   c                     t        t              st        d      t        d D              rt        d      j	                   j                         d         S )Nz@Base index computation only supported for trivial transfer plansc              3  8   K   | ]  }t        |      d k7    ywrK  r(   )r#   rB   s     r%   r&   zIFragmentedArray.transfer_tiled.<locals>.get_base_index.<locals>.<genexpr>+  s     7qs1v{7s   zTiling too complicatedr   )r   TrivialTransferPlanrV  r*   r   r  )r   rG  r-  r   s   r%   get_base_indexz6FragmentedArray.transfer_tiled.<locals>.get_base_index&  sX    $ 34#P  7$677#$<=
=$$W^^%5a%899r/   )r8  Sequence[ir.Value]r   zlist[ir.Value])=r   r   r  rv  rc  r  r)   rY   rH   r   r  rT   rw  r   r*   r   r   rG   r   r   r   r(   	itertoolschainfrom_iterabler   r   r   ra  rb  r  r   r   r   r   r   r<  r   is_smem_refrV  plan_tiled_transferrP  r   r   r   r   r   r   r  r   r   r  tile_index_transformsdotr  r   r   ru  rl  getelementptrr  )Ar  r  r  rH   r  r  ref_tiling_shape
ref_tilingr  ref_logical_shapenested_ref_shapenested_ref_stridestiles_shapetiles_stridesr$   
warp_shapewarp_strides
lane_shapelane_stridesr   element_bitstransfer_bytesswizzle_tile_transfersswizzle_group_transfersswizzle_groups_per_blockswizzle_block_transfersr  transfer_dtypellvm_memory_spacerA   tiles_strides_transferdyn_tiled_stridesr;  r   r   r   r  
_as_constsswizzle_maskr   r  const_offsetconst_offset_swizzleconst_offset_no_swizzleoffset_pre_swizzleswizzle_groupswizzle_bitsr   reg_ptroffset_no_swizzlerD  r   rF  rH  rL  rQ  r   r   rG  r#  r%  r  r-  r3  r   sA     `                                                     @@@@@@@@@r%   r
  zFragmentedArray.transfer_tiledO  s    , 	EA]]F]]388$FE{{Q<=={{a'V\\*:*;<=)+,J224NK'44U6<<5HIIeS/0A/B C'',g/    '( 
   '( 
 06/O/O,0,,
  -d37KM_3`.ab 
   F--.$6 
 )*K-.M^v++^f.J.J^FL]L]^ 6 c+a.11k!nM!$4 55mA	6
 y44[ABK66}EFMioo33 4TZTdTd4  J 	55 6PVP`P`6  L ioo33 4TZTdTd4  J 	55 6PVP`P`6  L ((M=='L$)Q.M? +#l235:  $l2q8N'':;;  >1!^3&"}69PP	||u%,!*;>>..|<j}}((-)9:Fn}}((-)95An"			6	"3F4G4G3HIJJ	a	!"NOO 
}
l
l
w	d !"d:GHQa=0HH
 .. &":":!:!<=
 	
!}
   -- 263F3F3H IK\]K-- 263F3F3H IK\]KK5J


3->
?CGJ*-CCLJJ, 9D

1K1KLAAhKLMgVVG%;<l *L8 ,/C C ::
dkk*-A"BC kk
++(!,C*D
E
$
%m ZZq1G/HIlzz,l;f##C&>Bg++j1H&IJ##G.?-@.Qg$ 6=^^5EFc$S)FhF&. D 08 	F: *NGCCs9DE I@ MB Gs8   R
\\"1\\%C\-\
<D\\#8\c                    | j                   | j                  j                  | j                  f}t	        | j                  j
                        |fS rD   )r  r  rH   r  r   r  )r+   auxs     r%   tree_flattenzFragmentedArray.tree_flatten0  s:    
++t~~++T^^
;C##$c))r/   c                x    |\  }}}t        j                  |t              j                  |      } | |||      S )Nr  r  )r   r  r  r  )re  r{  r  r  r  r  r  s          r%   tree_unflattenzFragmentedArray.tree_unflatten4  s8    #& FIy

>8@@KI)V	JJr/   )r  r  r  r  r  r  )r  r:  r  r  rY  
int | Noner   r  rD   )r  r  r   r  )r  r9  rH   r   r  r   r  zFragmentedLayout | Noner  r  r   r  r7  )r   r9  )r  r  r   r  )r  r  r   r  )r   r  )r  r6  r   r  )r  z$str | Callable[[ir.Value], ir.Value]r   z(Callable[[ir.Value, ir.Value], ir.Value])r  r9  r  r  r   r  )r   r  r  r  r   None)rF  r9  r  r  r   r  )r  z.str | Callable[[ir.Value, ir.Value], ir.Value]r}  zint | Sequence[int]r  zir.Value | Noner   r  )rH   r   r   r  )r  r  r   r  )
rH   r   r  r  r  zCallable[..., ir.Value]r  r  r   r  )r  z;Callable[[ir.Value, tuple[ir.Value, ...]], ir.Value | None])r  r5   r   r  )r  ir.Value | utils.MultimemRefr  r   r  r6  r   r  )r  N)
r  zutils.MultimemRefr  z!TiledLayout | WGStridedFragLayoutr  zutils.MultimemReductionOpr  r   r  r  )r  r:  r  r   r  r   r  r  r  r6  r   r  )r  r:  )T)r  r  r  r  r  r6  )r  r  r  r  r  r  r  r6  r  z-Callable[[ir.VectorType, ir.Value], ir.Value]r  r6  r   r  )r  r:  rY  r   )
r  r:  r  r  r  r   rH   r   r  r6  )Tr   r   r   r  fieldr  r   r  r|  r  r  staticmethodr  r?  rH   r  r  r  r(  r,  r0  r3  r7  r9  r=  rB  rE  rH  rP  rU  rX  r]  r_  rb  rd  rg  rj  rm  ro  rx  r}  r  r  r  r  ru  r  rd  r  r  r  r  r  r  r  r  r  r  r  r  r  rq  r(  r  r  r  r  r  r  r  r  r  r  r  r  r  rD  r   r  r   r  r
  r|  r~  r!   r/   r%   r  r    sz    ,+++7)Z7
2" 2"  	2"
 2"h 
  $!KK 	K
 K K K6  $BF4? ( 
 )-	  $  &	   > 	" 	" " "k\ 9=B*5BBH.
A
1D
-
.
.

@   #( * $) + #( * $) + #(  #(  $)  %*  %1%/% %P >B/:,@&6P =Aff.9ffX "&	\
8\  \ 	\
 \| $/LG<.><<|
   $  "	   $ 8]
E8]t; BDW[/-/;>/PT//6  #   0  +	 
      D  # 	
     4,   $!-AE#''K 'K
 'K 'K 'K ?'K !'K 'K 'KR 2& 2&h  ^D^D^D ^D 	^D
 ^D ^D@* K Kr/   r  .r	   IndexTransformc                  (    e Zd ZU ded<   ddZddZy)TransferPlanztuple[IndexTransform, ...]rX  c                    t         )zSelects the value corresponding to the group of the current thread.

    The argument must be of the same length as tile_index_transforms.
    rV  r+   group_elemss     r%   r  zTransferPlan.selectA  
    
 r/   c                    t         )zReturns `new` if the current thread belongs to the given group and `old` otherwise.

    group_idx must be between 0 and len(tile_index_transforms) - 1.
    r  r+   	group_idxoldrK  s       r%   rJ  zTransferPlan.select_if_groupH  r  r/   Nr  rR  r   r:  r  r   r  r:  rK  r:  r   r:  )r   r   r   r   r  rJ  r!   r/   r%   r  r  >  s    33r/   r  c                  ,    e Zd Zed        ZddZddZy)rP  c                
    d fS )Nc                    | S rD   r!   r{  s    r%   r  z;TrivialTransferPlan.tile_index_transforms.<locals>.<lambda>T      a r/   r!   r6   s    r%   rX  z)TrivialTransferPlan.tile_index_transformsR  s
    >r/   c                ,    t        |      dk(  sJ |d   S )Nr   r   rO  r  s     r%   r  zTrivialTransferPlan.selectV  s    {q   q>r/   c                    |dk(  sJ |S r[  r!   r  s       r%   rJ  z#TrivialTransferPlan.select_if_groupZ  s    >>Jr/   Nr  r  )r   r   r   r?  rX  r  rJ  r!   r/   r%   rP  rP  P  s     r/   rP  c                  V    e Zd ZU ded<   ded<   ded<   ded<   ed        ZddZdd	Zy
)StaggeredTransferPlanr   staggerrj   r  r:  
group_predc                6      j                   d fd}d |fS )Nc                b    g | d  |    j                   z   j                  z  | dz   d  S r,  )r  r  )r   rj   r+   s    r%   rotatez;StaggeredTransferPlan.tile_index_transforms.<locals>.rotatei  sK    t93x$,,.$));>A#')n r/   c                    | S rD   r!   r{  s    r%   r  z=StaggeredTransferPlan.tile_index_transforms.<locals>.<lambda>m  r  r/   )r   r   r   r   )rj   )r+   r  rj   s   ` @r%   rX  z+StaggeredTransferPlan.tile_index_transformsf  s    
((C   r/   c                p    t        |      dk(  sJ t        j                  | j                  |d   |d         S )NrA  r   r   )r(   r   r  r  r  s     r%   r  zStaggeredTransferPlan.selecto  s4    {q   <<QQHHr/   c                |    d|cxk  rdk  sJ  J |dk(  r||gn||g}t        j                  | j                  g| S )Nr   r   )r   r  r  )r+   r  r  rK  sidess        r%   rJ  z%StaggeredTransferPlan.select_if_groups  sH    	Q#q.S#JsCjE<<0%00r/   Nr  r  )r   r   r   r   r?  rX  r  rJ  r!   r/   r%   r  r  _  s4    ,
()! !I1r/   r  c	                    !"# t         j                  j                  d      fd}	d|z  !d|z   ||z  dz  dk(  sJ ||z  dz  }
t        j                  }t        j                  d t         |||       | ||            D         }!|z  r||k  rt        d|      |
t        d	z  kD  rt        |t        dz  kD  rt        t        t        |
      }t        t        |z  z  |dz  |z  t        |
|z  d
      t        z  "t        j                   |d      }d|t#        d |D              <   |j%                  "      #t        j&                  t)        t        j*                  |       |       ! "#fd} |d       s
t-               S fdt/        t                     D        }|D ]=  }dD ]4  }t        j0                  t              dddf   }||z  dz  t3        d
 |         D ]  }t        j4                  t7               t        j8                        ||<    fd} ||      rEt;        j<                  t?        j@                          |	t                    }t;        j<                  t;        jB                  | |	|             |	d            }t;        jD                  t:        jF                  jH                  | |	d            }tK        || |   |      c c c S  7 @ t        d      )a  Plans the tiled transfer in a way that avoids SMEM bank conflicts.

  Note that while xyz_shape length should always match the length of
  xyz_strides, we do not require the iteration spaces of tiles/warps/lanes to
  have the same rank.

  Arguments:
    tiles_shape: The nd-iteration space over tiles.
    tiles_strides: The memory strides (in elements) for each tile dimension.
    warp_shape: The nd-iteration space over warps in warpgroup.
    warp_strides: The memory strides (in elements) for each warp dimension.
    lane_shape: The nd-iteration space over lanes in a warp.
    lane_strides: The memory strides (in elements) for each lane dimension.
    vector_length: The length of a single transfer.
    element_bits: Element bitwidth.
    swizzle: The swizzle pattern length.
  r   c                0    t        j                  |       S rD   r  r  s    r%   r  z%plan_tiled_transfer.<locals>.<lambda>  s    sA& r/   r  i   rC  r   c              3  0   K   | ]  \  }}|d kD  r|  ywrK  r!   )r#   rA   r$   s      r%   r&   z&plan_tiled_transfer.<locals>.<genexpr>  s%      "
1a 
Q "s   zFailed to prove that vector transfers don't cross swizzle tile boundaries. This check is incomplete, and does not guarantee that this is a user error, but it might be. transfer_alignment=r   r   FTc              3  T   K   | ]   }|d k(  rt        d d      n
t        d       " yw)r   r   Nr  ri   s     r%   r&   z&plan_tiled_transfer.<locals>.<genexpr>  s%     OQa%1+U4[8Os   &(c                   t        j                        }t        j                  t        j                  |            }t        j
                  t        j                  |d      d      } | |      }|j                  d   dt        hv sJ t        j                  |      }|z   }|j                  d   t        k(  sJ |z  dz  z  }|z  }||z  z  z  }|j                  d      }	|	j                  dd      }	t        |	      D ]O  \  }
}|
d d |f   }
t        j                  |
d      }
t        j                  |
ddd f   |
dd df   k(        }|sO y y)	Nr   rw   r  r   )r}  .TF)r   r   r   unravel_indexarangeexpand_dimsstackrH   r   rY  r  swapaxesrG   sortr*   )tile_idx_transform	num_tiles	tile_idxslane_tile_idxlane_tile_offsetsr=  swizzle_groupsrw  
lane_bankswavefront_banksbanksmaskrepeatselems_per_banklane_offsets_in_tile	num_banksnum_wavefrontsr  swizzle_group_elemsswizzle_tile_elemsr`  ra  wavefront_laneswavefront_masks                r%   has_bank_conflictsz/plan_tiled_transfer.<locals>.has_bank_conflicts  se   		+&I  9!5{CIrxx	15q9I&y1Mq!a^333}m<"66G==	)))!44BGN!$66L\)n<	IJ ((^_MO%..q!4O?N; tAtGnegge"%euS!"W~sCRCx89g	 r/   c                    | S rD   r!   )r   s    r%   r  z%plan_tiled_transfer.<locals>.<lambda>  s    X r/   c              3  P   K   | ]  \  }\  }}|d kD  r|t         z  z  r|  ywrK  )
SMEM_BANKS)r#   r[   rA   r$   r  s       r%   r&   z&plan_tiled_transfer.<locals>.<genexpr>  s4      Av1	
Q1
^34 r(  )r   rA  r   rC  r  NrA  c                    | z  z   z  S rD   r!   )r   
lane_groupr   r`  s    r%   r  z%plan_tiled_transfer.<locals>.<lambda>  s    v
':!:k I r/   zBFailed to synthesize a transfer pattern that avoids bank conflicts)&r   r   r   rS  rT  r   gcdrG   r)   SMEM_BANK_BYTESrV  rd  r  r  r   r   r  rY   r  rY  r   r   rP  rW   r  rw  zerosr(   int64r   r   r   r  r   r  r  r|  r  )$r`  ra  rb  rc  rd  re  r   rf  r  r   rg  rT  transfer_alignmentsmem_bank_bytes	lane_maskr  candidate_dimsrj   group_stridelane_idr  	transformr  r  r  r  r   r  r  r  r  r   r  r  r  r  s$   ``      `                @@@@@@@@@@@r%   rW  rW  y  s   8 	##B'#&! </ \1		%	)Q	..	.!L0Q6. //%xx "
|\
:
Z
4"  --
-
-
	G3E2G	I  o))
Oa''
8/O>?)#a'L8.~8!<.//ggj%()SW)EO,OOP$$^_E.RZZ%< =|L 6 
5	6  "3}k#BC.  c( 		)$QW-g|+q0j 1k#./ '#k*BHH5sI	!),[[!1!1!3Qy\B(kk%++h,"H!A$O)zz%"5"5"8"8)QqTJ*&sK,j & 	J	 r/   c                b    t        j                  | |t         j                  j                        S N)fastmath)r   r.  FastMathFlagscontractr'  r  s     r%   r.  r.    !    	Aq5#6#6#?#?	@@r/   c                b    t        j                  | |t         j                  j                        S r  )r   r;  r  r  r  s     r%   r;  r;    r  r/   c                b    t        j                  | |t         j                  j                        S r  )r   r6  r  r  r  s     r%   r6  r6    r  r/   c                    y rD   r!   )r'  r  arrayss      r%   optimization_barrierr    s     r/   c                     y rD   r!   )r'  s    r%   r  r  !  s    r/   c                 
   t         j                  j                  d      fd}g }g }g }| D ]  }|j                  j                  d   j
                  }|j                  }t         j                  j                  |      s|k(  rt         j                  j                  |      rt        j                  |      j                  \  }|j                  j                  D 	
cg c]H  }	t        |      D ]8  }
t        j                  |	g t         j                  j                  |
g            : J }}	}
nt!        |j                  j                        }|k(  rdnd}nQt#        j$                  |      dk  r#dt#        j&                  |      z  }t         j                  j                  |      st)        |j                        t        j                  |      j                  \  }||z  rt)        |      ||z  }t         j                  j                  |f      }t        |      D 	cg c]g  }|j                  j                  D ]L  }	t        j                  t        j*                  ||	      g t         j                  j                  |g            N i }}}	d}nt)        |j                        ||z  }||d   j
                  gt-        |      z  z  }||gt-        |      z  z  } d}d	j/                  g d
 |D        t1        t2        t        t-        |                        }t-        |      dk(  r"t5        j6                  |d   |||dd      }|g}nt         j8                  j;                  dd	j/                  t1        t2        |             d      }t5        j6                  ||||dd      }t=        |      D cg c]  \  }}t5        j>                  |||g       }}}t         j                  j                  d      g }tA        |      }| D ]   }|j                  jB                  }|j                  j                  d   j
                  }t         j                  j                  |      rt        j                  |      }tE        jF                  |ftH              }t        |      D ]2  } |||      }	|	j
                  |k(  sJ |	j
                  |f       |	||<   4 |jK                  tM        |jO                  |j                  j                        |jP                  |jR                               # t-        |       dk(  r|d   S |S c c}
}	w c c}	}w c c}}w )zActs as an optimization barrier for LLVM.

  Passing arrays through this function will make sure that they are computed
  before any side-effecting operations that follow this barrier.
  r   c           	        t         j                  j                  |      st        |       }|j                  |k(  sJ |S t        j                  |      dz  }t         j                  j                  |f      }t        j                  |      }t        |      D ]L  }t        j                  t        |             }t        j                  ||t        j                  |            }N t        j                  ||      S r  )r   r   r   r  rv  r   rb  r   r   r  rw  r  r?  r   r  r   )	regs_itr  
result_regnum_i32_regs
i32_reg_tyr  i_elemr  r   s	           r%   _repackz%optimization_barrier.<locals>._repack.  s    ==##F+=j__&&&>>&)R/L""L?C8J
//*
%C% FLLd7m,csCV)DEcF >>&#&&r/   r   r  r  r  r   r1   ,c              3  &   K   | ]	  }d |z     yw)=Nr!   )r#   r   s     r%   r&   z'optimization_barrier.<locals>.<genexpr>m  s     *Qq*r'   r   T)asm_dialecthas_side_effectsz!llvm.struct<(z)>r  r  )*r   r   r   r  r  rv  r  r  r   r   rH   rw  r   r  r  r   r   r   rb  	bytewidthrV  r  r(   r3   r4   r5   r   r  r%  r&  rW   r'  iterr  r   r  r  rX   r  r  r  r  )r  r  r   
reg_dtypesreg_constraintsarrayr  r  r  r  pos
array_regsreg_constraintreg_packingr  r  r[   ptxall_reg_constraintsresult_elem	struct_tyresult_structresultsr  num_regsr  i_vregr   s                              @r%   r  r  &  s    	##B'#' 
$*/  ):e__!!!$))FE	zzU#u|		!	!&	)MM&)//	 ++
 W~
  NN!# " 4 4 8 8# ?


 
 %//../
#slsn			#//k]]%%f-!%"2"233--'--iw	;	!'**+l==$$l_c:j & __))  ..nnZ-! 2266s;j  n 0 011JD:a=%%&Z88J'#j/99OS):T 	#S*/*SSeC<P6Q-RS 	_ //1tS"5K =D
#c:"678;I OO41M "*-Au 	%4D 
 	##B'#'J' e##H__!!!$))F	}}'}}V$fHHh[7M/ "GV$cXX3#((F!33!mF" NN$,,U__-B-BC %	
$ 6{a'4W4g
*Hs   AU#A,U:"Uc           	        d|z  |z  }| dz  dk(  rd\  }}n3| dz  dk(  r|dz  rt        d      d\  }}n|dz  rt        d      d\  }}| |z  } ||z  }t        d	d|z  t        z        }d
x}}	|d	k  rad	|z  }
|
d
kD  r%|dz  dk(  r|
dz  }
|	dz  }	|dz  }|
d
kD  r	|dz  dk(  r|
d
kD  r%| dz  dk(  r|
dz  }
|dz  }| dz  } |
d
kD  r	| dz  dk(  r|||	z  z  }d|z  |k  rt        d      |dz  |z  }||z  dk(  sJ |t        z  d|z  kD  rd
}nd|z  |t        z  z  }d|z  }t        t	        ||z  dz  ||	z  |z  f|dz  |	|z  fd|f||f|ff      dddd      j                         S )NrC  r   r   )r   r   rA  z&Number of tiles is not a multiple of 4r  )r   r   r  r   z5Element types with bitwidth so large aren't supported)ir  )r  r  r  rO   rw   F)r   r   r   r   )rV  rd  r   r   r   r]   )	row_tiles	col_tilesr  rb  swizzle_elemswarp_row_tileswarp_col_tilesbytes_per_threadlane_row_tileslane_col_tilesmax_scale_upr   steps_per_tiletile_rows_per_steps                 r%   tiled_copy_smem_gmem_layoutr	    s=    g+)-]a%)"NN1}1} HII%)"NN1} HII%)"NN))Q[I56$%%.>))L

y1}1qlnAoi 
y1}1 
y1}1qlnAoi 
y1}1 77	H$
U
VV"Q&(2-		&!	++	+Y]!22N&=9+DENN* 
.2N^4SVc4cd!>M#AB- !=1  
 LNr/   c           
        t        j                  | j                        }t        j                  |j                        }t        j                  |j
                        t        j                  |j
                        k7  r%t        d|j
                   d|j
                         |j                  |j                  k7  r%t        d|j                   d|j                         t        j                  |j                        }t         j                  j                  |j                        rdnd}t        j                  |      t        j                  |      k7  r}t        j                  |      r||}}n||}}|j                  |j                  dz   k7  r&t        d|j                   d	|j                   d
      d|z  |z  }	|j
                  dd d|	gk7  rt        d|d|	 d      t        j                  |j
                  d|	f      }
t!        |j
                        |
k7  r+t        d|
 d|j
                   d|	 d|j
                         t#        g |j
                  dd || }t        j                  |      r-t$        j'                  | |||      }|j)                  |d       yt$        j+                  | ||d      }|j-                  ||       yt        d| j                   d|j                         )a$  Copy the data from the src reference to the dst reference.

  Exactly one of src/dst should be in SMEM, while the other should be in GMEM.
  The SMEM reference is expected to be tiled into (8, swizzle_elems) (as it
  would for MMA), and so should have a rank larger by 2 than the GMEM ref.
  zSSource and destination must have the same number of elements, but got source shape z and destination shape zLSource and destination must have the same element type, but got source type z and destination type FNrA  zqSMEM reference must have a rank larger by 2 than the destination reference (due to 2D tiling), but got SMEM rank z and destination rank rN   rC  rO   zFor swizzle=z!, expected SMEM tiling to be (8, r2   z&Expected SMEM reference to have shape z	 (tiling z by (8, z)), but got r  )r  r  )r  )r  r  r  zUnsupported copy: r4  )r   r  rv  r   r   rH   r)   rc  r   rb  r   r   rV  r  rV  rK   rY   r	  r  r   r  r  r  )srcdstr  src_tydst_tyrb  r  smem_tygmem_tyr   expected_src_shaper  r   s                r%   
copy_tiledr    s    =="&=="&	YYv||		&,, 77
	&=fll^	M  F///
	++, - 	" 
 ^^F//0( ~~001D1DEe4)
v%"3"3F";; wgwg||w||a''>>Ell^ L&||nA/ 
 K8+M}}RSa//
;M?!
L  ))'--!]9KLW]]1123E2F Gmm_H]O<P  ) 	r"	&(0F  ''W	RX'Yd
.  ))#6]b)cd
sG$
0
$sxxjIJJr/   )r   zSequence[T]r   zIterable[tuple[int, T]])rH   r   )r   r   )r`  r<  ra  r<  rb  r<  rc  r<  rd  r<  re  r<  r   r   rf  r   r  r   r   r  )r'  r:  r  r:  )r'  mgpu.FragmentedArrayr  r  r  r  r   zSequence[mgpu.FragmentedArray])r'  r  r   r  )
r  r   r  r   r  r   rb  r   r   r   r  )r  r:  r  r:  r  r   )Tr   
__future__r   collections.abcr   r   r   r  r=  rS  r   typingr   r   r	   r
   r   r   jaxjax.experimental.mosaic.gpuexperimentalmosaicr   ra  jaxlib.mlirr   jaxlib.mlir.dialectsr   r   rL  r   r   numpyr   r1   r   r   r\  r   r   r  r  r   	dataclassr   r   r   r   rF  rH  rX  r  WGMMA_COL_LAYOUTr  rD  r  WGMMA_LAYOUT_8BITr  r  r  r  r  r  TCGEN05_COL_LAYOUTr  TMEM_NATIVE_LAYOUTTMA_GATHER_INDICES_LAYOUT	tree_utilregister_pytree_node_classr  rY   r   r  r   r  rP  r  rW  r.  r;  r6  r  r	  r  r!   r/   r%   <module>r&     s   $ " 8 8     D D 
 * *  & $ % 2 ' '   CL%%	#y0 
	GG d#S S $Sj d#  $ d#m m $m`	
 d#2 2 $2j d#GN GN $GNR %'::[H  
<!}!}b!	  
%&:a=!	 ( 
+,	 %
12	 &  
./	 & %
45	  %
./	 6 &
56	  
,-	 (
67	  !
,-:A&'	  !
"$"B'	 $ (* 
 (
="~	  ))E$d;g"K g"K < *g"KTE %eCHo%6c3h%GH	 H8 $ d#,  $ d#1L 1 $12PP P P  	P
 P  P P P P PhAAA 
 "	
 $ 
 
 
t5n<<"<-0<<?<<~8Kr/   