
    ukiA             
      6   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddl	Z	ddl
mZmZmZ ddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z! ddl"m#Z$ ddl%m&Z' ejP                  jR                  Z)ejP                  jT                  Z*ejP                  jV                  Z+ejP                  jX                  Z,ejZ                  Z.ej^                  Z/ej`                  Z0ejb                  Z1ejd                  Z2ejf                  Z4ee.ejf                  f   Z5e6ejf                  df   Z7eejf                  e8f   Z9eeejt                     ef   Z;eee.   ef   Z<dZ=d Z>dOdZ?	 	 	 	 	 	 dPdZ@	 	 	 	 	 	 dQdZA	 	 	 	 	 	 dRdZB	 	 	 	 	 	 	 	 dSdZC	 	 	 	 	 	 	 	 dTdZD	 	 	 	 	 	 	 	 	 	 dUdZEd ZFd ZG e	j                  eGd       ZId ZJ G d dej                        ZLdVd ZM G d! d"      ZNd# ZOej                   ej                  d$%       G d& d'eN                    ZR	 	 	 dW	 	 	 	 	 dXd(ZS e	j                  ej                  j                  d) *      ZVd+ ZWd, ZX	 	 	 	 	 	 dYd-ZY	 dZ	 	 	 	 	 	 	 d[d/ZZ	 	 	 	 	 	 dYd0Z[ G d1 d2      Z\ e]d3 d4 d5 d6 d7 d8 d9 d: ;      Z^ e]d< d= d> d? d@ dA dB dC ;      Z_d\dDZ` e`e^      Z^ e`e_      Z_d\dEZadFdFd.d$ddGdHZb	 	 	 	 	 	 	 	 d]dIZcd^dJZddFdFd.dddd$d.dK	 	 	 	 	 	 	 	 	 	 	 	 	 d_dLZedFdFd.dMdNZfy)`z>Module for emitting custom TPU pipelines within a Pallas call.    )annotations)Sequence)contextmanagerN)AnyLiteralUnion)core)lax)	tree_util)util)
primitives)helpers)tpu_info)types)pallas.)      c                \   t               }t        j                  |      }g fd}	 t        j                  || |d        D cg c]
  }||u rdn| c}t              |j                  k(  sJ t        j                  |      S # t        $ r t	        d|  d| d      dw xY wc c}w )z/Broadcast a prefix pytree to a given full tree.c                j    j                  | gt        j                  |      j                  z         y N)extendr   tree_structure
num_leaves)ixbroadcast_leavess     Z/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/_src/pallas/mosaic/pipeline.py
add_leavesz(_broadcast_pytree_to.<locals>.add_leavesC   s,    	
i&&q)4446    c                
    | d u S r    r   s    r   <lambda>z&_broadcast_pytree_to.<locals>.<lambda>H   s
    d r   is_leafzCannot broadcast tree z to full tree structure .N)objectr   r   tree_map
ValueErrorlenr   tree_unflatten)from_pytree	to_pytreeproxytreedefr   ar   s         @r   _broadcast_pytree_tor1   >   s    
(%$$Y/'6Ez;	24
 :JJAa5jda/J		'"4"4	44	4		!	!'+;	<< 
 E
-k] ;//6iq: ;@DEE Ks   B	 B)	B&c                 >    t        j                         j                  S r   )r   get_tpu_info
generationr!   r   r   _get_tpu_generationr5   Q   s    				 	+	++r   c                   t        |       dk  rt        d|       t        |d      sdt        |       z  S | dd | dd }}|\  }}d|j                  j                  z  }t
        d   }d	t        t               dk        z   |z  }|t        ||      k  r|dz  }|t        ||      k  rg d
t        |      z  |t
        d	   S )ai  Compute a tiling for the given shape and type.

  For a n-dimensional shape, returns (8, 128) for the last 2 dimensions
  and 1 for the leading n - 2. For example, (256, 256) -> (8, 128) and
  (2, 3, 128, 128) -> (1, 1, 8, 128).

  Types are not required to have a dtype, so for such types we return None for
  all dimensions because their tiling is unknown.
     z-Shape must have at least 2 dimensions: shape=dtyper   N   r      r;   )	r*   r)   hasattrr8   itemsize_TILINGintr5   min)	shapetyleading_dims
final_dimssecond_minor_packing
max_tilingsecond_minor_tilings	            r   _make_tilingrK   U   s     	Z!^
EuhG
HH	W	SZ"3BZrs
, /,"""'qz*S!4!6!:;;wFc,
;;1 	c,
;;	E4#l##	E%8	E'!*	EEr   c                L    t        | t              r
| |z  dk(  r| S | | |z  z
  |z   S Nr   
isinstancer@   )smultiples     r   _round_up_to_nearest_multiplerR   u   s2     3ALA-H	
Q\	H	$$r   c                p    t        j                  | |z  |      }t        |t         j                        sJ |S )z(Make a DMA slice with mosaic size hints.)pldsrO   Slice)idxsizeouts      r   _make_block_dsrZ   ~   s1     	cDj$#	C	""	"	*r   c                z   || z  }||z  x}dk(  rt        j                  ||      S |t        d      ||z  dk7  rt        d|d|      t        j                  ||      }| |dz
  k(  }t	        j
                  |t        ||z  |      |      }t        j                  ||      }t        j                  | |z  |      S )Nr   z3If tiling is None, block_size must divide dim_size.*Block size must divide tiling: block_size=	, tiling=r;   )rT   rU   r)   cdivjnpwhererR   multiple_of)	block_index
block_sizedim_sizetilingblock_startdim_rem
num_blocksis_lastrounded_sizes	            r   _create_blocked_slicerk      s     [(+J&&g1,55j))^
J
KK&A
Bzm:fYO
PPwwx,*:>)'#Gj$8&A,
 f5,	{Z'	66r   c                   |||z  dk7  rt        d|d|      |t        j                  | |      S | |z   |kD  }|| z
  }t        j                  |t        ||      |      }t        j                  ||      }t        j                  | |      S )Nr   r\   r]   )r)   rT   rU   r_   r`   rR   ra   )slice_start
slice_sizerc   rd   re   is_oob	remainingrj   s           r   _create_bounded_slicerq      s    
 J/14
Bzm:fYO
PP ^55j)) #h.&$)#Iv6,
 f5,	{L	))r   c                .   |xt         j                  d x\    t        | |j                  ||      S  xt        d x\    t        | |||      S  xt         j
                  d x \    | }|j                  }t        |||||      S  xt         j                  dxP\  } t        | t         j                        st        d      | j                  }| j                  }t        |||||      S  x | S xt         j                  d x\    | S   	 t        d|       )Nr!   zDMust return a pl.ds from the index_map for a BoundedSlice dimension."Unsupported block dimension type: )rT   Blockedrk   rc   r@   Elementrq   BoundedSlicerO   rV   r)   startrX   Squeezed)rb   rc   rX   re   rf   rm   rn   s          r   _make_block_slicery      s(    		";
0E0EtVTT 
	";
D&II 
	k((j"
z:tV  
 
%	$RXX.
 	
  %%k##j"
z:tV  
% 
	 	 		
;J<HIIr   c                    t         j                  j                  t         j                  j                  d | |            }t	        j
                  d |d      S )z+Dynamic index-tuple comparison calculation.c                    | |k7  S r   r!   r   ys     r   r#   z _tuples_differ.<locals>.<lambda>   s
    !q& r   c                    | |z  S r   r!   r|   s     r   r#   z _tuples_differ.<locals>.<lambda>   
    q1u r   Fjaxtreeleavesmap	functoolsreduce)xsysdifferencess      r   _tuples_differr      s<    -@"b IJ+			,k5	AAr   c                     t         j                  j                  t         j                  j                   fd||            }t	        j
                  d |d      S )zBDynamic reduce_all calculation with a user-provided comparison op.c                     | |      S r   r!   )r   r}   binops     r   r#   z"_tuple_all_binop.<locals>.<lambda>   s    %1+ r   c                    | |z  S r   r!   r|   s     r   r#   z"_tuple_all_binop.<locals>.<lambda>   r   r   Tr   )r   r   r   r   s   `   r   _tuple_all_binopr      s=    -Er2 NO+			,k4	@@r   c                    | |k  S r   r!   r|   s     r   r#   r#      s
    QU r   c                f    t        j                  dt         j                        }| D ]  }||z  }	 |S )zDynamic grid size calculation.r;   )r_   arrayint32)gridrX   dims      r   
_grid_sizer      s3    	1cii	 $ cCKD	+r   c                  D    e Zd ZdZdZdZdZdZdZe	d        Z
e	d        Zy	)

BufferTypez5Buffer type for the arguments to an emitted pipeline.r;   r7      r:      c                d    | t         j                  t         j                  t         j                  fv S r   )r   INPUTACCUMULATORINPUT_OUTPUTselfs    r   is_inputzBufferType.is_input   s/      r   c                d    | t         j                  t         j                  t         j                  fv S r   )r   OUTPUTr   r   r   s    r   	is_outputzBufferType.is_output  s/      r   N)__name__
__module____qualname____doc__r   r   r   r   MANUALpropertyr   r   r!   r   r   r   r      sD    =
%&+,&   r   r   c                    d | j                   t        d      t        fd| j                   D              }t        d |D              S )z+Get the block shape for a given block spec.c                    | xt         j                  dx\  } |S  xt         j                  dx\  } |S  xt         j                  dx\  } |S  xt        d x\    | S  x y xt         j
                  d x\    y   	 t        d|        )Nr!   rs   )rT   rt   ru   rv   r@   rx   r)   )bdrc   s     r   _get_dim_sizez'_get_block_shape.<locals>._get_dim_size  s    
!2::! "!2::! "&2??& '35	   "++-  =bTBCCr   zBlock shape must be specified.c              3  .   K   | ]  } |        y wr   r!   ).0r   r   s     r   	<genexpr>z#_get_block_shape.<locals>.<genexpr>  s     GM!,Gs   c              3  &   K   | ]	  }||  y wr   r!   r   r   s     r   r   z#_get_block_shape.<locals>.<genexpr>  s     =Qq}q=s   )block_shaper)   tuple)specblock_shape_nonesr   s     @r   _get_block_shaper     sL    D 

5
66Gd6F6FGG	=+=	==r   c                     e Zd ZdZedd       Zedd       Zedd       Zed        Zed        Z	ed        Z
ed        Zed	        Zd
 ZdddZdddZdddZdddZdddZdddZedd       Zed        Zd Zd Zd Zd dZy)!BufferedRefBasez$Abstract interface for BufferedRefs.c                    t               r   NotImplementedErrorr   s    r   r   zBufferedRefBase.spec$      

r   c                    t               r   r   r   s    r   buffer_typezBufferedRefBase.buffer_type(  r   r   c                     yNFr!   r   s    r   is_bufferedzBufferedRefBase.is_buffered,  s    r   c                .    | j                   j                  S r   )r   r   r   s    r   r   zBufferedRefBase.is_input0  s    $$$r   c                .    | j                   j                  S r   )r   r   r   s    r   r   zBufferedRefBase.is_output4  s    %%%r   c                <    | j                   t        j                  k(  S r   )r   r   r   r   s    r   is_accumulatorzBufferedRefBase.is_accumulator8  s    z5555r   c                <    | j                   t        j                  k(  S r   )r   r   r   r   s    r   is_input_outputzBufferedRefBase.is_input_output<  s    z6666r   c                <    | j                   t        j                  k(  S r   )r   r   r   r   s    r   	is_manualzBufferedRefBase.is_manual@  s    z0000r   c                    t               )Initialize slot indices.r   r   s    r   
init_slotszBufferedRefBase.init_slotsD  r   r   c                    t               )zAdvance the copy in slot.r   r   	predicates     r   advance_copy_in_slotz$BufferedRefBase.advance_copy_in_slotH  r   r   c                    t               )zAdvance the wait in slot.r   r   s     r   advance_wait_in_slotz$BufferedRefBase.advance_wait_in_slotL  r   r   c                    t               )zAdvance the copy out slot.r   r   s     r   advance_copy_out_slotz%BufferedRefBase.advance_copy_out_slotP  r   r   c                    t               )zAdvance the wait out slot.r   r   s     r   advance_wait_out_slotz%BufferedRefBase.advance_wait_out_slotT  r   r   c                    t               )%Load slot information into registers.r   r   s     r   
load_slotszBufferedRefBase.load_slotsX  r   r   c                    t               )%Save slot information from registers.r   r   s     r   
save_slotszBufferedRefBase.save_slots\  r   r   c                .    | j                   j                  S r   r   r   r   s    r   r   zBufferedRefBase.block_shape`      99   r   c                .    | j                   j                  S r   r   	index_mapr   s    r   compute_indexzBufferedRefBase.compute_indexd      99r   c           
         t        |dd       x}t        d| d      t        |      dk  rt        d      t	        ||      } | j
                  | }t        d t        || j                  ||d      D              S )	NrB   zType z does not have a type.r7   zMust use >1D values.c              3  B   K   | ]  \  }}}}t        ||||        y wr   )ry   )r   bibsssts        r   r   z0BufferedRefBase.get_dma_slice.<locals>.<genexpr>  s+      BB 	"b"a(s   Tstrict)	getattrr)   r*   r   rK   r   r   zipr   )r   src_tygrid_indices	src_shapere   block_indicess         r   get_dma_slicezBufferedRefBase.get_dma_sliceh  s    ` VWd33	<vh&<=>>
9~ 677)V,F&D&&5M  4++Yt
  r   c                
    ~~| S )DFor handling VMEM references, the pipeline aliases the existing ref.r!   r   
window_refindicess      r   bind_existing_refz!BufferedRefBase.bind_existing_ref  s    GKr   c                    | S r   r!   r   s    r   unbind_refszBufferedRefBase.unbind_refs  s    Kr   c                    t               )z8Returns a new BufferedRefBase with the given block spec.r   r   r   s     r   	with_speczBufferedRefBase.with_spec  r   r   N)returnpl.BlockSpec)r  r   r  boolT)r   r  r  r   )r   bool | jax.Arrayr  r   r   r  )r  z)Sequence[pl.BlockDim | int | None] | None)r   r  r  r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r!   r   r   r   r   !  s   ,          % % & & 6 6 7 7 1 1        ! !  =~
 r   r   c                    t        | t        j                        r+t        j                  | j
                  | j                        S t        j                  |       j                  S )z@Return the inner of a ref, or a ShapedArray for TransformedRefs.)rB   r8   )
rO   state_typesTransformedRefjax_coreShapedArrayrB   r8   r   typeof
inner_aval)refs    r   _ref_to_value_avalr    sK     
C33	4 #))< ::c?%%r   T)frozenc                  t   e Zd ZU dZ ej
                   ed            Zded<    ej
                   ed            Z	ded<   d	ed
<   d	ed<   d	ed<   d	ed<   d	ed<   d	ed<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   d	ed<   d Z
ed        Zed        ZedId       ZedId        ZedJd!       ZedKd"       Zedd#d$ef	 	 	 	 	 dLd%       ZedMd&       ZedMd'       ZedMd(       ZedMd)       Zed*        Zed+        ZdNd,Z	 dO	 dPd-Z	 	 	 	 dQ	 	 	 	 	 	 	 	 	 dRd.Zed/        Zed0        Z ed1        Z!ed2        Z"ed3        Z#ed4        Z$ed5        Z%ed6        Z&ed7        Z'ed8        Z(d9 Z)d: Z*d; Z+d< Z,dSdTd=Z-dSdTd>Z.dSdTd?Z/dSdTd@Z0dSdUdAZ1dSdVdBZ2dC Z3dD Z4dE Z5dF Z6dWdGZ7dH Z8y#)XBufferedRefav  A helper class to automate VMEM double buffering in pallas pipelines.

  Attributes:
    spec: pallas blockspec.
    buffer_type: enum indicating whether this is an input, output, or in/out
      accumulator buffered reference.
    window_ref: a multiple-buffer to hold the working and dirty buffers used
      to copy into and out of.  In the case of a BufferedRef targeting a VMEM
      reference, this simply points to the existing ref.
    accum_ref: accumulating buffer used by accumulator BufferedRefs.
    copy_in_slot: current slot to copy in for the working buffer.
    copy_out_slot: current slot to copy out for the working buffer.
    wait_in_slot: current slot to wait in for the working buffer.
    wait_out_slot: current slot to wait out for the working buffer.
    next_fetch_smem: Holds the next grid indices to fetch for lookahead. This
      is the SMEM backing buffer used to persist state between pipeline
      invocations.
    next_fetch_sreg: Holds the next grid indices to fetch for lookahead. This
      is the register state used to track the indices within the pipeline loop.
    sem_recvs: Multiple buffered semaphores for input DMAs.
    sem_sends: Multiple buffered semaphores for output DMAs.
    block_shape: passthrough property for the BlockSpec's block_shape.
    compute_index: passthrough property for the BlockSpec's compute_index.
    memory_space: passthrough property for the BlockSpec's memory_space.
    current_ref: points to the current working slice of the double-buffer.
    is_input: whether this BufferedRef acts as a pipeline input.
    is_output: whether this BufferedRef acts as a pipeline output.
    is_accumulator: whether this BufferedRef is an accumulator.
    is_input_output: whether this BufferedRef is an input/output without
      automatic accumulation.
    swap: Tracks whether the BufferedRef slots need to be swapped before next
      copy.
  T)static)metadatar  _specr   _buffer_typezArrayRef | Noner   	accum_refcopy_in_slotwait_in_slotcopy_out_slotwait_out_slotint | jax.Array | None_copy_in_slot_reg_wait_in_slot_reg_copy_out_slot_reg_wait_out_slot_regSequence[jax.Array] | Nonenext_fetch_smemnext_fetch_sregzSemaphoreTuple | None	sem_recvs	sem_sendsswapc                    | j                   r'| j                  dk  rt        d| j                         | j                  r(| j                   r| j                  dkD  rt	        d      y y y )Nr;   z%buffer_count must be at least 1, got r7   z7Buffer count >2 not supported for output buffered refs.)r   buffer_countr)   r   r   r   s    r   __post_init__zBufferedRef.__post_init__  sr    D--11$2C2C1D
E  ~~			d//!3!E
 	
 4	 r   c                    | j                   S r   r  r   s    r   r   zBufferedRef.spec  s    ::r   c                    | j                   S r   )r  r   s    r   r   zBufferedRef.buffer_type  s    r   c                    | j                   | j                  | j                  | j                  g}t	        d |D              S )z)Whether this buffer is multiple-buffered.c              3  $   K   | ]  }|d u 
 y wr   r!   r   s     r   r   z*BufferedRef.is_buffered.<locals>.<genexpr>  s     ,q},s   )r  r  r  r  any)r   slotss     r   r   zBufferedRef.is_buffered	  s>      1 1!3!35E,e,,,r   c                    | j                   duS )z9Whether this buffer allows lookahead for fetching blocks.Nr#  r   s    r   use_lookaheadzBufferedRef.use_lookahead  s     t++r   c                b    | j                   st        d      | j                  j                  d   S )z:Returns the number of buffers used for multiple buffering.zbuffer count is undefinedr   )r   r)   r   rB   r   s    r   r)  zBufferedRef.buffer_count  s.     233??  ##r   c                     t         S r   )r   r!   r   r   buffer_typeszBufferedRef.buffer_types  s    r   NFc	                >   t        |t        j                        r|nt        j                  d|      }	t	        |      }
|t
        j                  u r&t        j                  |	j                  |
            }nd}|j                  t        n|j                  }|t        t        t        fvrt        d|       ||u r9 | di d|d|ddd|d	dd
dddddddddddddddddddddddS |r|t        d      |	j                  |g|
      } | di d|d|d|j                  |      d|d	|j                  rt        dt        j                         ndd
|j                  rt        dt        j                         ndd|j"                  rt        dt        j                         ndd|j"                  rt        dt        j                         ndddddddddd|r1t%        |      D cg c]  }t        dt        j&                         c}ndddd|t
        j(                  u rdnt*        j-                  |f      d|t
        j.                  u rdnt*        j-                  |f      d|rt        dt        j0                        S dS c c}w )a8  Create a BufferedRef.

    Args:
      spec: pallas blockspec.
      dtype_or_type: dtype or aval for buffers. If an aval, the shape is
        ignored.
      buffer_type: enum indicating whether this is an input, output, or in/out
        accumulator buffered reference.
      needs_swap_ref: whether a swap slots tracker needs to be allocated.
      grid_rank: rank of the pipeline grid.
      use_lookahead: whether to enable pipeline lookahead.
      source_memory_space: The memory space of the backing source Ref.

    Returns:
      Initialized BufferedRef
    ){   i  )rB   Nz!Unsupported buffer memory space: r  r  r   r  r  r  r  r  r  r  r   r!  r#  r$  r%  r&  r'  z7grid_rank must be specified when use_lookahead is True.r<   r!   )rO   r  AbstractValuer  r   r   r   VMEM	from_typeupdatememory_spaceSMEMHBMr)   r   r_   uint32r   ranger   r   SemaphoreTypeDMAr   r  )clsr   dtype_or_typer   r)  needs_swap_ref	grid_rankr4  source_memory_spacerC   r   r  buffer_memory_space	buffer_tyrG   s                  r   createzBufferedRef.create   sB   B mX%;%;< 	!!*m<  #4(Kj,,,..!=>ii##+$1B1B 4s"33-.A-B
C  11 "  	
     ! ! " "     !" # ( 
9,E
 	
 ))<">+">)?i " )229= 	
 2=1E1EtD#**-4 2=1E1EtD#**-4 3>2G2GT3::.T 3>2G2GT3::.T ! ! " " + ;@; Q4cii0 04 $ 
 1 11  $$l_5'. 
 0 00  $$l_514 (6tD#((#5 4 <@5 s   8!Jc                J     | j                   ||t        j                  |fi |S r   )rL  r   r   rE  r   rF  r)  kwargss        r   inputzBufferedRef.input  s,    3::mZ--|?E r   c                J     | j                   ||t        j                  |fi |S r   )rL  r   r   rN  s        r   outputzBufferedRef.output  s,    3::mZ..@F r   c                J     | j                   ||t        j                  |fi |S r   )rL  r   r   rN  s        r   accumulatorzBufferedRef.accumulator  s,    3::mZ33\EK r   c                J     | j                   ||t        j                  |fi |S r   )rL  r   r   rN  s        r   input_outputzBufferedRef.input_output  s,    3::mZ44lFL r   c                .    | j                   j                  S r   r   r   s    r   r   zBufferedRef.block_shape  r   r   c                .    | j                   j                  S r   r   r   s    r   r   zBufferedRef.compute_index  r   r   c                0    t        j                  | |      S )z4Returns a new BufferedRef with the given block spec.r,  dataclassesreplacer   s     r   r   zBufferedRef.with_spec  s    t400r   c                0    t        j                  | |      S )N)r$  rZ  )r   
next_fetchs     r   with_next_fetchzBufferedRef.with_next_fetch  s     tZ@@r   c                    | }|t        j                  ||      }|t        j                  ||      }|t        j                  ||      }|t        j                  ||      }|S )z4Returns a new BufferedRef with the given slot index.)r  )r   )r  )r!  rZ  )r   r  r  r  r  new_bufs         r   with_slot_indexzBufferedRef.with_slot_index  sn     G##G|Lg ##GNg##G|Lg ##GNgNr   c                X   t        d | j                  D              }| j                  t        | j                  t              rJ | j
                  s| j                  j                  |   S | j                  r| j                  }n| j                  }| j                  j                  |g|   S )Nc              3  j   K   | ]+  }|'t        |t        j                        st        d        - y wr   )rO   rT   rx   slicer   s     r   r   z*BufferedRef.current_ref.<locals>.<genexpr>  s.      	Z2;;7 	ds   13)
r   r   r   rO   REFr   atr   current_copy_out_slotcurrent_wait_in_slot)r   buffer_sliceslots      r   current_refzBufferedRef.current_ref  s     !! L
 ':doos+KLL__--	))((__ 5 566r   c                X    | j                   | j                   }|S | j                  d   }|S )z8The cumulative number of copy_ins issued on this buffer.r   )r  r  r   vals     r   cumulative_copy_inzBufferedRef.cumulative_copy_in  9     )""c J a cJr   c                |    t        j                  | j                  t        j                  | j
                              S )z;Index in multiple buffer corresponding to the current slot.)r
   remrp  r_   rA  r)  r   s    r   current_copy_in_slotz BufferedRef.current_copy_in_slot  *     774**CJJt7H7H,IJJr   c                X    | j                   | j                   }|S | j                  d   }|S )z9The cumulative number of copy_outs issued on this buffer.r   )r   r  rn  s     r   cumulative_copy_outzBufferedRef.cumulative_copy_out  9     *##c J q!cJr   c                |    t        j                  | j                  t        j                  | j
                              S )z@Index in multiple buffer corresponding to the current copy slot.)r
   rs  rw  r_   rA  r)  r   s    r   rh  z!BufferedRef.current_copy_out_slot  *     774++SZZ8I8I-JKKr   c                X    | j                   | j                   }|S | j                  d   }|S )z8The cumulative number of wait_ins issued on this buffer.r   )r  r  rn  s     r   cumulative_wait_inzBufferedRef.cumulative_wait_in  rq  r   c                |    t        j                  | j                  t        j                  | j
                              S z@Index in multiple buffer corresponding to the current wait slot.)r
   rs  r|  r_   rA  r)  r   s    r   ri  z BufferedRef.current_wait_in_slot  ru  r   c                X    | j                   | j                   }|S | j                  d   }|S )z9The cumulative number of wait_outs issued on this buffer.r   )r!  r  rn  s     r   cumulative_wait_outzBufferedRef.cumulative_wait_out  rx  r   c                |    t        j                  | j                  t        j                  | j
                              S r~  )r
   rs  r  r_   rA  r)  r   s    r   current_wait_out_slotz!BufferedRef.current_wait_out_slot  rz  r   c                    | j                   st        d      | j                  | j                  S t        d | j                  D              S )z?Returns the next grid indices to fetch from if using lookahead.z.Can only get fetch indices if using lookahead.c              3  &   K   | ]	  }|d      ywr   Nr!   )r   smems     r   r   z1BufferedRef.next_fetch_indices.<locals>.<genexpr>  s     :Ta:s   )r4  r)   r$  r   r#  r   s    r   next_fetch_indiceszBufferedRef.next_fetch_indices  sH     GHH'!!!:T%9%9:::r   c                    | j                   s3t        j                  | |j                  | j	                  |               S | S )r   r   )r   r[  r\  rg  compute_slicer   s      r   r   zBufferedRef.bind_existing_ref  s>      
:==););G)DE  Kr   c                L    | j                   st        j                  | d       S | S )Nr  )r   r[  r\  r   s    r   r   zBufferedRef.unbind_refs  s#      $77Kr   c                    | j                   | }t        | j                        t        |      k(  sJ g }t        | j                  |d      D ]  \  }}|xxnxt        j
                  d x\   n  n  |j                  |       9xt        j                  d x\    t        d       xt        j                  d x\    t        d       xt        j                  dx \  } |j                  t        ||              t        d x\   |j                  t        ||              	 t        dt        |              t        |      S )z$Compute DMA slice from grid indices.Tr   r!   z+Element block dimensions are not supported.z0BoundedSlice block dimensions are not supported.rs   )r   r*   r   r   rT   rx   appendru   r)   rv   rt   rZ   r@   typer   )r   r   r   indexerr   rW   rc   s          r   r  zBufferedRef.compute_slice#  s'    d  ,/Gt CL000Gt''> LC!TMBKKM!
..
RZZ\;   R__@   $RZZ#
..Z8
9 $U
..R0
1 ?RzJK
K%L& >r   c                   | j                   sy| j                  r_d| j                  d<   d| j                  d<   | j                  r5t        t        | j                              D ]  }d| j                  |   d<    | j                  rd| j                  d<   d| j                  d<   | j                  d| j                  d<   yy)r   Nr   F)r   r   r  r  r4  rB  r*   r#  r   r  r  r'  )r   r   s     r   r   zBufferedRef.init_slots=  s    V}}dd			s4//01 	)A'($

q
!!
$	)~~ddyydiil r   c                f   | j                   s| S | j                  s| S | j                  | j                  d   n| j                  }t	        j
                  ||dz   |      }| j                  | j                  |      S t        | j                  t        j                        sJ || j                  d<   | S )Switch to the next copy slot.r   r;   r  )
r   r   r  r  r
   selectrb  rO   r   Arrayr   r   current_slotnew_current_slots       r   r   z BufferedRef.advance_copy_in_slotL      D[==k**2 %%a(8<8N8N zz)\A-=|L)!!/?!@@d''333+DaKr   c                f   | j                   s| S | j                  s| S | j                  | j                  d   n| j                  }t	        j
                  ||dz   |      }| j                  | j                  |      S t        | j                  t        j                        sJ || j                  d<   | S )Switch to the next wait slot.r   r;   )r  )
r   r   r  r  r
   r  rb  rO   r   r  r  s       r   r   z BufferedRef.advance_wait_in_slotZ  r  r   c                h   | j                   s| S | j                  s| S | j                  	 | j                  d   n| j                  }t	        j
                  ||dz   |      }| j                  | j                  |      S t        | j                  t        j                        sJ || j                  d<   | S )r  r   r;   )r  )
r   r   r   r  r
   r  rb  rO   r   r  r  s       r   r   z!BufferedRef.advance_copy_out_sloth      D[>>k-1-D-D.D&&q)!%!8!8 zz)\A-=|L*!!0@!AAd((#))444,DqKr   c                h   | j                   s| S | j                  s| S | j                  	 | j                  d   n| j                  }t	        j
                  ||dz   |      }| j                  | j                  |      S t        | j                  t        j                        sJ || j                  d<   | S )r  r   r;   )r  )
r   r   r!  r  r
   r  rb  rO   r   r  r  s       r   r   z!BufferedRef.advance_wait_out_slotv  r  r   c                      j                   s S  fd} fd}t        j                  |||      \  }}}}} j                  ||||      }	|	j                  |	j                  |      }	|	S )r   c                 x   j                   rj                  d   nd } j                  rj                  d   nd }j                   rj                  d   nd }j                  rj
                  d   nd }j                  r1t        fdt        t        j                              D              }nd }| ||||fS )Nr   c              3  B   K   | ]  }j                   |   d      ywr  r3  )r   r   r   s     r   r   z;BufferedRef.load_slots.<locals>._do_load.<locals>.<genexpr>  s%      (!4//215 (s   )r   r  r   r  r  r  r4  r   rB  r*   r#  )copy_incopy_outwait_inwait_outr^  r   s        r   _do_loadz(BufferedRef.load_slots.<locals>._do_load  s    (,!!!$4g*...##A&dh(,!!!$4g*...##A&dh			 (u$$%@' ( (
 
x(J??r   c                    d x} x}x}}d }j                   r$ |j                        }  |j                        }j                  r$ |j                        } |j
                        }j                  rHj                  /t        d t        t        j                              D              }nj                  }nd }| ||||fS )Nc                4    | | S t        j                  d      S rM   )r_   rA  r"   s    r   r#   z:BufferedRef.load_slots.<locals>._no_load.<locals>.<lambda>  s    1 3::a= r   c              3  F   K   | ]  }t        j                  d         ywr  )r_   r   )r   rG   s     r   r   z;BufferedRef.load_slots.<locals>._no_load.<locals>.<genexpr>  s      *aSYYq\ *s   !)r   r  r  r   r   r!  r4  r$  r   rB  r*   r#  )r  r  r  r  _ensure_not_noner^  r   s         r   _no_loadz(BufferedRef.load_slots.<locals>._no_load  s    044g447X I	"4#9#9:"4#9#9:	#D$;$;<#D$;$;<			' *5$&&'4) * ** ++*
x(J??r   )r  r  r  r  )r^  )r   r
   condrb  r#  r_  )
r   r   r  r  r  r  r  r  r^  brefs
   `         r   r   zBufferedRef.load_slots  s    k
@@* 88Ix:\=,!#!#	   D '!!Z!8dKr   c                Z      j                   syt        j                  |       fd       }y)r   Nc                 (   j                   rj                  J j                  j                  d<   j                  J j                  j                  d<   j
                  rPj                  J t        t        j                              D ]!  } j                  |    j                  |    d<   # j                  rOj                  J j                  j                  d<   j                  J j                  j                  d<   y y rM   )r   r  r  r  r  r4  r$  rB  r*   r#  r   r   r  r!  r  )r   r   s    r   rG   z!BufferedRef.save_slots.<locals>._  s   	%%111#55!%%111#55!%%1
11T1123 Aa)-)=)=a)@D  #A&A	&&222 $ 7 71&&222 $ 7 71	 
r   )r   rT   when)r   r   rG   s   `  r   r   zBufferedRef.save_slots  s*    WWY8 8r   c                >   | j                   sJ | j                  sy| j                  t        | j                  t              rJ | j
                  J | j                  d| j                  d<   | j                  }| j                  t        |      |      }t        d t        || j                        D              }t        j                  |j                  |   | j                  j                  |g|   | j
                  j                  |         j!                          y)z3Starts copy of HBM dma slice into the current slot.NTr   c              3     K   | ]C  \  }}|<t        |t        j                        s"t        j                  d|j                         E y wrM   rO   rT   rx   rU   rX   r   rP   r   s      r   r   z&BufferedRef.copy_in.<locals>.<genexpr>  =      Ar
jR[[9 	a   A	A)r   r   r   rO   rf  r%  r'  rt  r   r  r   r   r   tpu_primitivesmake_async_copyrg  rw   )r   src_refr   rk  	src_slice	dst_slices         r   r  zBufferedRef.copy_in  s    ===V':doos+KLL>>%%%yydiil$$D""#5g#>MI D$4$45 I
 ""

9D-9-.$ egr   c                >   | j                   sJ | j                  sy| j                  t        | j                  t              rJ | j
                  J | j                  d| j                  d<   | j                  }| j                  t        |      |      }t        d t        || j                        D              }t        j                  | j                  j                  |g|   |j                  |   | j
                  j                  |         j!                          y)z3Starts copy of HBM dma slice from the current slot.NTr   c              3     K   | ]C  \  }}|<t        |t        j                        s"t        j                  d|j                         E y wrM   r  r  s      r   r   z'BufferedRef.copy_out.<locals>.<genexpr>  r  r  )r   r   r   rO   rf  r&  r'  rh  r   r  r   r   r   r  r  rg  rw   )r   dst_refr   rk  r  r  s         r   r  zBufferedRef.copy_out  s    >>>V':doos+KLL>>%%%yydiil%%D""#5g#>MI D$4$45 I
 ""D-9-.

9$ egr   c                   | j                   sJ | j                  sy| j                  t        | j                  t              rJ | j
                  J | j                  t        |      |      }t        d t        || j                        D              }| j                  }t        j                  |j                  |   | j                  j                  |g|   | j
                  j                  |         j                          y)zWaits for input copy to finish.Nc              3     K   | ]C  \  }}|<t        |t        j                        s"t        j                  d|j                         E y wrM   r  r  s      r   r   z&BufferedRef.wait_in.<locals>.<genexpr>  r  r  )r   r   r   rO   rf  r%  r   r  r   r   r   ri  r  r  rg  wait)r   r  r   r  r  	wait_slots         r   r  zBufferedRef.wait_in  s    ===V':doos+KLL>>%%%""#5g#>MI D$4$45 I
 ))I""

9##	
 	)$ dfr   c                   | j                   sJ | j                  sy| j                  t        | j                  t              rJ | j
                  J | j                  }| j                  t        |      |      }t        d t        || j                        D              }t        j                  | j                  j                  |g|   |j                  |   | j
                  j                  |         j                          y)z Waits for output copy to finish.Nc              3     K   | ]C  \  }}|<t        |t        j                        s"t        j                  d|j                         E y wrM   r  r  s      r   r   z'BufferedRef.wait_out.<locals>.<genexpr>  r  r  )r   r   r   rO   rf  r&  r  r   r  r   r   r   r  r  rg  r  )r   r  r   r  r  r  s         r   r  zBufferedRef.wait_out  s    >>>V':doos+KLL>>%%%**I""#5g#>MI D$4$45 I
 ""I2	23

9)$ dfr   c                      j                   sJ  j                  9 j                  j                   fd} fd}t        j                  |||       yy)z-Set accumulator or zero it out to initialize.Nc                 d    t        j                   j                  d          j                  d<   y N.)r_   
zeros_liker  r   s   r   _initz*BufferedRef.set_accumulator.<locals>._init*  s#    !nnT^^C-@Asr   c                 \    j                   d   j                         j                  d<   y r  )rl  astyper  )accum_dtyper   s   r   _setz)BufferedRef.set_accumulator.<locals>._set,  s'    "..s3::;Gsr   )r   r  r8   r
   r  )r   initr  r  r  s   `   @r   set_accumulatorzBufferedRef.set_accumulator%  sL    ~~!NN((kBH	hhtUD! "r   c                   | j                   sJ | j                  | j                  J t        j                  }| j                  j
                  t        j                  k(  rt        j                  }| j                  d   j                  |      | j                  d   j                  |      z   j                  | j                  j
                        | j                  d<   yy)zAdd into the current slot.N.)	r   r  r   r_   float32r8   r   rl  r  )r   r  s     r   
accumulatezBufferedRef.accumulate0  s    ~~!__(((KKk			#))	+ii 

3

&
&{
3NN3&&{34t$$% s "r   r  r  r@   )r  ztype[BufferType])r   r  rI  z#tpu_core.MemorySpace | Literal[ANY]r  r  )r7   )r   r  r  r  r   )r^  r"  )NNNN)
r  r  r  r  r  r  r  r  r  'BufferedRef'r  )r   r  r  r  )r   r  r  r  r  F)9r   r   r   r   r[  fielddictr  __annotations__r  r*  r   r   r   r   r4  r)  staticmethodr7  classmethodANYrL  rP  rR  rT  rV  r   r   r   r_  rb  rl  rp  rt  rw  rh  r|  ri  r  r  r  r   r   r  r   r   r   r   r   r   r   r  r  r  r  r  r  r!   r   r   r  r    sT    B *))4t3DE%E.[..D8IJ,
J    ++++,,,,----"""" 		
     - - , , $ $    ADee ?e e eN  
  
  
  
 ! !  1
 48A0A .2.2-1.2* , +	
 , & 7 7    K K   L L   K K   L L ; ;
4-^8****<	"&r   r  c                     j                   sJ fdfdfdd  j                   j                  z   |) j                  |z   }t	        j
                  |k  |      fd} fd}	 j                  }
 |
      }t	        j                  ||	|
| j                  f      \  }}} j                  |       |r j                  |        |fS )a  Fetch future blocks using unbounded lookahead.

  Args:
    buffered_ref: the BufferedRef to fetch for.
    src_ref: the source Ref.
    grid: the grid bounds.
    grid_offsets: the grid offsets (used for megacore).
    predicate: a boolean predicate for whether to perform the fetch.
    max_num_fetches: the maximum number of fetches to perform. If None,
      this will continually fetch until all copy_in slots are full.
    update_slots: whether to update the register slot indices.
  c                @    t        d t        | d      D              S )Nc              3  ,   K   | ]  \  }}||z     y wr   r!   r   r   js      r   r   z9fetch_with_lookahead.<locals>.<lambda>.<locals>.<genexpr>S  s      ;1a!e;   Tr   r   r   r   grid_offsetss    r   r#   z&fetch_with_lookahead.<locals>.<lambda>S  s"     ;A|D9; ; r   c                    t        |       S r   )	_tuple_ltr   r   s    r   r#   z&fetch_with_lookahead.<locals>.<lambda>U  s    Ia. r   c                     t        | d      S )NT)allow_overflow)_next_indexr  s    r   r#   z&fetch_with_lookahead.<locals>.<lambda>V  s    AtD I r   c                    t        | t              rt        j                  |       S | j	                  t        j                        S r   )rO   r  r_   rA  r  r"   s    r   	as_uint32z'fetch_with_lookahead.<locals>.as_uint32W  s-    !TZZ]XXcjj!!r   c                :    | \  }}}|k  } |      }|z  |z  S r   r!   )	carryrG   next_indicesrp  within_limit	in_boundsfetch_limitindex_inboundr   s	         r   
_loop_condz(fetch_with_lookahead.<locals>._loop_condf  s4    */'A|'%3Ll+I|#i//r   c                   | \  }}} |      } |       j                   | } j                    }t        ||      }|}j                  |      t        j                  |      fd       }	| |      z   }
 |      }|||
fS )Nr  c                 *     j                         y r   r  )r  next_indices_offsetr  s   r   _startz8fetch_with_lookahead.<locals>._loop_body.<locals>._startw  s    
ll7/0r   )r   r   rb  rT   r  )r  current_indicesr  rp  cur_indices_offsetr   next_block_indiceswill_changepredr  next_copy_innext_next_indicesr  r  
add_offsetr  buffered_refincrement_indicesr  s               @@r   
_loop_bodyz(fetch_with_lookahead.<locals>._loop_bodyn  s    8=5O\#5#O4$\2.L..0BCM3335HI 0BCKD''5G'HDWWT]1 1%	$7L),7*L88r   r  )
r4  r|  r)  rp  r
   r  r  
while_loopr_  rb  )r  r  r   r  r   max_num_fetchesupdate_slotsfetch_once_limitr  r  r  r^  final_indicesrG   final_copy_in_slotr  r  r  r  r  s   `````          @@@@@r   fetch_with_lookaheadr  @  s    $ 
	#	##	#;*.-I" //,2K2KK+ #66H **[+;;(*:<K09 9 !33/ 1*),*
L$C$CD*F&-& --m<,//=O/PL	)	))r   c                "    t        | t              S r   )rO   r   r"   s    r   r#   r#     s    jO4 r   r$   c                &      fd}t        |g| S )z!Maps over all input BufferedRefs.c                2    | j                   r	 | g| S | S r   )r   r  f_argsfs     r   fmapzmap_inputs.<locals>.fmap  s    }}tfKr   	map_brefsr  argsr  s   `  r   
map_inputsr         
4	$	r   c                &      fd}t        |g| S )z"Maps over all output BufferedRefs.c                2    | j                   r	 | g| S | S r   )r   r  s     r   r  zmap_outputs.<locals>.fmap  s    ~~tfKr   r  r  s   `  r   map_outputsr    r  r   c                >    t        d t        | |d      D              S )Nc              3  T   K   | ]   \  }}t        |t              r|d k(  rdn| " yw)r;   r   NrN   )r   r   gs      r   r   z"_filter_indices.<locals>.<genexpr>  s0      
!Q a!q&aa/s   &(Tr   r  )r   r   s     r   _filter_indicesr    s'     
 gtD1 
 r   Fc                   g }d}t        t        t        t        | |d                        D ]w  \  }\  }}t        j
                  j                  ||dz   |      }|r|t        |      dz
  k(  rd}n||k(  }|j                  t        j
                  j                  |d|             y |rt        t        |            S t        t        t        |            |      S )a  Increments the grid indices by one.

  Args:
    indices: the current grid indices.
    grid: the pallas grid.
    allow_overflow: whether to allow the indices to overflow the grid.
      If False (default), indices will wrap around to zero after reaching the
      maximum grid size. If True, the bounds on the first grid position
      will be ignored.

  Returns:
    The next grid indices.
  Tr   r;   Fr   )	enumeratereversedlistr   r   r
   r  r*   r  r   r  )	r   r   r  rY   r  positionr   r  incs	            r   r  r    s    " 	# %#tCd3457 .hA
''..Aq
)C8s4y1}4eQheJJsww~~eQ,-. #5#/66r   c           	     N   g }d}t        t        t        | |d                  D ]a  \  }}t        j                  j                  ||dz
  |      }|dk(  }|j                  t        j                  j                  ||dz
  |             c t        t        t        |            |      S )NTr   r;   )	r"  r#  r   r   r
   r  r  r  r   )r   r   rY   borrowr   r  decs          r   _prev_indexr*    s     	#!&tCd;<= 3da
''..Q
*CBYFJJsww~~fa!eS123 
x}-t	44r   c                      e Zd ZdZ	 	 	 	 d	 	 	 	 	 	 	 	 	 ddZed        Zd Zd Zd Z	d Z
d	 Zd
 Zd ZddZdddZdddZddZdddZdddZddZddZy)	Schedulerz;Sequences input and output copies and waits for a pipeline.Nc
                8   || _         || _        | _        || _        || _        || _        || _        |	| _        t        |      | _	        |dk(  | _
        || j                  dz
  k(  | _        || j                  z  | _        || j                  z  | _        t        d t        |d      D              | _        t        d t        t#        ||      d      D              | _        t'        ||      }
t        d t        |
d      D              | _        fd| _        | j                   | j(                  g| _        |
}t/        | j                  dz
        D ]E  }t'        ||      }| j,                  j1                  t        d	 t        |d      D                     G y
)aX  Initializes scheduler.

    Args:
      step: inner step number.
      indices: current grid indices.
      grid: pallas grid for BufferedRefs.
      grid_offsets: offsets for grid indices (used for megacore).
      num_stages: number of stages in the pipeline.
      first_cycle: whether this is the first invocation of the pipeline.
      last_cycle: whether this is the last invocation of the pipeline.
      init_accumulators: do we zero-initialize accumulator state for this
        invocation of the pipeline.
      trace_scopes: whether to use named_scope to trace blocks in the pipeline.
    r   r;   c              3  ,   K   | ]  \  }}||z     y wr   r!   r  s      r   r   z%Scheduler.__init__.<locals>.<genexpr>  s      !QAr  Tr   c              3  ,   K   | ]  \  }}||z     y wr   r!   r  s      r   r   z%Scheduler.__init__.<locals>.<genexpr>         Aq 	
Ar  c              3  ,   K   | ]  \  }}||z     y wr   r!   r  s      r   r   z%Scheduler.__init__.<locals>.<genexpr>  r0  r  c                @    t        d t        | d      D              S )Nc              3  ,   K   | ]  \  }}||z     y wr   r!   r  s      r   r   z7Scheduler.__init__.<locals>.<lambda>.<locals>.<genexpr>  s      &J1a!e &Jr  Tr   r  r  s    r   r#   z$Scheduler.__init__.<locals>.<lambda>  s(     &JA|CG9I &J !J r   c              3  ,   K   | ]  \  }}||z     y wr   r!   r  s      r   r   z%Scheduler.__init__.<locals>.<genexpr>  s       &1 E&r  N)stepr   r  
num_stagesfirst_cycle
last_cycleinit_accumulatorstrace_scopesr   	num_steps
first_step	last_stepfirst_step_everlast_step_everr   r   r   r*  prev_indicesr  r  r  fetch_indicesrB  r  )r   r5  r   r   r  r6  r7  r8  r9  r:  r  rA  rG   s       `        r   __init__zScheduler.__init__  s   4 DIDI$D DO"D DO.D$D  %DN aiDOT^^a//DN '8D$t~~5D  g|DA DL  GT2LN D w-L l4@ DJDO ,,(9(9:D M4??1$% 	!-6m
 &M<E& ! 		r   c              #     K   | j                   r#t        j                  |      5  d  d d d        y d  y # 1 sw Y   y xY wwr   )r:  r   named_scope)r   names     r   _named_scopezScheduler._named_scope$  s<     ??4     s   "A7AA Ac           	         t        j                  t        t        t         j                  | j
                  | j                                    S r   )pallas_coregrid_envr#  r   GridAxisr   r   r   s    r   rI  zScheduler.grid_env,  s6    S%%t||TYY?@B Br   c                    |j                   st        j                  d      S | j                  | j                  |j
                  z
  dz   k\  S )z2Returns whether there are no more blocks to fetch.Fr;   )r   r_   r  r5  r;  r)  r   r  s     r   out_of_fetchzScheduler.out_of_fetch0  s>     ##XXe_99,*C*CCaGHHr   c                    |j                   sy |j                  | j                   } |j                  | j                   }t	        ||      S r   )r   r   r   r@  r   )r   r  r   r@  s       r   has_changedzScheduler.has_changed8  I    ##(l(($,,7G-<--t/@/@AL'<00r   c                    |j                   sy |j                  | j                   } |j                  | j                   }t	        ||      S r   )r   r   r   r  r   r   r  r   r  s       r   will_change_currentzScheduler.will_change_current?  rP  r   c                
   |j                   sy|j                  dk  r
t                |j                  | j                  |j                  dz
      } |j                  | j                  |j                  dz
      }t        ||      S )NFr7   r;   )r   r)  r   r   rA  r   rR  s       r   will_change_fetchzScheduler.will_change_fetchF  s    ##  1$!!(l((			L55a7	8:G-<--			L55a7	8:L'<00r   c                :    |j                  || j                        S r   )r   r   )r   r  r  s      r   alias_local_refszScheduler.alias_local_refsQ  s    ))#t||<<r   c                "    |j                         S r   )r   rL  s     r   unalias_local_refszScheduler.unalias_local_refsT  s    ##%%r   c           	         |t         } |d          } j                  d|       5  |dk(  r7t        j                   j                        fd       }j                         j                  rj                  scd d d        S |dz   j                  k\  rcd d d        S j                  rl|dk(  r1t        j                  |       fd       }j                  |      nt         j                   j                   j                  |z  d      \  }n|dk(  r|}	 j                  |   nP j                  |    j                  |dz
     }
 j                   } j                  |
 }t!        ||      }||z  }	t        j                  |	      fd       }j                  |	      d d d        S # 1 sw Y   S xY w)	Nprologue_copy_inep_initialize_r   c                 &     j                          y r   r   r  s   r   _init_slotsz.Scheduler.initialize_step.<locals>._init_slotsf      

!
!
#r   r;   c                 \     j                  j                   j                               y r   )r  r  r  r  r   r  s   r   r  z)Scheduler.initialize_step.<locals>._startt  s&      ool==>@r   )r   r  c                 *     j                         y r   r  )r  rA  r  s   r   r  z)Scheduler.initialize_step.<locals>._start  s    


w
6r   )_default_schedulerF  rT   r  r>  r   r   r   r)  r4  r   r  r   r  rA  r   r   )r   r  r  scheduler5  do_copyr`  r  rG   r   prev_grid_indicesr   prev_block_indicesblock_changedrA  s   ```           @r   initialize_stepzScheduler.initialize_step\  s   "h +h)*4wGG			^D62	3 ,D		%%	&	$ 
'	$#..0"",*B*B,D ,D (|00	0,D ,D 
	#	#19777@ @ &::7C,0ii,,w6/, 19),,T2-,,T2-"00:
4,44mD-9|99;LM
(8JK--)			7 
	7#88CY,DZ [,DZ s   AGG+DGGc                     |t         } |d          } j                  d       fd       } j                  d       fd       }t        j                  |||       S )Nr  
ep_wait_inc                      j                   r j                  j                          j                  r j	                  j
                         y y r   )r   r  r   r   r  r9  rc  s   r   _waitz Scheduler.wait_in.<locals>._wait  sD    			Wdll3		$	$ 	$$T%;%;< 
%r   ep_set_accumc                     j                   r;t        j                  j                  j	                        z        fd       } y y )Nc                 <     j                  j                         y r   )r  r9  )r  r   s   r   _set_accumulatorz=Scheduler.wait_in.<locals>._no_wait.<locals>._set_accumulator  s     
&
&t'='=
>r   )r   rT   r  r<  rO  )rs  r  r   s    r   _no_waitz#Scheduler.wait_in.<locals>._no_wait  sA    		$	$	4#3#3L#AA	B	? 
C	? 
%r   )re  rF  r
   r  )r   r  r  rf  r  ro  rt  s   ```    r   r  zScheduler.wait_in  sw    "h8It\7;D	|$= %= 
~&? '? HHT5(#r   c                ^    |t         } |d          }j                  sS j                  r)t         j                   j
                  d      \  }S t        j                  |       j                  d       fd              }j                  |j                  z        S )Nr  T)r   
ep_copy_inc                      j                   r: j                  r- j                  j                   j                  dz
            y y y )Nr;   )r   r   r  rA  r)  rc  s   r   _sendz Scheduler.copy_in.<locals>._send  sG       \%=%=


w|88:;= &> r   )
re  r   r4  r  r   r  rT   r  rF  r   )r   r  r  rf  r  rG   rx  s   ```    r   r  zScheduler.copy_in  s    "h8It\7;D  !!,
D,=,=olA  
wwt}&= ' = "66
&&
&(lr   c                    |t         } |d          }j                  rj                  sy j                  rj	                  t
        j                  j                  t        j                  j                              t        j                  |       fd       }j                  |      t         j                   j                   |d      \  t        j                  |      fd       }y | j"                  z  } j$                  }t'        j(                  dz
        D ]~  } j*                  |dz       j,                  | }	 j,                   }
|dk(  rd}nt/        |	|
      }t        j                  ||z        fd	       }j                  ||z        } j1                          y )
Nprefetchc                 \     j                  j                   j                               y r   )r  r  r$  rc  s   r   r  z"Scheduler.prefetch.<locals>._start  s&    T__\%A%AB	Dr   F)r   r  c                 J    j                        } | j                          y )Nr  )rb  r   )r  r  r
  s    r   rG   zScheduler.prefetch.<locals>._  s!    ++9K+Lr   r;   r   Tc                 *     j                         y r   r  )r  next_grid_indicesr  s   r   rG   zScheduler.prefetch.<locals>._  s    


w(9
:r   )re  r   r   r4  r_  r   r   r   r_   r  r$  rT   r  r   r  r   r  r=  r   rB  r)  rA  r   r   r   )r   r  r  rf  r  r  rG   r   r   r   r  should_prefetchr
  r~  s   ```         @@r   rz  zScheduler.prefetch  s   "h8JlG<D  (@(@!!!11
((,,s~~|'C'C
DFl	wwt}D D "66t<l)=


))


*&l& 
wwt} . ' DNN"d\\l\..23 )! ..qs3222LA7\779JK6 !/*=:LM/	'	(	; 
)	;#889OP()  
r   c                     |t         } |d          }t        j                  |       j                  d       fd              }j	                  |j
                  z        S )Nr  ep_wait_outc                 X     j                   r j                  j                         y y r   )r   r  r@  r  r  r   s   r   ro  z!Scheduler.wait_out.<locals>._wait  s+     
		 	gt'8'89 
 r   )re  rT   r  rF  r   r   )r   r  r  rf  r  ro  s   ```   r   r  zScheduler.wait_out  sn    "h8JlG<DWWT]	}%: & : --d\5K5K.KLLr   c                    |t         } |d          } j                  d       fd       } j                  d       fd       }t        j                  |||       j	                  |j
                  z        S )Nr  ep_copy_outc                      j                   r j                           j                  r j                  j                         y y r   )r   r  r   r  r   r  s   r   _copy_out_and_accumulatez4Scheduler.copy_out.<locals>._copy_out_and_accumulate  s:    		$	$!			gt||4 
 r   ep_accumc                 n    j                   r(t        j                  j                        fd       } y y )Nc                 &     j                          y r   )r  r_  s   r   _accumulatezAScheduler.copy_out.<locals>._just_accumulate.<locals>._accumulate"  ra  r   )r   rT   r  r=  )r  r  r   s    r   _just_accumulatez,Scheduler.copy_out.<locals>._just_accumulate  s2    		$	$ 
	 	$ 
!	$ 
%r   )re  rF  r
   r  r   r   )r   r  r  rf  r  r  r  s   ```    r   r  zScheduler.copy_out  s    "h8JlG<D	}%5 &5 
z"	$ #	$ HHT+-=>--d\5K5K.KLLr   c                     |t         } |d          }t        j                  |       j                  d       fd              }j	                          y )Nepilogue_wait_outep_finalizec                 X     j                   r j                  j                         y y r   )r   r  r   r  s   r   _endz Scheduler.finalize.<locals>._end-  s'     
		gt||4 
 r   )re  rT   r  rF  r   )r   r  r  rf  r  r  s   ```   r   finalizezScheduler.finalize(  s_    "h(8'(|WEDWWT]	}%5 & 5 r   c                j    |t         }|j                  r |d   | ||      }|j                  |      }|S )Nadvance_wait_in)re  r   r   )r   r  rf  r  s       r   advance_slotszScheduler.advance_slots5  sD    "h(X'(|XFd!66t<l r   )NNNT)
r5  	jax.Arrayr   tuple[int | jax.Array, ...]r   r  r  r  r6  r@   rM   r   )r  r  )r   r   r   r   rB  r   rF  rI  rM  rO  rS  rU  rW  rY  rk  r  r  rz  r  r  r  r  r!   r   r   r,  r,    s    C I	I	 +I	 (	I	
 0I	 I	V  BI11	1=&5n421fM*M4	r   r,  c                    | j                   S r   r>  rP   r  rG   s      r   r#   r#   O      (9(9 r   c                >    | j                  |      | j                  z  S r   )rO  r<  r  s      r   r#   r#   Q  s    q}}T2Q\\A r   c                >    | j                  |      | j                  z  S r   rS  r=  r  s      r   r#   r#   R  s    	d#akk1 r   c                J    | j                  |      | j                  |       z  S r   rU  rM  r  s      r   r#   r#   T  )    q2248ANN= <   r   c                Z    | j                  |      | j                  z  | j                   z  S r   )rU  r=  r?  r  s      r   r#   r#   W  s,    	
		T	"Q[[	0Q5E5E4EE r   c                Z    | j                  |      | j                  z  | j                   z  S r   )rO  r<  r>  r  s      r   r#   r#   [  s&    t!4q||!CHYHYGY Y r   c                >    | j                  |      | j                  z  S r   r  r  s      r   r#   r#   ]  s     5 5d ;akk I r   c                    | j                   S r   r?  r  s      r   r#   r#   ^      )9)9 r   )r[  r  r  r  rz  r  r  r  c                    | j                   S r   r  r  s      r   r#   r#   g  r  r   c                >    | j                  |      | j                  z  S r   rO  r>  r  s      r   r#   r#   i  s    q}}T2Q5F5FF r   c                $    | j                  |      S r   )rS  r  s      r   r#   r#   j  s    q'<'<T'B r   c                J    | j                  |      | j                  |       z  S r   r  r  s      r   r#   r#   k  r  r   c                @    | j                  |      | j                   z  S r   )rU  r?  r  s      r   r#   r#   n  s     3 3D 9Q=M=M<M M r   c                @    | j                  |      | j                   z  S r   r  r  s      r   r#   r#   p  s    t!48I8I7I!I r   c                >    | j                  |      | j                  z  S r   )rS  r?  r  s      r   r#   r#   r  s     5 5d ;a>N>N N r   c                    | j                   S r   r  r  s      r   r#   r#   s  r  r   c                Z    i | }dD ]!  }d }t        j                  || |         ||<   # |S )z=Skip input copies in schedule when init_accumulators is True.)r[  r  r  c                     | | }|d   j                   s|d   j                  r%|t        j                  |d   j                        z  }|S )Nr;   r   )r   r   r_   logical_notr9  )original_pred_fnr0   r  s      r   new_predz:skip_input_copies_when_init_accumulators.<locals>.new_pred|  sG    q!d	
1		! 4 4! 6 677kr   )r   partial)rf  new_schedulekr  s       r   (skip_input_copies_when_init_accumulatorsr  w  sG    H,5 a  ''LO 
r   c                f    t         t        d}t        | t              r||    j	                         S | S )zGRetrieve a named pipeline schedule or pass through fully specified one.)defaultfixed)re  _fixed_schedulerO   strcopy)rf  predefined_scheduless     r   get_pipeline_scheduler    s7     # #)..00	/r   r!   in_specs	out_specsshould_accumulate_outrG  r   c                   t        |       }t        | t        t        f      s| f} t        |t        t        f      s|f}t        | t              rt        |       } t        |t              rt        |      }|d| }||d }fd}	t        j
                  j                  |	| |      }
fd}t        j
                  j                  ||||      }g |
|S )a  Create BufferedRefs for the pipeline.

  This function creates buffered refs for an inner pipeline that can be
  created at the top-level of a pallas call such that they may be reused across
  multiple invocations of the inner pipeline.

  Args:
    in_specs: input pallas block specs
    out_specs: output pallas block specs
    should_accumulate_out: booleans to indicate which outputs should be treated
      as accumulators.
    needs_swap_ref: whether a swap slots tracker needs to be allocated.
    grid: grid to use for the pipeline.

  Returns:
    A list of BufferedRefs, one corresponding to each ref specified in the
    in_specs and out_specs.
  Nc           	        d}d}| j                   ,| j                   j                  }| j                   j                  }|rt        d      t	        |      }t
        j                  | ||t              ||j                        S )Nr7   Fz,Grid must be specified when using lookahead.)rG  rH  r4  rI  )	pipeline_moder)  r4  r)   r  r  rP  r*   r>  )in_specin_refr)  r4  in_avalr   rG  s        r   make_input_brefz2make_pipeline_allocations.<locals>.make_input_bref  s    LM(**77l++99mEFF (GWg|,:'*4y+8171D1D	  F Fr   c                8   d}| j                   7| j                   j                  }| j                   j                  rt        d      t	        |      }|r$t
        j                  | |||j                        S t
        j                  | |||j                        S )Nr7   z,Output buffering does not support lookahead.)rG  rI  )	r  r)  r4  r)   r  r  rT  r>  rR  )out_specout_refr  r)  out_avalrG  s        r   make_output_brefz3make_pipeline_allocations.<locals>.make_output_bref  s    L)++88l				-	-GHH!'*H$$Xx4B9@9M9M % O O h,-;292F2F  H Hr   )r*   rO   r#  r   r   r   r   )r  r  r  rG  r   refsnum_in_specsin_refsout_refsr  in_brefsr  	out_brefss      ``        r   make_pipeline_allocationsr    s    6 X,	HtUm	,{H	Ie}	-I$XH	4 i I,',- (F XX\\/8W=(H  hhll	8-BD)	 8	 i	  r   c                x    | dt               z  fS t        |t              r+t        j                  |      }t        j
                  |      }n>t        j                  j                  |      }t        j                  j                  |      }t        |t              st        d|      |dk(  r dt               z  fS |t        ft               z  }t        |      t               k7  rt        d      t        |      D ch c]  \  }}|t        k(  r| }}}|st        d|      t         fd|D              rt        d       |D ch c]"  }t         |   t              r |   |z  dk(  r|$ c}rjfd	t!        t        |            D        ^}}	 |   |z  }
||
z  }t#        j$                   ||
      }t#        j$                  dt               z  ||      }||fS t'         fd
|D              fdt               D        ^}}	t)         |   |      \  }}|dkD  sJ |       t+        j,                  ||k  |dz   |      }t#        j$                   ||      }t+        j,                  ||k  ||z  ||z  |z         }t#        j$                  dt               z  ||      }||fS c c}}w c c}w )Nr   z>Cannot partition grid over dynamic number of cores: core_axis=r;   z4dimension_semantics must be the same length as grid.zRCannot partition over cores without parallel grid dimensions: dimension_semantics=c              3  F   K   | ]  }t        |   t                 y wr   rN   r   r   r   s     r   r   z"_partition_grid.<locals>.<genexpr>  s     C!ZQ%	%Cs   !z?Cannot partition cores over only dynamic grid dimensions: grid=r   c              3  ,   K   | ]  }|v s|  y wr   r!   )r   r   divisible_dimensionss     r   r   z"_partition_grid.<locals>.<genexpr>  s      %a;O6O%s   	c              3  N   K   | ]  }t        |   t              r|     y wr   rN   r  s     r   r   z"_partition_grid.<locals>.<genexpr>%  s,      %B(247C(@ &*!W %Bs   "%c              3  R   K   | ]  \  }}t        |t              r	|k(  r|   y wr   rN   )r   r   dlargest_parallel_dimensions      r   r   z"_partition_grid.<locals>.<genexpr>'  s/      Aqa!'A"A 	
s   $')r*   rO   r@   rT   num_programs
program_idr   r
   	axis_size
axis_indexr   	ARBITRARYr)   r!  PARALLELallrB  jax_utiltuple_updatemaxdivmodr_   r`   )r   	core_axisdimension_semantics	num_corescore_idr   r  parallel_dimensionsfirst_divisible_dimensionrG   partitioned_dim_sizepartitioned_dim_offsetnew_gridoffsetspartition_dimensionbase_num_itersrs  	num_itersgrid_offsetr  r  s   `                  @@r   _partition_gridr    s   
 D	!!!	3	*ImmI&G!!),Igg  +G	Is	#

IylK  !^D	!!!  $,T2	T*
K
LL'01D'E +tq!M  + + 

	!!	#  	C/BCC

JTGL 
 %	DGS	!d1g	&9Q&>  %012%!   9:iG$';;$$')=H ##s4y35KGF 
7	7 "% %B6I %B "BdO!
 !&9!:IFNC7C7 		'C-!);(*I$$T+>	JH
 ))#). 3&K
 ##s4y-{G 
7	}+s   J1'J7c                   t        | t              r"| }t        |t              rt        d      |}d}n!t        |t              st        d      |}| }d}|j                  t	        |      |      }t        d t        ||j                        D              }|r;t        j                  |j                  |   |j                  j                  |          yt        j                  |j                  j                  |   |j                  |          y)z+Perform a synchronous copy from src to dst.z,Only one of src or dst can be a BufferedRef.Fz(One of src or dst must be a BufferedRef.Tc              3     K   | ]C  \  }}|<t        |t        j                        s"t        j                  d|j                         E y wrM   r  r  s      r   r   zsync_copy.<locals>.<genexpr>T  s=      
!RjJr2;;7 	eeAqvvr  N)rO   r  r)   r   r  r   r   r   tpu_helpers	sync_copyrg  rl  )srcdstr   r  hbm_refr  	hbm_slice
bref_slices           r   r   r   C  s     [!D#{#EFFGGc;'ABBDGG  !3G!<gF) y$"2"23 *
 '**Y/**--j9; $**--j9!**Y/1r   )r  r  r  r  core_axis_namer  r:  no_pipeliningc       	           	 t        d D              r t        d D              }
t        d|
       ||t        d      ||n|}t        ||      \  t	              t        t        t        f      sft        t        t        f      sft        t              rt              t        t              rt              t              d }t        j                  j                  f      }t        dgt        ||            ddddd	ddddd
		 	 	 	 	 	 	 d 	fdS )a  Creates a function to emit a manual pallas pipeline.

  This has the same semantics as pallas_call but is meant to be called inside
  pallas_call for nesting grids. This is useful when you need to have separate
  windowing strategies for communication and computation.

  The new argument `should_accumulate_out` can be used to specify which outputs
  we should accumulate into automatically within and across pipeline
  invocations.

  Args:
    body: pallas kernel to set up pipeline for.
    grid: a pallas grid definition.
    in_specs: input pallas block specs
    out_specs: output pallas block specs
    should_accumulate_out: booleans to indicate which outputs should be treated
      as accumulators.
    core_axis: optional int, indicates whether or not to partition the grid
      along the core axis.
    core_axis_name: optional str, indicates whether or not to partition the grid
      along the core axis.
    dimension_semantics: optional tuple of GridDimensionSemantics (e.g. PARALLEL
      or ARBITRARY).
    trace_scopes: optional bool, indicates whether to annotate each region in
      the pipeline using named_scope.
    no_pipelining: If True, turns off pipelining and all copies will be made
      synchronous. This is useful for debugging multiple-buffering related bugs.
  c              3  ^   K   | ]%  }t        |t        t        j                  f        ' y wr   )rO   r@   r   r  r   r  s     r   r   z emit_pipeline.<locals>.<genexpr>  s#     ;ZC+,	,;s   +-c              3  2   K   | ]  }t        |        y wr   )r  r
  s     r   r   z emit_pipeline.<locals>.<genexpr>  s     -1tAw-s   z5Grid must consist of Python integers and JAX Arrays: Nz5core_axis and core_axis_name cannot both be provided.c                N    | "| j                   | j                   j                  S dS )Nr7   )r  r)  )r   s    r   r#   zemit_pipeline.<locals>.<lambda>  s.    	$,,8 $(#5#5#B#B ?@ r   r7   TF)		scratchesallocationsr7  r8  r9  rz  postyeetrf  body_prologuec        	        \   	  d 8duxs du}
t        j                  	 f	dt        	|
d      S t        t              rt              t        d       t        t        t
        f      st        fd      t        t              rt              t        d       fd	 f
d	rd
t              z  } d|      }t        |j                  	      }t        d |       t        d      t        d |D              rt        d      t        j                  t        j                  j                  d||f      	 fd       }yt!        j"                  dkD        	fd       }y)a  
    Run the pipeline.

    Args:
      *ref_args: a list of pallas refs (or more generally a list of pytrees of
        pallas refs)
      scratches: scratch buffers for the inner kernel
      allocations: a list of BufferedRefs, one corresponding to each ref
      first_cycle: boolean indicating if this is the first invocation of the
        inner pipeline cycle.
      last_cycle: boolean indicating if this is the last invocation of the
        inner pipeline cycle.
      init_accumulators: whether to zero-init accumulators during this cycle.
      prefetch: callback called as fn(*brefs, scheduler) that is used to fetch
        the next cycle invocations first inputs.  Called during the inputs phase
        in the final inner step.
      postyeet: callback called as fn(*brefs, scheduler) that is used to finish
        any writes or transfers from the last output of the previous cycle.
        Called during the outputs phase in the first inner step.
      schedule: manually specified pipeline schedules for brefs, None indicates
        default schedule.
      body_prologue: For running code within the grid environment before the
        body is run. Useful for updating manual refs.
    Nr!   c                "   	  	| dS )N)r  r  r7  r8  r9  rz  r  rf  r!   )
r  r7  r9  r8  pipeliner  rz  r  rf  r  s
    r   r#   z1emit_pipeline.<locals>.pipeline.<locals>.<lambda>  s'    h!%%# 1
 r   r  c                     y r   r!   r"   s    r   r#   z1emit_pipeline.<locals>.pipeline.<locals>.<lambda>      r   c                    S r   r!   )r   rf  s    r   r#   z1emit_pipeline.<locals>.pipeline.<locals>.<lambda>  s    X r   c                    t        |      S r   )r  )rG   r   s     r   r#   z1emit_pipeline.<locals>.pipeline.<locals>.<lambda>  s    *1- r   c                ,    t        | |	      S )N)r  r6  r7  r8  r9  r:  )r,  )	r5  r   r7  r   r  r9  r8  max_buffer_countr:  s	     r   make_schedulerz7emit_pipeline.<locals>.pipeline.<locals>.make_scheduler  s+    


#%!-#
 
r   c                
  
 |\  }} | |      j                         5  t        j                  |      t        j                        t        j                        j                  d      5  C| dz
  k(  t        fd       t        j                  fdd        t        fd      d d d        
 
        t        d       }j                  d      5   	g |  d d d        t        j                        t        j                        j                  d	      5  @| d
k(  t        fd       t        j                  fdd        t        fd      d d d        t        j                        t        j                        d d d        t        |      fS # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   exY w# 1 sw Y   <xY w)Nep_prefetchr;   c                &    | j                        S r   r   r   do_prefetchs    r   r#   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>      [ 9 r   c                      g   S r   r!   )brefsrz  	schedulers   r   r#   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>      H7e7Y7 r   c                      y r   r!   r!   r   r   r#   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>  r  r   c                &    | j                        S r   r   r  s    r   r#   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>      [(A r   c                    | j                   S r   rl  r"   s    r   r#   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>  s
    1== r   ep_run_kernelep_postyeetr   c                &    | j                        S r   r  r   do_postyeets    r   r#   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>$  r!  r   c                      g   S r   r!   )r#  r  r$  s   r   r#   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>&  r%  r   c                      y r   r!   r!   r   r   r#   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>'  r  r   c                &    | j                        S r   r(  r/  s    r   r#   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>(  r)  r   )rI  r  rW  r  r  rF  r
   r  r  r  r  rY  r  )r5  r  unaliased_brefsr   current_refsr#  r0  r   r$  bodyr  r   r  r;  r  rz  r  rf  r  s        @@@@r   	loop_bodyz2emit_pipeline.<locals>.pipeline.<locals>.loop_body  s   !&ow w/i *? )44otL)++UD(C)++UD(C ##M2 	J!)a-/K95AHH[7 " A5IE	J $
/ !8%@##O4 	*

)
)y
)	* ),,eT8D),,eT8D##M2 	J!!)K95AHH[7 " A5IE	J )115(C )66>U*?V K...E	J 	J	* 	*	J 	J;*? *?sW   AG9=AG/G92G!=A	G9AG-	5G9G	G9!G*	&G9-G6	2G99Hr  r   c                "    | j                         S r   r^  )r  s    r   r#   z1emit_pipeline.<locals>.pipeline.<locals>.<lambda>6  s    T__. r   zPrefetch/Postyeet not supportedc              3  4   K   | ]  }|j                     y wr   )r   )r   r  s     r   r   z2emit_pipeline.<locals>.pipeline.<locals>.<genexpr>9  s     3TT  3s   zAccumulators not supported)init_valc                   |\  } |       }|j                         5  t        |j                  |      }fd}t        ||       	 	        t        d |      }|j	                  d      5   g |  d d d        fd}t        ||       d d d        t        |j                  |      }|t        
      fS # 1 sw Y   GxY w# 1 sw Y   9xY w)Nc                    t        ||       S r   r   r  r  r   s     r   r#   zEemit_pipeline.<locals>.pipeline.<locals>._loop_body.<locals>.<lambda>D  s    iT7&C r   c                    | j                   S r   r+  r"   s    r   r#   zEemit_pipeline.<locals>.pipeline.<locals>._loop_body.<locals>.<lambda>I  s
    Q]] r   r,  c                    t        | |      S r   r=  r>  s     r   r#   zEemit_pipeline.<locals>.pipeline.<locals>._loop_body.<locals>.<lambda>M  s    ysG'D r   )rI  r  rW  r  rF  r  rY  r  )r5  r  r#  r$  r  r5  r  r   r6  r  r   r  r  r  s          @r   r  z3emit_pipeline.<locals>.pipeline.<locals>._loop_body;  s     w"41	! 	-I66tD%C'
WeT
*&O"#:EB,%%o6 ,+,++, E(
ht
,	- )66>k'4000, ,	- 	-s$   AC2C=CC	CCc                    dt              z  }  	d|       }}|j                         5  t        |j                  dz
        D ]/  }t	        t        j                  |j                  |      |      }1 	 d d d        t        j                  d
|| f      \  }}t        |      } 	
dz
  |      }|j                         5  t	        |j                  |       d d d        y # 1 sw Y   pxY w# 1 sw Y   y xY w)Nr  r   r;   )r5  )r*   rI  rB  r6  r  r   r  rk  r
   	fori_loopr*  r  )initial_indicesr$  r#  r5  r  r	  r  r   r7  r  r;  r  rf  s         r   rG   z*emit_pipeline.<locals>.pipeline.<locals>._R  s
    T*"1o6	! 	' I00145 'di//))6tX'E'		' "mmy)e_%=
|
 $L$7"9q=-@	! 	?
I&&tX
>	? 	?#	' 	'"	? 	?s   AC&C2&C/2C;)r   
run_scopedr  rO   r#  r   r  r*   rW  r   r0  r   r  r   r
   rB  rT   r  )r  r  r7  r8  r9  rz  r  rf  r  r  rG  rC  r$  r#  r  rG   r7  r  r6  r   r  r  r  r  r;  r  r  r  r:  s   ``````````      @@r   r  zemit_pipeline.<locals>.pipeline  s   H i  t+Cxt/Cn""
 
 $!$9+ * +t$+&k>;7hhu.-{;h(D!xh-{HFH ./ ./` s4y(o O4i	22KFe.6		!5!"CDD	3U3	3!">??**Ay#(/":<1 1<1* 
wwy1}? ? ?r   )r  r   r7  CondValr8  rE  r9  rE  )r0  r   r)   r  r   rO   r#  r1   r   r   r   r  r   )r6  r   r  r  r  r  r  r  r:  r  
grid_types
core_axis_get_buffer_countflattened_specsr  r  r;  r  s   `````   ``    @@@@r   emit_pipelinerJ  a  st   R 	;d;;---J

?
|L  
~5
L
MM!*!2~	*&tZ9LM$)	HtUm	,{H	Ie}	-I$XH	4 i I./DiPBHHOOXy$9:/!Ec"2ODEF !&G?G? 	G?
 G? G? G?R 
/r   )r  r  r  c               f    t        j                  t        ||||      }t        | ||||      }||fS )a  Creates pallas pipeline and top-level allocation preparation functions.

  Args:
    body: pallas kernel to set up pipeline for.
    grid: a pallas grid definition.
    in_specs: input pallas block specs
    out_specs: output pallas block specs
    should_accumulate_out: booleans to indicate which outputs should be treated
      as accumulators.

  Returns:
    (emit_pipeline, make_allocations) function pair, where
      - emit_pipeline is the pallas pipeline function.
      - make_allocations is a function to create buffered refs for the inner
        pipeline that can be created at the top-level of a pallas call to be
        reused across multiple invocations of the inner pipeline.
  )r  r  r  r   )r   r  r  r  )r   r  r  rJ  )r6  r   r  r  r  make_allocationsr  s          r   emit_pipeline_with_allocationsrM  o  sM    2 &&'@%'*?	
 
13( 
#	##r   r  )rB   tuple[int, ...]rC   zjax_core.AbstractValuer  ztuple[int | None, ...])rP   int | jax.ArrayrQ   r@   r  rO  )rW   jax.Array | intrX   rP  r  zpl.Slice)rb   rP  rc   r@   rd   r@   re   
int | None)
rm   rP  rn   rP  rc   r@   rd   r@   re   rQ  )
rb   r  rc   zpl.BlockDim | int | NonerX   r@   re   rQ  r  z"pl.Slice | slice | int | jax.Array)r   r  r  rN  )TNT)r   zjax.Array | boolr  rQ  r  r  )r   r  r   r  r  r  r  )r   r  r   r  r  r  r  r  )r  r   )r   r  r  zint | str | Noner  )tuple[GridDimensionSemantics, ...] | Noner  z?tuple[tuple[int | jax.Array, ...], tuple[int | jax.Array, ...]])r  REF | BufferedRefr  rS  )r   r  r  r  r  rQ  r  z
str | Noner  rR  r:  r  r  r  )gr   
__future__r   collections.abcr   
contextlibr   r[  enumr   typingr   r   r   r   r	   r  r
   r   jax._srcr   r  jax._src.pallasrH  r   jax._src.pallas.mosaictpu_corer   r  r  r   jax._src.stater   r	  jax.experimentalr   rT   	jax.numpynumpyr_   MemorySpacer?  r;  r@  r  	MemoryRefrf  GridDimensionSemanticsr  r  rC  r  SemaphoreTupleArrayRefr   GridIndicesr  rE  	BlockSpecPipelineBlockSpecsPipelineRefsr?   r1   r5   rK   rR   rZ   rk   rq   ry   r   r   r  r  r   Enumr   r   r   r  register_dataclass	dataclassr  r  r   r   r  r  r  r  r  r*  r,  r  re  r  r  r  r  r  r   rJ  rM  r!   r   r   <module>rm     s   E " $ %    & & 
     % / 4 3 9 ? + / )      !!!88 	&&cii CIIsN#
		4
 8K$9$9:C?@ Xc]C'( =&,FF 6FF@%%"%%%	 /7 77 7 	7.*&5*&)* %(* #-	*2"J"J(@"JHK"J"J ("JJB
A
 I.0BC	 4>,P  P d d#}	&/ }	& $ }	&F 8<7;.2F* %5F* +5	F*
 (,F*T IHHLL4	
  (0K  !7(70K77 !7B	5(	50K	5 	5h hp 9A3 ZI9! 0 9FB NJN9"$ ==NO :?K  	H!V[
%[[ C[ E	[|1D "' !%EIK &K  K K K CK K Kd %$r   