
    uki                      d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	Z	ddl
Z
ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddlm!Z! ddl"m#Z# ddlm$Z$m%Z% ddl&m'Z' ddl&m(Z( ddl&m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9Z:e;e6cZ<Z;e=e7cZ>Z=dddZ?d  Z@ddd!ZAddd"ZBddd#ZCd$ ZDd% ZEd& ZFd' ZGd( ZHd) ZId* ZJd+ ZKd, ZLd- ZMddd.ZNdd/d0d1ZOddd2ZPdd4ZQdd5ZR	 d	 	 	 	 	 dd6ZSdd8ZTdd9ZUd: ZVd; ZWd< ZXd= ZYd> ZZ	 	 dd?Z[d@ Z\dA Z]dB Z^dC Z_dD Z`dE ZadF Zb ej                  dG      Zdedj                   ee\ede(j                               edj                  e]        ej                  ed eeae(j                  e(j                                ej                  edeb        eeYeddH       ej                  ed<    eeUd7      ej                  ed<    ej                  dI      Zmemj                   ee\eme(j                               emj                   ee^dI              ej                  em eeae(j                  e(j                                eeYemdJ       ej                  em<    eeUd7      ej                  em<    ej                  dK      Zpepj                   ee\epe(j                               epj                   ee^dK              ej                  ep eeae(j                  e(j                                eeYepdL       ej                  ep<    eeUd7      ej                  ep<   dM ZsdN ZtdO ZudP ZvdQ Zw ej                  dR      Zxexj                  ew        ej                  exeu        ej                  exet       evej                  ex<    eeUd3      ej                  ex<    edST       G dU dVej                               Z{ e{       Z|ej                   j                  j                  e{       dW Zej                   j                  e{       dX Z ej                  dY      Zej                   eej                  e             ej                  e        ej                  eedZ[       d\ Z ej                  ee       evej                  e<    eeUd3      ej                  e<   d] Zd^ Z ej                  d_      Zej                  e        ej                  eedZ[       d` Z ej                  ee       evej                  e<    eeUd3      ej                  e<   da Zdb Zdc Z ej                  dd      Zej                  ew        ej                  ee        ej                  eedZ[       eej                  e<    eeUd3      ej                  e<   de Zdf Zdg Zdh Zdi Zdj Zdk Zdl Zdm Z ej                  dn      Zej                  e       ej                  e        ej                  ee        ej                  ee       eej                  e<    eeUd3      ej                  e<   do Zdp Zdq Zdr Zds Zdt Z ej                  du      Zej                  e       ej                  e       eej<                  e<   eej>                  e<    ej                  ee       eej                  e<    eeUd3      ej                  e<   dv Zddd/dwdx	 ddyZdz Zd{ Zdd[d|Zd} Zd~ Zd Zd Zd Z ej                  d      Zej                  e       ej                  e        ej                  ee       dD ]  Z ej                  e eee[      e[       !  ej                  ee        eee      ej                  e<    eeUd3      ej                  e<   dd/dddZ ej                  d      Zd Zej                  e       d Zej                  e       dd[dZ ej                  ee       dD ]  Z ej                  e eee[      e[       ! d Z ej                  ee       d Zeej                  e<    eeUd3      ej                  e<   d Zd Zd Zd Zd Z ej                  d      Zej                  e        ej                  ee       eej                  e<    eeUd3      ej                  e<    ej                  e eee(j                               ddd/ddZd Zd Zd Zd Zd Z ej                  d      Zej                   eej                  e              ej                  ee       ej                  e       eej                  e<    eeUd3      ej                  e<   d Zd Zd Zd Z ej                  d      Zej                  e       ej                  e        ej                  ee«       eej                  e<    eeUd7      ej                  e<   d Z ej                  d      Zd Zej                  eǫ       d Zej                   eeeƐj                               d Z ej                  eeʫ       d Zeej                  e<    eeUd7      ej                  e<   d Z ej                  ee̫       ej                  j                  d         ej                  ej                  d        d Zej                  j                  eΫ       d Z ej                  ej                  eϫ       d Zeej                  ej                  <   dd/dddZ ej                  d      Zd Zej                  eԫ       d Zej                  eի       dd[dZ ej                  ee֫       dD ]  Z ej                  e eee[      e[       ! d Z ej                  ee׫       d Zeej                  e<    eeUd3      ej                  e<   dd/ddZ ej                  d      Zd Zej                  e۫       d Zej                  eܫ       d Z ej                  eeݫ       d Zeej                  e<    eeUd3      ej                  e<   d Z ej                  e eee(j                               d Z ej                  d      Zd Zej                  e       d Z ej                  ee       d Zeej                  e<    eeUd7      ej                  e<   d Z ej                  ee       d Z ej                  d      Zej                  d         ej                  edÄ        dĄ Zej                  e       dń Z ej                  ee       dƄ Zeej                  e<   dǄ Z ej                  dȫ      Zej                  dɄ         ej                  edʄ        d˄ Zej                  e       d̄ Z ej                  ee       d̈́ Zeej                  e<   ej                  j                  d΄         ej                  ej                  dτ        dЄ Zej                  j                  e       dф Z ej                  ej                  e       d҄ Zeej                  ej                  <   ddӄZejJ                  eeej                  dԜZh dգZddքZy)z
Parallelization primitives.
    )annotations)Sequence)partial)	dataclassN)core)config)dispatch)dtypes)effects)	tree_util)SPMDAxisContextShardingContextNamedShardingPartitionSpec)AxisNameShapedArray)ad)batching)mlir)pxla)check_unreduced_args)get_abstract_mesh)abstract_tokenpvary)control_flow)lax)slicing)ir)hlo)
xla_client)Array)canonicalize_axismoveaxissafe_mapsafe_zipunzip2axis_index_groupsc                   t        |t        t        f      s|fn
t        |      s| S fd}t        j                  ||       S )a,  Compute an all-reduce sum on ``x`` over the pmapped axis ``axis_name``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  Inputs of boolean dtype are converted to integers before the reduction.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would perform psums over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once.

  Returns:
    Array(s) with the same shape as ``x`` representing the result of an
    all-reduce sum along the axis ``axis_name``.

  Examples:
    For example, with 4 XLA devices available:

    >>> x = np.arange(4)
    >>> y = jax.pmap(lambda x: jax.lax.psum(x, 'i'), axis_name='i')(x)
    >>> print(y)
    [6 6 6 6]
    >>> y = jax.pmap(lambda x: x / jax.lax.psum(x, 'i'), axis_name='i')(x)
    >>> print(y)
    [0.         0.16666667 0.33333334 0.5       ]

    Suppose we want to perform ``psum`` among two groups, one with ``device0`` and ``device1``, the other with ``device2`` and ``device3``,

    >>> y = jax.pmap(lambda x: jax.lax.psum(x, 'i', axis_index_groups=[[0, 1], [2, 3]]), axis_name='i')(x)
    >>> print(y)
    [1 1 5 5]

    An example using 2D-shaped x. Each row is data from one device.

    >>> x = np.arange(16).reshape(4, 4)
    >>> print(x)
    [[ 0  1  2  3]
     [ 4  5  6  7]
     [ 8  9 10 11]
     [12 13 14 15]]

    Full ``psum`` across all devices:

    >>> y = jax.pmap(lambda x: jax.lax.psum(x, 'i'), axis_name='i')(x)
    >>> print(y)
    [[24 28 32 36]
     [24 28 32 36]
     [24 28 32 36]
     [24 28 32 36]]

    Perform ``psum`` among two groups:

    >>> y = jax.pmap(lambda x: jax.lax.psum(x, 'i', axis_index_groups=[[0, 1], [2, 3]]), axis_name='i')(x)
    >>> print(y)
    [[ 4  6  8 10]
     [ 4  6  8 10]
     [20 22 24 26]
     [20 22 24 26]]
  c                    t        t        j                  |       d      }|dk(  rt        t	        |       S t        |       S )Nzjax.lax.psum	unreducedr'   )	_get_fromr   typeofNotImplementedErrorunreduced_psum_psum)leaffrom_axesr(   s     P/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/_src/lax/parallel.pybindzpsum.<locals>.bind   sK    dkk$'~>E		&!!D$''41BCC    
isinstancetuplelistr   tree_map)x	axis_namer(   r5   r3   s     ` @r4   psumr>   ;   sH    B )UDMB9,
	
 	HD 
		D!	$$r6   c          	     ,   t        |t        t        f      s|f}|s| S t        d |D              r|t	        d      t        |       t        j                  |       \  }}|D cg c]N  }t        j                  |      t        j                  k(  r$t        j                  |t        j                        n|P }}t        |      }t!        d |D              rg g fx\  }}|D ]$  }|t        |t"                 j%                  |       & fd|rJ t'        |d         nDt)        j*                  |D 	cg c]%  }	t-        j.                         j1                  |	      ' c}	      t        fd|D              }
nmt2        j4                  j6                  r%|D cg c]  }t9        |t        |      |       }
}n.|D cg c]#  }t:        j=                  |t        |      |      % }
}t        j>                  ||
      S c c}w c c}	w c c}w c c}w )Nc              3  <   K   | ]  }t        |t                y wNr8   int.0axiss     r4   	<genexpr>z_psum.<locals>.<genexpr>        54D#	5   >axis_index_groups only supported for sums over just named axesc              3  R   K   | ]  }t        |t        j                          ! y wrA   )r8   r   Tracer)rE   r1   s     r4   rG   z_psum.<locals>.<genexpr>   s     >tZdkk*	*>s   %'c                    s| S t        j                  | D cg c]  }t        |t        | dd             c}      S c c}w )Nndimr   )r   
reduce_sumr"   getattr)r<   rF   pos_axess     r4   
pos_reducez_psum.<locals>.pos_reduce   sI    ^^A,4 6$( !2$618M N  6 7 7  6s   ?
r   c              3  \   K   | ]#  }t        j                  |       |      z   % y wrA   )r   _const)rE   r1   rR   sizes     r4   rG   z_psum.<locals>.<genexpr>   s&     R4SZZd+j.>>Rs   ),r3   r(   ) r8   r9   r:   any
ValueError"_validate_reduce_axis_index_groupsr   tree_flattenr
   dtypenpbool_r   convert_element_typeint32_canonicalize_axis_index_groupsallrC   appendlenmathprodr   get_axis_env	axis_sizer   
_check_vmavaluebind_psum_invariantpsum_pr5   tree_unflatten)r<   r=   r(   leavestreedefl
named_axesaxes_partitionrF   nameout_flatr1   rQ   rR   rU   s               @@@r4   r0   r0      s   	It}	-I	H5955:K:W
U
VV$%67**1-/&';AC67v||A"((* $$Q1012 C& C56GH>v>>,.F2J> 9Zc*+224897
 $\"1%&dYY
S))+55d;STdR6RRH %& &dy1A8IK &h & %& ++dy)90A  C &h & 
	!	!'8	449C" T&&s   'AH8*HH(Hc               j    t        | ||      } t        ||      t        j                  fd|       S )aY  Compute an all-reduce mean on ``x`` over the pmapped axis ``axis_name``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would perform pmeans over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and on TPUs all groups must be the same size.

  Returns:
    Array(s) with the same shape as ``x`` representing the result of an
    all-reduce mean along the axis ``axis_name``.

  For example, with 4 XLA devices available:

  >>> x = np.arange(4)
  >>> y = jax.pmap(lambda x: jax.lax.pmean(x, 'i'), axis_name='i')(x)
  >>> print(y)
  [1.5 1.5 1.5 1.5]
  >>> y = jax.pmap(lambda x: x / jax.lax.pmean(x, 'i'), axis_name='i')(x)
  >>> print(y)
  [0.        0.6666667 1.3333334 2.       ]
  r=   r(   c                    | z  S rA    )vns    r4   <lambda>zpmean.<locals>.<lambda>   s    a!e r6   )r>   
_axis_sizer   r;   )r<   r=   r(   ry   s      @r4   pmeanr|      s4    : 1	5FG!-.!			OQ	//r6   c                   t        t        t        f      sft        d D              rt	        d      t               t              fd}t        j                  ||       S )a#  Compute an all-reduce max on ``x`` over the pmapped axis ``axis_name``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would perform pmaxes over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and on TPUs all groups must be the same size.

  Returns:
    Array(s) with the same shape as ``x`` representing the result of an
    all-reduce max along the axis ``axis_name``.
  c              3  <   K   | ]  }t        |t                y wrA   rB   rD   s     r4   rG   zpmax.<locals>.<genexpr>   rH   rI   rJ   c                L    t        |       } t        j                  |       S NrV   )insert_collective_pvarypmax_pr5   r1   r(   r=   s    r4   r5   zpmax.<locals>.bind   %    "9d3D;;t)?P;QQr6   	r8   r9   r:   rW   rX   rY   r`   r   r;   r<   r=   r(   r5   s    `` r4   pmaxr      j    & 
It}	-I5955:K:W
U
VV$%6756GHR 
		D!	$$r6   c                   t        t        t        f      sft        d D              rt	        d      t               t              fd}t        j                  ||       S )a"  Compute an all-reduce min on ``x`` over the pmapped axis ``axis_name``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would perform pmins over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and on TPUs all groups must be the same size.

  Returns:
    Array(s) with the same shape as ``x`` representing the result of an
    all-reduce min along the axis ``axis_name``.
  c              3  <   K   | ]  }t        |t                y wrA   rB   rD   s     r4   rG   zpmin.<locals>.<genexpr>  rH   rI   rJ   c                L    t        |       } t        j                  |       S r   )r   pmin_pr5   r   s    r4   r5   zpmin.<locals>.bind  r   r6   r   r   s    `` r4   pminr      r   r6   c                x    t        |t        t        f      rt        d|       t	        | t        | |      |      S Nz(pargmin only accepts a single axis, got )r8   r9   r:   	TypeError_axis_index_of_valr   r<   r=   s     r4   pargminr     9    	E4=)
>ykJ
KK	AtAy19	==r6   c                x    t        |t        t        f      rt        d|       t	        | t        | |      |      S r   )r8   r9   r:   r   r   r   r   s     r4   pargmaxr     r   r6   c           
     D   t        |      }|| k(  }t        j                  |t        j                  |j                  |      t        j                  |j                  t        j                  |j                        j                  |j                              }t        ||      S rA   )

axis_indexr   selectfullshaper
   iinfor[   maxr   )r<   valr=   idxmaskvalidxs         r4   r   r     sr    9#
($::dhhtzz3/hhtzz6<<		+B+F+F		RT& 
fi	  r6   c                    | y t        t        d | D                    }| D ch c]  }|D ]  }|  c}}t        |      k7  rt        d      y c c}}w )Nc              3  2   K   | ]  }t        |        y wrA   rc   )rE   groups     r4   rG   z5_validate_reduce_axis_index_groups.<locals>.<genexpr>'  s     CUCs   z5axis_index_groups must cover all indices exactly once)rangesumsetrX   )r(   
axis_spacegis       r4   rY   rY   $  sY    
SC1BCCD*".AA.qa.a.#j/A
L
MM B.s   Ac                :    | y t        t        t         |             S rA   )r9   mapr'   s    r4   r`   r`   +  s    
	s5+,	--r6   c                b    t        j                  t        t        j                  ||      |       S )aM  Perform a collective broadcast and replicate from ``source``.

  This is equivalent to
  ```
  def pbroadcast(x, axis_name, source):
    masked = jnp.where(axis_index(axis_name) == source, x, zeros_like(x))
    return psum(masked, axis_name)
  ```
  but implemented in a hardware optimized way.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  This function is an analog of the CollectiveBroadcast HLO.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    source: int, representing which index into ``axis_name`` that should be copied.

  Returns:
    Array(s) with ``x`` being copied from the ``source`` index slice of ``axis_name``.
  r=   source)r   r;   r   pbroadcast_pr5   )r<   r=   r   s      r4   
pbroadcastr   1  s-    2 
		l9VDa
I Ir6   c                p    t        t        t        f      sffd}t        j                  ||       S )a|  Perform a collective permutation according to the permutation ``perm``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  This function is an analog of the CollectivePermute HLO.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    perm: list of pairs of ints, representing
      ``(source_index, destination_index)``
      pairs that encode how the mapped axis named ``axis_name`` should be
      shuffled. The integer values are treated as indices into the mapped axis
      ``axis_name``. Any two pairs should not have the same source index or the
      same destination index. For each index of the axis ``axis_name`` that does
      not correspond to a destination index in ``perm``, the corresponding
      values in the result are filled with zeros of the appropriate type.

  Returns:
    Array(s) with the same shape as ``x`` with slices along the axis
    ``axis_name`` gathered from ``x`` according to the permutation ``perm``.
  c           
     z    t        |       } t        j                  | t        t	        t                          S N)r=   perm)r   
ppermute_pr5   r9   r   r1   r=   r   s    r4   r5   zppermute.<locals>.bindi  s0    "9d3D??495UDAQ;R?SSr6   r8   r:   r9   r   r;   r<   r=   r   r5   s    `` r4   ppermuter   N  s5    2 
Ie}	-IT 
		D!	$$r6   c                    t        t        t        f      rt              nffd}t        j                  ||       S )a  Perform a collective send according to the permutation ``perm``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  This function is an analog of the Send HLO.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    perm: list of pairs of ints, representing ``(source_index,
      destination_index)`` pairs that encode how the mapped axis named
      ``axis_name`` should be shuffled. The integer values are treated as
      indices into the mapped axis ``axis_name``. Any two pairs should not have
      the same source index or the same destination index. For each index of the
      axis ``axis_name`` that does not correspond to a destination index in
      ``perm``, the corresponding values in the result are filled with zeros of
      the appropriate type. The semantics here are platform-specific, and for
      GPU they correspond to NCCL send.

  Returns:
    A compiler token that can be used by precv and lax.optimzation_barrier to
    enforce ordering of collective ops.
  c           
     z    t        |       } t        j                  | t        t	        t                          S r   )r   psend_pr5   r9   r   r   s    r4   r5   zpsend.<locals>.bind  s0    "9d3D<<	c%>N8O<PPr6   r   r   s    `` r4   psendr   o  s;    4 #-Yu"FeIYL)Q 
		D!	$$r6   c                    t        |t        t        f      rt        |      n|f}t        j	                  | t        j                  |j                  |j                        |t        t        t        |                  S )a  Perform a collective recv according to the permutation ``perm``.

  This function is an analog of the Recv HLO.

  Args:
    token: a compiler token, either generated by a matching psend or
      lax.create_token(). This is used to enforce control dependencies between
      collectives.
    out_shape: ShapeDtypeStruct(s) containing the dtype and shape
      of the result.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    perm: list of pairs of ints, representing ``(source_index,
      destination_index)`` pairs that encode how the mapped axis named
      ``axis_name`` should be shuffled. The integer values are treated as
      indices into the mapped axis ``axis_name``. Any two pairs should not have
      the same source index or the same destination index. For each index of the
      axis ``axis_name`` that does not correspond to a destination index in
      ``perm``, the corresponding values in the result are filled with zeros of
      the appropriate type. The semantics here are platform-specific, and for
      GPU they correspond to NCCL recv.

  Returns:
    Array(s) with the same shape as ``out_shape``.
  )	out_shaper=   r   )
r8   r:   r9   precv_pr5   r   r   r   r[   r   )tokenr   r=   r   s       r4   precvr     sg    4 #-Yu"FeIYL)	  
//9?? UD!" 
 
 r6   c                    t        |      t        t        t        |                  k7  rt        d|       t	        | |t        t        |t        t        |                              S )a0  Convenience wrapper of jax.lax.ppermute with alternate permutation encoding

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    perm: list of ints encoding sources for the permutation to be applied to
      the axis named ``axis_name``, so that the output at axis index i
      comes from the input at axis index perm[i]. Every integer in [0, N) should
      be included exactly once for axis size N.

  Returns:
    Array(s) with the same shape as ``x`` with slices along the axis
    ``axis_name`` gathered from ``x`` according to the permutation ``perm``.
  z)`perm` does not represent a permutation: )r   r   rc   rX   r   r:   zip)r<   r=   r   s      r4   pshuffler     sU    & 	Y#eCI&''
@G
HH	!YSuSY/?%@ A	BBr6   c               "    t        | ||||      S )a  Swap the pmapped axis ``axis_name`` with the unmapped axis ``axis``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  The group size of the mapped axis size must be equal to the size of the
  unmapped axis; that is, we must have
  ``lax.psum(1, axis_name, axis_index_groups=axis_index_groups) == x.shape[axis]``.
  By default, when ``axis_index_groups=None``, this encompasses all the devices.

  This function is a special case of ``all_to_all`` where the pmapped axis of
  the input is placed at the position ``axis`` in the output. That is, it is
  equivalent to ``all_to_all(x, axis_name, axis, axis)``.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis: int indicating the unmapped axis of ``x`` to map with the name
      ``axis_name``.
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would run pswapaxes over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and all groups must be the same size.

  Returns:
    Array(s) with the same shape as ``x``.
  r'   
all_to_all)r<   r=   rF   r(   s       r4   	pswapaxesr     s    : 
Ay$@Q	RRr6   F)r(   tiledc               ^    t              ||ffd	}t        j                  ||       S )a  Materialize the mapped axis and map a different axis.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  In the output, the input mapped axis ``axis_name`` is materialized at the
  logical axis position ``concat_axis``, and the input unmapped axis at position
  ``split_axis`` is mapped with the name ``axis_name``.

  The group size of the mapped axis size must be equal to the size of the
  unmapped axis; that is, we must have
  ``lax.psum(1, axis_name, axis_index_groups=axis_index_groups) == x.shape[axis]``.
  By default, when ``axis_index_groups=None``, this encompasses all the devices.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    split_axis: int indicating the unmapped axis of ``x`` to map with the name
      ``axis_name``.
    concat_axis: int indicating the position in the output to materialize the
      mapped axis of the input with the name ``axis_name``.
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would run all_to_all over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and all groups must be the same size.
    tiled: when True, all_to_all will divide split_axis into chunks and concatenate
      them along concat_axis. In particular, no dimensions are added or removed.
      False by default.

  Returns:
    When tiled is False, array(s) with shape given by the expression::

      np.insert(np.delete(x.shape, split_axis), concat_axis, axis_size)

    where ``axis_size`` is the size of the mapped axis named ``axis_name`` in
    the input ``x``.

    Otherwise array with shape similar to the input shape, except with split_axis
    divided by axis size and concat_axis multiplied by axis size.
  c           	        t              }r7| j                  |   |z  dk7  rt        d| j                  |    d d| d      || j                  |   k7  r*d}t        |j                  || j                  |               ||k  r|dz  }t	        j
                  | |f      } n"||k(  rnt	        j
                  | |f      } |dz  }t        |       } t        j                  | ||      }s||k7  rt	        j                  ||f      }|S )	Nr   z#The size of all_to_all split_axis (z4) has to be divisible by the size of the named axis z ()z|all_to_all requires the size of the mapped axis axis_name to equal x.shape[split_axis], but they are {} and {} respectively.   )
split_axisconcat_axisr=   r(   r   )
r{   r   rX   formatr   expand_dimsr   all_to_all_pr5   squeeze)	r<   r   r   
group_sizemsgresultr(   r=   r   s	         r4   r5   zall_to_all.<locals>.bind  s7   I'89J	
	z	)Q	.>qwwz?R>S TN%;bA7 8 	8 
qwwz*	*QJ
0CDEE	k	!qOOA~.$OOA~.a
	1-AqZ[)21B%*  ,F Z;.{{6J=1fMr6   )r`   r   r;   )r<   r=   r   r   r(   r   r5   s    `  `` r4   r   r     s1    T 66GH# : 
		D!	$$r6   c          
         t        |t        t        f      s|f}t        |      }t        j                  | |||||||      S )al  Ragged version of :func:`all_to_all` collective.

  We say data are "ragged" when they can be represented as a list of arrays
  whose shapes differ only in the size of the leading axis. For example, these
  data are ragged, comprising four component arrays::

    ragged_data = [jnp.arange(3), jnp.arange(1), jnp.arange(4), jnp.arange(1)]

  We often instead want a contiguous representation, e.g. for batching. But
  because the shapes of the components differ, we can't apply ``jnp.stack`` to
  represent these data by a single rectangular array with the leading axis
  indexing the component arrays. So instead of stacking, we concatenate along
  the leading axis and keep track of offsets and sizes.

  That is, we can represent ragged data contiguously using a triple of dense
  arrays ``(data, offsets, sizes)``:

    * ``data``: the concatenated component arrays,
    * ``offsets``: 1D array of indices into the leading axis of ``data``
      indicating where the data for each component array begins,
    * ``sizes``: 1D array of sizes of the leading axis of each component array.

  We refer to this triple as a ragged array. (Offsets can't be computed from
  sizes in general to allow for internal padding.)

  For example::

    data: f32[8,3] = jnp.array([
        [a,b,c], [d,e,f], [g,h,i], [j,k,l], [m,n,o], [p,q,r], [s,t,u], [v,w,x],
    ])
    offsets: i32[3] = jnp.array([0, 1, 4])
    sizes: i32[3] = jnp.array([1, 3, 4])

    # To extract the first component array, of type f32[1,3]
    data[offsets[0]:offsets[0]+sizes[0]]

    # To extract the second component array, of type f32[3,3]
    data[offsets[1]:offsets[1]+sizes[1]]

    # To extract the third component array, of type f32[4,3]
    data[offsets[2]:offsets[2]+sizes[2]]

  The ``ragged_all_to_all`` collective operation communicates slices of ragged
  arrays between devices. Each caller is both a sender and a receiver. The
  ``input_offsets`` and ``send_sizes`` arguments indicate the slices of the
  caller's ``operand`` to be sent. Received results are returned in an array
  that has the same value of the argument ``output`` except with received values
  written at some slices. The ``output_offsets`` argument does *not* indicate
  the offsets at which all the received results are written; instead,
  ``output_offsets`` indicates the offsets at which the *sent* slices are
  written on their corresponding receivers. The sizes of received slices are
  indicated by ``recv_sizes``. See below for details.

  The arrays ``input_offsets``, ``send_sizes``,``output_offsets``, and
  ``recv_sizes`` must all be the same length, and that length must be divisible
  by the size of the mapped axis ``axis_name``. Moreover, ``send_sizes`` and
  ``recv_sizes`` must satisfy::

    jnp.all(send_sizes == jax.lax.all_to_all(recv_sizes, axis_name, 0, 0, tiled=True))

  Specifically, given a call::

    result = ragged_all_to_all(operand, output, input_offsets, send_sizes,
                               output_offsets, recv_sizes, axis_name)

  the caller sends data like::

    assert len(input_offsets) == len(send_sizes) == len(output_offsets) == len(recv_sizes)
    N = len(input_offsets)
    slices_per_device, leftover = divmod(N, lax.axis_size(axis_name))
    assert not leftover

    for i in range(N):
      dst_idx = i // slices_per_device
      SEND(data=operand[input_offsets[i]:input_offsets[i]+send_sizes[i]],
           axis_name=axis_name, to_axis_index=dst_idx)

  and receives data in ``result`` like::

    result = output
    output_offsets_ = jax.lax.all_to_all(output_offsets, axis_name, 0, 0, tiled=True)
    for i in range(N):
      src_idx = i // slices_per_device
      result = result.at[output_offsets_[i]:output_offsets_[i]+recv_sizes[i]
                    ].set(RECEIVE(axis_name=axis_name, from_axis_index=src_idx))

  where ``SEND`` and ``RECEIVE`` are pseudocode. Notice that a caller's local
  ``output_offsets`` does not indicate the offsets at which its local ``result``
  is updated; instead, it indicates where the corresponding sent slices are
  written on their destination instances. To compute the local offsets at which
  received data are written, we apply an ``all_to_all`` on ``output_offsets``.

  For example, if we apply a ``ragged_all_to_all`` along an axis of size 2, with
  these arguments in each mapped function instance::

    axis index 0:
      operand = [1, 2, 2]
      output = [0, 0, 0, 0]
      input_offsets = [0, 1]
      send_sizes = [1, 2]
      output_offsets = [0, 0]
      recv_sizes = [1, 1]

    axis index 1:
      operand = [3, 4, 0]
      output = [0, 0, 0, 0]
      input_offsets = [0, 1]
      send_sizes = [1, 1]
      output_offsets = [1, 2]
      recv_sizes = [2, 1]

  then::

    axis index 0:
      result = [1, 3, 0, 0]

    axis index 1:
      result = [2, 2, 4, 0]

  Args:
    operand: data array of shape (N, A, B, ...) representing concatenated
      (possibly padded) ragged data to be sent.
    output: data array of shape (M, A, B, ...) to update with received data.
    input_offsets: 1D integer array of shape (K,) representing the offsets of
      leading-axis slices into ``operand`` to be sent.
    send_sizes: 1D integer array array of shape (K,) representing the sizes of
      leading-axis slices into ``operand`` to be sent.
    output_offsets: 1D integer array of shape (K,) representing where the
      corresponding sent data is written on each corresponding receiver.
    recv_sizes: 1D integer array of shape (K,) representing sizes of
      leading-axis slices into ``output`` to update with received data.
    axis_name: name of the mapped axis over which to perform the communication.
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would run ragged all to all over the
      first two and last two replicas). Groups must cover all axis indices
      exactly once, and all groups must be the same size. Otherwise, the
      behavior is undefined.

  Returns:
    Array of shape (M, A, B, ...) with the same value as the ``output`` except
    with received data written into slices starting at
    ``all_to_all(output_offsets, axis_name, 0, 0, tiled=True)`` and with size
    ``recv_sizes``.
  ru   )r8   r9   r:   r`   ragged_all_to_all_pr5   )operandoutputinput_offsets
send_sizesoutput_offsets
recv_sizesr=   r(   s           r4   ragged_all_to_allr   9  sS    h 
It}	-I56GH		!	!'6=*"0*,54E 
" 
G Gr6   r=   c                    t        | t        t        f      st        j	                  |       S d}t        j                  d      }t        |       D ]!  }|t        |      |z  z  }|t        |      z  }# |S )ab  Return the index along the mapped axis ``axis_name``.

  Args:
    axis_name: hashable Python object used to name the mapped axis.

  Returns:
    An integer representing the index.

  For example, with 8 XLA devices available:

  >>> mesh = jax.make_mesh((8,), 'i', axis_types=(jax.sharding.AxisType.Explicit,))
  >>> @jax.shard_map(mesh=mesh, in_specs=(), out_specs=jax.P('i'))
  ... def f():
  ...   return lax.axis_index('i')[None]
  ...
  >>> f()
  Array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int32)

  >>> mesh = jax.make_mesh((4, 2), ('i', 'j'),
  ...                       axis_types=(jax.sharding.AxisType.Explicit,) * 2)
  >>> @jax.shard_map(mesh=mesh, in_specs=(), out_specs=jax.P('i', 'j'))
  ... def f():
  ...   return lax.axis_index(('i', 'j'))[None, None]
  ...
  >>> f()
  Array([[0, 1],
         [2, 3],
         [4, 5],
         [6, 7]], dtype=int32)
  r=   r   r   )
r8   r9   r:   axis_index_pr5   r   asarrayreversedr   rg   )r=   
inner_sizeindexrr   s       r4   r   r     su    > 
It}	-y11JKKNE# $z$*,,eIdO#j$ Lr6   c                    t        |       S )aC  Return the size of the mapped axis ``axis_name``.

  Args:
    axis_name: hashable Python object used to name the mapped axis.

  Returns:
    An integer representing the size.

  For example, with 8 XLA devices available:

  >>> mesh = jax.make_mesh((8,), 'i', axis_types=(jax.sharding.AxisType.Explicit,))
  >>> @jax.shard_map(mesh=mesh, in_specs=jax.P('i'), out_specs=jax.P())
  ... def f(_):
  ...   return lax.axis_size('i')
  ...
  >>> f(jnp.zeros(16))
  Array(8, dtype=int32, weak_type=True)

  >>> mesh = jax.make_mesh((4, 2), ('i', 'j'),
  ...                       axis_types=(jax.sharding.AxisType.Explicit,) * 2)
  >>> @jax.shard_map(mesh=mesh, in_specs=jax.P('i', 'j'), out_specs=jax.P())
  ... def f(_):
  ...   return lax.axis_size(('i', 'j'))
  ...
  >>> f(jnp.zeros((16, 8)))
  Array(8, dtype=int32, weak_type=True)
  )r{   r   s    r4   rg   rg     s    8 
I	r6   c               4    t        |      }t        d| |      S )Nr   r'   )r`   r>   ru   s     r4   r{   r{      s     
 66GH	a.?	@@r6   r3   c                v    t        |t        t        f      s|f}t        j	                  | |t        |            S )z>Uses the last positional axis of idx to index into src's axes.r3   )r8   r9   r:   	pgather_pr5   )srcr   r3   s      r4   pgatherr   )  s0    	D5$-	(7D	SuT{	33r6   c                T    ||    }t        |t        t        f      rt        |      S |fS rA   )r8   r9   r:   )pnameparams
axis_namess      r4   _names_in_paramr   2  s,    e}*
UDM*=r6   c                *   j                   |v sJ |rt        t        fd|D              }|r| j                  |||      }| t        u r't        j                  |j                        |z  }|d fS | t        t        fv r|}|d fS t        d|        )Nc              3  B   K   | ]  }|j                   k7  s|  y wrA   rr   )rE   ry   	axis_datas     r4   rG   z&_constant_reduction.<locals>.<genexpr><  s     :a9>>&91:   rV   zUnrecognized reducer: )rr   r.   r9   r5   rk   r   rT   rU   r   r   	Exception)primr   argr3   r(   new_axesouts    `     r4   _constant_reductionr   9  s    	4		11:d::(
))Ch:K)
LC	V^
**S)..
)C
/C
 
d	 
C 
d ,TF3
44r6   c                   |t        d      |t        j                  u s|dk(  r|nt        |d|      }|t        j                  u r" |d|      \  }}| j	                  ||d       }|S  |d|      \  }	}
| j	                  |
|	d       }|S )NzSaxis_index_groups not supported in vmap collectives. Please open a feature request!r   rV   )r.   r   
not_mapped	_moveaxisr5   )r   rx   dr(   transform_unmappedtransform_mappedunmapped_axesunmapped_vals_inunmapped_vals_outmapped_axesmapped_vals_inmapped_vals_outs               r4   "_reduction_with_positional_batcherr  G  s    "
 ? @ @###qAva9Q13E!(

&8A&>#M#		"248 " : 0A 6+~IIn;04  6/	r6   c          	         | j                   rJ t        d D              s| j                  ||      |fS t        | |||fdfd      }||t        j
                  u r|fS dfS )Nc              3  <   K   | ]  }t        |t                y wrA   rB   rD   s     r4   rG   z%_reduction_batcher.<locals>.<genexpr>Z       4tZc"4rI   rV   c                    |fS rA   rw   r  rx   r3   s     r4   rz   z$_reduction_batcher.<locals>.<lambda>^  s    D!9 r6   c                2     t         fdD              |fS )Nc              3  R   K   | ]  }t        |t              r||k\  z   n|   y wrA   rB   )rE   rF   r  s     r4   rG   z7_reduction_batcher.<locals>.<lambda>.<locals>.<genexpr>_  s0      ," 1;40E$$!),4O ,s   $'r9   r  s   ` r4   rz   z$_reduction_batcher.<locals>.<lambda>_  s!    E ,&*, , r6   r   )multiple_resultsrW   r5   r  r   r  )r   rx   r  r3   r(   val_outs      `  r4   _reduction_batcherr  X  sy    """	"	4t4	499QT5F9GJJ.
Aq#' 
qH///!	66Q	66r6   c           	     6   | j                   rJ ||c\  }\  }~~|3j                  v rt        | ||      S | j                  ||      |fS j                  vrt	        | |||      S t        | |||fdfd      }	|	t        j                  fS )NrV   c                R    t        fdD               |j                        fS )Nc              3  B   K   | ]  }|j                   k7  s|  y wrA   r   rE   rF   r   s     r4   rG   zB_batched_reduction_collective.<locals>.<lambda>.<locals>.<genexpr>~  s     J449>>3I$Jr   )r9   rU   )r  rx   r3   r   if_unmappeds     r4   rz   z/_batched_reduction_collective.<locals>.<lambda>~  s%    EJ4JJ9>>24 r6   c                4     t         fdD              |fS )Nc              3  t   K   | ]/  }t        |t              r||k\  z   n|j                  k7  r|n 1 y wrA   )r8   rC   rr   )rE   rF   r   r  s     r4   rG   zB_batched_reduction_collective.<locals>.<lambda>.<locals>.<genexpr>  sH      M?C 1;40E$$!),!Y^^3 LP9:; Ms   58r  )r  rx   r3   r   s   ` r4   rz   z/_batched_reduction_collective.<locals>.<lambda>  s$    E MGKM M r6   )r  rr   r   r5   r  r  r   r  )
r   r  r   vals_indims_inr3   r(   rx   r  r  s
    ``  `    r4   _batched_reduction_collectiver"  e  s     """	"*$1wY~~ y!T;LMMYYqt7HYI1LL^^4a1BD D /
Aq#4	' 
(%%	%%r6   c                    t        j                  | |      }|+|D cg c]  }|D ]  }|D cg c]  }||   	 c}  }}}}|S c c}w c c}}}w rA   )r   axis_groups)axis_envr=   r(   replica_groups
axis_groupaxis_index_groupr   s          r4   _replica_groupsr)    sx    ##Hi8."(6A A$.?A* /??z!}? A? AN A 
 @ As   AAAAc           	         t        j                  t        t        j                  | ddi      t         j
                        j                  }t        j                  j                  t        j                  |            S )N	fillvaluer[   )r\   arrayr:   	itertoolszip_longestint64Tr   DenseIntElementsAttrgetascontiguousarray)r&  groupss     r4   _replica_groups_hlor7    sZ     88D..M"MN(($$%A 			 	 	$	$R%9%9&%A	BBr6   c                   |J t        d |D              st        j                  | |||      S t        d |D              sJ  |||      S )Nc              3  <   K   | ]  }t        |t                y wrA   rB   rD   s     r4   rG   z"_allreduce_impl.<locals>.<genexpr>  r  rI   rV   c              3  <   K   | ]  }t        |t                y wrA   rB   rD   s     r4   rG   z"_allreduce_impl.<locals>.<genexpr>  r  rI   )ra   r	   apply_primitive)r   pos_reducerr   r3   r(   s        r4   _allreduce_implr=    s[    		""	"	4t4	4$$T3T7HJ J	4t4	44	4	S$	r6   c                  t        |d       t        d |D              }t        d |D              }|t        |      dk7  rt        d|       t	        j
                  | gd       t        | gd       t        t        j                  | |      | j                  t        j                  | |            }||D ch c]  }t	        j                  |       c}fS c c}w )Nr>   c              3  B   K   | ]  }t        |t              r|  y wrA   rB   rD   s     r4   rG   z5_allreduce_effectful_abstract_eval.<locals>.<genexpr>       Hd*T32GTH   c              3  B   K   | ]  }t        |t              s|  y wrA   rB   rD   s     r4   rG   z5_allreduce_effectful_abstract_eval.<locals>.<genexpr>       BDJtS,A4BrA  r   zMaxis_index_groups can only be used with reductions over named axes, but got: r   sharding)_check_axis_namesr9   rc   rX   r   check_avals_context_meshr   r   r   _reduce_op_shape_ruler[   _reduce_op_sharding_ruleNamedAxisEffect)avalr3   r(   rp   rQ   out_avalrF   s          r4   "_allreduce_effectful_abstract_evalrM    s    D&!HdHH*BDBB("
8} //3f6 7 7/vv&	84djj++Dx@B( 
:F4D((.F	FFFs   :Cc               n    t         j                  j                  st        |||      S t	        | ||      S )NrV   r   )r   rh   ri   rM  _psum_invariant_abstract_eval)rr   rK  r3   r(   s       r4   _pmin_pmax_abstract_evalrP    s5    				 	 -4+<> >	&tT	==r6   c                    t        d | D              }t        j                         }|D ]%  }|j                  |      rt	        d| d| d       y )Nc              3  B   K   | ]  }t        |t              r|  y wrA   rB   rD   s     r4   rG   z$_check_axis_names.<locals>.<genexpr>  r@  rA  zFound an unbound axis name: z. To fix this, please call z under `jax.shard_map`.)r9   r   rf   axis_exists	NameError)r3   api_namerp   r%  rr   s        r4   rF  rF    sc    HdHH* ( 1d%( /Z.01 11r6   c                    t         j                  j                  rt        j                  S | j
                  j                         S rA   )r   !jax_collectives_common_channel_idri   r   COLLECTIVE_CHANNEL_IDmodule_contextnew_channel_id)ctxs    r4   _get_channelr\    s2    --33%%%,,..r6   c               J    j                   \  }|Edj                  j                  v r-t        |d         t	        fd|D              rt        d      g g fx\  }}|D ]$  }	|t        |	t                 j                  |	       & r't        j                  |d      fd}
 |
||      }|s|gS t        t        j                  j                  ||            j                  j                  }t        |t        t         f       fd} |||      gS )	Ntpur   c              3  :   K   | ]  }t        |      k7    y wrA   r   )rE   r   len_0s     r4   rG   z&_allreduce_lowering.<locals>.<genexpr>  s     
6q3q6U?
6   z<axis_index_groups must all be the same size for TPU loweringFr  c                   | j                  t        j                  t        j                  | j                  t        j
                                    }j                  d | g|g      } ||t                    \  }|S )Nr-  r   	primitiveavals_in	avals_outr   )updater\   deleter.  r   r1  replacer9   )rK  r   aval_outreducer_ctxr   r[  positional_axesreducers        r4   _positional_reducez/_allreduce_lowering.<locals>._positional_reduce  sn    		"((4::RXX>)+  ,h KK$$H:KVk[#E/,BCdcjr6   c           	        r`t        t        j                  j                  t	        
      t
        j                        t        j                  j                  d            }ni }t        j                  |j                  g|gfdi|}t        j                  d| j                  t        | j                  j                   t#                           }t        j$                  |      }|j&                  d   j(                  j+                  ||      }t        j,                  |      5  t        j.                  j0                  d      }
j3                  d |gd	z  |g
      } ||g|j4                   }	t        j6                  t        j8                  |	             d d d        |j:                  S # 1 sw Y   |j:                  S xY w)NTchannel_handleuse_global_device_idsr&  rw   rD  r   Frb     re  )dictr   ChannelHandler4  r\  r   DEVICE_TO_DEVICE_TYPEr   BoolAttrAllReduceOptyper   r   r[   r   rE  meshPaval_to_ir_typeregionsblocksrb   InsertionPoint	lower_funr5   rk  	argumentsreturn_flatten_ir_valuesr   )rK  r<   
other_argsopscalar_avalscalar_typereducer_blocklower_reducerrm  	out_nodesr[  is_spmdr   r&  s             r4   
all_reducez'_allreduce_lowering.<locals>.all_reduce  sr   **..3!;!;= " 57j
 j		
1#
D&4
D8B
DB""
DJJt}}/A/A13!GIK&&{3KJJqM((//[IM			=	) 5nnTYYGmKK$*5):{m   UkFm.E.EFi	kk$((345 995 99s   &A6F00G)rg  rY  	platformsrc   rW   rX   r8   rC   rb   r   r  r7  r)  r%  axis_contextr   r   )r   pos_fnr[  r   r3   r(   aval_inrp   rq   rF   rp  r  r  r  r`  rn  ro  r&  s   ` `          @@@@@r4   _allreduce_loweringr    s    \\('"1C1C1M1M(M!!$%E

6$5
66UVV13R7*o 7d:dC()0067 nnVe<G Wc
*C	5L&c((11:')*. ##00,|o%GH', Wc
"	##r6   c                   g g fx\  }}|D ]$  }|t        |t                 j                  |       & rfd} || |      } t        j	                  | t        |      |      fS )Nc                    t        j                  |      sJ t        |       t         j                  u rt        j                  |j                        S t        j                  | |d       d   S )N)r3   out_shardingr   )r   is_undefined_primalr{  ZerorK  r   _reduce_sum_transpose_rule)ctr   rQ   s     r4   broadcast_positionalz2_psum_transpose_rule.<locals>.broadcast_positional  s_    ##C(((	bRWW	RWWSXX%66++B(9=??@B Br6   rV   )r8   rC   rb   rk   r5   r9   )	ctsr   r3   r(   rp   rq   rF   r  rQ   s	           @r4   _psum_transpose_ruler    s    *,b&0*h 7d:dC()0067 B
 sC
(C ++cj 1(9  ; 
= =r6   r>   c                    || z  S rA   rw   rx   rg   s     r4   rz   rz     s    iRSm r6   r   c                    | S rA   rw   r  s     r4   rz   rz         a r6   r   c                    | S rA   rw   r  s     r4   rz   rz   #  r  r6   c               x   t        | j                  j                  |d       }t        |d         t	        fd|D              \  }}t        |      t        t        |            k(  r t        |      t        t        |            k(  s| d}t        |j                  |            t        j                  t        |      t        |      dft        j                        }t        |      D ]<  \  }	}
t        |
      }
t        |      D ]  \  }\  }}|
|   ||	|df<   |
|   ||	|df<     > |j                  d      }| j                  j                  }t        |t               xr |j"                  }|rEt%        t&        j(                  j+                  t-        |       t.        j0                              }||fS i }||fS )Nr   c              3  8   K   | ]  \  }}|z  |z  f  y wrA   rw   )rE   r   dstr   s      r4   rG   z0_pcollectives_lowering_common.<locals>.<genexpr>*  s$     PxsCsZ'z)9:Ps   z1 sources and destinations must be unique, got {}.ru  r   )r,  ru  rs  )r)  rY  r%  rc   r&   r   rX   r   r\   zerosr1  	enumeratesortedreshaper  r8   r   manual_axesrv  r   rw  r4  r\  r   rx  )r[  r=   r   op_namer&  srcsdstsr   	full_permr   grpjr   r  r  	is_manualr  r   s                    @r4   _pcollectives_lowering_commonr  '  s   "3#5#5#>#>	4P.>!$%*P4PP*$
d)s3t9~
%#d)s3t9~*EIH
IC
SZZ%
&&hhN+SY:BHHE).) $fa
+C"4 $:Cs8i1as8i1a$$
 ()##00,/ #

"
"  ((,,t99
J 
J	 J	J	r6   c                   t        | ||d      \  }}t        j                  |t        j                  |      fi |j
                  S )Nr   r=   r   r  )r  r   CollectivePermuteOpr   dense_int_elementsresults)r[  r<   r=   r   r  r  s         r4   _ppermute_loweringr  G  sM    7	YT:)Z 
	 	 	 	 	+
;/9
;;B7Cr6   c                f    t        |      \  }}t        t        ||            }t        | ||      gS r   )r&   r:   r   r   )tr<   r   r=   r  r  inverse_perms          r4   _ppermute_transpose_ruler  O  s1    d|*$c$o&,
1	
=	>>r6   c                   | j                   | j                  c}||c\  }\  }t        |t        t        f      s|f}| j                  |vrt
        j                  |||      |fS t        fd|D              }|rt
        j                  |||      |fS |d   k(  sJ d       t        |      |k(  sJ d       |t        j                  u r||fS t        j                  |t              }	|D ]
  \  }
}|
|	|<    |j                  |	|      |fS )N)r   r=   c              3  .   K   | ]  }|k7  s	|  y wrA   rw   )rE   rF   
frame_names     r4   rG   z$_ppermute_batcher.<locals>.<genexpr>[  s     J$tz7IJs   
r   z*ppermute batcher called with a wrong axis!z(Permutation doesn't match the axis size!r-  )rU   rr   r8   r9   r:   r   r5   rc   r   r  r\   r  rC   take)r   r   r!  r=   r   rg   rx   r  remaining_axesperm_indicesr   r  r  s               @r4   _ppermute_batcherr  T  s   #..)..)Z*$1	It}	-I^^9$??149?=q@@J)JJ.??14>?BAEE	1	#Q%QQ	#	Ti	K!KK	(

a4K)3/, hc3L	
a	 !	##r6   c               R    t        |d       t        d||        t        | gd       | S )Nr   )rF  collective_vma_ruler   r<   r=   r   s      r4   _raise_to_shaped_abstract_evalr  g  s)    Iz*j)Q/sJ'	
(r6   r   T)frozenc                      e Zd Zd Zd Zd Zy)SingleSideCollectiveEffectc                     y)Nzone-sided communicationrw   )_s    r4   rz   z#SingleSideCollectiveEffect.<lambda>w  s    r6   c                     t        t              S rA   )hashr  )selfs    r4   __hash__z#SingleSideCollectiveEffect.__hash__x  s    *++r6   c                "    t        |t              S rA   )r8   r  )r  others     r4   __eq__z!SingleSideCollectiveEffect.__eq__z  s    e788r6   N)__name__
__module____qualname____str__r  r  rw   r6   r4   r  r  u  s    /',9r6   r  c                   t         fddD              rt        d      t         ||d      \  }}t        j                         }t        j
                  |g|fdt        j                  |      i|} j                  j                  }t        |t              st        d      t        j                         }	t        j                  j                  j                  |	_        t        j"                  ||	       |j$                  S )Nc              3  N   K   | ]  }|j                   j                  v  y wrA   )rY  r  )rE   pr[  s     r4   rG   z&_psend_lowering_gpu.<locals>.<genexpr>  s"     I1#$$..	.Is   "%)cudarocmz+psend is currently only implemented on GPUsr   r  source_target_pairsz-psend currently only supports manual sharding)ra   r.   r  r   create_tokenSendOpr   r  rY  r  r8   r   xc
OpShardingTypeMANUALr{  set_shardingr  )
r[  r<   r=   r   r  r  r   send_opaxis_ctxrE  s
   `         r4   _psend_lowering_gpur    s    I8HII
K
LL7	YT7)Z 


%JJc 11)< 		' ,,(	Ho	.
M
NN]]_(--$$++(-GX&	r6   c               j    t        |d       t        h t        t        j                  |      t
        fS )Nr   )rF  r   r   r   rJ  single_side_collective_effectr  s      r4   _psend_abstract_evalr    s;    Iw'	 
4+# 
 r6   r   gpu)platformc                   t        d      )Nz*psend is currently only implemented on GPUr.   )r[  r<   r=   r   s       r4   _psend_loweringr        HIIr6   c                  t        | ||d      \  }}t        j                  t        j                  |      |j
                  g|fdt        j                  |      i|}| j                  j                  }t        |t              st        d      t        j                         }	t        j                  j                  j                  |	_        t        j                   ||	       |j"                  }
|
d   gS )Nr   r  r  z-precv currently only supports manual shardingr   )r  r   RecvOpr   r~  r{  r  rY  r  r8   r   r.   r  r  r  r  r  r  )r[  r   r   r=   r   r  r  recv_opr  rE  r  s              r4   _precv_lowering_gpur    s    7	YT7)Z JJI&

3 11)< 		' ,,(	Ho	.
M
NN]]_(--$$++(-GX& OO'
!*r6   c               J    |h t        t        j                  |      t        fS rA   )r   r   rJ  r  )r   r   r=   r   s       r4   _precv_abstract_evalr    s2     
 4c$..	: 424 
4 4r6   r   c                   t        d      )Nz*precv is currently only implemented on GPUr  )r[  r   r   r=   r   s        r4   _precv_loweringr    r  r6   c                    t        |      |k(  }t        | |      }t        j                  |t        j                  | |      t        j                  | d            gS Nr   )r   r>   r   r   	full_like)r  r<   r   r=   	is_sourcetsums         r4   _pbroadcast_transpose_ruler    sI    #v-)	a	$
**Ya 6a8K
L	MMr6   c                     j                   }||c\  }\  }t        |t        t        f      s|f} j                  |vrt
        j                  |||      |fS t         fd|D              }|rt        d      |d    j                  k(  sJ d       |dk\  r||k  sJ d       |dk(  r|rt
        j                  |||      |fS |t        j                  u r||fS |j                  |g|z  |      |fS )	Nr   c              3  B   K   | ]  }|j                   k7  s|  y wrA   r   r  s     r4   rG   z&_pbroadcast_batcher.<locals>.<genexpr>  s     N$ty~~7MNr   z.pbroadcast batcher only supports a single axisr   z,pbroadcast batcher called with a wrong axis!z2collective broadcast doesn't fit in the axis size!r   )r   r=   )rU   r8   r9   r:   rr   r   r5   r.   r   r  r  )	r   r   r!  r=   r   rg   rx   r  r  s	   `        r4   _pbroadcast_batcherr    s   nn)*$1	It}	-I^^9$Q)FCQFFN)NN.
N
OO	1	'W)WW	'	1)+a-aa	+!^QvH!KK(

a4K	
9$a	(!	++r6   c                  t        | j                  j                  |d       }fd}|D cg c]
  } ||       }}t        | j                  j                  t
        t        f      }|rDt        j                  j                  t        |       t        j                        }t        |      }	ni }	t        j                  |fdt        |      i|	j                   S c c}w )Nc                R    |    gt        | d        z   t        | dz   d        z   S )Nr   )r:   )r   r   s    r4   source_to_frontz-_pbroadcast_lowering.<locals>.source_to_front  s4    &M?T%.11Dvz{9K4LLLr6   r  r&  )r)  rY  r%  r8   r  r   r   r   rw  r4  r\  r   rx  rv  CollectiveBroadcastOpr7  r  )
r[  r<   r=   r   r&  r  r   r  rs  r  s
      `      r4   _pbroadcast_loweringr    s    "3#5#5#>#>	4P.M8FGuOE*G.G	%%('  &&**<+<+/+E+EGN^4JJ		"	"
+N;
?I
G Hs   Cr   c                    t        |j                        D cg c]
  }|| k7  s	| }}|j                  ||        t        j                  ||      S c c}w rA   )r   rN   insertr   	transpose)r   r  r<   r   r   s        r4   r  r    sG    166]	/a3h!	/$	/++c3	q$	 
0s
   
AAc                    t        |j                        }||    |z  dk(  sJ ||    |f       |||    |z  g|| | dz    |j                  |      S )Nr   r   r:   r   r  )rF   factorr<   	new_shapes       r4   
_splitaxisr    sa    177m)	46	!Q	&A4&(AA	&"IdOv$=>)Da	
9	r6   c                    t        |j                        }|j                  |    |j                  | dz      z  g|| | dz    |j                  |      S )Nr   ru  r  )rF   r<   r  s      r4   	_foldaxisr    sK    177m)GGDMAGGD1H,==>)Da	
9	r6   c          	     ~   ~t        | j                  j                  ||      }t        |d         dk(  r|gS t        |d         t	        fd|D              st        d      t        | j                  j                  t        t        f      }|rDt        j                  j                  t        |       t        j                        }	t!        |	      }
ni }
t        j"                  |gft        j$                  |      t        j$                  |      t        j$                        t'        |      d|
j(                  S )Nr   r   c              3  :   K   | ]  }t        |      k(    y wrA   r   rE   r   split_counts     r4   rG   z'_all_to_all_lowering.<locals>.<genexpr>)       ;q[CF";ra  $Replica groups must be equally sizedr  )split_dimensionconcat_dimensionr  r&  )r)  rY  r%  rc   ra   rX   r8   r  r   r   r   rw  r4  r\  r   rx  rv  
AllToAllOpi64_attrr7  r  )r[  r<   r   r   r=   r(   r   r&  r  rs  r  r  s              @r4   _all_to_all_loweringr    s    "3#5#5#>#>	#46.	q 3JN1%&+	;N;	;
;
<<	%%('  &&**<+<+/+E+EGN^4JJ	C
MM*-]];/k*&~6
 
 'r6   c                &    t        | |||||      fS Nr=   r   r   r(   r   r   )r  r<   r=   r   r   r(   r   s          r4   _all_to_all_transpose_ruler   @  s(     	) 
 r6   c               h    | \  }|\  }t         j                  |||||k  z   |||k  z   ||      }	|	|fS r  )r   r5   )
r   r!  r=   r   r   r(   r   r<   r  r   s
             r4   _all_to_all_batcherr"  K  sV    "!"!qJ/k!12)  & 
r6   c           	     $   |t        d      | j                  | j                  }	}t        |t        t
        f      r|}
n|g}
|	|
vrt        |||||||      S |\  }|\  }|t        j                  u r&t        j                  ||g|j                        }d}t        |t        t
        f      r|j                  |	      }|d | ||dz   d  }}nd\  }}|si|sg||k(  r<|||k  z   }|}t        |||      }|||k  z  }t        |t        |||f||f            |fS t        |t!        |||            }t        |||      |fS t        j"                  t!        |d|      d      d}}|dz  }|dz  }|rt$        j'                  |||d||      }|dk(  sJ t        |||      }|}|||k  z  }|dz  }|rt$        j'                  |||d||      }t        dt        d|            }|dz  }|dz  }|dz  }t        |dz
  t!        d|dz
  |            }|dz  }||fS )	NPlease open a feature request!r  r   r   )rw   rw   )r   ru     ru  )r.   rU   rr   r8   r:   r9   r"  r   r  r   	broadcastr   r   r  r  r#   r  r   r   r5   )r   r   r!  r=   r   r   r(   r   rg   r  
axes_namesr<   r  pos
major_axes
minor_axesrF   d_pre_splitx_concatnew_ds                       r4   _all_to_all_batched_collectiver.  Y  s    "
>
??#..)..Z)	D%=)JJz!w)
1B%Q Q "!"!(

 	a).agg./A	A	D%=)
//*
%C&t_ia.A
J#J
	J[ 1
?+dk
T9a
(aDAIatXa!TT1I>?LL;	![!(DEh
Ix8*DD 
1a+V	4aQ!/*;!#;!z%/Q,= % 	 	'A
 
a-Y*!
%*+,+/*!z%/Q,= % 	 	'A 9Q?#!/*;!#;UaZUa1kAoq!AB!1*%	
E/r6   c                   ~t        |t        t        f      s|f}t        |d       t	        | gd       t        | j
                        }|t        |      nt        |d         }||   |z  dk(  sJ ||   |f       ||xx   |z  cc<   ||xx   |z  cc<   t        d||       }| j                  t        |      d|      }	h t        t        j                  |      }
|	|
fS )Nr   r   F)r   	weak_typevma)r8   r:   r9   rF  r   r   r{   rc   r  ri  r   r   rJ  )
input_avalr=   r   r   r(   r   r   rg   r1  rL  r   s              r4   #_all_to_all_effectful_abstract_evalr3    s     	Ie}	-II|,
|\2
z
 % 
	"  #$ 
 
z	Y	&!	+KeJ.?-KK	+
	!	!L)Z@#U5\UL(3c$&&	23'	7	r6   c                     t        d      )NzGall_to_all must be used within a mapped context like vmap or shard_map.RuntimeErrorargskwargss     r4   _all_to_all_implr:         0 	1 1r6   r   c                  t        | j                  j                  ||      }	t        |	d         t	        fd|	D              st        d      dt        |	      i}
t        | j                  j                  t        t        f      }|rIt        j                  j                  t        j                  j                  d      t!        |             |
d<   t#        j$                  |j&                  g||||||gt        j(                  j                  d      t        j*                  j                  |
      t        j                  j                  t        j                  j                  d      d	      
      j,                  S )Nr   c              3  :   K   | ]  }t        |      k(    y wrA   r   r  s     r4   rG   z._ragged_all_to_all_lowering.<locals>.<genexpr>  r  ra  r  r&  @   
channel_idr          )r   inputscall_target_namebackend_configapi_version)r)  rY  r%  rc   ra   rX   r7  r8   r  r   r   r   IntegerAttrr4  IntegerTypeget_signlessr\  r   CustomCallOpr{  
StringAttrDictAttrr  )r[  r   r   r   r   r   r   r=   r(   r&  ragged_all_to_all_attrsr  r  s               @r4   _ragged_all_to_all_loweringrM    s:    #3#5#5#>#>	#46. N1%&+	;N;	;
;
<< +N; 	%%'IK',.NN,>,>
##B'c):-L) 
		kk]v}j.}}(()<=[[__%<=..$$R^^%@%@%DaH
 Gr6   c                   ~ ~t        j                  |j                  t        j                        st        d      t        j                  |j                  t        j                        st        d      t        j                  |j                  t        j                        st        d      t        j                  |j                  t        j                        st        d      t        |j                        dk7  s|j                  d   dk  r$t        dj                  |j                              t        |j                        dk7  s|j                  d   dk  r$t        dj                  |j                              t        |j                        dk7  s|j                  d   dk  r$t        d	j                  |j                              t        |j                        dk7  s|j                  d   dk  r$t        d
j                  |j                              t        |d       |j                  |j                  d      }h t        t        j                  |      }	||	fS )Nz5ragged_all_to_all input_offsets must be integer type.z2ragged_all_to_all send_sizes must be integer type.z6ragged_all_to_all output_offsets must be integer type.z2ragged_all_to_all recv_sizes must be integer type.r   r   z]ragged_all_to_all input_offsets must be rank 1 with positive dimension size, but got shape {}zZragged_all_to_all send_sizes must be rank 1 with positive dimension size, but got shape {}z^ragged_all_to_all output_offsets must be rank 1 with positive dimension size, but got shape {}zZragged_all_to_all recv_sizes must be rank 1 with positive dimension size, but got shape {}r   F)r   r0  )r
   
issubdtyper[   r\   integerrX   rc   r   r   rF  ri  r   r   rJ  )
r   r   r   r   r   r   r=   r(   rL  r   s
             r4   *_ragged_all_to_all_effectful_abstract_evalrQ    s	     			=..

	;
L
MM			:++RZZ	8
I
JJ			>//	<
M
NN			:++RZZ	8
I
JJ		"m&9&9!&<q&@
	""(&)<)<"=  			a:#3#3A#6#:
	""(&)9)9":  			!#~';';A'>'B
	,,2F>3G3G,H  			a:#3#3A#6#:
	""(&)9)9": 
 I23]]]?(3c$&&	23'	7	r6   c                x   | ^}}}|^}}}t        j                  ||g|i |}	t        |      t        |      cxu rt        j                  u r&n n#t        j                  j                  |	      }
|	|
fS t        j                  |      }t        j                  |      }t        j                  ||g|i |}
|	|
fS rA   )r   r5   r{  r   r  from_primal_valueinstantiate_zeros)primalstangentsr   r   r   sizes_and_offsetsoperand_dot
output_dotr  r   
result_dots              r4   _ragged_all_to_all_jvpr[  	  s    (/%'6% (+zA##v5)5-35&	+$z*5bgg5**62J 
		 &&{3K%%j1J$))Z?"3?7=?J		r6   c          
        t        |       t        j                  u rnt        j                  |      rt        j                  |j                        nd }	t        j                  |      rt        j                  |j                        nd }
ndt        j
                  |j                        }t        ||ddd      }t        ||ddd      }t        j                  | |||||||      }	t        j                  t        j                  | j                  d   dd      j                  |   j                  d      j                  ||z      j!                  d            }t        j"                  |g t%        d| j&                              }t        j(                  || j                  t+        t%        | j&                              	      }t        j,                  |t        j.                  |       |       }
|	|
gd gd
z  z   S )Nr   T)r   ru   r_   r-  r   r,  )r   broadcast_dimensionsrA  )r{  r   r  r  rK  zeros_like_avalr   r   r5   r   cumsumr   r   r   atr   addr   r   rN   broadcast_in_dimr9   r   _zeros)r  r   r   r   r   r   r   r=   r(   	operand_toutput_tzerooutput_offsets_input_offsets_r   s                  r4   _ragged_all_to_all_transposeri    s    
!W)+)?)?)H%dI')'='=f'Erwwv{{#4Hgll+D AqMOy!QdKN#((	4*nj/@ ) BI Qg.11/BFFqI	Oj(	**-#b'3D ??4!55AFF#3!56DAGG%PUVWV\V\P]J^_Dzz$

1q1H
X	$!	++r6   c                N   | j                   |v rt        d      |rt        d      | j                  fd}d }fd}t        t	        t
        j                        |d d |d d       \  }}	|j                  d   |	j                  d   }}
t        ||dd  |dd        \  }}}}|t        j                  |j                        d d d f   |
z  z  }|t        j                  |j                        d d d f   |z  z  }||	||||f} |t        t        ||      d|i      }|d	fS )
Nr$  c                    | j                   dk(  sJ |t        j                  | dd       S |dk(  r| S | j                  S )Nru  r   )rN   r   r&  r2  )r<   r  rU   s     r4   bdim_at_secondz=_ragged_all_to_all_batched_collective.<locals>.bdim_at_second4  sJ    66Q;;45IHq$40 "aA"SS"r6   c                B     | j                   dg| j                  dd   S )Nr,  ru  r  r   )r<   s    r4   mergez4_ragged_all_to_all_batched_collective.<locals>.merge8  s!    yqyy2aggabk22r6   c                F     | j                   dg| j                  dd   S )Nr,  r   rn  )r<   rU   s    r4   splitz4_ragged_all_to_all_batched_collective.<locals>.split9  s$    yqyyr8AGGABK88r6   )rU   ru  r   r=   r   )rr   r.   rU   r   r   r   bdim_at_frontr   r   iotar[   r   )r   r   r!  r=   r(   rl  ro  rq  r   r   NMr   r   r   r   r   rU   s                    @r4   %_ragged_all_to_all_batched_collectiverv  ,  s8   ^^y 
>
??
>
??	$" 38 6 6TBGBQKQXY[Z[Q\]/'6	q	6<<?Q!	.'!"+wqr{3 8-^Z388M//6tQw?!CC-CHH^1148qAAEE.V]J
R'"Cw$7M9MN&	r6   c                     t        d      )NzNragged_all_to_all must be used within a mapped context like vmap or shard_map.r5  r7  s     r4   _ragged_all_to_all_implrx  E  r;  r6   r   c                    t         j                  j                  s|S t        | t              s| fn| } t        j                  |      t        |       j                  z  }t        |t	        fd|D                    }|S )Nc              3  @   K   | ]  }|j                   vs|  y wrA   r1  )rE   ry   rK  s     r4   rG   z*insert_collective_pvary.<locals>.<genexpr>Z  s     A1q/@QAs   )
r   rh   ri   r8   r9   r   get_avalr   r1  r   )r=   r<   names_unionrK  s      @r4   r   r   S  se    				 	 H",Y">ylI)	q	$I)+AuAAAB!	
(r6   varying)r(   rF   r   toc                   ddh}||vrt        d|       |dk(  rt        | ||||      S |dk(  sJ |t        t        | |||      S )a  Gather values of x across all replicas.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  This is equivalent to, but faster than, all_to_all(broadcast(x)).

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would run all gather over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and all groups must be the same size.
    axis: a positional axis into which the chunks along ``axis_name`` will be
      concatenated.
    tiled: when ``False``, the chunks will be stacked into a fresh positional
      axis at index ``axis`` in the output. When ``True``, ``axis`` has to
      refer to an existing positional dimension and the chunks will be
      concatenated into that dimension.

  Returns:
    Array(s) representing the result of an all-gather along the axis
    ``axis_name``. Shapes are the same as ``x.shape``, but:

    - when ``tiled`` is ``False``, there is a new dimension equal to the
      size of axis ``axis_name`` in position ``axis``,
    - when ``tiled`` is ``True``, the size of dimension in position ``axis``
      is multiplied by the size of axis ``axis_name``.

  For example, with 4 XLA devices available:

  >>> x = np.arange(4)
  >>> y = jax.pmap(lambda x: jax.lax.all_gather(x, 'i'), axis_name='i')(x)
  >>> print(y)
  [[0 1 2 3]
   [0 1 2 3]
   [0 1 2 3]
   [0 1 2 3]]

  An example of using axis_index_groups, groups split by even & odd device ids:

  >>> x = np.arange(16).reshape(4, 4)
  >>> print(x)
    [[ 0  1  2  3]
     [ 4  5  6  7]
     [ 8  9 10 11]
     [12 13 14 15]]
  >>> def f(x):
  ...   return jax.lax.all_gather(
  ...       x, 'i', axis_index_groups=[[0, 2], [3, 1]])
  >>> y = jax.pmap(f, axis_name='i')(x)
  >>> print(y)
  [[[ 0  1  2  3]
    [ 8  9 10 11]]
   [[12 13 14 15]
    [ 4  5  6  7]]
   [[ 0  1  2  3]
    [ 8  9 10 11]]
   [[12 13 14 15]
    [ 4  5  6  7]]]
  r~  reducedzMGot unexpected `to` value for `jax.lax.all_gather`. Allowed `to` values are: )r(   rF   r   rF   r   )rX   _all_gatherr.   all_gather_reduced)r<   r=   r(   rF   r   r  _allowed_ag_tos          r4   
all_gatherr  ]  s    B y).~
	&'	)* * 9_q)7H / / ??$aUCCr6   c                   t        t              sfs| S t              t              fd}t	        j
                  ||       S )Nc           	         t        |       } t        j                  | t        rt	        j
                  |       nt	        j
                  |       dz               S )Nr   all_gather_dimensionr=   r(   rg   r   )r   all_gather_pr5   r"   r\   rN   )r1   rF   r(   r=   rg   r   s    r4   r5   z_all_gather.<locals>.bind  s\    "9d3D.5"''$-bggdma.?A/@5  * *r6   )r8   r9   r`   r{   r   r;   )r<   r=   r(   rF   r   r5   rg   s    ```` @r4   r  r    sT    	Iu	%I	H56GH$56)* * 
		D!	$$r6   c                   t        d      )Nz#Unexpected call to _all_gather_impl)AssertionError)r<   r  r=   r(   rg   r   s         r4   _all_gather_implr    s    <==r6   c                  | j                   \  }| j                  \  }	| j                  j                  }
t	        |
t
        t        f      }|st        |j                        }|j                  |d       t        t        |            D cg c]
  }||k7  s	| }}t        j                  t        j                  |j!                  |            |t        j"                  |            }t%        | j                  j&                  ||      }|r`t)        t        j*                  j-                  t/        |       t        j0                        t2        j4                  j-                  d            }ni }t        j6                  t        j                  |	      g|gft        j8                  |      t;        |      d|j<                  S c c}w )Nr   rd  Trr  )all_gather_dimr&  )rg  rh  rY  r  r8   r   r   r:   r   r	  r   rc   r   rb  r   r~  ri  dense_int_arrayr)  r%  rv  rw  r4  r\  rx  r   ry  AllGatherOpr  r7  r  )r[  r<   r  r=   r(   rg   r   r  x_avalrL  r  r  r  r   r]  r&  r  s                    r4   _all_gather_loweringr    s~    LL'&mm)(##00,|o%GH'	V\\"I)1-',S^'<Z!EY@YAZZV]]];<a12	4A #3#5#5#>#>	$57. ((,,t99; kkood35J
 J	H%&c
--(<=(8
 		
 G	# [s   
G G c           	         t         j                  j                  s
t               S t	        |t
              s|fn|}t        fd|D              rt        d|  dj                   d| d      j                  S )Nc              3  :   K   | ]  }|j                   v  y wrA   r{  )rE   ar  s     r4   rG   z&collective_vma_rule.<locals>.<genexpr>  s     0&**	0ra  zCollective z4 must be applied to a device-varying  type, but got z& for collective acting over axis name z. Please open an issue at https://github.com/jax-ml/jax/issues and as a temporary workaround pass the check_vma=False argument to `jax.shard_map`)	r   rh   ri   	frozensetr8   r9   rW   rX   r1  )	prim_namer=   r  s     `r4   r  r    s    				 	 ;",Y">ylI)0i00

i[ ! ** &# %J	JK K 
r6   c               L   t        |t        t        f      s|f}t        |d       t	        | gd       t        | j
                        }|r||xx   |z  cc<   n|j                  ||       t        d||       }| j                  ||      h t        t        j                  |      fS )Nr  r   r1  )r8   r:   r9   rF  r   r   r	  r  ri  r   r   rJ  )r  r  r=   r(   rg   r   r  out_vmas           r4   #_all_gather_effectful_abstract_evalr    s     
Ie}	-II|,x.6<< )
"#y0#)95i@'
--iW-
5
1C$$i0
1
3 3r6   c               $    t        | ||||      fS )N)r=   scatter_dimensionr(   r   )psum_scatter)r  r<   r  r=   r(   rg   r   s          r4   _all_gather_transpose_ruler     s!    
si)=):"$ 
& &r6   c                   ||c\  }\  }	|	t         j                  ur|	|k  r|dz  }n|s|	dz  }	| t        u rt        j                  ||||||      }
|
|	fS | t        u sJ t        j                  |||||      }
|
|	fS )Nr   r  r  r=   rg   r   )r   r  r  r5   all_gather_invariant_p)r   r   r!  r  r=   r(   rg   r   r<   r  r   s              r4   _all_gather_batcherr    s    *$1h!!!  a1fa	\	 4	+y  F 19))))#((	 4	5 ) *F 19r6   c	           
     |   |j                   |j                  }
}	|
|vrt        | |||||||      S |t        d      ||	k(  sJ d       t	        |t
              s|f}t        |      dkD  rt        d      ||
fk(  sJ d       ||c\  }\  }|t        j                  u rot        t        j                  |            }|j                  ||       t        t        |            D cg c]
  }||k7  s	| }}t        j                  |||      }nt!        |||      }|rt#        ||      }|t        j                  fS c c}w )Nr  'axis_index_groups not supported in vmapaxis size doesn't matchr   r$  #batcher called with wrong axis name)rU   rr   r  r.   r8   r9   rc   r   r  r:   r\   r   r	  r   r   rb  r  r  )r   r   r   r!  r  r=   r(   rg   r   
frame_sizer  r<   r  r   r   broadcast_dimsys                    r4   _all_gather_batched_collectiver    sT    %>>9>>j*y gw5I/@5* * "
G
HH	j	 ;";;	 	Iu	%I^a
>
??	zm	#J%JJ	#*$1(

RXXa[!I)95!&s9~!6TA!?S:SaTNTQ	>:A!)1-A
&*A	
H	 Us   #
D9.D9r  )r  r  r^  r  c                   t        t              sfs| S t        d      t              fd}t	        j
                  ||       S )a[  Gather values of x across all replicas.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  all_gather_invariant differs from all_gather in the following ways:

  * all_gather_invariant is Varying -> Invariant.
    For example: `out: f32[8] = all_gather_invariant(inp: f32[4]{V: x}, 'x')`
    where the size of mesh axis `x` is 2.
    While all_gather is Varying -> Varying.

  * all_gather_invariant transposes to dynamic_slice which is
    Invariant -> Varying. While all_gather transposes to reduce_scatter
    which is Varying -> Varying.
  Nc           	        t        j                  |       j                  }|z
  x}rt        | t	        |            } t
        j                  | t        rt        j                  |       nt        j                  |       dz               S Nr   r  )
r   r-   r1  r   r9   r  r5   r"   r\   rN   )r1   in_vma
vary_namesaxes_rF   r=   rg   r   s      r4   r5   z"all_gather_invariant.<locals>.bind^  s    [[""FV^#z#4z*+d!&&.teRWWT]/1wwt}q/@By	 ' ? ?r6   )r8   r9   r{   r  r   r;   )r<   r=   rF   r   r5   r  rg   s    ``` @@r4   all_gather_invariantr  G  sS    " 
Iu	%I	HD))
I
%? ? 
		D!	$$r6   r  c               >   t        d       t        | gd       t        | j                        }|r||xx   |z  cc<   n|j	                  ||       t        fd| j                  D              }| j                  ||      h t        t        j                        fS )Nr  c              3  ,   K   | ]  }|vs|  y wrA   rw   rE   rx   r=   s     r4   rG   z@_all_gather_invariant_effectful_abstract_eval.<locals>.<genexpr>u       BAq	/AaB   	r  )rF  r   r:   r   r	  r  r1  ri  r   r   rJ  )r  r  r=   rg   r   r  r  s     `    r4   -_all_gather_invariant_effectful_abstract_evalr  k  s     I56x!786<< )
"#y0#)95BBB'
--iW-
5
1C$$i0
1
3 3r6   c                   t         rA   r  r<   r  r=   rg   r   s        r4   _all_gather_invariant_implr  |      r6   c          
     (    t        | |||d |||      S N)r  r=   r(   rg   r   r  r  r[  r<   r  r=   rg   r   r  s          r4   _all_gather_invariant_loweringr    s#    		1#79	
 r6   c                   t        | j                  |   |      \  }}|rJ t        |      |z  }t        j                  | |||      }	|r|	fS t        j                  |	|g      fS )N)
slice_sizerF   )divmodr   r   r   dynamic_slice_in_dimr   r   )
r  r<   r  r=   rg   r   r  remr   r   s
             r4   $_all_gather_invariant_transpose_ruler    sn    399%9:IF/*c.9
*#$$	3:,@	B##Is{{31E0FGIIr6   c                0    t        t        | ||||d ||	      S rA   )r  r  r   r   r!  r  r=   rg   r   s          r4   (_all_gather_invariant_batched_collectiver    s&     
(i';Oy%
) )r6   c                  |j                   \  }|j                  \  }	|j                  d      }
t        |j                  j
                  ||      }t        |j                        }||xx   |z  cc<   |j                  j                  }t        |t        t        f      }|r`t        t        j                  j                  t!        |      t"        j$                        t&        j(                  j                  d            }ni }t        j*                  t#        j,                  |j                  |            |ft#        j.                  |      t1        |      d|}t#        j,                  |
      }|j2                  d   j4                  j7                  ||      }t'        j8                  |      5  t#        j:                  | j<                  d      }|j?                  d |
gd	z  |
g
      } ||g|j@                   }t        jB                  t#        jD                  |             d d d        |r|jF                  S t        jH                  t#        j,                  |	      |jJ                        gS # 1 sw Y   KxY w)Nrw   rd  Trr  )r  r&  r   Frb  ru  re  )&rg  rh  ri  r)  rY  r%  r:   r   r  r8   r   r   rv  r   rw  r4  r\  r   rx  r   ry  ReduceScatterOpr~  r  r7  r  r  rb   r  r  r5   rk  r  r  r  r  r  r   )r   r[  r<   r  r=   r(   rg   r   r  rl  r  r&  scatter_out_shaper  r  r  r  r  r  r  rm  r  s                         r4   _reduce_scatter_loweringr    s    LL'&mm)(B'+"3#5#5#>#>	#46.6<<(%&94&##00,('  ((,,t99; kkood35J
 J

6==/@=AB &78(8	
 	" $$[1+**Q-&&--k;G-	' 3NN499uEM++(3}q'8)4  7K kDM,C,CDIKK&&y123 ::KK,,X6		BCC3 3s   .A6I..I7c                  t        |t        t        f      s|f}t        |d       t	        | gd       t        | j
                        }| j
                  |   }|r"||z  dk7  rt        d| d|       ||z  ||<   n||k7  rt        d| d|       ||= t        d||       }| j                  ||      h t        t        j                  |      fS )Nreduce_scatterr   4tiled reduce_scatter operand scatter dimension size " must be divisible by shard_count .reduce_scatter operand scatter dimension size  must match shard count r  )r8   r:   r9   rF  r   r   rX   r  ri  r   r   rJ  )	r  r=   r  r(   rg   r   r  scatter_dim_input_sizer1  s	            r4   '_reduce_scatter_effectful_abstract_evalr    s
    
Ie}	-II/0x!126<< )!<<(9:
	)Q.M01 2&&/[2 3 3 $:Y#FI *G011I#& ' ' 	#$,i@#
--iS-
1
1C$$i0
1
3 3r6   c               $    t        | ||||      fS )N)r=   r(   rF   r   )r  )r  r<   r=   r  r(   rg   r   s          r4   _reduce_scatter_transpose_ruler    s     
SI'8+5: 
< <r6   c               v    | |c\  }\  }||k  r|dz  }n|s|dz  }t         j                  ||||||      }	|	|fS )Nr   r  r=   r(   rg   r   )reduce_scatter_pr5   )
r   r!  r  r=   r(   rg   r   r<   r  r   s
             r4   _reduce_scatter_batcherr    sd    *$1
FA  )) ! & 
r6   c           	        | j                   | j                  }	}|	|vrt        |||||||      S |t        d      ||k(  sJ d       t	        |t
              s|f}t        |      dkD  rt        d      ||	fk(  sJ d       ||c\  }
\  }|t        j                  u r|
|z  |}}n)t        j                  |
dt        j                  |f      |}}|rt        |||      }||fS )Nr  r  r  r   r$  r  g        )rU   rr   r  r.   r8   r9   rc   r   r  r   reducera  r  )r   r   r!  r  r=   r(   rg   r   r  r  r<   r  r  dys                 r4   _reduce_scatter_collectiver    s    %>>9>>j*y ",=/@5* * "
G
HH	j	 ;";;	 	Iu	%I^a
>
??	zm	#J%JJ	#*$1(

	M,rAJJq"cggt,.?rA
2y!$A	
B,r6   r  r  r(   r   c               x    t        |t              s|fn|s| S fd}t        j                  ||       S )a  
  Like ``psum(x, axis_name)`` but each device retains only part of the result.

  For example, ``psum_scatter(x, axis_name, scatter_dimension=0, tiled=False)``
  computes the same value as ``psum(x, axis_name)[axis_index(axis_name)]``, but
  it is more efficient. Thus the ``psum`` result is left scattered along the
  mapped axis.

  One efficient algorithm for computing ``psum(x, axis_name)`` is to perform a
  ``psum_scatter`` followed by an ``all_gather``, essentially evaluating
  ``all_gather(psum_scatter(x, axis_name))``. So we can think of
  ``psum_scatter`` as "the first half" of a ``psum``.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a mapped axis (see the
      :func:`jax.pmap` documentation for more details).
    scatter_dimension: a positional axis into which the all-reduce result along
      ``axis_name`` will be scattered.
    axis_index_groups: optional list of lists of integers containing axis
      indices. For example, for an axis of size 4,
      ``axis_index_groups=[[0, 1], [2, 3]]`` would run reduce-scatter over the
      first two and the last two axis indices. Groups must cover all axis
      indices exactly once, and all groups must be the same size.
    tiled: boolean representing whether to use rank-preserving 'tiled' behavior.
      When ``False`` (the default value), the size of dimension in
      ``scatter_dimension`` must match the size of axis ``axis_name`` (or the
      group size if ``axis_index_groups`` is given). After scattering the
      all-reduce result along ``scatter_dimension``, the output is squeezed by
      removing ``scatter_dimension``, so the result has lower rank than the
      input. When ``True``, the size of dimension in ``scatter_dimension`` must
      be divisible by the size of axis ``axis_name`` (or the group size if
      ``axis_index_groups`` is given), and the ``scatter_dimension`` axis is
      preserved (so the result has the same rank as the input).

  Returns:
    Array(s) with the similar shape as ``x``, except the size of dimension in
    position ``scatter_dimension`` is divided by the size of axis ``axis_name``
    (when ``tiled=True``), or the dimension in position ``scatter_dimension`` is
    eliminated (when ``tiled=False``).

  For example, with 4 XLA devices available:

  >>> x = np.arange(16).reshape(4, 4)
  >>> print(x)
  [[ 0  1  2  3]
   [ 4  5  6  7]
   [ 8  9 10 11]
   [12 13 14 15]]
  >>> y = jax.pmap(lambda x: jax.lax.psum_scatter(x, 'i'), axis_name='i')(x)
  >>> print(y)
  [24 28 32 36]

  if using tiled:

  >>> y = jax.pmap(lambda x: jax.lax.psum_scatter(x, 'i', tiled=True), axis_name='i')(x)
  >>> print(y)
  [[24]
   [28]
   [32]
   [36]]

  An example of using axis_index_groups:

  >>> def f(x):
  ...   return jax.lax.psum_scatter(
  ...       x, 'i', axis_index_groups=[[0, 2], [3, 1]], tiled=True)
  >>> y = jax.pmap(f, axis_name='i')(x)
  >>> print(y)
  [[ 8 10]
   [20 22]
   [12 14]
   [16 18]]
  c                    t        t        j                  |       d      }|dk(  rt        t	        |       S t        |       S )Nzjax.lax.psum_scatterr+   r  r   r  )r,   r   r-   r.   unreduced_psum_scatter_psum_scatter)r1   r2   r3   r(   r  r   s     r4   r5   zpsum_scatter.<locals>.bindy  se    dkk$'/EFE		&!!#
(9H H 49J->eM Mr6   r8   r9   r   r;   )r<   r=   r  r(   r   r5   r3   s     ``` @r4   r  r  *  s;    X (	59)y$	H	M 
		D!	$$r6   c                   t        t              sfs| S t              t              fd}t	        j
                  ||       S )Nc                R    t        |       } t        j                  |       S )N)r=   r  r(   rg   r   )r   r  r5   )r1   r(   r=   rg   r  r   s    r4   r5   z_psum_scatter.<locals>.bind  s8    "9d3D  	5F+y ! O Or6   )r8   r9   r{   r`   r   r;   )r<   r=   r  r(   r   r5   rg   s    ```` @r4   r  r    sV    	Iu	%I	H$56)56GHO O
 
		D!	$$r6   c           
       
 ddl m t        t              r&sJ d       t	              dkD  rt        d      \  |j                  vrt        d       | j                  j                  }t        |j                        j                        }t        |t              r|j                  r|j                  t        |j                  j                         k7  r|j"                  |   dk(  rUt%        j&                  t(        j*                  j-                  t/        j0                  dt.        j2                                    S fd
 t5        j6                  
fd	      |       d   S |j8                  t;        j<                  |j"                        z  }t5        j>                  t/        j@                  |t;        j<                  |j"                  |dz   d        z  t.        jB                              }t5        j>                  t/        j@                  |j"                  |   t.        jB                              }t        |tD        t        f      rt%        jF                         }nt%        jH                         }t%        jJ                  t%        jL                  ||      |      }	t%        jN                  t(        jP                  j-                  g t(        jR                  jU                  d
            |	      S )Nr   )	shard_mapzempty axis namer   zC`axis_index` translation rule does not support multiple axis names.zunbound axis name: r-  c                 0    t         j                         S Nr   )r   r5   r   s   r4   fz)_build_axis_index_lowering_hlo.<locals>.f  s    33r6   c                 8       ddt                            gS )NFrw   )	check_vmain_specs	out_specs)r}  )r  r  s   r4   rz   z0_build_axis_index_lowering_hlo.<locals>.<lambda>  s&     *1%&S* , - r6   r@  )+jax._src.shard_mapr  r8   r9   rc   r.   namesrT  rY  r  r:   r   r   r  r  r|  r   sizesr   constantr   DenseElementsAttrr4  r\   r   r_   r   r  nrepsrd   re   ir_constantr.  uint32r   partition_id
replica_id	remainderdivideconvertRankedTensorTyperG  rH  )r[  r=   r%  r  axis_pos	nreplicasdivmod	device_idunsigned_indexr  r  s    `        @@r4   _build_axis_index_lowering_hlor	    s5   *	5!'''9
9~
OQ QJIhnn$
))5
66##00,(..!''	2( /)L,=,=,H,H"II~~h1$\\"..222::arxx3PQRR4.4>>	-..13346 6 nn		(.. 99)hh
diix!|~ >?
?ryy	#
 	(..":"))LM#@A  "I I==Is!;SA.	b".."="=b"AB
 r6   c               F    t        | || j                  j                        gS rA   )r	  rY  r%  )r[  r=   s     r4   _axis_index_loweringr    s*    
(i),););)D)DF 
G Gr6   c                t   t        j                  |       h}t        | t              s| fn| } t	        | d       t               }t        |t                     }t        j                  j                  r!|j                  rt        |       nt               n	t               }t        dt        j                  ||      |fS )Nr   rw   rE  r1  )r   rJ  r8   r9   rF  r   r   r}  r   rh   ri   _any_axis_manualr  r   r\   r_   )r=   effectr|  rE  r1  s        r4   #_axis_index_effectful_abstract_evalr    s      +,&",Y">ylI)I|,		$4%(## #'"7"7)I
Y[)2 	RH#	>	FFr6   c               b    t        j                  t        j                  | j                        dfS r  )r   rs  r\   r_   rU   )r   r   r!  r=   s       r4   _axis_index_batcherr    s     	"((INN	+Q	..r6   r   c          	        t        d |D              sJ t        | |t        t        |                  }|j                  t        |      d  }|j                  d|z         }d|z   }t        j                  |d      }t        t        |j                  dz
  |j                  |j                  z   dz
              }t        j                  |dd      }t        j                  |||t        |            S )	Nc              3  <   K   | ]  }t        |t                y wrA   rB   rD   s     r4   rG   z _pgather_impl.<locals>.<genexpr>  r  rI   )r,  )r   r   ru  )r   )offset_dimscollapsed_slice_dimsstart_index_map)dimension_numbersslice_sizes)ra   r#   r   rc   r   r  r   r   r9   rN   r   GatherDimensionNumbersgather)	r   r   r3   src_axes_frontnon_axes_shapesrc_one_axis_frontr  r  dnumss	            r4   _pgather_implr     s    	4t4	44	4CuSY'78.!''D	
3.%--en.DE~%+U##eCHHqL#((5G5L5L*Lq*PQR+

(
(%
 
*C5$)+$6
8 8r6   c                   t        |d       t        | j                        }t        d |D        d      D ]  }||=  |j                  t	        |      z   }t        || j                        S )Nr   c              3  B   K   | ]  }t        |t              s|  y wrA   rB   rE   r  s     r4   rG   z)_pgather_abstract_eval.<locals>.<genexpr>  s     <AAs);a<rA  T)reverse)rF  r:   r   r  r9   r   r[   )r   r   r3   r   rF   s        r4   _pgather_abstract_evalr%    sa     D)$
syy/%<<dK dd
))eEl
"%	UCII	&&r6   c                   t        d |D              rt        d       t        j                  t        d      | |||      S )Nc              3  >   K   | ]  }t        |t                 y wrA   rB   rD   s     r4   rG   z-_pgather_parallel_lowering.<locals>.<genexpr>  s     4tZc"	"4   zJpgather only supported in the SPMD lowering.Please open a feature request!Frb  r   )rW   r.   r   r  r   )r[  r   r   r3   s       r4   _pgather_parallel_loweringr)    sG    4t44
 ? @ @	>	>	3$
   r6   c               h  
 |\  }}|\  
}
t         j                  u rt        d      |t         j                  urt        d      t	        
fd|D              }	t        d |D              rt        |||	      t         j                  fS t        j                  |||	      t         j                  fS )Nz;pgather axis {frame.name} is missing from the indexed valuer$  c              3  `   K   | ]%  }|k(  rnt        |t              r||k  z   n| ' y wrA   rB   )rE   rF   dsrcr  s     r4   rG   z._pgather_collective_batcher.<locals>.<genexpr> 	  s?      %   :-4,6tS,A444<( %s   +.c              3  <   K   | ]  }t        |t                y wrA   rB   rD   s     r4   rG   z._pgather_collective_batcher.<locals>.<genexpr>	  s     04D#	0rI   r   )	r   r  rX   r.   r9   ra   r   r   r5   )rg   r  r  r   r!  r3   r   r   didxr   r,  s    `        @r4   _pgather_collective_batcherr/    s    (#s*$	X   
R
SS	$$$
>
?? %  $% %( 	0400 c183F3FFF>>#s>2H4G4GGGr6   r   c                   |t         t        |      }t        j                  |       j                  }||z
  x}rt        | t        |            n| }t        j                  ||      S Nr   )	r.   r  r   r|  r1  r   r9   psum_invariant_pr5   )r1   r3   r(   r  r  pbroadcast_namesr   s          r4   rj   rj   	  sk    "

D/%==""&!&// 
tU+,	-6: 			s		..r6   psum_invariantc               F    t        t        t        j                  | |d       S r   )r=  r2  r   rO   r   r3   s     r4   _psum_invariant_implr7  "	  s    	)3>>3T+/
1 1r6   c          	     t   t        |t              sJ t        |d       t        |      j	                  |j
                        st        d| d|j
                         t        d |D              t        d |D              }t        j                  |g|        t        |g|        t        j                  t        j                  ||      |j                  t        j                  ||      t        fd|j
                  D                    }|D ch c]  }t        j                   |       c}fS c c}w )	Nr>   zpsum is a variant->invariant collective. This means that the axis names mentioned in `axes` passed to `psum` must be present in `jax.typeof(inp).vma`. Got axes= and jax.typeof(inp).vma=c              3  B   K   | ]  }t        |t              r|  y wrA   rB   rD   s     r4   rG   z0_psum_invariant_abstract_eval.<locals>.<genexpr>1	  r@  rA  c              3  B   K   | ]  }t        |t              s|  y wrA   rB   rD   s     r4   rG   z0_psum_invariant_abstract_eval.<locals>.<genexpr>2	  rC  rA  r   c              3  ,   K   | ]  }|vs|  y wrA   rw   )rE   r  rp   s     r4   rG   z0_psum_invariant_abstract_eval.<locals>.<genexpr>8	  s     ?!1J+>A?r  r  )r8   r9   rF  r   intersectionr1  rX   r   rG  r   r   r   rH  r[   rI  r  rJ  )rr   rK  r3   rQ   rL  rF   rp   s         @r4   rO  rO  '	  s   	D%	  	 D&!	T			)
	,,06 2  $z	+, , HdHH*BDBB(-vt$	84djj++Dx@
?txx?
?A( 
:F4D((.F	FFFs   D5c               \    t        t        j                  t        j                  | ||d       S r   r  r   add_prO   r[  r   r3   s      r4   _psum_invariant_lowering_rulerB  =	  s$    	SYYSt/3
5 5r6   c           	     .    t        t        d | |||d       S )Nc                    || z  S rA   rw   r  s     r4   rz   z/_psum_invariant_batching_rule.<locals>.<lambda>D	  s
    Y] r6   )r"  r2  r   r   r!  r3   s       r4   _psum_invariant_batching_rulerF  B	  s     	&:'4
/ /r6   c               `    t        j                  |      sJ t        j                  | |      fS r  )r   r  r   r   r  r   r3   s      r4   _psum_invariant_transpose_rulerI  I	  s*    				$$	$
**SD
)	++r6   c                   | S rA   rw   r6  s     r4   rz   rz   P	  s    3 r6   c                   |gS rA   rw   r[  r<   r3   s      r4   rz   rz   Q	  s    aS r6   c                  t         j                  j                  s| S t        |d       t	        | gd       t        |t              sJ t        |      j                  | j                        rt        d| d|        | j                  | j                  j                  t                     | j                  j                  t        |                  S )Nr   zpvary is a invariant->variant collective. This means that the axis names mentioned in `axes` passed to `pvary` must not be present in `jax.typeof(inp).vma`. Got axes=z and jax.typeof(inp)=)r|  r  )r   rh   ri   rF  r   r8   r9   r   r=  r1  rX   ri  rE  r   unionr  )rK  r3   s     r4   _pvary_abstract_evalrO  S	  s    				 	 KD'"vw'	D%	  	 YDHH%
	,,06 2 6	#$ $
 
dmm228I8K2L	$8 
 
: :r6   c               `    t        j                  |      sJ t        j                  | |      fS r1  )r   r  r2  r5   rH  s      r4   _pvary_transpose_rulerQ  c	  s/    				$$	$


$

/	11r6   c                   t        d |D              rt        | |c\  }\  }t        j                  j	                  ||      }||fS )Nc              3  >   K   | ]  }t        |      t        u   y wrA   )r{  rC   rD   s     r4   rG   z!_pvary_batcher.<locals>.<genexpr>i	  s     ,tds	,r(  r   )rW   r.   r   pvary_pr5   )r   r!  r3   r<   r  r  s         r4   _pvary_batcherrU  h	  sG    ,t,,
*$1
ll%!	
A+r6   c                   t        t              sfs| S t        d       fd}t        j                  ||       S )Nc           	         t         j                  | t        rt        j                  |       nt        j                  |       dz               S r  )all_gather_reduced_pr5   r"   r\   rN   )r1   rF   r=   rg   r   s    r4   r5   z all_gather_reduced.<locals>.bindy	  sL    $$.5"''$-bggdma.?Ay	 % ? ?r6   r8   r9   r{   r   r;   )r<   r=   rF   r   r5   rg   s    ``` @r4   r  r  s	  sB    	Iu	%I	HD))? 
		D!	$$r6   r  c               *   t        d       | j                  st        d| j                  d             | j                  t	              z  st        dd| j                         | j
                  j                  j                  t	              z  r t        d| j                  d       d      t        | j                        }|r||xx   |z  cc<   n|j                  ||       | j
                  }|j                  j                  t              z  }|j                  |j                  j                  |      	      }t        fd
| j                  D              }	| j                  ||	|      h t        t        j                        fS )Nr  z=all_gather_reduced only accepts inputs that are varying. Got TzUall_gather_reduced is a Varying -> Reduced collective. This means that the axis_name=zE passed to `all_gather_reduced` must be present in jax.typeof(x).vma=zRall_gather_reduced's input cannot be reduced across the axis_name provided. Got x= and axis_name=r  specc              3  ,   K   | ]  }|vs|  y wrA   rw   r  s     r4   rG   z>_all_gather_reduced_effectful_abstract_eval.<locals>.<genexpr>	  r  r  r   r1  rE  )rF  r1  rX   	str_shortr   rE  r^  r  r:   r   r	  r  ri  r   r   rJ  )
r  r  r=   rg   r   r  x_aval_snew_reducedr  r  s
     `       r4   +_all_gather_reduced_effectful_abstract_evalrd  	  s    I34	
 &&,&6&6t&<%=? @ @ **s9~
%
	<    &

|	-. . __!!C	N2
	",,T233C	FG G 6<< )
"#y0#)95__(%%	)(<<+hmm&:&:;&:&OP,BBB'
--iW|-
L
1C$$i0
1
3 3r6   c                   t         rA   r  r  s        r4   _all_gather_reduced_implrf  	  r  r6   c          
     (    t        | |||d |||      S r  r  r  s          r4   _all_gather_reduced_loweringrh  	  s%     
	1#79	
 r6   c               "    t        | |||      fS )N)r=   r  r   )r  )r  r<   r  r=   rg   r   s         r4   "_all_gather_reduced_transpose_rulerj  	  s    
 	3G',. 
0 0r6   c                    t        d      Nz<Please file an issue at https://github.com/jax-ml/jax/issuesr  r  s          r4   &_all_gather_reduced_batched_collectiverm  	       	D	F Fr6   r  c                   t        t              sfs| S t        d       fd}t        j                  ||       S )Nc                8    t         j                  |       S )N)r=   r  rg   r   )unreduced_reduce_scatter_pr5   )r1   r=   rg   r  r   s    r4   r5   z$unreduced_psum_scatter.<locals>.bind	  s(    %**	5F5 + * *r6   rY  )r<   r=   r  r   r5   rg   s    ``` @r4   r  r  	  sB    	Iu	%I	HD))* 
		D!	$$r6   unreduced_reduce_scatterc                  t        d       | j                  j                  j                  st	        d| j                  d             | j                  j                  j                  t              z  s/t	        dd| j                  j                  j                         | j                  t              z  r t	        d| j                  d       d      t        | j                        }| j                  |   }|r"||z  dk7  rt	        d	| d
|       ||z  ||<   n||k7  rt	        d| d|       ||= | j                  }|j                  |j                  j                  t        fd|j                  j                  D                          }| j                  t              z  }	| j                  ||	|      h t        t        j                        fS )Nr  zCunreduced_psum_scatter only accepts inputs that are unreduced. Got Tz[unreduced_psum_scatter is a Unreduced -> Varying collective. This means that the axis_name=z] passed to `unreduced_psum_scatter` must be present in jax.typeof(x).sharding.spec.unreduced=zVunreduced_psum_scatter's input cannot be varying across the axis_name provided. Got x=r[  r   r  r  r  r  c              3  ,   K   | ]  }|vs|  y wrA   rw   )rE   r   r=   s     r4   rG   zD_unreduced_reduce_scatter_effectful_abstract_eval.<locals>.<genexpr>	  s     S@R!Sr  r+   r]  r`  )rF  rE  r^  r+   rX   ra  r  r1  r   r:   r   ri  r   r   rJ  )
r  r=   r  rg   r   r  r  rb  r  r  s
    `        r4   1_unreduced_reduce_scatter_effectful_abstract_evalrv  	  s    I/0				'	'
 ((.(8(8(>'?A B B //


(
(9Y+?
?
	&$, '228//2F2F2P2P1Q	S  ZZ#i. 
	",,T233C	FG G 6<< )!<<(9:
	)Q.M01 2&&/[2 3 3 $:Y#FI *G011I#& ' ' 	#$__(hmm&:&:SX]]%<%<SS '; 'U V,JJY''
--iW|-
L
1C$$i0
1
3 3r6   c                   t         rA   r  )r<   r=   r  rg   r   s        r4   _unreduced_reduce_scatter_implrx  
  r  r6   c               "    t        | |||      fS )N)r=   rF   r   )r  )r  r<   r=   r  rg   r   s         r4   (_unreduced_reduce_scatter_transpose_rulerz  
  s    
SI<M#(* 
, ,r6   c                    t        d      rl  r  )r   r   r!  r=   r  rg   r   s          r4   !_unreduced_reduce_scatter_batcherr|  
  rn  r6   c          
     (    t        | ||||||d       S )N)r=   r  rg   r   r(   )r  )r   r[  r<   r=   r  rg   r   s          r4   "_unreduced_reduce_scatter_loweringr~  
  s$    	!
Ci;L$
@ @r6   c                p    t        t        t        f      sfs| S t        j                  fd|       S )Nc                D    t         j                  | t                    S r1  )unreduced_psum_pr5   r9   )r1   r=   s    r4   rz   z unreduced_psum.<locals>.<lambda>*
  s    #((E)4D(E r6   r7   r   s    `r4   r/   r/   $
  s:    	It}	-I	H			Eq
J Jr6   r/   c               ^   t        d       | j                  j                  j                  st	        d| j                  d             | j                  j                  j                  t              z  s/t	        dd| j                  j                  j                         | j                  t              z  r t	        d| j                  d       d      t        d D              rt	        d	       t        j                  | gd
       | j                  }|j                  |j                  j                  t        fd|j                  j                  D                          }| j                  |      }|D ch c]  }t        j                  |       c}fS c c}w )Nr>   z;unreduced_psum only accepts inputs that are unreduced. Got TzPunreduced_psum is a Unreduced -> Invariant collective. This means that the axes=zU passed to `unreduced_psum` must be present in jax.typeof(x).sharding.spec.unreduced=zOunreduced_psum's input cannot be varying across the  axis_name provided. Got x=z
 and axes=c              3  <   K   | ]  }t        |t                y wrA   rB   r#  s     r4   rG   z0_unreduced_psum_abstract_eval.<locals>.<genexpr>?
  s     *As	*rI   z@unreduced_psum does not accept integer axis_name. Got axis_name=r/   c              3  ,   K   | ]  }|vs|  y wrA   rw   )rE   ur3   s     r4   rG   z0_unreduced_psum_abstract_eval.<locals>.<genexpr>F
  s     I1D=!Ir  ru  r]  rD  )rF  rE  r^  r+   rX   ra  r  r1  r   rW   r   rG  ri  rJ  )rK  r3   a_sr  rL  rF   s    `    r4   _unreduced_psum_abstract_evalr  .
  s   D&!				%	%
 ))-)=(>@ A A --


&
&4
8
	!' "226--2D2D2N2N1O	QR R
 
XXD	
	&&*nnT&:%;;	IJ J 	*T**
 ''+f. / / (89#ISXX%7%7II "1 "K L,[[,[/(	4@4D((.@	@@@s   
F*c               \    t        t        j                  t        j                  | ||d       S r   r?  rA  s      r4   _unreduced_psum_loweringr  K
  s&    	SYYS"&$
@ @r6   c                    t         rA   r  rE  s       r4   _unreduced_psum_batcherr  P
      r6   c               L    t        j                  |      sJ t        | |      fS r  )r   r  preducedrH  s      r4   _unreduced_psum_transpose_ruler  U
  s&    				$$	$
3$
'	))r6   c                    t        |t              s|fn|}|s| S t               }|j                  r|nt	        j
                  ||      t              t        |      k(  sJ ~t        j                  fd|       S )Nc                2    t         j                  |       S r1  )
preduced_pr5   )ro   r   s    r4   rz   zpreduced.<locals>.<lambda>e
  s    jooaho&G r6   )	r8   r9   r   emptyr   order_wrt_meshr   r   r;   )r<   r=   r3   cur_meshr   s       @r4   r  r  ]
  sl    '	59)y$	H (^^T)<)<Xt)L(	X#d)	##	#
			G	KKr6   r  c                   | S rA   rw   r6  s     r4   rz   rz   h
  s     r6   c                   |gS rA   rw   rL  s      r4   rz   rz   i
  s    A3 r6   c               B   t        |t              sJ t        |d       | j                  j	                  t        |            rt        d| d| j                         | j                  j                  j                  t        |      z  r t        d| j                  d       d|       | j                  }|j                  j                  t        |      z  }|j                  t               |j                  j                  |            }| j                  |	      }|S )
Nr  zpreduced is a Invariant->Reduced collective. This means that the axis names mentioned in `axes` passed to `preduced` must not be present in `jax.typeof(inp).vma`. Got axes=r9  zFpreduced input cannot be reduced across the axis_name provided. Got x=Tr[  r\  r|  r^  rD  )r8   r9   rF  r1  r=  r   rX   rE  r^  r  ra  r  ri  r   )rK  r3   r  rc  r  rL  s         r4   _preduced_abstract_evalr  k
  s   	D%	  	 D*%	XX3t9%
	77;f =  $z	+, ,
 
]]#d)+
	 NN401	HI I 	#  9T?2+!2!4!$!E  G,[[,[/(	/r6   c               L    t        j                  |      sJ t        | |      fS r  )r   r  r/   rH  s      r4   _preduced_transpose_ruler  
  s&    				$$	$

-	//r6   c                   t         rA   r  r   r!  r3   s      r4   _preduced_batcherr  
  r  r6   c                h    t        |t              s|fn||s| S t        j                  fd|       S )Nc                2    t         j                  |       S r1  )vary_unreduced_cast_pr5   )r1   r3   s    r4   rz   z%vary_unreduced_cast.<locals>.<lambda>
  s    (--d-> r6   r  )r<   r=   r3   s     @r4   vary_unreduced_castr  
  s7    '	59)y$	H			>
C Cr6   r  c                   | S rA   rw   r6  s     r4   rz   rz   
  s    C r6   c                   |gS rA   rw   rL  s      r4   rz   rz   
  s    qc r6   c                  t        t              sJ t        d       t        | gd       | j                  st        d| j                  d             | j                  t              z  st        d d| j                         | j                  j                  j                  t              z  r t        d| j                  d       d       | j                  }|j                  j                  t              z  }|j                  t               |j                  j                  |      	      }t        fd
| j                  D              }| j                  ||      S )Nr  z>vary_unreduced_cast only accepts inputs that are varying. Got Tzvary_unreduced_cast is a Varying->Unreduced collective. This means that the axis names mentioned in `axes` passed to `vary_unreduced_cast` must be present in `jax.typeof(x).vma`. Got axes=z and jax.typeof(x).vma=zSvary_unreduced_cast input cannot be unreduced across the axis_name provided. Got x=r[  ru  r  c              3  ,   K   | ]  }|vs|  y wrA   rw   rE   r   r3   s     r4   rG   z5_vary_unreduced_cast_abstract_eval.<locals>.<genexpr>
  s     ;AQd]a;r  r  )r8   r9   rF  r   r1  rX   ra  r   rE  r^  r+   r  ri  r   )rK  r3   aval_snew_unreducedr  r  s    `    r4   "_vary_unreduced_cast_abstract_evalr  
  s\   	D%	  	 D/0v45	
 &&*nnT&:%;= > > ((SY

	* +/ 0"hhZ		)* * 
]]!!CI-
	 NN401	HI I ==&++'')D/9-$5$7$*KK$6$6$6$O  Q,;;;'	l	88r6   c               `    t        j                  |      sJ t        j                  | |      fS r  )r   r  r   reduced_vary_castr  r<   r3   s      r4   #_vary_unreduced_cast_transpose_ruler  
  s,    				""	"

 
 
5	77r6   c                   t         rA   r  r  s      r4   _vary_unreduced_cast_batcherr  
  r  r6   c                   | S rA   rw   r6  s     r4   rz   rz   
  s    s r6   c                   |gS rA   rw   rL  s      r4   rz   rz   
  s    ! r6   c                  t        t              sJ t        d       | j                  j                  j
                  st        d| j                  d             | j                  j                  j
                  t              z  s/t        d d| j                  j                  j
                         | j                  t              z  r t        d| j                  d       d       | j                  }t        fd|j                  j
                  D              }|j                  t               |j                  j                  |	      
      }| j                  t              z  }| j                  ||      S )Nr  z<reduced_vary_cast only accepts inputs that are reduced. Got Tzreduced_vary_cast is a Reduced->Varying collective. This means that the axis names mentioned in `axes` passed to `reduced_vary_cast` must be present in `jax.typeof(x).sharding.spec.reduced`. Got axes=z) and jax.typeof(x).sharding.spec.reduced=zOreduced_vary_cast input cannot be varying across the axis_name provided. Got x=r[  c              3  ,   K   | ]  }|vs|  y wrA   rw   r  s     r4   rG   z3_reduced_vary_cast_abstract_eval.<locals>.<genexpr>
  s     JATM!Jr  r\  r  r  )r8   r9   rF  rE  r^  r  rX   ra  r   r1  r  ri  r   )rK  r3   r  rc  r  r  s    `    r4    _reduced_vary_cast_abstract_evalr  
  sm   	D%	  	 D-.				#	#
 &&*nnT&:%;= > > --


$
$s4y
0
	< =A6 B0040B0B0J0J/K		MN N 
XXD	
	 NN401	HI I ==&JV[[%8%8JJ+$5$7$*KK$6$6{$6$K  M,HHy&'	l	88r6   c               L    t        j                  |      sJ t        | |      fS r  )r   r  r  r  s      r4   !_reduced_vary_cast_transpose_ruler  
  s&    				""	"
cT
2	44r6   c                   t         rA   r  r  s      r4   _reduced_vary_cast_batcherr  
  r  r6   c           	        | j                   }| j                  j                  j                  }| j                  j                  j                  }||z  |z  }||z  |z  rJ t               }|D ][  }||v r|j                  d       ||v r|j                  d       /||v r|j                  d       E||vsJ |j                  d       ] t        |      dkD  rt        | d|  d| d|       |\  }	|	S )	Nr~  r+   r  	invaryingr   z can only accept axis_name which corresponds to one of varying, unreduced, reduced or invarying state of the input. Got input type: z, axes: z and input state: )	r1  rE  r^  r+   r  r   ra  rc   rX   )
rK  r3   rr   r1  r+   r  vma_urr   r  os
             r4   r,   r,   
  s   #mm  **)MM&&'?W$&Io'(	(# 	aCx	ggi	
i	ggk	
g	ggif__	ggk	 	X\
& vXdV+=cU	DE E "!	
(r6   ))r  r~  )r  r  )r~  r+   )r  r~  >   r  r~  r+   c                   t        |t        t        f      rt        d|d|       t        |t              s|fn||s| S t
        vrt        dt
               fd}t        j                  ||       S )Nz
axis_name=z must be a tuple or a str. Got z4Got unexpected `to` value. Allowed `to` values are: c                    t        t        j                  |       d      }t        j	                  |fd       }|t        d| d       ||       S )Nzjax.lax.pcastzUnsupported pcast from=z, to=)r,   r   r-   _pcast_funcsr4  rX   )r1   r2   funcr3   r  s      r4   r5   zpcast.<locals>.bind  s[    dkk$'?EUBK.D|0v"?@@dr6   )	r8   r   r  r   r9   _allowed_pcast_torX   r   r;   )r<   r=   r  r5   r3   s     ` @r4   pcastr    s    	C+,
zyl"A)M
NN'	59)y$	H  
		 ! ! 
		D!	$$r6   )r=   r   returnr!   )r=   r   r  rC   rA   )r=   r   r(   zSequence[Sequence[int]] | Noner  rC   )r3   zint | AxisName)r   strr   zcore.ParamDictr  z
tuple[str])r&  zSequence[Sequence[int]]r  zir.DenseElementsAttr)r  r  )rF   rC   r   bool)r3   ztuple[AxisName, ...]r  r  )__doc__
__future__r   collections.abcr   	functoolsr   dataclassesr   r/  rd   jax._srcr   r   r	   r
   r   effects_libr   jax._src.sharding_implsr   r   r   r   r}  jax._src.corer   r   jax._src.interpretersr   r   r   r   r   jax._src.meshr   r   r   jax._src.laxr   r   r   jax._src.lib.mlirr   jax._src.lib.mlir.dialectsr   jax._src.libr    r  jax._src.typingr!   jax._src.utilr"   r#   r$   r%   r&   numpyr\   r   
unsafe_mapr   
unsafe_zipr>   r0   r|   r   r   r   r   r   rY   r`   r   r   r   r   r   r   r   r   r   rg   r{   r   r   r   r  r  r"  r)  r7  r=  rM  rP  rF  r\  r  r  	Primitiverk   def_implrO   def_effectful_abstract_evalregister_loweringr@  
deflinear2fancy_primitive_batchersskippable_batchersr   
reduce_maxmax_pr   
reduce_minmin_pr  r  r  r  r  r   def_abstract_evalEffectr  r  control_flow_allowed_effectsadd_typer  lowerable_effectsr  r   r;  r  r  r  r   r  r  r  r  r   r  r  r  r  r   r"  r.  r3  r:  r   rM  rQ  r[  ri  rv  rx  r   primitive_jvpsprimitive_transposesr   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r   r   r%  r)  r/  r   rj   r2  r7  rO  rr   rB  rF  rI  rT  rO  rQ  rU  primitive_batchersr  rX  rd  rf  rh  rj  rm  r  rq  rv  rx  rz  r|  r~  r/   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  reduced_vary_cast_pr  r  r  r,   r  r  r  r  rw   r6   r4   <module>r     sP   # $  !       + H H / $ * & & . + / %      * ) !# # x
Cx
C
 -1 M%^%5P .2 0B -1 %> -1 %>>>
!N.I:%B %F#LC0 8< S> LPW\ H%X $([G|'TB 9=AA5A 		A4"7&@C1C G >1/3$j=$ 
	 @ A  " "#E F   
G'CNNCE f* +	
'1ST 	 ! !& )&-ov&F  F #		 @ A  " "7+CV#L M   
G'CNNCE 

'1GH 	 ! !& )&-ov&F  F # 
	 @ A  " "7+CV#L M   
G'CNNCE 

'1GH 	 ! !& )&-ov&F  F #@C?
$& T^^J'
 
  ; < j2 3   z#5 60A ! !* -*1/;*O  J ' $9 9 9 !; <   ) ) 2 23M N0    & &'A B $..
!   117; <  # #$8 9   w 3e DJ   w 0-> ! !' *'.'L  G $04 $..
!  # #$8 9   w 3e DJ   w 0-> ! !' *'.'L  G $N
,$, t~~l+   = > l6 7   |%9E J2E ! !, /,3O[,Q  L ) 

B	DN,1 t~~l+   & '  ( ()L M   |%9 : l6 72P ! !, /,3O[,Q  L )>%N,*21 %dnn%89    4 5  / /0Z [)?  % &/K  + ,   *,G H9^ ! !"5 63:?K3X  / 0 37Qe"MDMD`%">
 #'D3 &* : t~~l+  ( ()L M   & '   |%9 :	  %A$ !5B"#%% l6 729"L32 ! !, /,3O[,Q  L ) 78u  %D ((>? 3  2 213    : ; LP   -/M N	  %A$/ !?!L"#%%
J $&J K) =e ! !"8 96=o{6[  2 3-D`34< 6 "4>>"23   , ,+  > ?6P ! !"2 3070U  , -   '7CE 56Y%v%(TGG/ t~~l+   gh66E F   |%9 :  ( ()L M2E ! !, /,3O[,Q  L )8 ' H, DNN9%	 	  = ! 	  2 3   y"< =/J ! !) ,)0&)I  I &/ "4>>"23 1   . /G&  , ,)+;+@+@AC5   ')F G/ 7T ! !"2 3070P  , -,  > ?   . /   t||%@ A:   3 42 dll1 2 -;  DLL )
 56U % &t~~&:; 3<  0 0/1   6 7
    +-I J	  %A$- !=J"#%%
0
 "$F GF
 ;a ! !"6 74;O[4Y  0 1
 ?@u 
% ,T^^,FG (3R  6 657  # #$B C, (*R SF
 Ab ! !"< =:A/S^:_  6 7@
   1A399MOJ "4>>"23 A6  , ,-J K@   ')A B6M ! !"2 3070P  , -*  > ?
L T^^J'
 
  , -   z#> ?*   4 50 j2 3*;  J '
C ''>?    7 8   ,.I J94  ' '(J K8 #%H I5Q  1 2    ! !": ;   t//1L M92    * *+K L5 d&&(I J8R  D44 5: #jj&1 22	 8 %r6   