
    uki              
      j	   d Z ddlmZ ddlmZ ddlmZ ddlZddlZ	ddl
mZmZmZ ddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlm Z!m"Z#m$Z$ ddl%m&Z' ddl%mZ( ddl)m*Z* ddl)m+Z+ ddl,m-Z-m.Z/ ddl0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z7 ejp                  dVd       Z9ejt                  ejp                  dVd              Z;e;jy                  d        ejp                  dWdXd       Z=ejp                  dVd       Z>ejp                  dVd       Z?ejp                  dVd       Z@ ejp                  d !      dVd"       ZAejp                  dVd#       ZBejp                  dVd$       ZCeCZDejp                  dVd%       ZEejp                  dVd&       ZFejp                  dYdZd'       ZGejp                  d[d\d(       ZHejp                  dVd)       ZIejp                  dYdZd*       ZJejp                  dVd+       ZKd]d^d,ZL ejp                  d-.      d_d`d0       ZMe7Z6 ejp                  d1.      	 	 	 da	 	 	 	 	 	 	 	 	 dbd3       ZN ejp                  d-.      	 	 dc	 	 	 	 	 ddd4       ZO	 	 dc	 	 	 	 	 ddd5ZP eejt                  d67      d/de	j                   f	 	 	 	 	 	 	 	 	 ded8       ZReRj                  d9        ZTd/de	j                   f	 	 	 	 	 	 	 	 	 ded:ZU ejp                  d-.      	 	 	 	 	 df	 	 	 	 	 	 	 	 	 	 	 dgd;       ZV ejp                  d<.      	 	 	 	 	 	 dhd=       ZWdd/d>	 	 	 	 	 did?ZXejt                  ejp                  dVd@              ZYeYjy                  dA        ejp                  dVdB       ZZejp                  dVdC       Z[e[Z\dD Z]dE Z^djdFZ_dG Z`dH ZadI ZbdJ Zc	 dk	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dldKZde	 	 dmdd2dddddLdM	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dndN       Ze	 	 dmdd2dddddLdM	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dodO       Z	 	 dmdd2ddddd2dM	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dpdPZe	j                  f	 	 	 	 	 	 	 	 	 	 	 dqdQZf	 dr	 dsdRZge	j                  ddf	 	 	 dtdSZhejt                  ejp                  dVdT              Zieijy                  dU        y)uz6Shared neural network activations and other functions.    )annotations)Sequence)partialN)AnyLiteraloverload)api)config)core)custom_derivatives)deprecations)dtypes)lax)numpy)util)AxisName)dot_product_attentionMaskType)scaled_matmul_wrapperscaled_dot_general_wrapperBlockScaleConfig)einsum)_count)Axis)NamedShardingPartitionSpec)Array	ArrayLikeDType	DTypeLike)	logsumexpc                .    t        j                  d|       S )a(  Identity activation function.

  Returns the argument unmodified.

  Args:
    x : input array

  Returns:
    The argument `x` unmodified.

  Examples:
    >>> jax.nn.identity(jax.numpy.array([-2., -1., -0.5, 0, 0.5, 1., 2.]))
    Array([-2. , -1. , -0.5, 0. , 0.5, 1. , 2. ], dtype=float32)

  identity)
numpy_utilensure_arraylikexs    P/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/_src/nn/functions.pyr#   r#   4   s    " 
	$	$Z	33    c                .    t        j                  | d      S )uM  Rectified linear unit activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{relu}(x) = \max(x, 0)

  except under differentiation, we take:

  .. math::
    \nabla \mathrm{relu}(0) = 0

  For more information see
  `Numerical influence of ReLU’(0) on backpropagation
  <https://dl.acm.org/doi/10.5555/3540261.3540297>`_.

  Args:
    x : input array

  Returns:
    An array.

  Examples:
    >>> jax.nn.relu(jax.numpy.array([-2., -1., -0.5, 0, 0.5, 1., 2.]))
    Array([0. , 0. , 0. , 0. , 0.5, 1. , 2. ], dtype=float32)

  See also:
    :func:`relu6`

  r   )jnpmaximumr&   s    r(   relur-   G   s    B 
Q	r)   c                ^    t        j                  |dkD  | t        j                  | d            S Nr   r   select	full_likegansr'   s      r(   <lambda>r6   j   s"    szz!a%CMM!Q4GH r)   c                    t        j                  d| |      \  } }| t        j                  t        j                  |       |z         z   }|dz  S )zSquareplus activation function.

  Computes the element-wise function

  .. math::
    \mathrm{squareplus}(x) = \frac{x + \sqrt{x^2 + b}}{2}

  as described in https://arxiv.org/abs/2112.11687.

  Args:
    x : input array
    b : smoothness parameter
  
squareplus   )r$   r%   r+   sqrtsquare)r'   bys      r(   r8   r8   l   sE     
	$	$\1a	8$!Q#((3::a=1$
%%!	
Q,r)   c                .    t        j                  | d      S )zSoftplus activation function.

  Computes the element-wise function

  .. math::
    \mathrm{softplus}(x) = \log(1 + e^x)

  Args:
    x : input array
  r   )r+   	logaddexpr&   s    r(   softplusr@      s     
q!	r)   c           
         t        j                  d|       } t        j                  | dk  dt        j                  | dk\  | | dz   dz  dz              S )a  Sparse plus function.

  Computes the function:

  .. math::

    \mathrm{sparse\_plus}(x) = \begin{cases}
      0, & x \leq -1\\
      \frac{1}{4}(x+1)^2, & -1 < x < 1 \\
      x, & 1 \leq x
    \end{cases}

  This is the twin function of the softplus activation ensuring a zero output
  for inputs less than -1 and a linear output for inputs greater than 1,
  while remaining smooth, convex, monotonic by an adequate definition between
  -1 and 1.

  Args:
    x: input (float)
  sparse_plusg                    ?r9      r$   r%   r+   wherer&   s    r(   rB   rB      sJ    , !!-3!	19c399Q#Xq1s7Q,q.#I	JJr)   c                d    t        j                  d|       }|t        j                  |      dz   z  S )zSoft-sign activation function.

  Computes the element-wise function

  .. math::
    \mathrm{soft\_sign}(x) = \frac{x}{|x| + 1}

  Args:
    x : input array
  	soft_sign   )r$   r%   r+   absr'   x_arrs     r(   rI   rI      s.     
%
%k1
5%	#''%.1$	%%r)   T)inlinec                ,    t        j                  |       S )zSigmoid activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{sigmoid}(x) = \frac{1}{1 + e^{-x}}

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`log_sigmoid`

  )r   logisticr&   s    r(   sigmoidrQ      s    & 
ar)   c                <    dt        j                  | dz   dd      z  S )a  Sparse sigmoid activation function.

  Computes the function:

  .. math::

    \mathrm{sparse\_sigmoid}(x) = \begin{cases}
      0, & x \leq -1\\
      \frac{1}{2}(x+1), & -1 < x < 1 \\
      1, & 1 \leq x
    \end{cases}

  This is the twin function of the ``sigmoid`` activation ensuring a zero output
  for inputs less than -1, a 1 output for inputs greater than 1, and a linear
  output for inputs between -1 and 1. It is the derivative of ``sparse_plus``.

  For more information, see `Learning with Fenchel-Young Losses (section 6.2)
  <https://arxiv.org/abs/1901.02324>`_.

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`sigmoid`
        ?rD   rC          @)r+   clipr&   s    r(   sparse_sigmoidrV      s     < 
sxxCc*	**r)   c                J    t        j                  d|       }|t        |      z  S )aH  SiLU (aka swish) activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{silu}(x) = x \cdot \mathrm{sigmoid}(x) = \frac{x}{1 + e^{-x}}

  :func:`swish` and :func:`silu` are both aliases for the same function.

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`sigmoid`
  silu)r$   r%   rQ   rL   s     r(   rX   rX      s%    ( 
%
%fa
0%		r)   c                p    t        j                  d|       }|t        j                  t	        |            z  S )aM  Mish activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{mish}(x) = x \cdot \mathrm{tanh}(\mathrm{softplus}(x))

  For more information, see
  `Mish: A Self Regularized Non-Monotonic Activation Function
  <https://arxiv.org/abs/1908.08681>`_.

  Args:
    x : input array

  Returns:
    An array.
  mish)r$   r%   r+   tanhr@   rL   s     r(   rZ   rZ     s.    & 
%
%fa
0%	(5/*	**r)   c                H    t        j                  d|       }t        |        S )zLog-sigmoid activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{log\_sigmoid}(x) = \log(\mathrm{sigmoid}(x)) = -\log(1 + e^{-x})

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`sigmoid`
  log_sigmoid)r$   r%   r@   rL   s     r(   r]   r]     s&    $ 
%
%mQ
7%
E6
	r)   c                    t        j                  d|       }t        j                  |dkD  ||t        j                  t        j                  |dkD  d|            z        S )ak  Exponential linear unit activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{elu}(x) = \begin{cases}
      x, & x > 0\\
      \alpha \left(\exp(x) - 1\right), & x \le 0
    \end{cases}

  Args:
    x : input array
    alpha : scalar or array of alpha values (default: 1.0)

  Returns:
    An array.

  See also:
    :func:`selu`
  elur   rC   )r$   r%   r+   rG   expm1)r'   alpharM   s      r(   r_   r_   .  sT    , 
%
%eQ
/%	519399SYYuqy"e%DEE
G Gr)   c                h    t        j                  d|       }t        j                  |dk\  |||z        S )a  Leaky rectified linear unit activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{leaky\_relu}(x) = \begin{cases}
      x, & x \ge 0\\
      \alpha x, & x < 0
    \end{cases}

  where :math:`\alpha` = :code:`negative_slope`.

  Args:
    x : input array
    negative_slope : array or scalar specifying the negative slope (default: 0.01)

  Returns:
    An array.

  See also:
    :func:`relu`
  
leaky_relur   rF   )r'   negative_sloperM   s      r(   rc   rc   I  s2    0 
%
%lA
6%	5A:unu&<	==r)   c           	         t        j                  d|       }t        j                  |dkD  dt        j                  |dk  d|            S )a  Hard :math:`\mathrm{tanh}` activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{hard\_tanh}(x) = \begin{cases}
      -1, & x < -1\\
      x, & -1 \le x \le 1\\
      1, & 1 < x
    \end{cases}

  Args:
    x : input array

  Returns:
    An array.
  	hard_tanhrJ   rF   rL   s     r(   rf   rf   d  s>    & 
%
%k1
5%	519a52:r5!A	BBr)   c                    t        j                  | d      |t        j                  t        j                  | d      |z        z  z   S )a  Continuously-differentiable exponential linear unit activation.

  Computes the element-wise function:

  .. math::
    \mathrm{celu}(x) = \begin{cases}
      x, & x > 0\\
      \alpha \left(\exp(\frac{x}{\alpha}) - 1\right), & x \le 0
    \end{cases}

  For more information, see
  `Continuously Differentiable Exponential Linear Units
  <https://arxiv.org/abs/1704.07483>`_.

  Args:
    x : input array
    alpha : array or scalar (default: 1.0)

  Returns:
    An array.
  rC   )r+   r,   r`   minimum)r'   ra   s     r(   celurj   z  s8    . 
Q	usyyQ1Du1L'MM	MMr)   c                (    d}d}|t        | |      z  S )a
  Scaled exponential linear unit activation.

  Computes the element-wise function:

  .. math::
    \mathrm{selu}(x) = \lambda \begin{cases}
      x, & x > 0\\
      \alpha e^x - \alpha, & x \le 0
    \end{cases}

  where :math:`\lambda = 1.0507009873554804934193349852946` and
  :math:`\alpha = 1.6732632423543772848170429916717`.

  For more information, see
  `Self-Normalizing Neural Networks
  <https://arxiv.org/abs/1706.02515>`_.

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`elu`
  g,x?g2֫?)r_   )r'   ra   scales      r(   selurm     s    8 ,%
+%	Q	r)   c           	        t        j                  d|       \  }|rkt        j                  dt        j                  z        j                  |j                        }ddt        j                  ||d|dz  z  z   z        z   z  }||z  S t        j                  d      j                  |j                        }t        j                  d|z  t        j                  | |z        z  |j                        S )a  Gaussian error linear unit activation function.

  If ``approximate=False``, computes the element-wise function:

  .. math::
    \mathrm{gelu}(x) = \frac{x}{2} \left(\mathrm{erfc} \left(
      \frac{-x}{\sqrt{2}} \right) \right)

  If ``approximate=True``, uses the approximate formulation of GELU:

  .. math::
    \mathrm{gelu}(x) = \frac{x}{2} \left(1 + \mathrm{tanh} \left(
      \sqrt{\frac{2}{\pi}} \left(x + 0.044715 x^3 \right) \right) \right)

  For more information, see `Gaussian Error Linear Units (GELUs)
  <https://arxiv.org/abs/1606.08415>`_, section 2.

  Args:
    x: input array
    approximate: whether to use the approximate or exact formulation.
  gelur9   rS   rD   gHm?   dtype)r$   promote_args_inexactnpr:   piastyperr   r+   r[   arrayr   erfc)r'   approximaterM   sqrt_2_over_picdf	sqrt_halfs         r(   ro   ro     s    , ++FA6'5WWQY'..u{{;N
sxx%(eqj:Q2Q RSS
TC3;##EKK0I99esxx 234EKK r)   axis)static_argnamesrg   c                    t        j                  d|       }|j                  |   }|dz  dk(  sJ d       t        j                  |d|      \  }}|t        |      z  S )a	  Gated linear unit activation function.

  Computes the function:

  .. math::
    \mathrm{glu}(x) =  x\left[\ldots, 0:\frac{n}{2}, \ldots\right] \cdot
      \mathrm{sigmoid} \left( x\left[\ldots, \frac{n}{2}:n, \ldots\right]
        \right)

  where the array is split into two along ``axis``. The size of the ``axis``
  dimension must be divisible by two.

  Args:
    x : input array
    axis: the axis along which the split should be computed (default: -1)

  Returns:
    An array.

  See also:
    :func:`sigmoid`
  glur9   r   z axis size must be divisible by 2)r$   r%   shaper+   splitrQ   )r'   r~   rM   sizex1x2s         r(   r   r     s_    0 
%
%eQ
/%	T	$	Q:::99UAt$&"b	gbk	r)   )r~   keepdimsFc                    t        | |||      }t        | ||||j                        }|t        j                  |      z
  S )a  Log mean exp.

  Computes the function:

  .. math::
    \text{logmeanexp}(x) = \log \frac{1}{n} \sum_{i=1}^n \exp x_i = \text{logsumexp}(x) - \log n

  Args:
    x: Input array.
    axis: Axis or axes along which to reduce.
    where: Elements to include in the reduction. Optional.
    keepdims: Preserve the dimensions of the input.
  Returns:
    An array.
  See also:
    :func:`jax.nn.logsumexp`
  )r~   rG   r   )r~   rG   r   rr   )
_logsumexpr   rr   r+   log)r'   r~   rG   r   lsecounts         r(   
logmeanexpr     s=    0 	14ux@#
UXSYY
O%	swwu~	r)   c                   t        j                  d|       }t        j                  |||t        j
                   d      }||n%t        j                  ||t        j
                         }|t        j                  |      z
  }t        j                  t        j                  t        j                  |      ||d            }||z
  }|&t        j                  ||t        j
                         S |S )a  Log-Softmax function.

  Computes the logarithm of the :code:`softmax` function, which rescales
  elements to the range :math:`[-\infty, 0)`.

  .. math ::
    \mathrm{log\_softmax}(x)_i = \log \left( \frac{\exp(x_i)}{\sum_j \exp(x_j)}
    \right)

  Args:
    x : input array
    axis: the axis or axes along which the :code:`log_softmax` should be
      computed. Either an integer, tuple of integers, or ``None`` (all axes).
    where: Elements to include in the :code:`log_softmax`. The output for any
      masked-out element is minus infinity.

  Returns:
    An array.

  Note:
    If any input values are ``+inf``, the result will be all ``NaN``: this reflects the
    fact that ``inf / inf`` is not well-defined in the context of floating-point math.

  See also:
    :func:`softmax`
  log_softmaxTrG   initialr   rG   r   )r$   r%   r+   maxrt   infrG   r   stop_gradientr   sumexp)	r'   r~   rG   rM   x_maxx_safeshiftedshifted_logsumexpresults	            r(   r   r     s    < 
%
%mQ
7%
''%URVVGd
K%M5syyw'G&S&&u--'gg	ggcgggEDAC&&&
99UFRVVG,,	-r)   c                j    t         j                  j                  rt        | ||      S t	        | ||      S )aF  Softmax function.

  Computes the function which rescales elements to the range :math:`[0, 1]`
  such that the elements along :code:`axis` sum to :math:`1`.

  .. math ::
    \mathrm{softmax}(x) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}

  Args:
    x : input array
    axis: the axis or axes along which the softmax should be computed. The
      softmax output summed across these dimensions should sum to :math:`1`.
      Either an integer, tuple of integers, or ``None`` (all axes).
    where: Elements to include in the :code:`softmax`. The output for any
      masked-out element is zero.

  Returns:
    An array.

  Note:
    If any input values are ``+inf``, the result will be all ``NaN``: this reflects the
    fact that ``inf / inf`` is not well-defined in the context of floating-point math.

  See also:
    :func:`log_softmax`
  )r
   softmax_custom_jvpvalue_softmax_softmax_deprecated)r'   r~   rG   s      r(   softmaxr   C  s3    : $$ AtU##q$..r)   rJ   )nondiff_argnumsc                
   t        j                  | |||d      }|| nt        j                  || |      }t        j                  ||z
        }|t        j                  |||d      z  }|t        j                  ||d      }|S NTr   r   r   )r+   r   rG   r   r   r'   r~   rG   r   r   r   unnormalizedr   s           r(   r   r   i  sz     ''!T$
G%1399UAw#?&%(,#'',EDQQ&
YYufa(F	-r)   c                v    ||c\  }}}\  }}}t        || ||      }|||||z  j                  | |d      z
  z  fS )NTr   )r   r   )	r~   primalstangentsr'   rG   r   x_dot_r=   s	            r(   _softmax_jvpr   w  sS    '.$1eW}q!q$w'!	
A!e)UTJJK	KKr)   c                0   t        j                  | |||d      }|| nt        j                  || |      }t        j                  |t	        j
                  |      z
        }|t        j                  |||d      z  }|t        j                  ||d      }|S r   )r+   r   rG   r   r   r   r   r   s           r(   r   r   }  s    
 ''!T$
G%1399UAw#?&#"3"3E"::;,#'',EDQQ&
YYufa(F	-r)   c                   t        j                  d|        t        j                  d|||       |t        j                  | |d|      }|Xt        j                  t        j
                  |       |d|      t        j
                  |      z
  }t        j                  |d      }t        j                  | |      t        j                  ||z         z  S )ay  Standardizes input to zero mean and unit variance.

  The standardization is given by:

  .. math::

     x_{std} = \frac{x - \langle x\rangle}{\sqrt{\langle(x - \langle x\rangle)^2\rangle + \epsilon}}

  where :math:`\langle x\rangle` indicates the mean of :math:`x`, and :math:`\epsilon` is
  a small correction factor introduced to avoid division by zero.

  Args:
    x: input array to be standardized.
    axis: integer, tuple of integers, or ``None`` (all axes), representing the
      axes along which to standardize. Defaults to the last axis (``-1``).
    mean: optionally specify the mean used for standardization. If not specified,
      then ``x.mean(axis, where=where)`` will be used.
    variance: optionally specify the variance used for standardization. If not
      specified, then ``x.var(axis, where=where)`` will be used.
    epsilon: correction factor added to variance to avoid division by zero; defaults
      to ``1E-5``.
    where: optional boolean mask specifying which elements to use when computing
      the mean and variance.

  Returns:
    An array of the same shape as ``x`` containing the standardized input.
  standardizeT)r   rG   r   )
r$   check_arraylikecheck_arraylike_or_noner+   meanr;   rU   subtractr   rsqrt)r'   r~   r   varianceepsilonrG   s         r(   r   r     s    D ]A.$$]D(EJ	\88Atd%8D
 xx

1td%9;>::d;KLH xx!$H	a	8g+=!>	>>r)   )num_classesrr   r~   c          	        t        j                  |d      }	 t        j                  || j                  dz         }t        j                  |      }t        j                  | |f      }dg| j                  z  }|j                  ||       t!        | j"                  j$                  j&                  t)        d gt+        |      z         }	t        j,                  | j.                  |||	      }
||
k(  j1                  |      S # t
        $ ra t        j                  |      }||k7  rt        d| d| d|       d t        j                  |      }t        j                  | |k(  |      cY S w xY w)N9The error arose in jax.nn.one_hot argument `num_classes`.rJ   z/Expected num_classes to match the size of axis z, but z != rq   )out_sharding)r   concrete_dim_or_errorr   canonicalize_axisndim	TypeErrorr   	axis_size
ValueError
axis_indexr+   asarrayoperatorindexexpand_dimsinsertr   avalshardingmeshPlenbroadcasted_iotarr   rv   )r'   r   rr   r~   output_pos_axisr   axis_idxlhs	rhs_shaperhs_shardingrhss              r(   _one_hotr     sO    **AC+3,,T166A:>O 
	$D7##cAFFl)?K0qvv33QY8O5PQ,QWWi*6	8#
*		U	## 
 3d#IiH O)]$yk; <AEF~~d#H;;qH}E223s   #C= =A'E'&E'rr   r~   c               0   t        j                  |d      }t        j                  |       }t	        j
                  |j                  d      s%t        j                  dd|j                   d       |t	        j                         n|}t        ||||      S )a  One-hot encodes the given indices.

  Each index in the input ``x`` is encoded as a vector of zeros of length
  ``num_classes`` with the element at ``index`` set to one::

    >>> jax.nn.one_hot(jnp.array([0, 1, 2]), 3)
    Array([[1., 0., 0.],
           [0., 1., 0.],
           [0., 0., 1.]], dtype=float32)

  Indices outside the range [0, num_classes) will be encoded as zeros::

    >>> jax.nn.one_hot(jnp.array([-1, 3]), 3)
    Array([[0., 0., 0.],
           [0., 0., 0.]], dtype=float32)

  Args:
    x: A tensor of indices.
    num_classes: Number of classes in the one-hot dimension.
    dtype: optional, a float dtype for the returned values (default :obj:`jnp.float_`).
    axis: the axis or axes along which the function should be
      computed.
  r   integralzjax-nn-one-hot-float-inputz8jax.nn.one_hot input should be integer-typed; got dtype=rJ   )
stacklevelr   )r   r   r+   r   r   isdtyperr   r   warndefault_float_dtyper   )r'   r   rr   r~   rM   s        r(   one_hotr     s    2 **AC+ ++a.%	Z	0"@N +0-&
$
$
&U%	%E	==r)   c                V    t        j                  t        j                  | d      d      S )an  Rectified Linear Unit 6 activation function.

  Computes the element-wise function

  .. math::
    \mathrm{relu6}(x) = \min(\max(x, 0), 6)

  except under differentiation, we take:

  .. math::
    \nabla \mathrm{relu}(0) = 0

  and

  .. math::
    \nabla \mathrm{relu}(6) = 0

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`relu`
  r         @)r+   ri   r,   r&   s    r(   relu6r     s     : 
S[[A&	++r)   c                j    t        j                  |dkD  |dk  z  | t        j                  | d            S )Nr      r0   r3   s      r(   r6   r6     s/    jj!a%AE*As}}Q/BC r)   c                $    t        | dz         dz  S )zHard Sigmoid activation function.

  Computes the element-wise function

  .. math::
    \mathrm{hard\_sigmoid}(x) = \frac{\mathrm{relu6}(x + 3)}{6}

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`relu6`
  g      @r   )r   r&   s    r(   hard_sigmoidr   "  s    $ 
q2v	r)   c                J    t        j                  d|       }|t        |      z  S )aM  Hard SiLU (swish) activation function

  Computes the element-wise function

  .. math::
    \mathrm{hard\_silu}(x) = x \cdot \mathrm{hard\_sigmoid}(x)

  Both :func:`hard_silu` and :func:`hard_swish` are aliases for the same
  function.

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`hard_sigmoid`
  	hard_silu)r$   r%   r   rL   s     r(   r   r   6  s&    * 
%
%k1
5%	e$	$$r)   c                t    t        j                  |       j                  }t        j                  d|z  |       S )Ngffffffrq   )r   finfor   r+   r   )rr   	dtype_maxs     r(   _get_large_negativer   P  s,    ll5!%%)	TI%U	33r)   c                |    t        j                  t        j                  | |ft                    }|d d d d d d f   S )Nrq   )r+   trilonesbool)TSmasks      r(   _get_causal_maskr   T  s3    	#((Aq6.	/$	dD!Q	r)   c                   t        j                  t        |             }t        j                  t        |            }|\  }}|d   |dd d d f   |z   k  }|d   |dd d d f   |z
  k\  }t        j                  ||      d d d d d d f   S )N).N.)r+   rw   rangelogical_and)	r   r   local_window_size	query_poskey_posleft_windowright_window	left_mask
right_masks	            r(   _get_window_maskr   X  s    iia!)IIeAh'/+|	"gc4l&;k&II)#wsD!|'<|'KK*	Y	/dAq0@	AAr)   c                   d}d}|+t        j                  d|       d d d d f   }||d d d d f   k  }|+t        j                  d|      d d d d f   }||d d d d f   k  }t        j                  ||      }|d d d d d d d f   S )NTr   )r+   aranger   )	r   r   q_seqlen	kv_seqlenq_maskkv_mask	q_indices
kv_indicesr   s	            r(   _get_padding_mask_logitsr  `  s    &'

1a q$/I!T4-00FAq!$a-0J9Qd]33G		)$	aq!m	r)   c                n    t        j                  d|       d d d f   }||d d d f   k  }|d d d d d d f   S r/   )r+   r   )r   r   r  r   s       r(   _get_padding_mask_encodedr  l  sB    jjAtQw')	Xag&	&$	aD$	r)   c                J   |
|s|||| S t        j                  | t              }|>|j                  t	        j                  t              k(  sJ t        j
                  ||      }| j                  d   | j                  d   }}|r"t        ||      }t        j
                  ||      }|#t        |||      }t        j
                  ||      }||$t        ||||      }t        j
                  ||      }t        | j                        }	t        j                  || |	      }
|
S )Nrq   r9   rp   )r+   	ones_liker   rr   rt   r   r   r   r   r  r   rG   )logitsr   	is_causalr   r  r   combined_maskr   r   large_negative_numberpadded_logitss              r(   _apply_masksr  q  s   	\)(8Y=NSdSlM--d3-	::$'''OOM48M	a&,,q/Q!Aq!DOOM48M"Aq"34DOOM48MY2#Aq(I>DOOM48M-fll;))M63HI-	r)   c                <   t        j                  | j                  t        j                        }| j                  t
        j                  k(  rt        j                  j                  }n:| j                  t        j                  k(  rt        j                  j                  }nd }	 t        j                  d| |||      }|t        j                  ||j                        z  }|||z   j                  |j                        }t!        ||||||	      }|j                  t        j                        }t#        |d      j                  |j                        }t        j                  d||      }|7t%        |j&                  d   |      }||j                  |j                        z  }|
rSt)        |d      j                  |j                        }t        j*                  |d      }|t        j,                  |      fS |S #  t        j                  d| |d |      }Y _xY w)	NzBTNH,BSNH->BNTS)	precisionpreferred_element_typerq   rg   r}   zBNTS,BSNH->BTNHrJ   r   r9   rJ   )r+   promote_typesrr   rt   float32r   bfloat16r   DotAlgorithmPresetBF16_BF16_F32float16F16_F16_F32
jnp_einsumr   rw   rv   r  r   r  r   r!   	transposer   )querykeyr   biasr   r  rl   r   r  r   return_residuallogits_dtyper  r  r  probsencodedlse_residuals                     r(   _dot_product_attention_corer&    s    ""5;;

;,
 [[FOO#&&44I{{bjj &&22I I
+F  	CIIe6<<00&	tm##FLL1FvtY)02-  &&rzz2-
-b
)
0
0
;%/>'$W]]1%5x@Dt{{7==))G]4;;CIIFL==y9LC%%l333	.?+Fs   "G< <Hc                   | j                   \  }}}|j                   \  }}}z  t        j                  | |||f      } fd} ||      } ||      }t        j                  t
        dd      } || |||||||||	|
      }|
r<|\  }}t        j                  ||||f      }t        j                  |||f      }||fS t        j                  ||||f      }|S )Nc           	         | e| j                   \  }}}}|dk(  r-t        j                  | d d d d d d d d d f   ||||f      } | S |k(  sJ t        j                  | |||f      } | S )NrJ   )r   r+   broadcast_toreshape)ttBtNtTtSGKNs        r(   _reshape_to_groupedz7_dot_product_attention_xla.<locals>._reshape_to_grouped  s    }wwnb"b"	qQq!T1a/02r1b"2EF H QwwKKB1b"-.Hr)   )rp   NNr9   r9   NNNNNNrp   )in_axesout_axes)r   r+   r*  r	   vmapr&  )r  r  r   r   r   r  rl   r   r  r   r!  Br   Hr   r   r3  
vmapped_fnoutputr$  r%  r0  r1  r2  s                        @@@r(   _dot_product_attention_xlar;    s    {{*!Q1yy*!Q11f!
++eaAq!_
-% 
T	"$	T	"$xx!G*
 eS%tY,=P& "G\kk'Aq!Q<0G;;|aAY7LL  KKAq!-'	.r)   .)rl   r  query_seq_lengthskey_value_seq_lengthsr   implementationr!  c                    y N r  r  r   r   r   rl   r  r<  r=  r   r>  r!  s               r(   r   r     s     r)   c                    y r@  rA  rB  s               r(   r   r   	  s     r)   c                  t        j                  |       j                  }|dd }d } ||       } ||      } ||      }| ||      nd}| ||      nd}|t        j                  |      }|t        j                  |      }t        |	t              r|	|	f}		 	 	 	 	 	 dd}|j                  \  }}}} ||||||g|j
                  d        |||dd|g|j
                  d        ||dgdz  t        j
                  t              d        ||dgdz  dd	        |||gt        j
                  d
      d        |||gt        j
                  d
      d       |j                  d   |z  dk7  rt        d|j                  d    d|       |dt        j                  |      z  n|}|
xdk(  r t        ||||||||||	|      }nlxdk(  r? |duxs |du}|ra|6|j                  d   }t        j                  |f|t        j                        }|'t        j                  |f|t        j                        }t        j                  }|r|rt        j                  }n%|rt        j                   }n|rt        j"                  }d}|	2|	\  }}|dk(  s|t        j                   k(  r|dz   }nt        d| d      t%        |||||||||||      }|r`|\  }}t        j&                  |d      j)                  |j
                        }||f}n't        ||||||||||	|      }n	 t        d|
       |r1|\  }}t        j*                  ||      t        j*                  ||      fS t        j*                  ||      S )a  Scaled dot product attention function.

  Computes the following for each head:

  .. math::

    \mathrm{Attention}(Q, K, V) = \mathrm{softmax}\left( \frac{QK^T}{\sqrt{d}} + B \right) V

  where
  :math:`Q` is the query matrix,
  :math:`K` is the key matrix,
  :math:`V` is the value matrix,
  :math:`d` is the dimension of each individual query and key,
  and :math:`B` is the bias matrix (optional).

  Throughout this function, we utilize the following uppercase letters to
  represent the shape of array::

    B = batch size
    S = length of the key/value (source)
    T = length of the query (target)
    N = number of attention heads
    H = dimensions of each attention head
    K = number of key/value heads
    G = number of groups, which equals to N // K

  Args:
    query: query array; shape :code:`(BTNH|TNH)`
    key: key array: shape :code:`(BSKH|SKH)`. When `K` equals `N`, multi-headed
      attention (MHA https://arxiv.org/abs/1706.03762) is performed. Otherwise,
      grouped query attention (GQA https://arxiv.org/abs/2305.13245) is
      performed if `N` is a multiple of `K`, and multi-query attention (MQA
      https://arxiv.org/abs/1911.02150) is performed if `K == 1` (a special case
      of GQA).
    value: value array, should have the same shape as the `key` array.
    bias: optional, bias array to be added to logits; The shape must be 4D and
      be broadcastable to :code:`(BNTS|NTS)`.
    mask: optional, mask array used to filter out logits. It is a boolean mask
      where `True` indicates the element should take part in attention. For an
      additive mask, users should pass it to `bias`. The shape must be 4D and be
      broadcastable to :code:`(BNTS|NTS)`.
    scale: scale for the logits. If None, the scale will be set to 1 divided by
      the square root of query's head dimension (i.e. H).
    is_causal: If true, causal attention will be applied. Note, some
      implementations like `xla` will generate a mask tensor and apply it to the
      logits to mask out the non-causal parts of the attention matrix, but other
      implementations like `cudnn` will avoid computing the non-causal regions,
      providing speedups.
    query_seq_lengths: `int32` array of sequence lengths for query; shape
      :code:`(B)`
    key_value_seq_lengths: `int32` array of sequence lengths for key and value;
      shape :code:`(B)`
    local_window_size: Window sizes to make self attention to attend to each
      token's local window. If set, this specifies the (left_window_size,
      right_window_size) for each token. E.g., if local_window_size == (3, 2)
      and the sequence is [0, 1, 2, 3, 4, 5, c, 7, 8, 9], token `c` can attend
      to [3, 4, 5, c, 7, 8]. If a single int is given, it will be interpreted as
      a symmetric window (window_size, window_size).
    return_residual: Whether to return the logsumexp tensor of shape BTN
      or BNT to users. See section 3.1.1 in the FlashAttention-2 paper:
      https://arxiv.org/pdf/2307.08691 to find the definition of logsumexp.
    implementation: A string to control which implementation backend to use.
      Supported strings are `xla`, `cudnn` (cuDNN flash attention). It defaults
      to `None`, which currently falls back to `xla`.
      Note, `cudnn` supports only a subset of shapes/dtypes, and an exception
      will be thrown if its not supported.

  Returns:
    If return_residual is False, returns an array of the attention output with
    the same shape as :code:`query`. If return_residual is True, returns a tuple
    of (output, residual). The residual is the shape of BTN|TN.
  Nrg   c                    t        j                  |       } d| j                  z
  }|dkD  r)t        j                  | t	        t        |                  S | S )NrE   r   r}   )r+   r   r   r   tupler   )r+  dims_to_adds     r(   
_ensure_4dz)dot_product_attention.<locals>._ensure_4ds  sC    AAaff*KQ__QU5+=%>??Hr)   c                   | y | j                   t        |      k7  r&t        | dt        |       d| j                          |,| j                  |k7  rt        | d| d| j                         t	        | j                         D ]=  }||   dk7  s| j
                  |   ||   k7  s"t        | d| d| j
                          y )Nz ndim should be z
, but got z dtype should be rg   z shape should be z
: but got )r   r   r   rr   r   r   )r+  r   rr   nameis        r(   _check_shape_and_dtypez5dot_product_attention.<locals>._check_shape_and_dtype  s    yvvU$/E
|:affXNOOQWW-$0z!''KLL166] O	qRAGGAJ%(2D6!25'AGG9MNNOr)   r   r  rE   r   r   int32r<  r=  r   zIThe number of query heads must be a multiple of key/value heads, but got z vs rD   xla)r  rl   r   r  r   r!  cudnnrJ   rq   z$cuDNN doesn't support right window: z when causal mask is not used.)rl   	mask_typesliding_window_lengthr!  r  z#Unsupported implementation option: )
r+  Array | Noner   zSequence[int]rr   zDType | NonerJ  strreturnNone)r+   r   r   
isinstanceintrr   rt   r   r   r:   r;  fullrM  r   NO_MASKPADDING_CAUSALCAUSALPADDINGcudnn_dot_product_attentionr  rv   r*  ) r  r  r   r   r   rl   r  r<  r=  r   r>  r!  output_shaperesidual_shaperH  	query_arrkey_arr	value_arrrL  r7  r   r1  r8  	scale_valoutuse_paddingr   rQ  sliding_windowl_windowr_windowresiduals                                    r(   r   r     s   n U#)),$. )sO')!-D	4$!-D	4$"$56&KK(=>!3'*,=>
O$0
O8;
O@D
O }}*!Q1Q1aL'--IQBNGMM7Ktax$@taxv6*QC'1B,..RXXg5F02__R1!
 11:1D0ET!N O O %*MsRWWQZu)	&
Wity$5)-)c 
D(M,A,M  
$ooa !!hhtQbhh?
 ("%((A4"(("C
""i	++	OO	$$	 n		&.(q=I8#a<.A( L; ; < < (
Wit5F
yI .c
 
 X==95<<SYYGHo	&
Wity$5)-)c 
<^<LMNNMC;;sL)3;;x+PPP	S,	''r)   c           
     x   | |||f\  }}}}t        d ||||fD              st        d      |j                  \  }	}
}|j                  \  }}}||k7  s|	|k7  r%t        d|j                   d|j                         |j                  \  }}}|j                  \  }}}||k7  s||k7  r%t        d|j                   d|j                         ||
k7  s||k7  r?t        d|j                   d|j                   d|j                   d	|j                         t        j                  |d
      }t        |||||      }|S )a  Scaled matrix multiplication function.

    Performs block-scaled matmul of `a` and `b` using `a_scales` and `b_scales`.
    The last dim is the contracting dim, and block size is inferred.

    Mathematically, this operation is equivalent to::

      a_block_size = a.shape[-1] // a_scales.shape[-1]
      b_block_size = b.shape[-1] // b_scales.shape[-1]
      a_scaled = a * jnp.repeat(a_scales, a_block_size, axis=-1)
      b_scaled = b * jnp.repeat(b_scales, b_block_size, axis=-1)
      jnp.einsum('BMK,BNK->BMN', a_scaled, b_scaled)

    Args:
      lhs (Array): Operand a, shape (B, M, K).
      rhs (Array): Operand b, shape (B, N, K).
      lhs_scales (Array): Shape (B, M, K_a), where `K % K_a == 0`.
      rhs_scales (Array): Shape (B, N, K_b), where `K % K_b == 0`.
      preferred_element_type (DTypeLike, optional): Defaults to `jnp.float32`.

    Returns:
      Array of shape (B, M, N).

    Notes:
      - We currently do not support user-defined `precision` for customizing the
        compute data type. It is fixed to `jnp.float32`.
      - Block size is inferred as `K // K_a` for `a` and `K // K_b` for `b`.
      - To use cuDNN with Nvidia Blackwell GPUs, inputs must match::

          # mxfp8
          a, b: jnp.float8_e4m3fn | jnp.float8_e5m2
          a_scales, b_scales: jnp.float8_e8m0fnu
          block_size: 32
          # nvfp4
          a, b: jnp.float4_e2m1fn
          a_scales, b_scales: jnp.float8_e4m3fn
          block_size: 16

    Examples:

      Basic case:

      >>> a = jnp.array([1, 2, 3]).reshape((1, 1, 3))
      >>> b = jnp.array([4, 5, 6]).reshape((1, 1, 3))
      >>> a_scales = jnp.array([0.5]).reshape((1, 1, 1))
      >>> b_scales = jnp.array([0.5]).reshape((1, 1, 1))
      >>> scaled_matmul(a, b, a_scales, b_scales)  # doctest: +SKIP
      Array([[[8.]]], dtype=float32)

      Using fused cuDNN call on Blackwell GPUs:

      >>> dtype = jnp.float8_e4m3fn
      >>> a = jax.random.normal(jax.random.PRNGKey(1), (3, 128, 64), dtype=dtype)
      >>> b = jax.random.normal(jax.random.PRNGKey(2), (3, 128, 64), dtype=dtype)
      >>> a_scales = jnp.ones((3, 128, 4), dtype=jnp.float8_e8m0fnu)
      >>> b_scales = jnp.ones((3, 128, 4), dtype=jnp.float8_e8m0fnu)
      >>> scaled_matmul(a, b, a_scales, b_scales)  # doctest: +SKIP
    c              3  :   K   | ]  }|j                   d k(    yw)rp   N)r   ).0r'   s     r(   	<genexpr>z scaled_matmul.<locals>.<genexpr>&  s     ?qqvv{?s   z<scaled_matmul requires all inputs to be 3-dimensional arrayszmscaled_matmul requires inputs a and b to have matching batch (B) and contract (K) dimensions, but got shapes z and zescaled_matmul requires scales to have matching batch (B) and contract (K) dimensions, but got shapes z\scaled_matmul requires scales to match non-contract dimensions of inputs, but got shapes a: z, b: z, a_scales: z, b_scales: scaled_matmulr  )allr   r   r   !check_and_canonicalize_user_dtypecudnn_scaled_matmul)r   r   
lhs_scales
rhs_scalesr  ar<   a_scalesb_scalesB_aM_aK_aB_bN_bK_bB_asM_asK_asB_bsN_bsK_bsre  s                         r(   ro  ro    s   B  #CZ?Aq(H?Q8X$>??J
 	
 GGMCcGGMCc
czSCZ;;<77)5wwi
 	
  ~~D$~~D$t|tt|77?~~6Fe~~ 
 	
 s{dck))*	qwwi|~~l8>>*:<
 	
 $EE 		5C Jr)   c                D   | dk(  r\t        j                  dt        j                        }t	        ddt
        j                  t
        j                  ||d      S |d      S | dk(  r-t	        dd	t
        j                  t
        j                  dd      S t        d
|        )zGet quantization configs for scaled_dot_general.

    Create quantization configs for the `jax.nn.scaled_dot_general`.

    See Also:
      - :func:`jax.nn.scaled_dot_general`: Scaled dot general function.
    nvfp4r   rq      NF)mode
block_size	data_type
scale_typeglobal_scale
infer_onlymxfp8    zUnsupported mode: )
r+   r   rt   r  r   r   float4_e2m1fnfloat8_e4m3fnfloat8_e8m0fnur   )r  r  ones      r(   get_scaled_dot_general_configr  P  s     whht2::.**++ , 4
 	

 ;G
 	
 
**,,
 	
 -dV455r)   c                    |t        j                  dt               |t        j                  | |||      S t        | ||||      }|S )a
  Scaled dot general operation.

  Performs a generalized dot product with block-scaled quantization on the
  lhs and rhs inputs. This operation extends `lax.dot_general` to support
  user-defined scaling configurations.

  Essentially, the operation follows::

      a, a_scales = quantize(lhs, configs[0])
      b, b_scales = quantize(rhs, configs[1])
      c = jax.nn.scaled_matmul(a, b, a_scales, b_scales)

  Args:
    lhs (ArrayLike): Input array.
    rhs (ArrayLike): Input array.
    dimension_numbers (DotDimensionNumbers): A tuple of two tuples specifying
      the contraction and batch dimensions:
      `((lhs_contracting_dims, rhs_contracting_dims), (lhs_batch_dims, rhs_batch_dims))`.
    preferred_element_type (DTypeLike, optional): Output data type of the dot
      product. Defaults to `jnp.float32`. Other valid types include
      `jnp.bfloat16` and `jnp.float16`.
    configs (list of BlockScaleConfig, optional): Scaling configurations for
      lhs, rhs, and gradients. Users can obtain valid configurations via
      `jax.nn.get_scaled_dot_general_config`. Currently, `nvfp4` and `mxfp8`
      are supported. If `None`, falls back to `lax.dot_general`.
    implementation: str
      (Deprecated) Backend selector, now ignored. The system chooses the backend
      automatically. Scheduled for removal in future releases.

  Returns:
    Array: The resulting tensor, with batch dimensions first, followed by
    non-contracting/non-batch dimensions of lhs, and then those of rhs.

  See Also:
    - :func:`jax.nn.scaled_matmul`: Scaled matmul function.
    - :func:`jax.lax.dot_general`: General dot product operator.

  Notes:
    - Unlike `nn.scaled_matmul`, which assumes quantized low-precision
      inputs with explicit scaling factors, this operator takes high-precision
      inputs, applies quantization internally, and handles the backward pass.

  Examples:

    Creating config for mxfp8:

    >>> configs = [jax.nn.get_scaled_dot_general_config('mxfp8')] * 3

    Creating config for nvfp4:

    >>> global_scale = jnp.array([0.5], jnp.float32)
    >>> configs = [jax.nn.get_scaled_dot_general_config('nvfp4', global_scale)] * 3

    Using scaled_dot_general with the configs:

    >>> import functools
    >>> scaled_dot_general_fn = functools.partial(jax.nn.scaled_dot_general, configs=configs)
    >>> lhs = jax.random.normal(jax.random.PRNGKey(1), (3, 128, 64))
    >>> rhs = jax.random.normal(jax.random.PRNGKey(2), (3, 128, 64))
    >>> out = scaled_dot_general_fn(lhs, rhs, (((2,), (2,)), ((0,), (0,))))  # doctest: +SKIP
  zLBackend selector, now ignored. The system chooses the backend automatically.rp  )r  configs)warningsr   DeprecationWarningr   dot_generalcudnn_scaled_dot_general)r   r   dimension_numbersr  r  r>  re  s          r(   scaled_dot_generalr  p  se    H MM +,>@ _??3%62HJ J 	!	3!3	# 
*r)   c           	     ,   t        j                  d|       } t        j                  d      }t        j                  | |k  t        j                  t        j
                  |               t        j                  t        j                  |                     S )u  Numerically stable calculation of :math:`\log(1 - \exp(-x))`.

  This function is undefined for :math:`x < 0`.

  Based on `TensorFlow's implementation <https://www.tensorflow.org/probability/api_docs/python/tfp/math/log1mexp>`_.

  References:
    .. [1] Martin Mächler. `Accurately Computing log(1 − exp(−|a|)) Assessed by the Rmpfr package.
      <https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf>`_.
  log1mexprT   )r$   r%   r+   r   rG   r`   log1pr   )r'   cs     r(   r  r    sk     !!*a0!	ggcl!	!e	ggsyy!}n	ii!
 r)   c                2    | t        j                  |      z  S r@  )r+   r`   r3   s      r(   r6   r6     s    1syy|#3 r)   )r'   r   rU  r   )rE   )r'   r   r<   r   rU  r   )rD   )r'   r   ra   r   rU  r   )g{Gz?)r'   r   rd   r   rU  r   )T)r'   r   ry   r   rU  r   )rg   )r'   r   r~   rX  rU  r   )NNF)
r'   r   r~   r   rG   ArrayLike | Noner   r   rU  r   )rg   N)r'   r   r~   r   rG   r  rU  r   )
r'   r   r~   r   rG   r  r   r   rU  r   )rg   NNgh㈵>N)r'   r   r~   r   r   r  r   r  r   r   rG   r  rU  r   )
r'   r   r   rX  rr   r    r~   int | AxisNamerU  r   )
r'   r   r   rX  rr   z
Any | Noner~   r  rU  r   )r   rX  r   rX  r   ztuple[int, int])F)r  r   r  r   r   r   r   rS  r   rS  r  r   rl   floatr   rS  r  rS  r   ztuple[int, int] | Noner!  r   )NN)r  r   r  r   r   r   r   r  r   r  rl   float | Noner  r   r<  r  r=  r  r   int | tuple[int, int] | Noner>  Literal['xla', 'cudnn'] | Noner!  zLiteral[False]rU  r   )r  r   r  r   r   r   r   r  r   r  rl   r  r  r   r<  r  r=  r  r   r  r>  r  r!  zLiteral[True]rU  ztuple[Array, Array])r  r   r  r   r   r   r   r  r   r  rl   r  r  r   r<  r  r=  r  r   r  r>  r  r!  r   )r   r   r   r   rt  r   ru  r   r  r    rU  r   r@  )r  zLiteral['nvfp4', 'mxfp8']r  rS  )r  zlist[BlockScaleConfig] | Noner>  zLiteral['cudnn'] | None)j__doc__
__future__r   collections.abcr   	functoolsr   r   r   rt   typingr   r   r   r  jax._srcr	   r
   r   r   r   r   r   r+   r   jax._src.corer   (jax._src.cudnn.fused_attention_stablehlor   r^  r   &jax._src.cudnn.scaled_matmul_stablehlor   rs  r   r  r   jax._src.numpyr   r  r$   jax._src.numpy.reductionsr   r   jax._src.sharding_implsr   r   r   jax._src.typingr   r   r   r    jax._src.ops.specialr!   r   jitr#   
custom_jvpr-   defjvpsr8   r@   rB   rI   rQ   rV   rX   swishrZ   r]   r_   rc   rf   rj   rm   ro   r   r   r   r   r   r   defjvpr   r   r   r   r   r   r   r   
hard_swishr   r   r   r  r  r  r&  r;  r  ro  r  r  r  rA  r)   r(   <module>r     sq   = " $    ) )     ' !   !  "D  0 - , * E > > 8 4 	4$  	 B H I 	$  	 K 	K0 & 	& 	 ( + 	+>   	 , 	+ 	+*  	( G 	G4 > 	>4 C 	C* N 	N0  	B D 	# $> 	 	-. "	
  	
  /8 	#*.&&'&38& $&X &*#/#/##//4#/L 			&	&= "&&	
  	 &+	 > 
L L "&&	
  	 &+	 	#)--1%)*.1?1?&1? +1? #	1?
 (1?
 491? $1?h 	9:$$%3$8=$ ;$4 !%R$>$>,:$>DI$>N , 	 ,8  D E  	& % 	%. 
4 B
 
8:N ",,	, , 	,
 , , , , , ., ,^ 

 "! *..26:59&)	  	
    ( , 4 3 $  
  

 "! *..26:59%(	  	
    ( , 4 3 #  
( "!H( *..26:59!H(H(	H( H( 	H(
 H( H( H( (H( ,H( 4H( 3H( H(^ )+

j	j	j j 	j
 &j jZ @D60<6F ::-1.2R +	R
 ,Rh  	 & 	  3 4r)   