
    uki
                         d Z ddlZddlZddlmZ ddlmZ ddl	m
Z defdZ ej                  ej                  g d      d	d
ddddej                   dededededej                   fd       Zy)zPallas softmax kernel.    N)pallas)triton	block_rowc                H   | j                   d   }t        j                  |      |k  }t        j                  | j
                  t        j                  d|         |t        d             }t        j                  |d      }t        j                  ||z
  j                  t        j                              }t        j                  |d      }t        j                  |j
                  t        j                  d|         ||z  j                  |j                        |       y )Nr   inf)maskother)axis)r	   )shapejnparangeplgpuloadatpldsfloatmaxexpastypefloat32sumstoredtype)		input_ref	probs_refr   row_lenr	   rowrow_max	numeratordenominators	            b/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/pallas/ops/gpu/softmax.py_vmappable_softmax_kernelr$      s     OOB'	I		($

ll255I&'d5<-	# GGCa 'ggsW},,S[[9:)	*+++ll255I&';&&y7    )r   	num_warps	interpretdebug)static_argnamesr      Fxr   r&   r'   r(   returnc                4   |dk\  r|nt        | j                        |z   }|t        | j                        dz
  k7  rt        d      | j                  d   }t        j                  |      }t        j                  |f| j                        }t        j                  t        |      }t        j                  |t        j                  |d      d|||	      }	t        t        | j                        dz
        D ]  }
t        j                  |	      }	  |	|       S )
a  Computes the softmax of the input array along the specified axis.

  Args:
    x: input array
    axis: the axis along which to perform the computation
    num_warps: the number of warps to use for executing the Triton kernel
    interpret: whether to interpret the kernel using pallas
    debug: whether to use pallas in debug mode

  Returns:
    The result of the softmax operation over the specified axis of x.
  r      z3reductions along non-trailing dimension unsupportedr   )r   r   )r   )r&   
num_stages )compiler_paramsgrid	out_shaper(   r'   )lenr   NotImplementedErrorr   next_power_of_2jaxShapeDtypeStructr   	functoolspartialr$   pallas_callr   CompilerParamsrangevmap)r+   r   r&   r'   r(   r   r   r3   kernelf_s              r#   softmaxrB   4   s    $ AGGt 3$	S\A
=? ? GGBK'  ))""'177C)6)L&nn**!-! QWW!" aA 
1+r%   )__doc__r9   r7   	jax.numpynumpyr   jax.experimentalr   r   jax.experimental.pallasr   r   intr$   r:   jitArrayboolrB   r0   r%   r#   <module>rL      s      
  ) 3 8 377 -6 7 "$a5(
yy((03(($(( 	YY(7(r%   