
    ukiN                         d Z ddlmZ ddlZddlmZ ddlZddlZddl	m
Z
 ddl	mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ 	 ddlmZ eZej6                  j8                  Zej6                  j:                  Zdededededef
dZ dededededef
dZ!dededededef
dZ"dededededef
dZ#dedededede$e   f
dZ%dededededef
dZ&dededededef
dZ'd Z(dededededede)e*eef   e*eef   e*eef   e*eef   f   fdZ+dejX                  defdZ- eed       	 dAd!ed"ed#eded$edededed%e.dedejX                  de)eeef   fd&       Z/ eej`                  d'(      d!ed"ed#ed)e*eef   d*e*eef   d+e*eef   d,e*eef   d$edededed%e.dede)eeef   fd-       Z1d$ede)e)eef   ef   fd.Z2d/ed$efd0Z3d1ed$edefd2Z4d!ed"ed#ed3ed$edededed%e.dedejX                  fd4Z5dededed%e.ded5efd6Z6 e
jn                  d7      Z8d8e8_9        e8ju                   eejv                  e8             e8jy                  e6       erG ejz                  e8ej|                  d9:        e?ed;      r ejz                  e8ej                  d<:       dededed%e.dedejX                  fd=ZAdededed%e.ded5efd>ZB e
jn                  d?      ZCd8eC_9        eCju                   eejv                  eC             eCjy                  eB       erG ejz                  eCej                  d9:        e?ed@      r ejz                  eCej                  d<:       e/j                  e5eA       y# e$ r dZY (w xY w)Ba
  `jax.experimental.rnn`: GPU accelerated RNN

----------------------------------------------

This module provides experimental support to CUDNN-backed LSTM.

Currently, the only supported RNN flavor is LSTM with double-bias. We use
notations and variable names similar to
https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM

and CUDNN_LSTM entry in
https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNMode_t.

Note that a bidirectional LSTM is treated as having twice the number of layers,
where a forward layer i is followed by a reverse layer i. Each direction has
its own associated weights. We use pseudo-layer to denote such layers
following CUDNN documentation
https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetRNNWeightParams.

CUDNN takes an opaque 1D weight array that densely packs all the weight arrays
in a sparsely documented layout. Through trial-and-error and testing, we believe
the layout is the following. Assume 2-layer bi-LSTM with double-bias, so 4
pseudo-layers in total (forward-0, reverse-0, forward-1, reverse-1).

There are 4 kinds of weights: W_ih, W_hh, b_ih and b_hh, where

W_ih = (W_ii, W_if, W_ig, W_io) concatenated on leading axis,
W_hh = (W_hi, W_hf, W_hg, W_ho) concatenated on leading axis,
b_ih = (b_ii, b_if, b_ig, b_io) concatenated on leading axis,
b_hh = (b_hi, b_hf, b_hg, b_ho) concatenated on leading axis.

Say W_ih^0 denotates W_ih from pseudo-layer 0. The linear weights are packed
together from all pseudo-layers followed by bias weights from all pseudo-layers.
In particular, for each layer, W_ih is followed by W_hh and b_ih by b_hh.

(W_ih^0, W_hh^0, W_ih^1, W_hh^1, W_ih^2, W_hh^2, W_ih^3, W_hh^3,
 b_ih^0, b_hh^0, b_ih^1, b_hh^1, b_ih^2, b_hh^2, b_ih^3, b_hh^3)

See `get_params_shapes_in_lstm`.

Example usage:
```
  x = jax.random.normal(
      k1, (batch_size, seq_len, input_size), dtype=jnp.float32)
  h_0 = jax.random.normal(
      k2, (num_directions * num_layers, batch_size, hidden_size),
      dtype=jnp.float32)
  c_0 = jax.random.normal(
      k3, (num_directions * num_layers, batch_size, hidden_size),
      dtype=jnp.float32)
  seq_lengths = jnp.ones((batch_size,), dtype=jnp.int32) * seq_len
  weights = rnn.init_lstm_weight(k4, input_size, hidden_size, num_layers,
                                 bidirectional)
  y, h_n, c_n = rnn.lstm(
      x,
      h_0,
      c_0,
      weights,
      seq_lengths=seq_lengths,
      input_size=input_size,
      hidden_size=hidden_size,
      num_layers=num_layers,
      dropout=False,
      bidirectional=bidirectional)
```

TODO:
  - Add support for input and weight dtypes other than float32.
  - Support ragged inputs.
  - Support RNNs other than LSTM.
    )partialN)Any)core)dispatch)mlir)
custom_vjp)ArrayShape)lax)gpu_rnnlayer_i
input_sizehidden_sizebidirectionalreturnc                 H    | dk(  s| dk(  r	|rd|z  |fS |rdnd}d|z  ||z  fS )zSShape of W_ii|W_if|W_ig|W_io.

  Note that layer_i is an index of pseudo-layers.
  r             )r   r   r   r   num_directionss        O/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/rnn.py_W_ih_lr   l   s?     \gl}OZ(('QQNO^k9::    c                     d|z  |fS )zShape of W_hi|W_hf|W_hg|W_ho.r   r   r   r   r   r   s       r   _W_hh_lr   y   s     k/;	''r   c                     d|z  fS )zShape of b_ii|b_if|b_ig|b_io.r   r   r   s       r   _b_ih_lr           k/	r   c                     d|z  fS )zShape of b_hi|b_hf|b_hg|b_ho.r   r   r   s       r   _b_hh_lr"      r    r   
num_layersc                    g }|rdnd}||z  }t         t        g}t        |      D ]%  }|D ]  }	 |	|| ||      }
|j                  |
         ' t        t
        g}t        |      D ]%  }|D ]  }	 |	|| ||      }
|j                  |
         ' |S )z?Get flat param shapes in LSTM. See module docstring for layout.r   r   )r   r   rangeappendr   r"   )r   r   r#   r   layer_shapesr   num_pseudo_layerslinear_weightsiw_kindlayer_shapebias_weightss               r   _get_params_shapes_in_lstmr.      s     ,%11. >1W%."# 'a  '1j+}Ek+&''
 7#,"# 'a '1j+}Ek+&'' 
r   c                 F    t        | |||      }t        d |D              }|S )zGet param count in LSTM.c              3   F   K   | ]  }t        j                  |        y wN)mathprod).0shapes     r   	<genexpr>z)get_num_params_in_lstm.<locals>.<genexpr>   s     ?DIIe$?s   !)r.   sum)r   r   r#   r   r'   param_counts         r   get_num_params_in_lstmr9      s-     ,JZ,9;,?,??+	r   rngc                     t        ||||      }t        j                  d|z        }t        j                  j                  | |ft        j                  | |      S )zDRandom initialize LSTM weights from U(-k, k), k=sqrt(1/hidden_size).g      ?)r5   dtypeminvalmaxval)r9   npsqrtjaxrandomuniformjnpfloat32)r:   r   r   r#   r   r8   ks          r   init_lstm_weightrG      s[     'z;
'46+ggcK !				+s{{A2a 
 
I Ir   c           
      
   t        j                  |       } t        ||||      }|rdnd}d}t        |      D ]  }t        |      D ]  }	dD ]  }
|j	                  d      }t        j                  |      }| |||z    j                  |      }t        j                  |dd      }t        j                  |d   |d   |d   |d   gd      }| j                  |||z    j                  |j                               } ||z  }   | S )zCSwaps the weights for the input and output gates for an LSTM model.r   r   r   W_ihW_hhb_ihb_hhr   axis   )rD   asarrayr.   r%   popr2   r3   reshapesplitconcatenateatsetflatten)weightsr   r   r#   r   flat_shapesr   	w_offsetsl	direction	gate_namer5   	num_elemsmatrixgatesswapped_matrixs                   r   swap_lstm_gatesrc      s   KK '*:{JP]^+%11.) a>* 	7 )"IIe$	9y#89AA%H 		&!!,%(E!HeAha)QXYZ **Yy9'<=AA.BXBXBZ[Y	  
.r   rY   c                    t        ||||      }d}d}|rdnd}||z  }	i }
i }t        |	      D ]I  }|
|fD ]@  }||   }|dz  }t        j                  |      }| |||z    j	                  |      ||<   ||z  }B K i }i }t        |	      D ]I  }||fD ]@  }||   }|dz  }t        j                  |      }| |||z    j	                  |      ||<   ||z  }B K |
|||fS )a  Unpack cudnn LSTM weights into individual weights.

  CUDNN LSTM weight layout: (num_layers, num_directions, W_ih, W_hh, b_ih, b_hh)
  Returns W_ih, W_hh, b_ih, b_hh. e.g. W_ih[2][1] is the concat weights of
  4 weights (W_ii, W_if, W_ig, W_io), each of shape (hidden_size, input_size)
  at 2nd layer for the reverse direction. See notations from
  https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM.
  r   r   r   )r.   r%   r2   r3   rS   )rY   r   r   r#   r   rZ   flat_shapes_offsetr[   r   r(   rJ   rK   r\   r+   r5   r_   rL   rM   s                     r   unpack_lstm_weightsrf      sI    +:{J+8:+)%11. >1$$"# a, ,-eA))E"i)I	$9:BB5IfQi9i $$"# a, ,-eA))E"i)I	$9:BB5IfQi9i 
tT4	r   	precisionc                    t        j                  |       } | t        | t              rt	        |       dk(  sy| x  r& dk(  r!\  t         j
                  j                  k(  r  y x  r& dk(  r!\  t         j
                  j                  k(  r  y   r/ dk(  r*\  t         j
                  j                  k(  r t        d       	 t        d|        )Nr   TFz)bfloat16 support not implemented for LSTMz%Unexpected precision specifier value )r   canonicalize_precision
isinstancetuplelen	PrecisionHIGHESTHIGHDEFAULTNotImplementedError
ValueError)rg   s    r   _lstm_cudnn_allow_tf32rs      s     ((3)z)U;IRS@S	#	##--

	# 
$	 	 #--

a	  
!	#	##--

 KLL 
$	
>ykJKKr   )            	   
   )nondiff_argnumsxh_0c_0seq_lengthsdropoutc                 F    t        | |||||||||	|
      \  \  }}}}|||fS )a`  LSTM via CuDNN or HIPDNN (not-yet-supported).

  Assume batch-first inputs.

  Arguments:
    x: (batch_size, max_seq_length, input_size)
    h_0: (num_directions * num_layers, batch_size, hidden_size)
    c_0: (num_directions * num_layers, batch_size, hidden_size)
    weights: (num_params,) where num_params = get_num_params_in_lstm(...)
    seq_lengths: (batch_size,)
  Returns: (y, h_n, c_n, reserve_space).
    y: (batch_size, max_seq_length, hidden_size * num_directions)
    h_n: (num_directions * num_layers, batch_size, hidden_size)
    c_n: (num_directions * num_layers, batch_size, hidden_size)
  )r   r   r#   r   r   rg   )lstm_fwd)r{   r|   r}   rY   r~   r   r   r#   r   r   rg   yh_nc_n_s                  r   lstmr     sH    & 		!-1c3 
Cr   )rw   rx   ry         )static_argnumsrJ   rK   rL   rM   c           	         |j                   t        j                   d      k7  rt        d      |dk7  rt        d      d }d }| j                  ddd	      }|sg }g }t	        |
      D ]  }t        |||   ||   ||   ||   
      }t        ||      }t        j                  j                  |||   ||   f|      }t        ||      \  \  }}}|j                  |       |j                  |        t        j                  |      }t        j                  |      }|j                  ddd	      ||fS g }g }t	        |
d	z        D ]  }t        |||   ||   ||   ||   
      }t        ||      }|d	z  dk(  r<t        j                  j                  |||   ||   f|      }t        ||      \  \  }}}nlt        ||      }t        j                  j                  |||   ||   f|      }t        ||      \  \  }}}t        ||      }t        j                  |gd      }|j                  |       |j                  |        t        j                  |      }t        j                  |      }|j                  ddd	      ||fS )zReference implementation of LSTM.

  See https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#lstm
  https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNMode_t
  int32 `seq_lengths` can only be int32.g        zWDropout not supported in LSTM reference because we cannot determine CUDNN dropout mask.c                   | \  }}t        j                  |dd      \  }}	}
}t        j                  |dd      \  }}}}t        j                  |dd      \  }}}}t        j                  |dd      \  }}}}t        ||j                  z  |d    z   ||j                  z  z   |d    z         }t        ||	j                  z  |d    z   ||j                  z  z   |d    z         }t	        ||
j                  z  |d    z   ||j                  z  z   |d    z         }t        ||j                  z  |d    z   ||j                  z  z   |d    z         }||z  ||z  z   }|t	        |      z  }||f|fS )Nr   r   rN   )rD   rT   sigmoidTtanh)carryr{   rJ   rK   rL   rM   hcW_iiW_ifW_igW_ioW_hiW_hfW_hgW_hob_iib_ifb_igb_iob_hib_hfb_hgb_hor*   fgos                               r   	lstm_cellzlstm_ref.<locals>.lstm_cellI  sg   DAq YYtQQ7D$d YYtQQ7D$d YYtQQ7D$d YYtQQ7D$dDFF
T$Z'!dff*4tDzABADFF
T$Z'!dff*4tDzABAQZ$t*$q466z1DJ>?ADFF
T$Z'!dff*4tDzABA	AAA	DGAq619r   c                 &     | ||      \  }}|||ffS r1   r   )cellr   r{   r   s       r   scan_fnzlstm_ref.<locals>.scan_fnZ  s     E1~HE15!*r   r   r   r   rI   rN   )r<   rD   rq   	transposer%   r   rA   r   scan_extract_outputr&   stack_flip_sequencerU   )r{   r|   r}   rJ   rK   rL   rM   r~   r   r   r#   r   r   r   r   seq_first_yfinal_hfinal_cr\   r   cell_fnouth_tc_tr   r   seq_first_y_fwdseq_first_y_reversedseq_first_y_bwds                                r   lstm_refr   0  s    #))G,,
@
AA^
a " Aq!$+	GG: 
$q'Qd1gDGMd&gGGLL3q63q6"2-8:c /S AjsC+nnSnnS ))G
C
))G
C  Aq)333 ''a  aQd1gDG$q'KDgt$G1uzGGLL3q63q6"21<>c$3K$E!jsC/ ,KEGGLL
CFCF#%9;c$3K$E!jsC/&DoOO_o$FRPkNN3NN3'( 			'#		'#			q!Q	'c	11r   c                    |\  }\  \  }}}t        ||       }t        ||       }| d    t        j                  |j                  d   t        j                        d d d f   kD  }t        j
                  |d   |d      }||f|fS )Nr   )r<   ).N)_select_last_carryrD   aranger5   r   where)	r~   r   r   hscsr   r   r   masks	            r   r   r     s    "!hr22{+#2{+# 
T	SZZ(9(9!(<CIINqRVwW	W$		
9o	+ s[	  r   	carry_seqc                 V    | |dz
  t        j                  | j                  d         f   S )Nr   )rD   r   r5   )r   r~   s     r   r   r     s(    	;?CJJyq/A$BB	CCr   	sequencesc                     | j                   d   }||z
  } t        j                  t        t        j
                  d      dd      | |      d d d   S )Nr   rN   )r   r   r   )in_axesout_axesr   )r5   rA   vmapr   rD   roll)r   r~   	max_stepsroll_amountss       r   r   r     sW    ooa )[(,
'#((+V
\
++/R4
1 1r   wc                     |j                   t        j                   d      k7  rt        d      t        |
      }t        j                  | |||||||||	|      \  }}}}|||f| ||||||ffS )Nr   r   r   r   r#   r   r   cudnn_allow_tf32)r<   rD   rq   rs   	rnn_fwd_pbind)r{   r|   r}   r   r~   r   r   r#   r   r   rg   r   r   r   r   reserve_spaces                   r   r   r     s     #))G,,
@
AA+I6(~~		!'  .  )!S#} S#CaaG	GGr   r   c           
      .   | j                   d   | j                   d   }}|	rdnd}||||z  f}t        j                  || j                        }t	        j
                  |||||||	|
      \  }}t        j                  |ft        j                        }||||fS )Nr   r   r   )r5   r   ShapedArrayr<   r   )compute_rnn_workspace_reserve_space_sizesrD   rE   )x_avalh_0_avalc_0_avalw_avalseq_lengths_avalr   r   r#   r   r   r   
batch_sizemax_seq_lengthr   output_shapeoutput_avalr   reserve_space_sizereserve_space_avals                      r   rnn_abstract_evalr     s      &||AQn*%11.nn{.JK,  v||<+77
k:z>
="24 ! '');(=s{{K	h*<	<<r   rnn_fwdTcuda)platformmiopen_rnn_loweringrocmc                     t        |      }|\  }	}
}}}}}|\  }}}t        j                  ||||	|
|||||| |||||      \  }}}}||||t        j                  |      fS )Nr   )rs   	rnn_bwd_pr   rD   
zeros_like)r   r   r#   r   r   rg   	residuals	gradientsr   r{   r|   r}   r   r~   r   r   dydh_ndc_ndxdh_0dc_0dws                          r   lstm_bwdr     s     ,I62;/!S#q+q-."dD ~~

		!'! & )"dD"" dD"cnn[9	::r   c                     ||||fS r1   r   )dy_avaldhn_avaldcn_avalr   h0_avalc0_avalr   y_avalr   r   r   r   r#   r   r   r   s                   r   rnn_bwd_abstract_evalr     s    
 
'6	))r   rnn_bwdmiopen_rnn_bwd_loweringr1   )G__doc__	functoolsr   r2   typingr   rA   numpyr?   jax._srcr   r   jax.interpretersr   jax._src.custom_derivativesr   jax._src.typingr	   r
   jax._src.laxr   	jax.numpyrD   jax._src.libr   ImportErrorPRNGKeyArraynnr   r   intboolr   r   r   r"   listr.   r9   rG   rc   rk   dictrf   PrecisionLikers   floatr   jitr   r   r   r   r   r   	Primitiver   multiple_resultsdef_implapply_primitivedef_abstract_evalregister_loweringcudnn_rnn_loweringhasattrr   r   r   r   cudnn_rnn_bwd_loweringr   defvjpr   r   r   <module>r     s8  FN    
    ! 2 (  " 
&&..
vv{{
;S 
;c 
; 
;
;$)
;(S (c ( (($)(S c  $)S c  $)3 S +..27;E{*s  # *.36I, IC Ic I!$I59I4' '  #' 25' CF' '  4U
T#u*-tCJ/?cFKGL BM M N' TLc&7&7 LD L0 	%89 >BE  E E  '*8;FK),):):FKESXZ_L_F` :B 	!34W2 W2E W2 W2T#u*5E W2U
#W2+/U
+;W2U
#W227W2EHW2 W2 ,/W2 :?W2 !	W2 &+5%+>%?	W2 5W2r! !eE5L6I56P0Q !D% De D1e 1% 1E 1H HE H H% He HH+.H<?HJOH H-0->->H*="%=47=EH=$=59= )-=  DNN9%	!	  	  7833Y? @ 	  - .
$G$>$>PW+,D9g&A&AFS; ;3 ;C ;% ; ;-0->->;4*8;*JM* '** 5:* KO* -1	* DNN9%	!	  	  7833Y? @ 	  1 2
$//&BW/0D722VE Hh w  's   M, ,M76M7