
    bia              '       .   d dl mZ d dlZd dlmZ d dlmZ d dlZd dl	m
Z
mZmZmZ ddlmZ ddlmZmZ  ed	d
      dej&                  dej&                  fd       Z edd
      dej&                  dej&                  dej&                  fd       Zdej&                  dej&                  dej&                  fdZ edd
      	 	 dSdej&                  dej&                  dej&                  deej,                     deej&                     dej&                  fd       Z edd
      dTdej&                  fd       Z edd
      	 dTdej&                  deej&                  ej&                  ej&                  ej&                  eej&                     f   fd       Z	 dTdej&                  deej&                  eej&                     f   fdZ edd
      dej&                  dej&                  d edeej&                  ej&                  f   fd!       Z ed"d
      dej&                  d#ej&                  dej&                  d edej,                  dej&                  fd$       Z ed%d
      dej&                  d#ej&                  dej&                  d edej,                  dej&                  ddfd&       Zdej&                  d#ej&                  dej&                  d edej,                  dej&                  ddfd'Z ed(d
      dej&                  d ed)ed*ej,                  deej&                  ej&                  f   f
d+       Z ed,d
      dej&                  d#ej&                  d ed)ed-ee   dej,                  dej&                  fd.       Z ed/d
      dej&                  d#ej&                  d ed)ed-ee   dej,                  dej&                  ddfd0       Zdej&                  d#ej&                  d ed)edej,                  dej&                  ddfd1Z ed2d
      dej&                  dej&                  d3ee   d#ej&                  dej&                  d edej&                  fd4       Z ed5d
      dej&                  dej&                  d3ee   d#ej&                  dej&                  d edej&                  ddfd6       Zdej&                  dej&                  d3ee   d#ej&                  dej&                  d edej&                  ddfd7Z	 ej<                  ej>                  ej@                  fejB                  ejD                  fejF                  ejH                  fejJ                  ejL                  ejN                  fejP                  ejR                  fej<                  ej>                  ej@                  fejT                  ejV                  ejX                  fd8Z-ej\                  ej^                  ej`                  fejb                  ejd                  ejf                  fejh                  ejj                  ejl                  fejn                  ejp                  ejr                  fejt                  ejv                  ejx                  fejz                  ej|                  ej~                  fd9Z@	 dUd:ed;ej&                  d<ej&                  d=ej&                  d>eej&                     d?eej&                     d@eAdAeAdBeAdCeAdDeAdEeAdFeAdGeAdHedIeAdJeAddf$dKZB	 dUd:ed;ej&                  d<ej&                  d=ej&                  d>eej&                     dBeAdCeAdDeAdEeAdFeAdHedIeAdLej&                  dMeej&                     dNej&                  dOeej&                     dGeAdJeAddf&dPZC  edQd
      eC         edRd
      eB       y)V    )SequenceN)prod)Optional)CUBLAS_Context_cuda_device_of_get_tensor_streamget_ptr   )register_kernel)HIP_ENVIRONMENTlibz bitsandbytes::int8_linear_matmulcudaABc                     t        j                  g | j                  d d |j                  d   | j                  t         j                        }t        | ||      S )Nr   devicedtype)torchemptyshaper   int32_int8_linear_matmul_implr   r   outs      Y/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/backends/cuda/ops.py_r      sH    
++11aggaj1!((%++
VC#Aq#..    z$bitsandbytes::int8_linear_matmul.outr   c                     t        | ||       y )N)r   r   s      r   r   r      s    Q3'r   c                    || }} | j                   |j                   t        j                  | j                  t        j                  k(  d        t        j                  |j                  t        j                  k(  d        t        j                  | j
                  dk(  d        t        j                  |j
                  dv d        t        j                  t              dkD  fd       t        j                  j                  t        j                  k(         g d d	 d   t        j                  j                   k(  fd
       \  }}t        d d	       }d	   }d	   }d	   }t        j                  ||k(  fd       |dz  dk7  rnt        j                  |j                         | j                         j                               j                  t        j                        }	j                  |	      S t        |       5  t        j                         j!                  | j"                        }
t%        |       }t%        |      }t%              }d }t'        j(                  |      }t'        j(                  |      }t'        j(                  |      }t'        j(                  |      }t'        j(                  |      }t'        j(                  |      }t+        |       }t-        j.                  |
|||||||||||      }d d d        r0|dk(  rt1        d      t3        dddd|||fd|||f
      S # 1 sw Y   =xY w)Nc                       y)NzB must be int8 r#   r   r   <lambda>z*_int8_linear_matmul_impl.<locals>.<lambda>       r   c                       y)NzA must be int8r#   r#   r   r   r$   z*_int8_linear_matmul_impl.<locals>.<lambda>    r%   r      c                       y)Nz:Only two dimensional matrices are supported for argument Br#   r#   r   r   r$   z*_int8_linear_matmul_impl.<locals>.<lambda>!   r%   r   )r'   r
   c                       y)NzCOnly two or three dimensional matrices are supported for argument Ar#   r#   r   r   r$   z*_int8_linear_matmul_impl.<locals>.<lambda>"   r%   r   r   c                      d  S )Nz(Input tensor dimensions need to be > 0: r#   )shapeBs   r   r$   z*_int8_linear_matmul_impl.<locals>.<lambda>#   s    -UV\U]+^ r   r   c                  (    d j                    d S )NzOutput shape z does not match expected shape r   )r   shapeCs   r   r$   z*_int8_linear_matmul_impl.<locals>.<lambda>'   s    cii[Hghngo.p r   c                      d d  S )NzQint8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = z @ r#   )shapeAr+   s   r   r$   z*_int8_linear_matmul_impl.<locals>.<lambda>1   s    cdjckknounvw r      d   z#int8_linear_matmul not implemented!z$cublasLt ran into an error!
	shapeA=z	, shapeB=z	, shapeC=z
	(lda, ldb, ldc)=z
	(m, n, k)=)r   r   _checkr   int8ndimr   r   matmulfloatttocopy_r   r   get_instanceget_contextr   r	   ctc_int32r   r   cigemmlt_32NotImplementedErrorRuntimeError)r   r   r   kmnldaldbldcresultctxptrAptrBptrCptrRowScalestream	has_errorr0   r+   r.   s     `              @@@r   r   r      s   aqAWWFWWF	LLEJJ&(@A	LLEJJ&(@A	LL1bc	LL6!#pq	LLf!#^_	LLekk)*&vcr{&F1I&F	LLf$&pqDAqVCR[A
*C
*C
*C	LLs
w Qw!|aggi7::5;;Gyy  		 h))+77Aqzqzs|JJqMJJqMJJqMjjojjojjo#A&OOCAq$dKQTVY[^`fg	h   &&KLL9&*VIZyPeVY[^`cUdTffulmoprsktjvw  J5h hs   C:MMzbitsandbytes::int8_mm_dequant	row_stats	col_statsr   biasreturnc                     t        j                   j                  t         j                  k(   fd       t        j                  j                  t         j                  k(  fd       t        j                  j                  t         j                  k(  fd       t        j
                   t         j                        }t               }t        |      }t              }t              }	t        j                  t         j                  d d             }
t        j                   j                  d         }|(|j                  t         j                  k(  rt        |      nd }t               5  t        j                  |||	|||
|t                      d d d        |.|j                  t         j                  k7  r|j!                  |       |j#                  |xs t         j                        S # 1 sw Y   \xY w)Nc                  "    d j                    S )NzA must be int32, got r   r   s   r   r$   z_.<locals>.<lambda>`       3H	1R r   c                  "    d j                    S )Nzrow_stats must be float32, got rV   )rP   s   r   r$   z_.<locals>.<lambda>a       =\]f]l]l\m;n r   c                  "    d j                    S )Nzcol_stats must be float32, got rV   )rQ   s   r   r$   z_.<locals>.<lambda>b   rZ   r   rV   r   )r   r3   r   r   float32
empty_likefloat16r	   r=   r>   r   r   r   r   cdequant_mm_int32_fp16r   add_r9   )r   rP   rQ   r   rR   r   rJ   ptrOutptrRowStatsptrColStatsnumRowsnumColsptrBiass   ```          r   r   r   X   si    
LLEKK')RS	LLEMM13no	LLEMM13no 

1EMM
2C1:DS\F)$K)$Kjjaggcrl+,Gjj%G  $/DJJ%--4OgdmUYG		 
""+{FGWgOabcOd	

 DJJ%--766%(5==))
 
s   5&G66G?z#bitsandbytes::int8_vectorwise_quantc                 :    t        j                   j                  t         j                  k(   fd       t        j                  |dk\  d        t	         j
                  d d       } j
                  d   }t        j                  | j                  t         j                        }t        j                   j
                   j                  t         j                        }d }|dkD  r j                         |k\  }|j                         r5t        j                  |j                  d            j                  d      }n0t        j                  d j                  t         j                        }t               5  t!        j"                  t%               t%        |      t%        |      t'        j(                  |      t'        j*                  |      t'        j*                  |      t-                      d d d        |dkD  r|	d|d d |f<   |||fS # 1 sw Y   xY w)	Nc                  "    d j                    S )NzA must be float16, got rV   rW   s   r   r$   z_.<locals>.<lambda>   s    5LQWWI3V r           c                       y)Nzthreshold must be non-negativer#   r#   r   r   r$   z_.<locals>.<lambda>   r%   r   r   r   r   )dim   )r   r3   r   r^   r   r   r   r   r\   r4   absanyargwhereviewint64r   r   cint8_vector_quantr	   r=   c_floatr>   r   )r   	thresholdrowscolsrP   out_rowoutlier_colsoutlierss   `       r   r   r      s   	LLEMM)+VW	LLc!#KLD772;DDGIkk!''!((%**EGL3557i'<<> >>(,,1,*=>CCBGL !;;qML		 	
AJGIJJy!JJtJJtq!	
	
 axL,#$< I|++!	
 	
s   ;A9HHzbitsandbytes::int8_double_quantc                    t         j                  j                  j                  j	                  | |      \  }}}t        | |      \  }}|dkD  r|| j                  |d      } t        j                  | j                  d      |j                  d      z        j                  t         j                        }||||j                         j                         |fS )N)rt   ri   g     _@r   )r   opsbitsandbytesint8_vectorwise_quantdefault_get_col_absmaxmasked_fillroundmul	unsqueezer9   r4   flattenr7   )r   rt   	quant_rowrP   rx   rQ   outlier_mask	quant_cols           r   r   r      s     */)?)?)U)U)])]	 *^ *&Iy, .a9EI|3<3MM,,AEE%L9+>+>q+AABEEejjQIiI,=,=,?,E,E,GUUr   c                 (   t        j                  | j                                d }| j                         j	                  d| j
                  d         }|dkD  r||k\  }|j                  |d       |j                  dd      j                         }||fS )Nr   ri   r   F)rk   keepdim)	r   r3   is_floating_pointrm   rp   r   masked_fill_amaxr7   )r   rt   r   absArQ   s        r   r   r      s     
LL$$&'L557<<AGGBK(D3y(,, 		a	/557Il""r   z bitsandbytes::quantize_blockwisecode	blocksizec           
      8   t        j                  |       t        rt        j                  |dv        nt        j                  |dv        t        j                  j                  t         j
                  k(  fd       | j                         }|| z   }t        j                  |f| j                  t         j
                        }t        j                  | t         j                        }t        |       5  t              t        |       t        |      t        |      t        j                  |      t        j                  | j                               f}| j                  t         j                   k(  rt#        j$                  |  nx| j                  t         j&                  k(  rt#        j(                  |  nH| j                  t         j
                  k(  rt#        j*                  |  nt-        d| j                         d d d        ||fS # 1 sw Y   ||fS xY w)N                  r   r   r   r   r   r   @   c                  "    d j                    S )Nzcode must be float32, got rV   )r   s   r   r$   z_.<locals>.<lambda>   s    8RSWS]S]R^6_ r   r   rV   z?Blockwise quantization only supports 16/32-bit floats, but got )r   _check_is_sizer   r3   r   r\   numelr   r   r]   uint8r   r	   r=   r>   c_intr^   r   cquantize_blockwise_fp16bfloat16cquantize_blockwise_bf16cquantize_blockwise_fp32
ValueError)r   r   r   rD   blocksabsmaxr   argss    `      r   r   r      s   	#Y"CCDY"GGH	LLu}},._`		AYJF[[&1885==IF


1EKK
0C		 jDMAJFOCLJJy!HHQWWY
 77emm#(($/WW&(($/WW%(($/^_`_f_f^ghii#j& ;'j& ;s   8D	HHz"bitsandbytes::dequantize_blockwiser   c                 V    t        j                  | |      }t        | |||||       |S )NrV   r   )r   r]   _dequantize_blockwise_implr   r   r   r   r   r   s         r   r   r      s+    


1E
*Cq&$	5cJJr   z&bitsandbytes::dequantize_blockwise.outc                      t        j                  j                  k(  fd       t        j                  j                   j                  k(   fd       t	         |||       y )Nc                  (    d  dj                    S NzExpected out.dtype == , got rV   r   r   s   r   r$   z_.<locals>.<lambda>      /EeWFSVS\S\R]-^ r   c                  <    d j                    dj                    S NExpected out.shape == r   r-   r   r   s   r   r$   z_.<locals>.<lambda>	  "    1GyPVWZW`W`Va/b r   r   )r   r3   r   r   r   r   s   `   ``r   r   r      sK     
LLe#%^_	LLagg%'bcq&$	5cJr   c           
      `    t         rt        j                  |dv        nt        j                  |dv        t        j                   j                  t        j                  k(   fd       t        j                  t        j
                  t        j                  t        j                  fv fd       t               5  t        |      t               t        |      t        |      t        j                  |      t        j                   j                               t               f}t        j
                  k(  rt        j                  |  nKt        j                  k(  rt        j                   |  n%t        j                  k(  rt        j"                  |  d d d        y # 1 sw Y   y xY w)Nr   r   c                  "    d j                    S )NzA must be uint8, got rV   rW   s   r   r$   z,_dequantize_blockwise_impl.<locals>.<lambda>  rX   r   c                      d  S )NzGBlockwise dequantization only supports 16bit/32bit floating types, got r#   rV   s   r   r$   z,_dequantize_blockwise_impl.<locals>.<lambda>  s    YZ_Y`a r   )r   r   r3   r   r   r^   r   r\   r   r	   r=   r   r   r   r   cdequantize_blockwise_fp16cdequantize_blockwise_bf16cdequantize_blockwise_fp32)r   r   r   r   r   r   r   s   `   `  r   r   r     s*    Y"CCDY"GGH	LLEKK')RS	LL%--??a
 
	 2DMAJFOCLHHYHHQWWYq!
 EMM!**D1enn$**D1emm#**D1!2 2 2s   ?CF$$F-zbitsandbytes::quantize_4bit
quant_typequant_storagec           	          t         rt        j                  |dv        nt        j                  |dv        t        j                  |dv        t        j                   j                  t        j                  t        j
                  t        j                  fv  fd        j                         }|| z   }t        j                  |f j                  t        j                        }t        j                  |dz   |j                  dz  z  df j                  |      }t               5  d t               t        |      t        |      t        j                  |      t        j                  |      f} j                  t        j                  k(  r+|dk(  rt!        j"                  |  nt!        j$                  |  n j                  t        j
                  k(  r+|dk(  rt!        j&                  |  nZt!        j(                  |  nG j                  t        j                  k(  r*|dk(  rt!        j*                  |  nt!        j,                  |  d d d        ||fS # 1 sw Y   ||fS xY w)	Nr   r   fp4nf4c                  "    d j                    S )NzDBlockwise 4bit quantization only supports 16/32-bit floats, but got rV   rW   s   r   r$   z_.<locals>.<lambda>:  s    VWXW^W^V_` r   r   rl   r'   r   )r   r   r3   r   r   r^   r\   r   r   r   itemsizer   r	   r=   r>   r   r   cquantize_blockwise_bf16_fp4cquantize_blockwise_bf16_nf4cquantize_blockwise_fp16_fp4cquantize_blockwise_fp16_nf4cquantize_blockwise_fp32_fp4cquantize_blockwise_fp32_nf4)	r   r   r   r   rD   r   r   r   r   s	   `        r   r   r   .  s    Y"CCDY"GGH	LL~-.	LL	ENNEMM5==AA`
 	
	AYJF[[&1885==IF
++A=#9#9A#=>B188[h
iC		 8AJFOCLJJy!HHQK
 77enn$U"00$700$7WW%U"00$700$7WW%U"00$700$7184 ;584 ;s   *D!II#zbitsandbytes::dequantize_4bitr   c                 l    t        j                  ||| j                        }t        | |||||       |S )N)r   r   r   )r   r   r   _dequantize_4bit_implr   r   r   r   r   r   r   s          r   r   r   _  s1     ++e5
:C!VY
EsKJr   z!bitsandbytes::dequantize_4bit.outc                     t        j                  j                  k(  fd       t        j                  j                  k(  fd       t	        | |||       y )Nc                  (    d d j                    S r   r-   )r   r   s   r   r$   z_.<locals>.<lambda>w  r   r   c                  (    d  dj                    S r   rV   r   s   r   r$   z_.<locals>.<lambda>x  r   r   r   )r   r3   r   r   r   r   s       ```r   r   r   m  sG     
LLe#%^_	LLe#%^_!VY
EsKr   c           
         t         rt        j                  |dv        nt        j                  |dv        t        j                  |dv        t        j                  t        j                  t        j                  t        j
                  fv fd       t        |       5  d t        |       t        |      t        |      t        j                  |      t        j                  |j                               t        |       f}|j                  t        j                  k(  r+|dk(  rt        j                  |  nt        j                  |  n|j                  t        j                  k(  r+|dk(  rt        j                   |  nZt        j"                  |  nG|j                  t        j
                  k(  r*|dk(  rt        j$                  |  nt        j&                  |  d d d        y # 1 sw Y   y xY w)Nr   r   r   c                      d  S )NzFBlockwise 4bit dequantization only supports 16/32-bit floats, but got r#   rV   s   r   r$   z'_dequantize_4bit_impl.<locals>.<lambda>  s    XY^X_` r   r   )r   r   r3   r   r^   r\   r   r	   r=   r   r   r   r   r   cdequantize_blockwise_bf16_fp4cdequantize_blockwise_bf16_nf4cdequantize_blockwise_fp16_fp4cdequantize_blockwise_fp16_nf4cdequantize_blockwise_fp32_fp4cdequantize_blockwise_fp32_nf4)r   r   r   r   r   r   r   s       `  r   r   r   |  si    Y"CCDY"GGH	LL~-.	LL%..%--??`
 
	 :AJFOCLHHYHHSYY[!q!
 99&U"22D922D9YY%--'U"22D922D9YY%--'U"22D922D93: : :s   !D9G##G,zbitsandbytes::gemv_4bitr+   c           	          g | j                   d d |d   }t        j                  || j                  | j                        }t        | ||||||       |S )Nr   r   r   r   )r   r   r   r   r   _gemv_4bit_impl)r   r   r+   r   r   r   r   r   s           r   r   r     sT     'aggcrl&F1I&E
++eAHHAGG
<CAq&&$	sCJr   zbitsandbytes::gemv_4bit.outc           	      
    t        j                  j                  g  j                  d d d   k(   fd       t        j                  j                   j                  k(   fd       t	         ||||       y )Nr   r   c                  R    dg  j                   d d d    dj                    S )Nr   r   r   r   r-   )r   r   r+   s   r   r$   z_.<locals>.<lambda>  s3    ()C1773B<)C)C(DF399+V r   c                  <    d j                    dj                    S r   rV   r   s   r   r$   z_.<locals>.<lambda>  r   r   r   )r   r3   r   r   r   )r   r   r+   r   r   r   r   s   ` `   `r   r   r     sk     
LL		/qwws|/VAY//V 
LLagg%'bcAq&&$	sCr   c                 ^   t        j                  |       t        j                  |d         }t        j                  d      }t        j                  |d         }	|}
t        j                  | j                  d   dz   dz        }|}t        |       }t        |       5  | j                  t         j                  k(  rbt        j                  |||	t        |       t        |      t        |      t        |      t        |      |
||t        j                  |      |       n| j                  t         j                  k(  rbt        j                  |||	t        |       t        |      t        |      t        |      t        |      |
||t        j                  |      |       n~| j                  t         j                  k(  rat        j                  |||	t        |       t        |      t        |      t        |      t        |      |
||t        j                  |      |       d d d        y # 1 sw Y   y xY w)Nr   rl   r   r'   )r   r   r=   r>   r   r   r   r   r^   r   cgemm_4bit_inference_naive_fp16r	   r   cgemm_4bit_inference_naive_bf16r\   cgemm_4bit_inference_naive_fp32)r   r   r+   r   r   r   r   rC   rD   rB   rE   rF   rG   rN   s                 r   r   r     s    
#$ 	

6!9A


1A


6!9A
C
**aggbkAo!+
,C
C"F		 077emm#//



9% WW&//



9% WW%//



9%E0 0 0s   E=H##H,)adammomentumrmsproplionadagradlambademamix)r   r   r   r   r   r   optimizer_namegpstate1state2	unorm_vec	max_unorm
param_normbeta1beta2beta3alphaepsweight_decaysteplrgnorm_scalec                    t         j                  | d       }|,t        d|  dt        t        j                                      |j                  t        j                  k(  r|d   }ny|j                  t        j                  k(  r|d   }nV|j                  t        j                  k(  rt        |      dk(  r|d   }n%t        d|j                   d|j                         t        |      5   |t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |	      t        j                  |
      t        j                  |      t        j                  |      t        j                  |      t        j                   |      t        j                  |      t        j                  |      t        j"                  |      t        j                   |j%                                      d d d        y # 1 sw Y   y xY w)	NUnsupported optimizer name: . Supported optimizers: r   rl   r
   r'   zAGradient+optimizer bit data type combination not supported: grad z, optimizer )str2optimizer32bitgetr   liststr2optimizer8bit_blockwisekeysr   r   r\   r^   r   lenr   r	   r=   rs   r>   c_boolr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
skip_zeros	optim_fns
optim_funcs                       r   _optimizer_update_32bit_implr  e  s   ( #&&~t<I*>*::RSWXsXxXxXzS{R|}
 	
 	ww%--q\
	
EMM	!q\
	
ENN	"s9~':q\
OPQPWPWyXdekeqeqdrs
 	
 
	 
AJAJFOFOIJJy!JJz"JJuJJuJJuJJuJJsOJJ|$JJtJJrNJJ{#IIj!JJqwwy!%	

 
 
s   ,EII
qmap1qmap2absmax1absmax2c                 P   t         j                  |       }|,t        d|  dt        t         j	                                      |j
                  t        j                  k(  r|d   }n_|j
                  t        j                  k(  r|d   }n<|j
                  t        j                  k(  r|d   }nt        d|j
                   d      t        |      5   |t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |	      t        j                  |
      t        j                  |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |j!                                      d d d        y # 1 sw Y   y xY w)Nr   r   r   rl   r'   zUnsupported gradient dtype: z@. Supported dtypes: torch.float32, torch.float16, torch.bfloat16)r   r   r   r   r   r   r   r\   r^   r   r   r	   r=   rs   r>   r  r   )r   r   r   r   r   r   r   r   r   r   r   r   r  r  r	  r
  r   r   r  optimizer_fnsoptimizer_fns                        r   %_optimizer_update_8bit_blockwise_implr    s   h 033NCM*>*::RSWXsXxXxXzS{R|}
 	
 	ww%--$Q'	
EMM	!$Q'	
ENN	"$Q'*177)3st
 	
 
	 
AJAJFOFOJJuJJuJJuJJuJJsOJJtJJrNENENGGJJ|$JJ{#IIj!JJqwwy!'	

 
 
s   EHH%z-bitsandbytes::optimizer_update_8bit_blockwisez$bitsandbytes::optimizer_update_32bit)NN)ri   )F)Dcollections.abcr   ctypesr=   mathr   typingr   r   bitsandbytes.functionalr   r   r   r	   _opsr   
cextensionr   r   Tensorr   r   r   tupler   intr   strr   r   cadam32bit_grad_fp32cadam32bit_grad_fp16cadam32bit_grad_bf16cmomentum32bit_grad_32cmomentum32bit_grad_16crmsprop32bit_grad_32crmsprop32bit_grad_16clion32bit_grad_fp32clion32bit_grad_fp16clion32bit_grad_bf16cadagrad32bit_grad_32cadagrad32bit_grad_16cademamix32bit_grad_fp32cademamix32bit_grad_fp16cademamix32bit_grad_bf16r   cadam_8bit_blockwise_grad_fp32cadam_8bit_blockwise_grad_fp16cadam_8bit_blockwise_grad_bf16"cmomentum_8bit_blockwise_grad_fp32"cmomentum_8bit_blockwise_grad_fp16"cmomentum_8bit_blockwise_grad_bf16!crmsprop_8bit_blockwise_grad_fp32!crmsprop_8bit_blockwise_grad_fp16!crmsprop_8bit_blockwise_grad_bf16clion_8bit_blockwise_grad_fp32clion_8bit_blockwise_grad_fp16clion_8bit_blockwise_grad_bf16!cadagrad_8bit_blockwise_grad_fp32!cadagrad_8bit_blockwise_grad_fp16!cadagrad_8bit_blockwise_grad_bf16"cademamix_8bit_blockwise_grad_fp32"cademamix_8bit_blockwise_grad_fp16"cademamix_8bit_blockwise_grad_bf16r   r7   r  r  r#   r   r   <module>r;     s\
   $     ` ` # . 3V</ /%,, / =/
 7@( (%,, (U\\ ( A(< < <ELL <~ 0&9
 $(#'#*||#*||#* ||#* EKK 	#*
 5<<
 #* \\#* :#*L 6?&, &, @&,R 2F; V||V 5<<u||U\\8ELLCYYZV <V* #||# 5<<%,,//0#* 3V<" "U\\ "c "eELLRWR^R^D^>_ " ="J 5v> u|| 5<< C X]XcXc hmhtht  ? 96B
K||
KLL
K ,,
K 	
K
 ;;
K 

K 

K C
K2||2"\\2162JM2V[VaVa2hmhtht2	2B .7-||- #-14-EJ[[-
5<<%&- 8-` 0&9
||
LL
 
 	

 C=
 ;;
 \\
 :
 4f=L||LLLL L 	L
 C=L ;;L 
L 
L >L,:||,:LL,: ,: 	,:
 ;;,: 
,: 
,:^ *F3||.6smEJ\\Y^YeYeru
\\ 4 .7D||D||D SMD LL	D
 ,,D D 
D 
D 8D"U||U||U SMU LL	U
 ,,U U 
U 
Up ! 	       	""""
 	!!!!
 	       	!!!!
 	       	$$$$$$9! J 	****** 	...... 	------ 	****** 	------ 	......5 h %8
8
||8
 ||8
 LL	8

 U\\"8
 %8
 8
 8
 8
 8
 8
 8
 
8
 8
 8
  	!8
" #8
& 
'8
\ 'Z
Z
||Z
 ||Z
 LL	Z

 U\\"Z
 Z
 Z
 Z
 Z
 
Z
 Z
 	Z
 <<Z
 ELL!Z
 \\Z
  ell#!Z
" #Z
$ %Z
( 
)Z
z I? HIn o ?6 ?@\ ]r   