
    bi?              (       (   d dl Z d dlmZ d dlZd dlZd dlmZ ddlm	Z	m
Z
mZmZ d ZdZdZdZdZdZeeeeeed	Zej(                  d
ej*                  dej*                  dej*                  dej*                  dej*                  dej*                  dej*                  dej*                  fd       Zej(                  d
ej*                  dej*                  dej*                  dej*                  dej*                  dej*                  dej*                  fd       Zej(                  dej*                  d
ej*                  dej*                  dej*                  dej*                  dej*                  dej*                  dej*                  dej*                  fd       Zej(                  dej*                  d
ej*                  dej*                  dej*                  dej*                  dej*                  dej*                  dej*                  dej*                  fd       ZeedeedeedeedeedeeddZ	 	 d8dedej8                  dej8                  dej8                  deej8                     deej8                     deded
eded ed!ededed"ed#eded$df$d%Zej@                  d&ej8                  d'ej8                  d(ej8                  d)ed*ejB                  d$ej8                  fd+       Z"ej@                  d&ej8                  d(ej8                  d)ed$e#ej8                  ej8                  f   fd,       Z$dej8                  dej8                  dej8                  deej8                     d
eded ed!eded"ed#ed-ej8                  d.eej8                     d/ej8                  d0eej8                     deded1e%ded$df(d2Z&dej8                  dej8                  dej8                  deej8                     d
eded ed!eded"ed#ed-ej8                  d.eej8                     d/ej8                  d0eej8                     deded1e%ded$df(d3Z'ej(                  d
ej*                  dej*                  dej*                  d4ej*                  dej*                  dej*                  fd5       Z(ej(                  d
ej*                  dej*                  dej*                  dej*                  dej*                  d4ej*                  dej*                  dej*                  fd6       Z)e(e(e(e)e(e)d	Z*	 	 	 d9dedej8                  dej8                  dej8                  deej8                     d
eded ed!eded"ed#ed-ej8                  d.eej8                     d/ej8                  d0eej8                     deded$df&d7Z+e+Z+y):    N)Optional   )dequant_8bit_blockwise"dequant_8bit_blockwise_kernel_util#quantize_8bit_blockwise_kernel_utilquantize_blockwise_triton            )momentumrmspropadagradadamlionademamixbeta1beta2epsweight_decaygnorm_scaleOPTIMIZER_ID
BLOCK_SIZEN_PER_THc                 z   t        j                  d      }||z  }||z  t        j                  d||z        z   }||k  }t        j                  | |z   |d      }t        j                  ||z   |d      }t        j                  ||z   |d      }||z  }dd|
z
  z  }dd|z
  z  }|dk(  rJ||z  d|z
  |z  z   }||z  d|z
  |z  |z  z   }||z  }||z  }|t        j                  |      |z   z  }||z  }n|dk(  r|}t        j
                  t        j                  |d            }t        j                  ||       y)	zBPreprocessing optimizer, computing update norm (2-state optimizer)r   axis        maskother      ?r
   r   Ntl
program_idarangeloadsqrtsumwhere
atomic_add)g_ptrp_ptr
state1_ptr
state2_ptr	unorm_ptrr   r   r   r   step
beta1_step
beta2_steplrr   
n_elementsr   r   r   pidblock_start_idxoffsetsr    g_valss1_valss2_valscorrection1correction2update_valsupdate_norm
total_norms                                 e/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/backends/triton/kernels_optim.py$_optimizer_precondition_2state_32bitrB   #   sW   , --Q
CHnO
*RYYq*x:O-PPGZDWWUW_4s;Fggj7*SAGggj7*SAG6!Fz)*Kz)*KqE/S5[F$::E/S5[F$:V$CCK'K'!1C!78!K/		{C89JMM)Z(    c                    t        j                  d      }||z  }||z  t        j                  d||z        z   }||k  }t        j                  | |z   |d      }t        j                  ||z   |d      }||z  }|dk(  r|	dk(  r|}n||z  |z   }||z  }nz|dk(  r||z  d|z
  |z  z   }|}nd|dk(  r2||z  d|z
  |z  |z  z   }|t        j                  |      |z   z  }||z  }n-|dk(  r(|||z  z   }|t        j                  |      |z   z  }||z  }t        j
                  t        j                  |d            }t        j                  ||       y	)
zBPreprocessing optimizer, computing update norm (1-state optimizer)r   r   r   r   r   r   r"   r	   Nr#   )r,   r-   r.   r/   r0   r   r   r   r   r1   r2   r3   r4   r   r5   r   r   r   r6   r7   r8   r    r9   r:   r?   r>   r@   s                              rA   $_optimizer_precondition_1state_32bitrE   Z   sr   , --Q
CHnO
*RYYq*x:O-PPGZDWWUW_4s;Fggj7*SAG6!Fq19Go.G'		E/S5[F$::		E/S5[F$:V$CC 03 67!K/		FVO+ 03 67!K/{C89JMM)Z(rC   	max_unormc                    t        j                  d      }||z  }||z  t        j                  d||z        z   }||k  }t        j                  | |z   |d      j	                  t         j
                        }t        j                  ||z   |d      j	                  t         j
                        }t        j                  ||z   |d      }t        j                  ||z   |d      }|dk(  rt        j                  ||z   |z   |d      }||z  }d} |dkD  r8t        j                  t        j                  |            }!|!||z  kD  r||z  |!z  } |dk(  r||z  d|z
  |z  z   }||z  d|z
  |z  |z  z   }d|z
  }"t        j                  d|z
        }#| |#z  |"z  }$|dkD  r|d||z  z
  z  }| |$z  |t        j                  |      ||#z  z   z  z  }%||%z   }n|dk(  r||z  d|z
  |z  z   }|	z  d|	z
  |z  z   }||z  d|z
  |z  |z  z   }d|z
  }"t        j                  d|z
        }#|dkD  r|d||z  z
  z  }||"z  |
|z  z   }&t        j                  |      |#z  |z   }'|||&|'z  z  z
  }t        j                  ||z   ||       t        j                  ||z   ||       t        j                  ||z   ||       |dk(  rt        j                  ||z   |z   |       y	y	)
z2-state optimizer kernelr   r   r   r   r   r"   r
   r    N)r$   r%   r&   r'   tofloat32r(   store)(r,   r-   r.   r/   r0   rF   
param_normr   r   beta3alphar   r   r1   r2   r3   r4   r   
skip_zerosr5   r   r   r   r6   r7   r8   r    r9   p_valsr:   r;   s3_valsupdate_scalecurrent_unormr<   r=   	step_size
update_valmixed_momentumadaptive_terms(                                           rA   ,_optimizer_update_2state_32bit_triton_kernelrX      s   6 --Q
CHnO
*RYYq*x:O-PPGZDWWUW_4s;>>rzzJFWWUW_4s;>>rzzJFggj7*SAGggj7*SAGq''*z1G;$cR6!FL3	 239z11%
2mCLqE/S5[F$::E/S5[F$:V$CCJ&ggcJ./C+%3	#sR,%667F!I-BGGG<LsU`O`<`1ab
*$		E/S5[F$::E/S5[F$::E/S5[F$:V$CCJ&ggcJ./#sR,%667F!K/EGOD)K73>" >??HHUW_f40HHZ'!76HHZ'!76q
j(72G$G rC   c           
         t        j                  d      }||z  }||z  t        j                  d||z        z   }||k  }t        j                  | |z   |d      j	                  t         j
                        }t        j                  ||z   |d      j	                  t         j
                        }t        j                  ||z   |d      }||z  }|dkD  r|||z  z   }d}|dkD  r>t        j                  t        j                  |            }|||z  |z   kD  r||z  |z   |z  }|dk(  r|dk(  r|}n||z  |z   }|| |z  z  } || z   }n|dk(  rZ||z  d|z
  |z  z   }!||z  t        j                  |!dkD  dt        j                  |!dk  dd            z  } || z
  }||z  d|z
  |z  z   }nm|dk(  r8||z  d|z
  |z  |z  z   }||z  |z  t        j                  |      |z   z  } || z
  }n0|d	k(  r+|||z  z   }||z  t        j                  |      |z   z  } || z
  }t        j                  ||z   ||
       t        j                  ||z   ||
       y)z1-state optimizer kernelr   r   r   r   r"   r   r         r	   rH   N)	r$   r%   r&   r'   rI   rJ   r(   r*   rK   )"r,   r-   r.   r/   r0   rF   rL   r   r   rM   rN   r   r   r1   r2   r3   r4   r   rO   r5   r   r   r   r6   r7   r8   r    r9   rP   r:   rR   rS   rU   momentum_updates"                                     rA   ,_optimizer_update_1state_32bit_triton_kernelr\      sw   6 --Q
CHnO
*RYYq*x:O-PPGZDWWUW_4s;>>rzzJFWWUW_4s;>>rzzJFggj7*SAG6!Fc&<//L3	 239z1C77%
2S8MILq19Go.G!bS7]3
*$		!E/S5[F,BB!B&/A2EsBHHUdghUhjnpsLt)uu
*$E/S5[F$::		E/S5[F$:V$CC!B&/27773Cc3IJ
*$		FVO+&[BGGG$4s$:;
*$HHUW_f40HHZ'!76rC   )
preprocessupdate)r   r   r   r   r   r   optimizer_namegpstate1state2	unorm_vecrL   rM   rN   r1   r4   returnc                    |rt        d      d}d}t        j                  |j                         ||z        f}t        |    }t
        |    d   }t
        |    d   }||z  }|	|z  }| dk(  rt ||   |||||||||	|
||||||||||j                         |||d       |d	kD  r=|j                           ||   |||||||	||||||||j                         |||d       y
y
|d	kD  r<|j                           ||   |||||||	||||||||j                         |||d        ||   |||||||||	|
||||||||||j                         |||d       y
)z0
    32-bit optimizer implemented by Triton
    &skip_zeros is not supported on XPU yet   r   r]   r^   r   r	   )	num_warpsr   N)NotImplementedErrortritoncdivnumelname2optimizer_idname2optimizer_32bit_fnzero_)r_   r`   ra   rb   rc   rd   rF   rL   r   r   rM   rN   r   r   r1   r4   r   rO   r   r   gridoptimizer_idfn_preprocess	fn_updater2   r3   s                             rA   optimizer_update_32bit_implru   S  s    . !"JKKJHKK	:#89;D$^4L+N;LIM'7AI JJ	$GGI1	
6 s?OOM$	' 2 s?OOM$	', 		$GGI1	
rC   Aabsmaxcode	blocksizedtypec                    | j                         dk(  rt        j                  | |      S | j                         }|j                         }|j	                  | j
                        |j                            j	                  |      }t        j                  ||z        }||z  |z
  }	|	dkD  r,t        j                  j                  j                  |d|	f      }|j                  ||      }
|
|j                  d      j	                  |      z  }|j                         }|	dkD  r|d|	  }|j                  | j                        S )zN
    Pure PyTorch reference implementation for block-wise dequantization.
    r   rz   r   N)rm   torch
empty_likeflattenrI   devicelongmathceilnn
functionalpadreshape	unsqueezeshape)rv   rw   rx   ry   rz   A_flatnum_elementsdequantized_flat
num_blockspad_lendequantized_blocksrescaled_blocksrescaled_flats                rA   _dequantize_blockwise_pytorchr     s    	wwyA~//YY[F<<>Lwwqxx(7::5A<)34J9$|3G{ 88..223Ca\R)11*iH(6+;+;A+>+A+A%+HHO#++-M{%ix0  ))rC   c                    | j                         dk(  rUt        j                  | t        j                        t        j                  dt        j
                  | j                        fS | j                         }|j                         }t        j                  ||z        }||z  |z
  }|dkD  r,t        j                  j                  j                  |d|f      }|j                  ||      }t        j                  t        j                  |      dd      d   }d||dk(  <   ||z  }	t        j                  |	j!                  d      |j#                  | j                        z
        }
t        j$                  |
d	      j#                  t        j                        }|j                         }|dkD  r|d
|  }|j                  | j&                        |j                         fS )zL
    Pure PyTorch reference implementation for block-wise quantization.
    r   r|   )rz   r   r   T)dimkeepdimr"   r	   )r   N)rm   r}   r~   uint8emptyrJ   r   r   r   r   r   r   r   r   maxabsr   rI   argminr   )rv   rx   ry   r   r   r   r   A_blocksrw   scaled_blocksdiffquantized_indicesquantized_flats                rA   _quantize_blockwise_pytorchr   
  s}    	wwyA~5u{{1EMMbcbjbj7kkkYY[F<<>L<)34J9$|3G{$$((!W>~~j)4HYYuyy*4@CFF6Q;v%M 99],,Q/$''!((2CCDDTq144U[[A&..0N{'	'2!!!''*FNN,<<<rC   qmap1qmap2absmax1absmax2rO   c                   |rt        d      d}t        j                         5  |dk(  rm|j                  dk(  r^t	        |d   |d   ||t        j
                        }t	        |d   |d   ||t        j
                        }t        j                  ||g      }nt	        ||||t        j
                        }d}|t	        ||||t        j
                        }|j                         |z  }| j                  j                         }|dk(  r|j                  |      j                  |d	|z
  
       |j                  |      j                  ||d	|z
         d	||	z  z
  }d	||	z  z
  }|j                         t        j                  |      z  j                  |      }|dkD  r|j                  d	|
|z  z
         |j                  |||
 |z         n|dk(  r
|d   |d   }}|}|j                  |      j                  |d	|z
  
       |j                  |      j                  |d	|z
  
       |j                  |      j                  ||d	|z
         d	||	z  z
  }t        j                  d	||	z  z
        }||z  ||z  z   |j                         |z  |z   z  } |dkD  r|j                  d	|
|z  z
         |j                  | |
 
       t        j                  ||g      }n|dk(  r`|j                  ||
       |	dk(  r|j                  |       n |j                  |      j                  |       |j                  ||
 
       nt|dk(  rm|j                  ||
       |j                  |      j                  ||d	|z
         |j                  ||j                         j                  |      |
        n|dk(  r|dkD  r|j                  d	|
|z  z
         t        j                   |j#                  |      |j#                  d	|z
        z         }!|j                  |!|
 
       |j                  |      j                  |d	|z
  
       nn|dk(  rZ|j                  ||
       |j                  ||d	       |j                  ||j                         j                  |      |
        nt%        d| d      | j                  j                  |       |dk(  rt'        |d   ||      \  }"}#t'        |d   ||      \  }$}%|d   j                  |"       |d   j                  |$       |d   j                  |#       |d   j                  |%       t'        |||      \  }&}'|j                  |&       |j                  |'       nft'        |||      \  }(})|j                  |(       |j                  |)       |2t'        |||      \  }&}'|j                  |&       |j                  |'       ddd       y# 1 sw Y   yxY w)
    Pure PyTorch implementation of the 8-bit block-wise optimizer update step.
    This version ensures high-precision updates for float16 parameters.
    z'skip_zeros is not supported on XPU yet.rh   r   r	   r   r   Nr   r"   rN   valuer   r   r   r   r   +Pure PyTorch implementation for optimizer '' is not available.)
ValueErrorr}   no_gradndimr   rJ   stackfloatdatamul_add_addcmul_r(   r   addcdiv_copy_signmulrj   r   )*ra   r`   rb   rc   r   r   rM   rN   r   r1   r4   r   r   r   r   r   r   rO   r_   ry   	s1_1_fp32	s1_2_fp32state1_fp32state2_fp32gradp_fp32bias_correction1bias_correction2denomm1_fp32m2_fp32nu_fp32r^   
update_dirnew_m1_8bitnew_absmax_m1new_m2_8bitnew_absmax_m2new_state2_8bitnew_absmax2new_state1_8bitnew_absmax1s*                                             rA   'optimizer_update_8bit_blockwise_pytorchr   2  s   6 BCCI	 j+Z'GLLA,=5fQiUT]_d_l_lmI5fQiUT]_d_l_lmI++y)&<=K7PY[`[h[hiK7PY[`[h[hiKwwy;& V#U#((S5[(AU#,,T4sU{,K"UD[0"UD[0 %%'$))4D*EEKKCPEc!C"|"334OOKrc<L6LOMz)*1~{1~WG!GLL$$Tu$=LL$$Tu$=LL((t3;(G"UD[0#yyud{):; 0057?Bw||~XhGhknGnoFc!C"|"334KKrcK*++w&89Kz)IIfLI1qy!!$'  ',,T2KKB3K/y(IIfLI1U#,,T4sU{,KOOD+"2"2"4"9"9#">rcOJv%c!C"|"334KOOE$:TXXcEk=R$RSJKK
2#K.U#((S5[(Ay(IIfLI1  t3 7OOD+"2"2"4"9"9#">rcOJ &=n=MM`a 
 	
V Z')D[QR^UZ\e)f&K)D[QR^UZ\e)f&K1IOOK(1IOOK(AJ]+AJ]++F{TY[d+e(O[LL)MM+&+F{TY[d+e(O[LL)MM+&&/J;X]_h/i,_-k*Uj+ j+ j+s   WW..W7c                   |rt        j                  |      syd}|j                         |z  }t        j                         5  | j                  j                         }|dk(  ro|j
                  dk(  r`t        |d   |d   ||t         j                        }t        |d   |d   ||t         j                        }t        j                  ||g      }nt        ||||t         j                        }d}|t        ||||t         j                        }|dk(  r|d	kD  r|j                  d
|
|z  z
         |j                  |      j                  |d
|z
         |j                  |      j                  ||d
|z
         d
||	z  z
  }d
||	z  z
  }|j                         t        j                  |      z  j                  |      }|j                  |||
 |z         n|dk(  r
|d   |d   }}|}|j                  |      j                  |d
|z
         |j                  |      j                  |d
|z
         |j                  |      j                  ||d
|z
         d
||	z  z
  }t        j                  d
||	z  z
        }||z  ||z  z   |j                         |z  |z   z  } |d	kD  r|j                  d
|
|z  z
         |j                  | |
        t        j                  ||g      }n|dk(  r`|j                  ||       |	dk(  r|j                  |       n |j                  |      j                  |       |j                  ||
        nt|dk(  rm|j                  ||       |j                  |      j                  ||d
|z
         |j                  ||j                         j                  |      |
        n|dk(  r|d	kD  r|j                  d
|
|z  z
         t        j                   |j#                  |      |j#                  d
|z
        z         }!|j                  |!|
        |j                  |      j                  |d
|z
         nn|dk(  rZ|j                  ||       |j                  ||d
       |j                  ||j                         j                  |      |
        nt%        d| d      | j                  j                  |       |dk(  rt'        |d   ||      \  }"}#t'        |d   ||      \  }$}%|d   j                  |"       |d   j                  |$       |d   j                  |#       |d   j                  |%       t'        |||      \  }&}'|j                  |&       |j                  |'       nft'        |||      \  }(})|j                  |(       |j                  |)       |2t'        |||      \  }&}'|j                  |&       |j                  |'       ddd       y# 1 sw Y   yxY w)r   Nrh   r   r	   r   r|   r   r   r   r"   r   r   r   r   r   r   r   r   )r}   anyr   r   r   r   r   rJ   r   r   r   r   r(   r   r   r   r   r   rj   r   )*ra   r`   rb   rc   r   r   rM   rN   r   r1   r4   r   r   r   r   r   r   rO   r_   ry   r   r   r   r   r   r   r   r   r   r   r   r   r^   r   r   r   r   r   r   r   r   r   s*                                             rA   ,optimizer_update_8bit_blockwise_triton_quantr     s   6 %))A,I779{"D	 i+ Z'GLLA,=.vay'!*eY^c^k^klI.vay'!*eY^c^k^klI++y)&<=K0%Z_ZgZghK0%Z_ZgZghK V#c!C"|"334U#((S5[(AU#,,T4sU{,K"UD[0"UD[0 %%'$))4D*EEKKCPEOOKrc<L6LOMz)*1~{1~WG!GLL$$Tu$=LL$$Tu$=LL((t3;(G"UD[0#yyud{):; 0057?Bw||~XhGhknGnoFc!C"|"334KKrcK*++w&89Kz)IIfLI1qy!!$'  ',,T2KKB3K/y(IIfLI1U#,,T4sU{,KOOD+"2"2"4"9"9#">rcOJv%c!C"|"334KOOE$:TXXcEk=R$RSJKK
2#K.U#((S5[(Ay(IIfLI1  t3 7OOD+"2"2"4"9"9#">rcOJ &=n=MM`a 
 	
V Z')B;q>SXZc)d&K)B;q>SXZc)d&K1IOOK(1IOOK(AJ]+AJ]++D[RWYb+c(O[LL)MM+&+D[RWYb+c(O[LL)MM+&&/HV[]f/g,_-k*Si+ i+ i+s   V2W==XBLOCK_SIZE_Nc           	         t        j                  d      }||z  }||z  t        j                  d||z        z   }||k  }t        j                  ||z   |d      j	                  t         j
                        |z  }t        j                  | |z   |d      j	                  t         j
                        }t        ||||||      }|dkD  r|dk(  r|d||z  z
  z  }n|dkD  r|||z  z  }|dk(  r|	dk(  r|}n||z  |z   }|||z  z  }n|dk(  r3||z  d|z
  |z  |z  z   }|||t        j                  |      |z   z  z  z  }n|dk(  r*|||z  z  }|||t        j                  |      |z   z  z  z  }n[|dk(  rV||z  d|z
  |z  z   }t        j                  |dkD  dt        j                  |dk  d	d            }|||z  z  }||z  d|z
  |z  z   }t        j                  | |z   |j	                  | j                  j                        |
       t        ||d||      \  } }!t        j                  ||z   | |
       t        j                  ||z   t        j                  d|      z   |!       y)zy
    Triton kernel for 8-bit optimizers that use one momentum state.
    Supports: Momentum, RMSprop, Adagrad, Lion.
    r   r   r   r   r	   r"   r   r   rZ   rH   rh   N)r$   r%   r&   r'   rI   rJ   r   r(   r*   rK   rz   
element_tyr   )"r-   r,   r.   r/   r   r   rM   rN   r   r1   r2   r3   r4   	qmap1_ptr	qmap2_ptrabsmax1_ptrabsmax2_ptrr   r   r5   r   r   r   r6   r7   r8   r    r`   ra   s1valr^   s1_codesr   s"                                     rA   5_optimizer_update_1state_8bit_blockwise_triton_kernelr   W  s~   B --Q
CHnO,ryyL8<S/TTGZD 	d#699"**ESA
d#699"**EA	+JKY]_k	lB cla/	S2$$$			Q q19BeaB	R"W 
	%Z3;!+a//	R1c)*++ 
	
a!e	R1c)*++ 
	5jC%K1,,#)S"((39dC*HI	R&[%Z3;!++ HHUW_add5;;#9#9:F?IsT`bjkHkHHZ'!8$7HH[?*RYYq(-CC[QrC   c                    t        j                  d      }||z  }||z  t        j                  d||z        z   }||k  }t        j                  ||z   |d      j	                  t         j
                        |z  }t        j                  | |z   |d      j	                  t         j
                        }|dk(  rt        ||||||      }t        ||||||      }||z  d|z
  |z  z   }||z  d|z
  |z  |z  z   }d|
z
  }d|z
  } |dkD  r|d||z  z
  z  }t        j                  |      t        j                  |       z  |z   }!|||z  ||!z  z  z  }t        j                  | |z   |j	                  | j                  j                        |       t        ||d||      \  }"}#t        j                  ||z   |"|       t        j                  ||z   t        j                  d|      z   |#       t        ||d||      \  }$}%t        j                  ||z   |$|       t        j                  ||z   t        j                  d|      z   |%       y
|d	k(  r!t        ||||||      }&t        ||z   |||||z  z   ||      }'t        ||||||      }(|&|z  d|z
  |z  z   }&|'|z  d|z
  |z  z   }'|(|z  d|z
  |z  |z  z   }(d|
z
  }t        j                  d|z
        } |&|z  ||'z  z   t        j                  |(      | z  |z   z  })|dkD  r|d||z  z
  z  }|||)z  z  }t        j                  | |z   |j	                  | j                  j                        |       t        |&|d||      \  }*}+t        j                  ||z   |*|       t        j                  ||z   t        j                  d|      z   |+       t        |'|d||      \  },}-t        j                  ||z   |z   |,|       t        j                  ||z   t        j                  d|      z   ||z  z   |-       t        |(|d||      \  }.}/t        j                  ||z   |.|       t        j                  ||z   t        j                  d|      z   |/       y
y
)zh
    Triton kernel for 8-bit optimizers that use two momentum states.
    Supports: Adam, AdEMAMix.
    r   r   r   r   r
   r"   rH   rh   r   N)r$   r%   r&   r'   rI   rJ   r   r(   rK   rz   r   r   )0r-   r,   r.   r/   r   r   rM   rN   r   r1   r2   r3   r4   r   r   r   r   r   r   r5   r   r   r   r6   r7   r8   r    r`   ra   r   s2r   r   r   r   r   s2_codesr   m1m2nur^   m1_codesr   m2_codesr   nu_codesnew_absmax_nus0                                                   rA   5_optimizer_update_2state_8bit_blockwise_triton_kernelr     s   F --Q
CHnO,ryyL8<S/TTGZD 	d#699"**ESA
d#699"**EA q/
GYP[]acop/
GYP[]acop%Z3;!++%Z3;!+a// ++#rL(((Abgg&677#=	b##U
33 	!$$u{{'='=">TJ !DB	SVXdfn o+
g%xd;
.1h1GGU CB	SVXdfn o+
g%xd;
.1h1GGU		/
GYP[]acop/#*44
 0
GYP[]acop%Z3;!++%Z3;!++%Z3;!+a// +773#34''%"*4GW9WZ]9]^#rL(((A	R&[ 	!$$u{{'='=">TJ #Fb)UXZfhp"q-
g%xd;
.1h1GGW"Eb)UXZfhp"q-
j(72H4H
/)BIIa,BBZS_E__	

 #Fb)UXZfhp"q-
g%xd;
.1h1GGWa 
rC   c                    |rt        d      | dk(  rz|j                         dk  s|j                  d   dk7  rt        d|j                         |j                         dk  s|j                  d   dk7  rt        d|j                         d}d}t	        j
                  |j                         ||z        f}t        |    }t        |    }||
z  }||
z  } ||   |||||||||	|
||||||||||j                         |||d	       y )
Nrg   r   r	   r   zIFor ademamix, state1 must be a stacked tensor of shape (2, ...), but got zJFor ademamix, absmax1 must be a stacked tensor of shape (2, ...), but got rh   r   )r   r   r   ri   )	rj   r   r   r   rk   rl   rm   name2optimizer_fnrn   )r_   r`   ra   rb   rc   r   r   rM   rN   r   r1   r4   r   r   r   r   r   r   rO   r   r   rq   fnrr   r2   r3   s                             rA   $optimizer_update_8bit_blockwise_implr   4  sD   * !"JKK#::<!v||A!3[\b\h\h[ij  ;;=1a 0A 5\]d]j]j\kl  JHKK	:#89;D	>	*B$^4L JJBtH		
		!1rC   )r"   F)r   r"   F),r   typingr   r}   rk   triton.languagelanguager$   kernels_8bit_quantr   r   r   r   MOMENTUMRMSPROPADAGRADADAMLIONADEMAMIXrn   jit	constexprrB   rE   rX   r\   ro   strTensorr   intru   compilerz   r   tupler   boolr   r   r   r   r   r    rC   rA   <module>r      s	         

   3) <<3) <<3) 
3) ,,3) 3)  ,,!3)" #3)$ ll%3) 3)l 6) <<6) <<6) 
6) 6)  ,,!6)" #6)$ ll%6) 6)r QH ||QH <<QH <<QH 
QH ,,QH$ %QH* ,,+QH, -QH. ll/QH QHh J7 ||J7 <<J7 <<J7 
J7 ,,J7$ %J7* ,,+J7, -J7. ll/J7 J7^ ;>
 ;>
 ;>
 ;>
 ;>
 ;>+ Z %L
L
||L
 ||L
 LL	L

 U\\"L
 %L
 L
 L
 L
 L
 L
 L
 
L
 L
 L
  	!L
" #L
& 
'L
h *||*LL* ,,* 	*
 ;;* \\* *D #=||#=
,,#= #= 5<<%&	#= #=NJ+||J+||J+ LLJ+ U\\"	J+
 J+ J+ J+ J+ 
J+ J+ 	J+ <<J+ ELL!J+ \\J+ ell#J+  !J+" #J+$ %J+* +J+, 
-J+fJ+||J+||J+ LLJ+ U\\"	J+
 J+ J+ J+ J+ 
J+ J+ 	J+ <<J+ ELL!J+ \\J+ ell#J+  !J+" #J+$ %J+* +J+, 
-J+d OR <<OR <<OR 
OR. ,,/OR0 ll1OR2 ,,3OR ORd |X <<|X <<|X 
|X* ,,+|X, -|X2 ,,3|X4 ll5|X6 ,,7|X |X@ FDDAAE 6 'GG||G ||G LL	G
 U\\"G G G G G 
G G 	G <<G ELL!G \\G  ell#!G" #G$ %G( 
)G\ (L $rC   