
    biF              &       	   d dl mZ d dlmZmZ d dlmZ d dlZddlm	Z	 ddl
mZ  e	d	d
      	 	 dOdej                  dej                  dej                  deej                     deej                     dej                  fd       Z e	dd
      	 	 dOdej                  dej                  dej                  dej                  dej                  deej                     deej                     deej                  eej                     f   fd       Z e	dd
      	 	 dOdej                  dej                  dej                  dej                  deej                     deej                     dej                  fd       Z e	dd
      dej                  dej                  fd       Z e	dd
      dej                  dej                  dej                  fd        ZdPdej                  dej                  deej                     fd!Z e	d"d
      dQdej                  fd#       Z e	d$d
      dej                  d%ej                  d&edeej                  ej                  f   fd'       Z e	d(d
      dej                  d)ej                  d%ej                  d&edej                  dej                  fd*       Z e	d+d
      dej                  d&ed,ed-ej                  deej                  ej                  f   f
d.       Z e	d/d
      dej                  d)ej                  d&ed,ed0ee   dej                  dej                  fd1       Z e	d2d
      dej                  dej                  d3ee   d)ej                  d%ej                  d&edej                  fd4       Zd Zd5ZdZdZd6Zd7Zeeeeeed8Zej4                  d9ej                  d:ej                  d;ej                  d<eej                     d=ej                  d>ed?ed@edAedBedCedDedEefdF       Zej4                  d9ej                  d:ej                  d;ej                  d<eej                     d=eej                     dGedHed>ed?edIedJed@edAedBedCedDedEef"dK       Z e	dLd
      	 	 dRdMed9ej                  d:ej                  d;ej                  d<eej                     d=eej                     dGedHed>ed?edIedJed@edAedBedCedDeddf$dN       Zy)S    )Sequence)prodsqrt)OptionalN   )register_kernel   )CODEzbitsandbytes::int8_mm_dequantdefaultA	row_stats	col_statsdtypebiasreturnc                 h    t        j                   j                  t         j                  k(   fd       t        j                  j                  t         j                  k(  fd       t        j                  j                  t         j                  k(  fd        j                  d j                  d         }j                  d      j                  d      j                  d      j                  d      |z  z  dz  }|||z  }|j                  |xs t         j                        S )Nc                  "    d j                    S )NzA must be int32, got r   r   s   \/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/backends/default/ops.py<lambda>z_.<locals>.<lambda>       3H	1R     c                  "    d j                    S )Nzrow_stats must be float32, got r   )r   s   r   r   z_.<locals>.<lambda>       =\]f]l]l\m;n r   c                  "    d j                    S )Nzcol_stats must be float32, got r   )r   s   r   r   z_.<locals>.<lambda>   r   r   r   g D1@?)torch_checkr   int32float32viewshapereshape	unsqueezetofloat16)r   r   r   r   r   A_calcouts   ```    r   _r*      s     
LLEKK')RS	LLEMM13no	LLEMM13noVVB$F!!"%//3I!!"%//2I
I	)
*\
9Ct66%(5==))r   z"bitsandbytes::int8_mixed_scaled_mmCACBSCASCBoutlier_colsc                 H   d }||j                         r| d d |f   j                         }t        j                  j                  j
                  j                  |d d |f   j                         |      j                  | j                        j                         }n,t        j                  d| j                  | j                        }t        j                  j                  j                  j                  |||||| j                        }	||	j                  ||      }	|	|fS )Nr   devicer   )r   r   )numel
contiguousr   opsbitsandbytesint8_vectorwise_dequantr   r&   r   temptyr2   int8_scaled_mmaddmm)
r   r+   r,   r-   r.   r/   r   subBsubAoutputs
             r   r*   r*   "   s     DL$6$6$8L!,,. II""::BB2aoCVCaCaCcehiR[QS 	 {{1QXXQWW= YY##22::2r3RV^_^e^e:fFdD)4<r   zbitsandbytes::int8_scaled_mmBc                     t         j                  j                  j                  j	                  | |      }t         j                  j                  j
                  j	                  ||||xs t         j                  |      S )N)r   r   )r   r5   r6   int8_linear_matmulr   int8_mm_dequantr'   )r   r?   r   r   r   r   out_i32s          r   r*   r*   I   sf     ii$$77??1EG99!!1199$u}} :  r   z bitsandbytes::int8_linear_matmulc                     t        | |      S N)_int8_linear_matmul_impl)r   r?   s     r   r*   r*   \   s    #Aq))r   z$bitsandbytes::int8_linear_matmul.outr)   c                 ~    t        j                  |j                  t         j                  k(         t	        | ||       y rE   )r   r   r   r    rF   )r   r?   r)   s      r   r*   r*   a   s'    	LLekk)*Q3'r   c                     t        j                  | j                         |j                         j                               j	                  t         j
                        }||j                  |      }|S rE   )r   matmulfloatr8   r&   r    copy_)r   r?   r)   results       r   rF   rF   g   sK    \\!'')QWWY[[]366u{{CF
6"Mr   z#bitsandbytes::int8_vectorwise_quantc                    t        | j                  d d       }d }d }|dkD  r| j                         |k\  }|j                         rMt	        j
                  |j                  d            j                  d      }| |   j                         }d| |<   n0t	        j                  d| j                  t        j                        }t	        j                  | j                         d      j                  j                         }t	        j                  | d|j                  d      z  z        j!                  t        j"                        }|dkD  r|	d|d d |f<   ||| <   |||fS )Nr           r   dimr1      g     _@)r   r#   absanyr   argwherer"   cloner9   r2   int64maxvaluesrJ   roundr%   r&   int8)r   	thresholdrowsr/   outlier_restoreoutliersr   out_rows           r   r*   r*   o   s2   DLO3557i'<<> !>>(,,1,*=>CCBGLk//1OAhK !;;qML 		!%%'q)00668I kk!uy':':2'>>?@CCEJJOG axL,#$<  "%(I|++r   z bitsandbytes::quantize_blockwisecode	blocksizec                 @   t        j                  |       | j                         }||z  }|dkD  }||z  |z   }t        j                  |f| j                  t         j
                        }| j                  |      }|d ||z
   }	|	j                  ||z  |      }
t        j                  |
      j                  d      d   |d ||z
   t        j                  |
d|d ||z
   j                  dd      z  z  dd      }|j                  d      }|rkt        j                  |||z
  d        j                         |d<   t        j                  |||z
  d  d|d   z  z  dd      }t        j                  ||gd      }t        j                  |j                  d      |j                  |j                        z
        }t        j                  |d      j                  t         j                        j                  |j                        j                  | j                         }||fS )Nr   r1   r   rO   rQ   )r   _check_is_sizer3   zerosr2   r!   r$   rR   rW   clampr"   catr%   r&   argminuint8r#   )r   r`   ra   nremhas_remblocksabsmax
A_reshapedA_comA_com_reshapedscaled_Ascaled_A_remdiffr)   s                  r   r*   r*      s   	#		A
i-CAgG)^g%F[[&1885==IF1JyS!E]]1	>9=N!&>!:!>!>2!>!Fq!IFVg{{>Q8J&7:J1K1P1PQSUV1W-WXZ\^_`H#HYYz!c')4599;r
{{:a#gi#8Ar
N#KRQRS99h51=99X''+dgghoo.FFGD
,,t
$
'
'
4
7
7
H
P
PQRQXQX
YC;r   z"bitsandbytes::dequantize_blockwiserm   c                 f    t        j                  |       t        j                   j                  t         j                  k(   fd       | j                  d      j                            }|j                  d   |z  }|j                  d   |z  }|dk7  r2t         j                  j                  j                  |d||z
  fdd      }|j                  d|      |j                  dd      z  j                  |      j                  d      }|d ||z  |z    }|j                   j                        }|S )Nc                  "    d j                    S )NzA must be uint8, got r   r   s   r   r   z_.<locals>.<lambda>   r   r   r   r   constant)modevaluerQ   )r   rc   r   r   rh   r$   intr#   nn
functionalpadr"   r&   )r   rm   r`   ra   r   r)   rl   ress   `       r   r*   r*      s   	#	LLEKK')RS
qyy}  "
#CYYr]i'F
))B-)
#C
axhh!!%%cAy3+?jXY%Z88B	"V[[Q%77
;
;E
B
J
J2
NC
("S(
)C
++agg
CJr   zbitsandbytes::quantize_4bit
quant_typequant_storagec                     t        j                  |       t        j                  dv fd       t        j                   j                  t         j                  t         j
                  t         j                  fv  fd        j                         }||z  }||z  }|r|dz   n|}t        j                  |f j                  t         j                        } j                  |      }	|	d ||z
   j                  ||z  |      }
t        j                  |
      j                  d      d   |d | t        j                  |
d|d | j                  dd      z  z  dd      j                  d      }|re|	| d  }t        j                  |      j                         |d<   t        j                  |d|d   z  z  dd      }t        j                  ||gd      }t            j#                  |j                        j#                  |j                        }t        j$                  t        j                  |j                  dd      |z
        dd	
      j#                  t         j&                        }|d d d   dz  |dd d   z  }|t         j&                  k7  r.|j)                         j                  |      j+                  d      }||j-                         fS )Nnf4fp4c                      d  S Nz#quant_type must be nf4 or fp4, got  r~   s   r   r   z_.<locals>.<lambda>       9\]g\h7i r   c                  "    d j                    S )NzDBlockwise 4bit quantization only supports 16/32-bit floats, but got r   r   s   r   r   z_.<locals>.<lambda>   s    VWXW^W^V_` r   rQ   r1   r   rO   r   T)rP   keepdimr	      )r   rc   r   r   bfloat16r'   r!   r3   rd   r2   r$   rR   rW   re   r"   rf   r
   r&   rg   rh   squeezer%   rJ   )r   ra   r~   r   ri   full_blocksrj   rl   rm   A_flattenedA_full_blocksscaledA_rem
scaled_remr`   	quantizedpackeds   ` `              r   r*   r*      s_    
#	LL~-/ij	LL	ENNEMM5==AA`
 	
	Ay.K
i-C #[1_F[[&1885==IF))A,K  	!c'*221	>9MM 99]377B7?BF<K[[!f\k.B.G.GA.N*NOQSUVW__`bcF SDE"YYu%))+r
[[!fRj.!92qA
FJ/Q7 
v}}-00>DUYYv{{2q'9D'@ArSWX[[\a\g\ghI ss^q 9QTT?2F#!&&}5??B6<<>!!r   zbitsandbytes::dequantize_4bitr#   c                 B   t        j                  |       t        j                  dv fd       t        j                  t         j                  t         j                  t         j
                  fv fd       | j                  t         j                  k7  r| j                  t         j                        } | j                  d      } t        j                  | j                  d      dz  t         j                  | j                        }|j                         }| dz  |d	d d<   | d
z	  |d d d<   t           j!                        j!                  | j                        }||   }|j                         |k7  r0|j                         |d	z   k(  sJ t        j"                  |dd|      }||z  }	|	||z  dkD  rd	ndz  }	||z  }
|
dkD  }t        j                  || j                        j                  d      }|r[|d ||
z
   j                  d|      |d |	|z
   j                  dd	      z  j                  d      |d ||
z
   |||
z
  d  |d   z  |||
z
  d  n%|j                  d|      |j                  dd	      z  } |j                  dg|d	d   j!                        }|S )Nr   c                      d  S r   r   r   s   r   r   z_.<locals>.<lambda>   r   r   c                      d  S )NzFBlockwise 4bit dequantization only supports 16/32-bit floats, but got r   r   s   r   r   z_.<locals>.<lambda>   s    XY^X_` r   r   r   r	   )r   r2      rQ   r   )r   rc   r   r   r'   r!   r   rh   r"   r$   r9   sizer    r2   r3   r
   r&   narrow)r   rm   ra   r~   r#   r   out_dqri   r`   rl   rj   rk   r)   s      ` `       r   r*   r*      sY    
#	LL~-/ij	LL%..%--??` 	ww%++FF5;;			"A[[Qekk!((KFAs7F14a4Lq&F3Q3K
u%((2D&\F ||~||~Q&&&faA.)^F
1y=1$a!+F
i-CAgG
++e5
:
B
B2
FC 1s7+00Y?&I[6T[K[B\BaBabdfgBhhqqrtuIa#gC	*VBZ7AGIkk"i(6;;r1+==
#++b
%59
%
(
(
/CJr   zbitsandbytes::gemv_4bitshapeBc                     |d   dkD  rdnd}t         j                  j                  j                  j	                  |||||| j
                        }t         j                  j                  j                  | |d       S )NrQ   r   r   r   )r   )	r   r5   r6   dequantize_4bitr   r   rz   r{   linear)r   r?   r   rm   r`   ra   r~   B_dqs           r   r*   r*     ss     q'A+5J99!!1199!VYPZ\bdedkdklD88%%	 &  r   rQ   r      )momentumrmspropadagradadamlionademamixgpstate1state2	unorm_vecbeta1beta2epsweight_decaysteplrgnorm_scaleoptimizer_idc                 ^   || z  }|dk(  r`dd||	z  z
  z  }dd||	z  z
  z  }||z  d|z
  |z  z   }||z  d|z
  |z  |z  z   }||z  }||z  }|t        j                  |      |z   z  }||z  }n|dk(  r|}n|dk(  r|	dk(  r|}n||z  |z   }||z  }nz|dk(  r||z  d|z
  |z  z   }|}nd|dk(  r2||z  d|z
  |z  |z  z   }|t        j                  |      |z   z  }||z  }n-|dk(  r(|||z  z   }|t        j                  |      |z   z  }||z  }t        j                        }|j                  |       y)	z.Preprocessing optimizer, computing update normr         ?r   r   rQ   r   r	   N)r   r   sumadd_)r   r   r   r   r   r   r   r   r   r   r   r   r   g_valscorrection1correction2s1_valss2_valsupdate_valsupdate_norm
total_norms                        r   _optimizer_precondition_32bitr   D  s   $ 1_FqS5$;./S5$;./5.C%K6#995.C%K6#9F#BBK'K'G!4s!:;!K/				19Gunv-G'		5.C%K6#99		5.C%K6#9F#BB

7 3c 9:!K/		6F?*

7 3c 9:!K/;'JNN:r   	max_unorm
param_normbeta3alphac                 6   |j                         }|| z  j                         }|dv r|dkD  r|||z  z   }d}|dkD  r@t        j                  |      }|dv r|||z  |z   kD  r||z  |z   |z  }n|||z  kD  r||z  |z  }|dk(  r||z  d|z
  |z  z   }||z  d|z
  |z  |z  z   }d||z  z
  }t        d||z  z
        }| |z  |z  }|dkD  r|d||z  z
  z  }||z  |t        j                  |      ||z  z   z  z  }||z   }|j                  |       |j                  |       n|dk(  r|d   }|d   }|}||z  d|z
  |z  z   }||	z  d|	z
  |z  z   }||z  d|z
  |z  |z  z   }d||z  z
  }t        d||z  z
        }|dkD  r|d||z  z
  z  }||z  |
|z  z   }t        j                  |      |z  |z   } |||| z  z  z
  }|d   j                  |       |d   j                  |       |j                  |       n|dk(  r0|dk(  r|}n||z  |z   }|| |z  z  }||z   }|j                  |       n|dk(  rN||z  d|z
  |z  z   }!||z  t        j                  |!      z  }||z
  }||z  d|z
  |z  z   }|j                  |       n|dk(  rI||z  d|z
  |z  |z  z   }||z  |z  t        j                  |      |z   z  }||z
  }|j                  |       nA|d	k(  r<|||z  z   }||z  t        j                  |      |z   z  }||z
  }|j                  |       |j                  |       y
)zUnified optimizer update kernel)r   rQ   r	   r   rN   r   r   r   r   rQ   r   r	   N)rJ   r   r   rK   sign)"r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   p_valsr   update_scalecurrent_unormr   r   r   r   	step_size
update_vals3_valsm1m2numixed_momentumadaptive_termmomentum_updates"                                     r   _optimizer_update_32bitr     s   , WWYFAo$$&F|#s(:&<//L3

9-<'y:5;; )J 6 <My:55 )J 6-Gq5.C%K6#995.C%K6#9F#BBE4K'3,-C+%3	#sR,%667F!I-EJJw<ORUXcRc<c1de
*$WW		))u_ev55u_ev55u_ev5>>E4K'3,-#sR,%667F{*urz:B+5<" >??q	q	R		19Gunv-G!bS7]3
*$W		 5.C%K6+AA!B&O)DD
*$5.C%K6#99W		5.C%K6#9F#BB!B&/5::g3F3LM
*$W		6F?*&[EJJw$7#$=>
*$WGGFOr   z$bitsandbytes::optimizer_update_32bitoptimizer_namec                 Z   |rt        d      t        |    }| dk(  rIt        |||||||||	|
|||||||       |dkD  r(|j                          t	        |||||||	||||||       yy|dkD  r'|j                          t	        |||||||	||||||       t        |||||||||	|
|||||||       y)zE
    32-bit optimizer implemented by PyTorch with @torch.compile
    zskip_zeros is not supported yetr   rN   N)NotImplementedErrorname2optimizer_idr   zero_r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
skip_zerosr   s                      r   r*   r*     s   0 !"CDD$^4L#	
( s?OO)1ffisLRVXZ\giu  s?OO)1ffisLRVXZ\giu 	 #	
r   )NNrE   )rN   )r   F)collections.abcr   mathr   r   typingr   r   _opsr   utilsr
   Tensorr   r*   tuplerF   ry   strMOMENTUMRMSPROPADAGRADADAMLIONADEMAMIXr   compilerJ   r   r   r   r   r   <module>r      sw   $    #  0)<
 $(#'*||*||* ||* EKK 	*
 5<<
 * \\* =*, 5yA ,0#'#||## 	# 
	#
 
# 5<<(# 5<<
 # 5<<%,,//0# B#L /; $(#'|||| || ||	
 5<<
  EKK  \\ <$ 3Y?* *%,, * @* 7C( (%,, (U\\ ( D(
  HU\\DZ  6	B", ", C",J 3Y? U\\ c eELLRWR^R^D^>_  @2 5yA u|| 5<< C X]XcXc hmhtht  B  .	:'"||'" #'"14'"EJ[['"
5<<%&'" ;'"T 0)</||/LL/ / 	/
 C=/ ;;/ \\/ =/d *I6|||| SM LL	
 ,,  \\ 7& 

   9||9||9 LL9 U\\"	9
 ||9 9 9 
9 9 9 	9 9 9 9x m||m||m LLm U\\"	m
 %m m m m m m m 
m m m 	m  !m" #m m` 7C$ %O
O
||O
 ||O
 LL	O

 U\\"O
 %O
 O
 O
 O
 O
 O
 O
 
O
 O
 O
  	!O
" #O
& 
'O
 DO
r   