
    bi&              '       "   d dl mZ d dlmZ d dlZddlmZmZmZ  e	ed      r$ej                  j                         j                  ndZ eeeej                        Zdej"                  d	ej"                  d
edeej"                  ej"                  f   fdZdej"                  dej"                  d	ej"                  d
edej*                  dej"                  fdZdej"                  dej"                  d	ej"                  d
edej*                  dej"                  ddfdZdej"                  d
ededej*                  deej"                  ej"                  f   f
dZdej"                  dej"                  d
ededee   dej*                  dej"                  fdZdej"                  dej"                  d
ededee   dej*                  dej"                  ddfdZdej"                  dej"                  dee   dej"                  d	ej"                  d
edej"                  fdZej:                  Z	 	 	 d2dedej"                  dej"                  dej"                  deej"                     d ed!ed"ed#ed$ed%ed&ed'ej"                  d(eej"                     d)ej"                  d*eej"                     d+ed,eddf&d-Z	 d3dedej"                  dej"                  dej"                  deej"                     d.eej"                     d/ed0ed ed!ed"ed#ed$ed+ed%ed&ed,eddf$d1Z y)4    )Sequence)OptionalN   )kernels_4bitkernels_8bit_quantkernels_optimacceleratorcudaAcode	blocksizereturnc                     t        j                  |       t        j                  | j                        5  t	        j
                  | ||      \  }}||j                         fcd d d        S # 1 sw Y   y xY w)N)torch_check_is_sizetorch_accelerator_moduledevicer   quantize_blockwise_tritonfloat)r   r   r   outabsmaxs        [/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/backends/triton/ops.pyquantize_blockwiser      s]    	#	!	(	(	2 #(BB1dIVVFLLN"# # #s   ,A++A4r   dtypec                 8    t        j                  |       t        j                   j                  t         j                  k(   fd       t
        j                   j                        5  t        j                   ||||      }d d d        |S # 1 sw Y   S xY w)Nc                  "    d j                    S NzA must be uint8, got r   r   s   r   <lambda>z&dequantize_blockwise.<locals>.<lambda>       3H	1R     r   )	r   r   _checkr   uint8r   r   r   dequant_8bit_blockwiser   r   r   r   r   r   s   `     r   dequantize_blockwiser'      s~     
#	LLEKK')RS	!	(	(	2 
 77

 J
 Js   *BBr   c           	      L    t        j                  |       t        j                   j                  t         j                  k(   fd       t        j                  j
                   j
                  k(   fd       t        j                  j                   j                  k(   fd       t        j                  j                  k(  fd       t        j                   j                        5  t        j                   |||       d d d        y # 1 sw Y   y xY w)Nc                  "    d j                    S r   r   r   s   r   r    z.dequantize_blockwise_inplace.<locals>.<lambda>3   r!   r"   c                  <    d j                    dj                    S NzExpected out.shape == , got shaper   r   s   r   r    z.dequantize_blockwise_inplace.<locals>.<lambda>4   s"    1GyPVWZW`W`Va/b r"   c                  <    d j                    dj                    S )NzExpected out.device == r,   )r   r/   s   r   r    z.dequantize_blockwise_inplace.<locals>.<lambda>5   s"    3J188*TZ[^[e[eZf1g r"   c                  (    d  dj                    S NzExpected out.dtype == r,   r   r   r   s   r   r    z.dequantize_blockwise_inplace.<locals>.<lambda>6       /EeWFSVS\S\R]-^ r"   r3   )
r   r   r#   r   r$   r.   r   r   r   r%   r&   s   `   ``r   dequantize_blockwise_inplacer5   *   s     
#	LLEKK')RS	LLagg%'bc	LLqxx')gh	LLe#%^_	!	(	(	2 
11	

 
 
s   5DD#
quant_typequant_storagec           
          t        j                  |       t        j                   j                  t         j                  t         j
                  t         j                  fv  fd        j                         }||dz   z   }t        j                  |dz  f j                   j                        }t        j                  |dz  df j                  t         j                        }t        j                   j                        5  t        j                   ||||||       d d d        |}|t         j                  k7  r.|j                         j                  |      j!                  d      }||j#                         fS # 1 sw Y   ^xY w)Nc                  "    d j                    S )NzDBlockwise 4bit quantization only supports 16/32-bit floats, but got r   r   s   r   r    zquantize_4bit.<locals>.<lambda>J   s    VWXW^W^V_` r"      )r   r   r   )num_elementsquantized_out)r   r   r#   r   bfloat16float16float32numelemptyr   r$   r   r   quantize_4bit_blockwise_tritonsqueezeview	unsqueezer   )	r   r   r6   r7   nblocksr   r   packeds	   `        r   quantize_4bitrI   C   s0    
#	LL	ENNEMM5==AA`
 	
	A
 i!m$$%F[[&1*qxxqwwGF
++qAvqk!((%++
FC	!	(	(	2 
33y*ff1TW	

 F###M2<<Q?6<<>!!
 
s   FF
r.   c           	      X   t        j                  |       t        j                  t         j                  t         j                  t         j
                  fv fd       | j                  t         j                  k7  r<| j                         j                  t         j                        j                  d      } t        j                  || j                        }t        j                  | j                        5  t        j                  | ||||       d d d        |S # 1 sw Y   |S xY w)Nc                      d  S )NzFBlockwise 4bit dequantization only supports 16/32-bit floats, but got  r   s   r   r    z!dequantize_4bit.<locals>.<lambda>o   s    XY^X_` r"   r   r   r   r   )r   r   r#   r=   r>   r?   r   r$   rC   rD   rE   rA   r   r   r   dequantize_4bit_implr   r   r   r6   r.   r   r   s        ` r   dequantize_4bitrQ   c   s     
#	LL%..%--??` 	ww%++IIKU[[)33A6
++e5
:C	!	(	(	2 \))!VY
EWZ[\ J\ Js   9DD)c           	      D   t        j                  j                  k(  fd       t        j                  j                  k(  fd       t        j                  | j
                        5  t        j                  | |||       d d d        y # 1 sw Y   y xY w)Nc                  (    d d j                    S r+   r-   )r   r.   s   r   r    z)dequantize_4bit_inplace.<locals>.<lambda>   r4   r"   c                  (    d  dj                    S r2   r   r3   s   r   r    z)dequantize_4bit_inplace.<locals>.<lambda>   r4   r"   rN   )r   r#   r.   r   r   r   r   rO   rP   s       ```r   dequantize_4bit_inplacerU      sy     
LLe#%^_	LLe#%^_	!	(	(	2 \))!VY
EWZ[\ \ \s   1BBBshapeBc           	         |j                   t        j                  k7  r<|j                         j	                  t        j                        j                  d      }t        j                  || j                   | j                        }t        j                  | j                        5  t        j                  ||||| j                   |       t        j                  j                  j                  | |d       cd d d        S # 1 sw Y   y xY w)Nr   rM   r3   )bias)r   r   r$   rC   rD   rE   rA   r   r   r   !dequantize_4bit_impl_passing_codenn
functionallinear)r   rV   rW   r   r   r   B_dq_tritons          r   	gemv_4bitr_      s     	ww%++IIKU[[)33A6++fAGGAHHEK	!	(	(	2 
66''	
 xx"")) * 

 
 
s   %AD  D	optimizer_namegpstate1state2beta1beta2beta3alphaepssteplrqmap1qmap2absmax1absmax2weight_decaygnorm_scalec                     t         j                  |j                        5  t        di d| d|d|d|d|d|d|d|d	|d
|	d|
d|d|d|d|d|d|d|d| d d d        y # 1 sw Y   y xY w)Nr`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   
skip_zerosrL   )r   r   $optimizer_update_8bit_blockwise_impl)r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rs   s                      r   optimizer_update_8bit_blockwiseru      s    j 
"	(	(	7 
, 	
)	
	
 	
 		

 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
  !	
" &#	
$ $%	
& "'	

 
 
s   AA,,A5	unorm_vec	max_unorm
param_normc                     t         j                  |j                        5  t        j                  di d| d|d|d|d|d|d|d|d	|d
|	d|
d|d|d|d|d|d|d| d d d        y # 1 sw Y   y xY w)Nr`   ra   rb   rc   rd   rv   rw   rx   re   rf   rg   rh   ri   rp   rj   rk   rq   rs   rL   )r   r   r   optimizer_update_32bit_impl)r`   ra   rb   rc   rd   rv   rw   rx   re   rf   rg   rh   ri   rp   rj   rk   rq   rs   s                     r   optimizer_update_32bitr{     s    ( 
"	(	(	7 
11 	
)	
	
 	
 		

 	
  	
  	
 "	
 	
 	
 	
 	
 	
 &	
 	
  !	
" $#	
$ "%	

 
 
s   A
A33A<)g        g      ?F)F)!collections.abcr   typingr   r    r   r   r   hasattrr	   current_acceleratortypedevice_typegetattrr
   r   Tensorinttupler   r   r'   r5   strrI   rQ   rU   r_   rt   r   ru   r{   rL   r"   r   <module>r      s   $   = = ?Fe]>[e335::ag"5+uzzB #%,, #ell #s #uUZUaUachcocoUoOp #||"\\16JMV[VaVa
\\"
||
LL
 ,,
 	

 ;;
 

 

2"||" #"14"EJ[["
5<<%&"@||LL  	
 C= ;; \\:\||\LL\ \ 	\
 C=\ ;;\ 
\ 
\
||
||
 SM
 LL	

 ,,
 
 \\
D (5'Y'Y $( 'J
J
||J
 ||J
 LL	J

 U\\"J
 J
 J
 J
 J
 
J
 J
 	J
 <<J
 ELL!J
 \\J
  ell#!J
" #J
$ %J
( 
)J
~ %(
(
||(
 ||(
 LL	(

 U\\"(
 %(
 (
 (
 (
 (
 (
 (
 
(
 (
 (
  	!(
" #(
& 
'(
r"   