
    bi@                     f   d dl mZ d dlmZ d dlmZmZ d dlZd dlmZ d dl	Z	d dl
mZ d dlmZ 	  G d d      Z ed	e
      dee	j"                  ge	j"                  f   deeef   fd       Ze	j*                  j,                  Z ed	e
      de	j"                  de	j0                  de	j"                  fd       Ze G d d             Z G d de	j6                  j8                        Z G d de	j6                  j8                        Z G d de	j6                  j8                        Z	 	 	 	 d"de	j"                  de	j"                  dee	j"                     dee   dee	j"                     f
dZ 	 	 d#de	j"                  de	j"                  d ejB                  dee	j"                     dee	j"                     f
d!Z"y)$    )	dataclass)prod)CallableOptionalN)warn)
deprecatedc                   8    e Zd ZdZd Zd Zed        Zd Zd Z	y)GlobalOutlierPoolerNc                     t        d      )NzCall get_instance() instead)RuntimeErrorselfs    [/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/autograd/_functions.py__init__zGlobalOutlierPooler.__init__   s    899    c                 0    t               | _        d | _        y N)setoutliers	model_dimr   s    r   
initializezGlobalOutlierPooler.initialize   s    r   c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r   )	_instance__new__r   )clss    r   get_instancez GlobalOutlierPooler.get_instance!   s6    == KK,CMMM$$&}}r   c                     | j                   || _         || j                   k7  ry | j                  j                  |j                                y r   )r   r   updatetolist)r   outlier_idxfeature_dims      r   add_outliersz GlobalOutlierPooler.add_outliers(   s=    >>!(DN$..([//12r   c                     t        j                  t        | j                              j	                  t         j
                        S r   )torchTensorlistr   toint64r   s    r   get_current_outlier_idxz+GlobalOutlierPooler.get_current_outlier_idx0   s)    ||D/033EKK@@r   )
__name__
__module____qualname__r   r   r   classmethodr   r"   r)    r   r   r
   r
      s0    I:  3Ar   r
   zDThis function is deprecated and will be removed in a future release.)categorytransform_tile	tile_sizec                 j   |\  }}d||z  cxk  rdk  sJ  J t        j                  ||z  t         j                        j                  ||      }t        j                  |      }t        d      D ]  }t        j                  |d|z  d      dz  }|dz
  j                  t         j                        j                         }t        j                  |j                         dz   |k(        sJ d	        | |      }	|	j                  |j                        dz   }
||
d|z  z  z  }||z  d|z  k  s |S  |S )
a  
    Compute a permutation of indices that invert the specified (tiled) matrix transformation

    :param transform_tile: a function that applies forward transform to a tensor of shape [dim1, dim2]
    :param tile_size: higher-level tile dimensions, i.e. (8, 32) for Turing and (32, 32) for Ampere
    :note: we assume that tile_transform applies to a cpu-based int8 tensor of shape tile_size
    :example: transform_tile function for the turing layout (bitsandbytes.functional as F)
    :returns: indices
    r   l            dtype      trunc)rounding_mode   zint overflow)r$   aranger(   view
zeros_likerangedivr'   int8
contiguousallintr4   )r0   r1   d1d2tile_indicespermuted_tile_indicesiith_dim_indicessample_tile_ipermuted_tile_iith_permuted_indicess              r   get_inverse_transform_indicesrL   4   s7   " FBrBw<<Ru{{;@@RHL!,,\:1X 	))L#q&PSVV(3.225::>IIKyy**,s2oEFVVF(7.11,2D2DEK!5a!@@7S!V  	 ! r   permuted_tensorrE   returnc                    | j                   |j                   c\  }}\  }}||z  ||z  cxk(  rdk(  sJ d        J d       | j                  d|j                               j                         }t	        j
                  |      }|||j                         <   |j                  ||||z  ||z        }|j                  dddd      }|j                  ||      j                         S )a  
    Undo a tiled permutation such as turing or ampere layout

    :param permuted_tensor: torch tensor in a permuted layout
    :param tile_indices: reverse transformation indices, from get_inverse_transform_indices
    :return: contiguous row-major tensor
    r   z+tensor must contain a whole number of tiles         )	shapereshapenumeltr$   
empty_likeflattenpermuter@   )rM   rE   rowscols	tile_rows	tile_colstensoroutputss           r   undo_layoutra   Z   s     ,;+@+@,BTBT(LT4(9i)ti/414c6cc4c6cc4$$R););)=>@@BFv&G&,GL  "#ooiDI4EtyGXYGooaAq)G??4&1133r   c                      e Zd ZU dZeej                     ed<   dZe	ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed	<   dZeej                     ed
<   dZeej                     ed<   dZeej                     ed<   dZee   ed<   dZdZdZeej                     ed<   dZdZdZdZd Zed        Zy)MatmulLtStateN_tile_indicesFforce_no_igemmltCBCxBSBSCBCxBtSBtCBtsubBoutlier_pool        idxTrowc                 f    d | _         d | _        d | _        d | _        d | _        d | _        d | _        y r   )rf   rg   rh   ri   rj   rk   rl   r   s    r   reset_gradszMatmulLtState.reset_grads   s3    	r   c                     t        d      )Nz$tile_indices is no longer supported.)
ValueErrorr   s    r   rE   zMatmulLtState.tile_indices   s    ?@@r   )r*   r+   r,   rd   r   r$   r%   __annotations__re   boolrf   rg   rh   ri   rj   rk   rl   rm   rn   r
   has_accumulated_gradients	thresholdrp   is_traininghas_fp16_weightsuse_poolformatBrs   propertyrE   r.   r   r   rc   rc   p   s   ,0M8ELL)0"d"!%B%"&C%,,	&!%B%"&C%,,	&#'D(5<<
 '"&C%,,	&"&C%,,	&#'D(5<<
 '26L(./6 %I"&C%,,	&KHG A Ar   rc   c                   F   e Zd Ze	 	 	 ddej
                  j                  j                  dej                  dej                  de	ej                     de	ej                     de	e
   fd       Zedej
                  j                  j                  d	ej                  fd
       Zy)MatMul8bitLtNctxABoutbiasstatec           	      	   |xs
 t               }d| _        t        |j                        dk(  rd| _        || _        || _        || _        |j                  d   |j                  d   k(  rIt        j                  |j                  d d |j                  dd  z   |j                  |j                        S t        j                  |j                  d d |j                  d d z   |j                  |j                        S |j                  }|j                  t        j                  k7  r-t               s#t        j                  d|j                   d       t        |j                        d	k(  r|j!                  d|j                  d         }| j"                  d   rEt%        j&                  |j)                  t        j                        |j*                  
      \  }}}	}
}nFt%        j,                  |j)                  t        j                        |j*                  
      \  }}	}d x}}
d}|j.                  s|j0                  t3        |dd       d u}|j5                          xr! |j                  d   |j7                  d      k(  }|r|j9                         }|j:                  r|r|j0                  |j<                  P|j?                          t%        j,                  |j)                  t        j                              \  |_        |_        }|j*                  dkD  rN||_         t        jB                  jD                  jG                  |||j0                  |	|j<                  ||      \  }}nYt        jB                  jD                  jH                  jK                  ||j0                  |	|j<                  ||j                        }d }|| _&        || _'        |j                  | _(        |d n|j                  | _)        tU        | j"                  d d       r|||f| _+        |
|j@                  f| _,        n"g d| _+        d| _,        | j[                  d d        g |d d |j0                  j                  d   }t        |      d	k(  r|j!                  |      S |S )NFr   TrP   rS   r4   devicez'MatMul8bitLt: inputs will be cast from z to float16 during quantizationrQ   )ry   gradro   )r   r4   rR   NNNNN).rc   is_emptyr   rT   r   r   r   r$   emptyr4   r   float16_is_compilingwarningsr   lenrU   needs_input_gradFint8_double_quantr'   ry   int8_vectorwise_quantr{   rf   getattris_contiguousstrider@   rz   ri   rs   rp   opsbitsandbytesint8_mixed_scaled_mmint8_scaled_mmdefaultr   
grad_shapedtype_A
dtype_biasanytensorstensor_statessave_for_backward)r   r   r   r   r   r   input_shapeCACAtSCASCAtoutlier_colshas_gradis_transposed_outputsubAoutput_shapes                     r   forwardzMatMul8bitLt.forward   s    ( =ACLCECECHwwr{aggaj({{1773B<!''!"+#=QWWUVU]U]^^{{1773B<!''"1+#=QWWUVU]U]^^gg 77emm#MOMMCAGG9Lklmqww<1		"aggbk*A "/0/B/B144CVbgbqbq/r,BS$ %&$;$;ADD<O[`[j[j$k!B\C$!!UXX%5q&$/t;H ! 11OaggajAHHQK6OMLLN!!(uxx7G599K\!!# *+)@)@emmAT)U&%)Q ??S $EI !9911FF		LFD YY++::BBEHHc5994qww C F D 	$gg!%4::s##BQ'(a.CK!%uyy 1C,CK ,C!!$-=Sb)=588>>!+<={q >>,//r   grad_outputc                    | j                   rn| j                  d nt        j                  | j                        }t        j                  | j                        t        j                  | j
                        d |d fS | j                  \  }}}}}| j                  \  }}}	| j                  \  }
}| j                  }d x}x}}|r|j                  d| j                        }t        |j                        dk(  r-|j                  d|j                  d         j                         }|rt!        j"                  |j%                  t        j&                              \  }}}}}t        j(                  j*                  j,                  j/                  |j1                         j                         |j1                         ||
t        j&                        }|j2                  dkD  rH|F|j5                         dkD  r3|d d |fxx   t        j6                  |j1                         |      z  cc<   |r|j8                  |j8                  j%                  | j:                  d      j=                  |j>                  jA                  d      jC                  d	            }t        j6                  |j%                  | j:                        |      jE                  | jF                        }ntI        d
      ||d |d fS )Nr   r3   rQ   rP   ro   TcopyrS   @ ?)State must contain CB matrix for backward)%r   r   r$   r<   r   r   r   r   r   r   sumr   r   rT   rU   r@   r   r   r'   r   r   r   r   r   rW   ry   rV   matmulrf   r   mul_ri   	unsqueezemulr;   r   	Exception)r   r   	bias_grad	req_gradA	req_gradBr   req_gradBiasr   r   r   r   rp   r   grad_Agrad_B	grad_biasCgradSCgradtrf   s                      r   backwardzMatMul8bitLt.backward   sK   << # 0e6F6Fsxx6PI##CEE*E,<,<SUU,CT9VZZZ363G3G0	9aq{{T1%%	c"yy&***)#@I {  !Q&%--b+2C2CB2GHSSUK&'&9&9+..:W&X#E1a!YY++::BB	$$&mm C F $)9djjlQ>Nq#v%,,{}}"EExx#XX[[4[8==eii>Q>QRS>T>X>XYd>efknnS[[&A2FKKCNN[ KLLvtY44r   r   )r*   r+   r,   staticmethodr$   autogradfunctionFunctionCtxr%   r   rc   r   r   r.   r   r   r   r      s    
 '+'+)-`^^$$00`<<` <<` ell#	`
 u||$` &` `D (5enn--99 (5 (5 (5r   r   c                   4    e Zd Zeddefd       Zed        Zy)MatMul8bitFpNc                    |j                   s|j                  t        |dd       d u}|j                          xr! |j                  d   |j                  d      k(  }|r|j                         }|j                  r|r|j                  |j                  \|j                          t        j                  |j                  t        j                              \  |_        |_        }|j                  }|j                  j                  j                  |j                         j#                  |j                  j%                  d      j'                  d            }	t        j(                  j*                  j-                  ||	|      }
|| _        |j                   | _        |j                  | _        || _        |	d | _        |
S |j                   | _        |
S )Nr   r   rS   r   )r{   rf   r   r   rT   r   r@   rz   ri   rs   r   r   r'   r$   r   datar4   r   r   r   nn
functionallinearr   r   r   r   r   )r   r   r   r   r   r   r   r   r   rf   r   s              r   r   zMatMul8bitFp.forward/  s]   !!UXX%5q&$/t;H ! 11OaggajAHHQK6OMLLN!!(uxx7G599K\!!#)*)@)@emmAT)U&%)QHHXX]]agg&++EII,?,?,B,F,F{,ST$$++Ar48	gg!% 48::r   c                     | j                   \  }}}}}| j                  }| j                  }d x}x}	}
|r|j                  d| j                        }
t        |j                        dk(  r-|j                  d|j                  d         j                         }|r2t        j                  |j                         |      j                         }	|r|j                  |j                  j                  | j                  d      j                  |j                   j#                  d      j%                  d            }t        j                  |j                  | j                        |      j'                  | j(                        }nt+        d	      ||	d |
d fS )
Nr   r3   rQ   rP   Tr   rS   r   r   )r   r   r   r   r   r   rT   rU   r@   r$   r   rW   rf   r'   r   r   ri   r   r   r;   r   r   )r   r   r   r   r   r   r   r   r   r   r   rf   s               r   r   zMatMul8bitFp.backwardE  sF   363G3G0	9aqEE		&***)#@I {  !Q&%--b+2C2CB2GHSSUK\\!##%5779Fxx#XX[[4[8==eii>Q>QRS>T>X>XYd>efknnS[[&A2FKKCNN[ KLLvtY44r   )r*   r+   r,   r   rc   r   r   r.   r   r   r   r   )  s1     #$m  * 5 5r   r   c                   N    e Zd Zeddeej                     fd       Zed        Zy)
MatMul4BitNquant_statec                 X   d| _         t        |j                        dk(  rd| _         || _        || _        || _        |j                  }|j                  d   |d   k(  r?t        j                  |j                  d d |dd  z   |j                  |j                        S t        j                  |j                  d d |d d z   |j                  |j                        S t        j                  j                  j                  |t        j                  ||      j                  |j                        j!                         |      }|| _        |j                  |j                  |d n|j                  c| _        | _        | _        t+        | j,                  d d       rd |f| _        |S d| _        |S )	NFr   TrP   rS   r   rR   r   )r   r   rT   r   r   r   r$   r   r4   r   r   r   r   r   dequantize_4bitr'   rW   r   r   dtype_Br   r   r   r   )r   r   r   r   r   r   B_shaper   s           r   r   zMatMul4Bit.forwardd  sn    =ACLCECECH!''Gwwr{gaj({{1773B<'!"+#=QWWUVU]U]^^{{1773B<'"1+#=QWWUVU]U]^^ $$++Aq/@/@K/P/S/STUT[T[/\/^/^/`bfg  	3477AGGT\T_c_i_i0S[#.s##BQ'()CK  'CKr   c                 J   | j                   rn| j                  d nt        j                  | j                        }t        j                  | j                        t        j                  | j
                        d |d fS | j                  \  }}}}}| j                  \  }}d\  }}}	|r|j                  d| j                        }	|r[t        j                  |t        j                  || j                        j                  |j                        j!                               }||d |	d fS )Nr   r   r3   )r   r   r$   r<   r   r   r   r   r   r   r   r   r   r   r'   r4   rW   )
r   r   r   r   r   r   r   r   r   r   s
             r   r   zMatMul4Bit.backward  s    << # 0e6F6Fsxx6PI##CEE*E,<,<SUU,CT9VZZZ+.+?+?(	1aq{{1$4!	#@I \\+q/@/@CII/N/Q/QR]RcRc/d/f/f/hiFvtY44r   r   )	r*   r+   r,   r   r   r   
QuantStater   r   r.   r   r   r   r   `  s:     Xall=S  : 5 5r   r   r   r   r   r   r   c                     |xs
 t               }|dkD  r||_        |j                  r1| j                  j                  dv rt
        j                  | ||||      S t        j                  | ||||      S )Nro   )cpuxpu)rc   ry   rz   r   typer   applyr   )r   r   r   r   ry   r   s         r   r   r     si     $]_E3#88==N*%%aCu==aCu55r   r   c                    |J | j                         | j                  d   k(  r| j                  dk(  r| j                  j                  dk7  r| j                  d   |j
                  z  dk7  r>t        d|j
                   d| j                          t        j                  | ||||      S t        j                  | |j                         ||      }|||z  }|S t        j                  | ||||      S )NrP   Fhpur   z4Some matrices hidden dimension is not a multiple of z^ and efficient inference kernels are not supported for these (slow). Matrix input size found: )r   )rV   rT   requires_gradr   r   	blocksizer   r   r   r   	gemv_4bitrW   )r   r   r   r   r   s        r   matmul_4bitr     s     """wwyAGGBKAOOu$<RWAW772;...!3F{G\G\F]  ^|  }~  }D  }D  |E  F ##Aq#t[AA++a;?CtJ1c4==r   )NNro   Nr   )#dataclassesr   mathr   typingr   r   r   r   r$   typing_extensionsr   bitsandbytes.functionalr   r   r
   FutureWarningr%   tuplerB   rL   compileris_compilingr   
LongTensorra   rc   r   Functionr   r   r   r   r   r   r.   r   r   <module>r      s   !  %    ( #A A: J!ell^U\\9:!S#X!	!> ++ J4 4U=M=M 4RWR^R^ 4	4$ %A %A %APM55>>** M5`455>>** 45n65(( 65x #'%)#'6||6||6 
%,,	6 M"	6 5<<
 6, #'#'>||>||> > 
%,,		>
 5<<
 >r   