
    bif                        d dl Z d dlmZmZmZmZmZ d dlZd dlZd dlm	Z	m
Z
mZmZ d dlmc mZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZ  edd	
      Z G d dej                  j8                        Z G d dej                  j8                        Z G d dej                  j<                        Zded   fdZ  G d dejB                        Z" G d de"      Z# G d de"      Z$ G d dej                  j<                        Z%d Z& G d dej8                        Z' G d d ej8                        Z( G d! d"e(      Z) G d# d$e(      Z* G d% d&ejB                        Z+ G d' d(ejB                        Z, G d) d*ejB                        Z-y)+    N)AnyOptionalTypeVarUnionoverload)Tensordevicedtypenn)HIP_ENVIRONMENT)
QuantState)GlobalOptimManager)*INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPINGOutlierTracerTztorch.nn.Module)boundc                        e Zd ZdZ	 	 	 	 	 	 	 	 ddededee   dee   deded	ed
ee   ddf fdZ	ddZ
	 ddZdedefdZ xZS )StableEmbeddinga  
    Custom embedding layer designed to improve stability during training for NLP tasks by using 32-bit optimizer states. It is designed to reduce gradient variations that can result from quantization. This embedding layer is initialized with Xavier uniform initialization followed by layer normalization.

    Example:

    ```
    # Initialize StableEmbedding layer with vocabulary size 1000, embedding dimension 300
    embedding_layer = StableEmbedding(num_embeddings=1000, embedding_dim=300)

    # Reset embedding parameters
    embedding_layer.reset_parameters()

    # Perform a forward pass with input tensor
    input_tensor = torch.tensor([1, 2, 3])
    output_embedding = embedding_layer(input_tensor)
    ```

    Attributes:
        norm (`torch.nn.LayerNorm`): Layer normalization applied after the embedding.

    Methods:
        reset_parameters(): Reset embedding parameters using Xavier uniform initialization.
        forward(input: Tensor) -> Tensor: Forward pass through the stable embedding layer.
    Nnum_embeddingsembedding_dimpadding_idxmax_norm	norm_typescale_grad_by_freqsparse_weightreturnc                     t         |   |||||||||	|

       t        j                  j	                  ||	      | _        t        j                         j                  | dddi       ya  
        Args:
            num_embeddings (`int`):
                The number of unique embeddings (vocabulary size).
            embedding_dim (`int`):
                The dimensionality of the embedding.
            padding_idx (`Optional[int]`):
                Pads the output with zeros at the given index.
            max_norm (`Optional[float]`):
                Renormalizes embeddings to have a maximum L2 norm.
            norm_type (`float`, defaults to `2.0`):
                The p-norm to compute for the `max_norm` option.
            scale_grad_by_freq (`bool`, defaults to `False`):
                Scale gradient by frequency during backpropagation.
            sparse (`bool`, defaults to `False`):
                Computes dense gradients. Set to `True` to compute sparse gradients instead.
            _weight (`Optional[Tensor]`):
                Pretrained embeddings.
        r	   weight
optim_bits    N)	super__init__torchr   	LayerNormnormr   get_instanceregister_module_override)selfr   r   r   r   r   r   r   r   r	   r
   	__class__s              R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/nn/modules.pyr%   zStableEmbedding.__init__0   sp    @ 		
 HH&&}V&D	'')BB4T`bdSef    c                     t         j                  j                  j                  | j                         | j                          y Nr&   r   initxavier_uniform_r!   _fill_padding_idx_with_zeror+   s    r-   reset_parametersz StableEmbedding.reset_parameters_   (    %%dkk2((*r.   c                     | j                   Ft        j                         5  | j                  | j                      j	                  d       d d d        y y # 1 sw Y   y xY wNr   r   r&   no_gradr!   fill_r5   s    r-   r4   z+StableEmbedding._fill_padding_idx_with_zeroj   S    ' 7D,,-33A67 7 (7 7   )AAinputc           	      ^   t        j                  || j                  | j                  | j                  | j
                  | j                  | j                        }|j                  t        j                               }| j                  |      j                  | j                  j                        S r0   )F	embeddingr!   r   r   r   r   r   tor&   get_default_dtyper(   r
   r+   r?   embs      r-   forwardzStableEmbedding.forwardo   s}    kkKKMMNN##KK
 ffU,,./yy~  !2!233r.   )NN       @FFNNNr   N)__name__
__module____qualname____doc__intr   floatboolr   r%   r6   r4   rG   __classcell__r,   s   @r-   r   r      s    : &*$(#($(-g-g -g c]	-g
 5/-g -g !-g -g &!-g 
-g^+7
4V 4 4r.   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddededee   dee   deded	ed
ee   dee	   ddf fdZ
ddZ	 ddZdedefdZ xZS )	EmbeddingzS
    Embedding class to store and retrieve word embeddings from their indices.
    Nr   r   r   r   r   r   r   r   r	   r   c
                     t         
|   |||||||||		       t        j                         j	                  | dddi       yr   )r$   r%   r   r)   r*   )r+   r   r   r   r   r   r   r   r   r	   r,   s             r-   r%   zEmbedding.__init__   sV    > 	 	 
	
 	'')BB4T`bdSefr.   c                     t         j                  j                  j                  | j                         | j                          y r0   r1   r5   s    r-   r6   zEmbedding.reset_parameters   r7   r.   c                     | j                   Ft        j                         5  | j                  | j                      j	                  d       d d d        y y # 1 sw Y   y xY wr9   r:   r5   s    r-   r4   z%Embedding._fill_padding_idx_with_zero   r=   r>   r?   c           	          t        j                  || j                  | j                  | j                  | j
                  | j                  | j                        }|S r0   )rA   rB   r!   r   r   r   r   r   rE   s      r-   rG   zEmbedding.forward   sH    kkKKMMNN##KK
 
r.   )NNrH   FFNNrI   )rJ   rK   rL   rM   rN   r   rO   rP   r   r	   r%   r6   r4   rG   rQ   rR   s   @r-   rT   rT      s     &*$(#($(#'*g*g *g c]	*g
 5/*g *g !*g *g &!*g  *g 
*gX+7
V  r.   rT   c                   &    e Zd Zddddddej                  ddf	deej                     dee   dee   de	d	e
d
ej                  ded   de	dd fdZd Zd Zd Zd Ze	 	 	 d%dej                  dee
ef   de	ded   dd f
d       Zd Zd Zd&deeeee
f      de	fdZd&deeeee
f      de	fdZe	 	 	 d'dedeeeef      deeee
f      de	def
d       Zed(dedeee
f   de	defd        Zed(ded!ede	defd"       Z fd#Zed) fd$	       Z xZS )*
Params4bitNFTfp4dataquant_state	blocksizecompress_statistics
quant_typequant_storagemodule
Linear4bitbnb_quantizedr   c
                     |t        j                  d      }|
t        sdnd}t         j                  j	                  | ||      }
||
_        ||
_        ||
_        ||
_        ||
_	        |	|
_
        ||
_        ||
_        |
S )Nr   @      )r&   emptyr   r   _make_subclassr^   r_   r`   r]   ra   rd   r\   rb   )clsr\   requires_gradr]   r^   r_   r`   ra   rb   rd   r+   s              r-   __new__zParams4bit.__new__   s~     <;;q>D"1sI||**3mD"#6 $&**	r.   c                 v    | j                   j                         }| j                  |d<   | j                  |d<   |S )Nr\   rk   )__dict__copyr\   rk   r+   states     r-   __getstate__zParams4bit.__getstate__   s6    ""$		f!%!3!3or.   c                     |d   | _         |d   | _        |d   | _        |d   | _        |d   | _        |d   | _        |d   | _        |d   | _        |d	   | _        y )
Nrk   r^   r_   r`   r]   r\   ra   rd   rb   )	rk   r^   r_   r`   r]   r\   ra   rd   rb   rp   s     r-   __setstate__zParams4bit.__setstate__   sr    "?3{+#()>#? - /&M	"?3"?3Hor.   c                    t        |       j                  t        |             }| j                         }|j                  |       t	        j
                  |d         |_        t	        j
                  |d         |_        |S )Nr]   r\   )typerl   rr   rt   ro   deepcopyr]   r\   )r+   memonew_instancerq   s       r-   __deepcopy__zParams4bit.__deepcopy__   sg    Dz))$t*5!!#!!%(#'==}1E#F  MM%-8r.   c                     t        |       j                  t        |             }| j                         }|j                  |       |S r0   )rv   rl   rr   rt   )r+   ry   rq   s      r-   __copy__zParams4bit.__copy__  s<    Dz))$t*5!!#!!%(r.   quantized_statsrk   c                    t         j                  j                  | |j                  |            }||_        t        j                  ||      |_        |j                  j                  |_        |j                  j                  |_
        |j                  j                  |_        d|_        |j                  |_        ||_        |j                  |j                  |j                  _        |S )N)qs_dictr	   T)r&   r   ri   rC   rk   r   	from_dictr]   r^   nestedr_   r`   rd   r
   ra   rb   )rj   r\   r}   rk   r	   rb   kwargsr+   s           r-   from_prequantizedzParams4bit.from_prequantized  s     ||**3@*%//PVW))33#'#3#3#:#: **55!!ZZ;;"&*&6&6DKK#r.   c                 Z   | j                   j                         j                  |      }t        j                  j                  || j                  | j                  | j                  | j                        \  }}|| _         || _
        | j                  || j                  _
        d| _        | S )N)r^   r_   r`   ra   T)r\   
contiguousrC   bnb
functionalquantize_4bitr^   r_   r`   ra   r]   rb   rd   )r+   r	   ww_4bitr]   s        r-   	_quantizezParams4bit._quantize&  s    II  "%%f-!nn::nn $ 8 8,, ; 
 	&;;"&1DKK#!r.   c                 &    | j                  d      S Ncpur    rC   r5   s    r-   r   zParams4bit.cpu6      wwew$$r.   r	   non_blockingc                 <    | j                  |d|      S ||      S Ncudar	   r   r   r+   r	   r   s      r-   r   zParams4bit.cuda9  '    wwfQ]w^^FQ]w^^r.   c                 <    | j                  |d|      S ||      S Nxpur   r   r   s      r-   r   zParams4bit.xpu<  '    wwv~eP\w]]6P\w]]r.   r+   r
   c                      y r0    r+   r	   r
   r   s       r-   rC   zParams4bit.to?       r.   c                      y r0   r   r+   r
   r   s      r-   rC   zParams4bit.toG      NQr.   tensorc                      y r0   r   r+   r   r   s      r-   rC   zParams4bit.toJ      DGr.   c           
         t        j                  j                  j                  |i |\  }}}}|,|j                  dk7  r| j
                  s| j                  |      S | j                  | j                  j                  |       t        t        | !  |||      | j                  | j                  | j                  | j                  | j                  | j                  | j
                        }|S )Nmetar	   r
   r   )rk   r]   r^   r_   r`   ra   rd   )r&   _C_nn	_parse_torv   rd   r   r]   rC   rZ   r$   rk   r^   r_   r`   ra   )	r+   argsr   r	   r
   r   convert_to_format	new_paramr,   s	           r-   rC   zParams4bit.toM  s    9>9O9OQU9`Y_9`6|%6&++"7@R@R>>&))+  ##F+"
&L
Q"00 ,,..$($<$<??"00"00	I r.   c                     |i }|t         j                  t         j                  fv r|d   t           ||||      }t        |t              rt         fd|D              S   |j                  j                  j                  j                  j                  j                  j                  j                  	      S t           ||||      S )Nr   c              3      K   | ]e  } |j                   j                  j                  j                  j                  j
                  j                  j                   	       g yw)	r\   rk   r]   r^   r_   r`   ra   rb   rd   N)rk   r]   r^   r_   r`   ra   rb   rd   ).0chunkrj   r   s     r-   	<genexpr>z0Params4bit.__torch_function__.<locals>.<genexpr>n  sn        "&,&:&:$*$6$6"("2"2,2,F,F#)#4#4&,&:&:%}}&,&:&:
 
s   A+A.r   )r&   r   splitr$   __torch_function__
isinstancetuplerk   r]   r^   r_   r`   ra   rb   rd   )rj   functypesr   r   resultr   r,   s   `     @r-   r   zParams4bit.__torch_function__c  s    >FEKK--!WFW/eT6JF&%(  "(   "("6"6 & 2 2$..(.(B(B%00"("6"6!=="("6"6
 
 w)$tVDDr.   )Fr   NNF....)r   N) rJ   rK   rL   r&   uint8r   r   r   rN   rP   strr
   rl   rr   rt   rz   r|   classmethoddictr   r   r   r   r   r	   r   r   r   r   rC   r   rQ   rR   s   @r-   rZ   rZ      s=    (,,0#'$(%*[[)-#u||$ j)	
 C= "  {{ &  
:	& 
 $)-ll c3h 	 & 
 2 %_8E#vs*:$;< _SW _^(5fc)9#:; ^RV ^  03-0 	sF{+, eSj)* 	
 
  QQ5,QDQ1Q QGGFG$GG G, %E %Er.   rZ   rb   )Embedding4bitrc   c                 l   t        | j                  dd       y t        | dd       t        j                  d       | j                  j                  d   dk(  sJ t        | j                  t              s't        | j                  | j                  d      | _        | j                  | j                  _        y )Nr]   zhFP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.   T)ra   rd   )	getattrr!   warningswarnshaper   rZ   ra   r]   )rb   s    r-   'fix_4bit_weight_quant_state_from_moduler     s    v}}mT2>v}d+3v	
 ==q!Q&&&fmmZ0"6==@T@Tdhi & 2 2FMMr.   c                   t     e Zd ZdZddddej
                  df fd	Zd Z fdZdej                  fd	Z
 xZS )
rc   a  
    This class is the base module for the 4-bit quantization algorithm presented in [QLoRA](https://arxiv.org/abs/2305.14314).
    QLoRA 4-bit linear layers uses blockwise k-bit quantization under the hood, with the possibility of selecting various
    compute datatypes such as FP4 and NF4.

    In order to quantize a linear layer one should first load the original fp16 / bf16 weights into
    the Linear4bit module, then call `quantized_module.to("cuda")` to quantize the fp16 / bf16 weights.

    Example:

    ```python
    import torch
    import torch.nn as nn

    import bitsandbytes as bnb
    from bnb.nn import Linear4bit

    fp16_model = nn.Sequential(
        nn.Linear(64, 64),
        nn.Linear(64, 64)
    )

    quantized_model = nn.Sequential(
        Linear4bit(64, 64),
        Linear4bit(64, 64)
    )

    quantized_model.load_state_dict(fp16_model.state_dict())
    quantized_model = quantized_model.to(0) # Quantization happens here
    ```
    TNr[   c	                     t         	|   ||||       t        | j                  j                  d||||       | _        || _        |du| _        d| _        || _        y)aw  
        Initialize Linear4bit class.

        Args:
            input_features (`str`):
                Number of input features of the linear layer.
            output_features (`str`):
                Number of output features of the linear layer.
            bias (`bool`, defaults to `True`):
                Whether the linear class uses the bias term as well.
        Frk   r_   r`   ra   rb   N)	r$   r%   rZ   r!   r\   compute_dtypecompute_type_is_setr]   ra   )
r+   input_featuresoutput_featuresbiasr   r_   r`   ra   r	   r,   s
            r-   r%   zLinear4bit.__init__  sg    , 	$G KK 3!'
 +#0#< *r.   c                 h   |j                   t        j                  t        j                  fv r|j                   | _        y |j                   t        j
                  k(  r| j                  d t        j                  fv rL|j                         |j                  d   k(  r,t        j                  d       t        j                  dd       | j                  d t        j                  fv rN|j                         |j                  d   k7  r-t        j                  d       t        j                  dd       y y y y )NzInput type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.ignorez.*inference.)messagezInput type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.z.*inference or training)r
   r&   float32bfloat16r   float16numelr   r   r   filterwarningsr+   xs     r-   set_compute_typezLinear4bit.set_compute_type  s    77u}}enn55 "#DWW%!!dEMM%::	QWWUW[@X  Y ''.I!!dEMM%::	QWWUW[@X k '':ST	 AY: &r.   c                 
   t         |   |||       t        | j                  dd      Z| j                  j                  j                  d      j                         D ]"  \  }}|r|n|j                         ||dz   |z   <   $ yy)zc
        save weight and bias,
        then fill state_dict with components of quant_state
        r]   NT)packedzweight.)r$   _save_to_state_dictr   r!   r]   as_dictitemsdetach)r+   destinationprefix	keep_varskvr,   s         r-   r   zLinear4bit._save_to_state_dict  s    
 	#KC4;;t4@//77t7DJJL U1;Da!((*FY.23U Ar.   r   c                    t        |        | j                  a| j                  j                  |j                  k7  r>| j                  j                  j	                  |j                        | j                  _        | j
                  s| j                  |       d| _        |j                  }| j                  |j	                  | j                        }| j                  d n$| j                  j	                  | j                        }| j                  j                         }t        j                  |||| j                  j                        j	                  |      S )NT)r   r]   )r   r   r
   r\   rC   r   r   r   r!   tr   matmul_4bitr]   )r+   r   	inp_dtyper   r!   s        r-   rG   zLinear4bit.forward  s    /5 99 TYY__%?!YY^^..qww7DIIN''!!!$'+D$GG	)T''(Ayy(tdiill4;M;M.Nq&tAXAXY\\]fggr.   )rJ   rK   rL   rM   r&   r   r%   r   r   r   rG   rQ   rR   s   @r-   rc   rc     sE    H  kk#+JU(	Uh hr.   rc   c                   B     e Zd ZdZdddej
                  df fd	Z xZS )	LinearFP4z'
    Implements the FP4 data type.
    TNc           
      2    t         |   |||||d||       y)Q  
        Args:
            input_features (`str`):
                Number of input features of the linear layer.
            output_features (`str`):
                Number of output features of the linear layer.
            bias (`bool`, defaults to `True`):
                Whether the linear class uses the bias term as well.
        r[   Nr$   r%   	r+   r   r   r   r   r_   ra   r	   r,   s	           r-   r%   zLinearFP4.__init__  *    & 			
r.   rJ   rK   rL   rM   r&   r   r%   rQ   rR   s   @r-   r   r     s'      kk
 
r.   r   c                   B     e Zd ZdZdddej
                  df fd	Z xZS )	LinearNF4a"  Implements the NF4 data type.

    Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that
    is normalized into the range [-1, 1].

    For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314)

    Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in
    the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
    TNc           
      2    t         |   |||||d||       y)r   nf4Nr   r   s	           r-   r%   zLinearNF4.__init__G  r   r.   r   rR   s   @r-   r   r   ;  s'    	  kk
 
r.   r   c                       e Zd Z	 	 	 	 	 ddeej
                     deej
                     deej
                     fdZ fdZd Zddee	e
eef      defd	Zddee	e
eef      defd
Zd Ze	 	 	 ddedee	e
ef      dee	eef      dedef
d       Zeddede	eef   dedefd       Zeddedededefd       Z fdZ xZS )
Int8Paramsr\   CBSCBc                     |t        j                  d      }t         j                  j                  | ||      }||_        ||_        ||_        |S r9   )r&   rh   r   ri   r   r   has_fp16_weights)rj   r\   rk   r   r   r   objs          r-   rl   zInt8Params.__new__g  sI     <;;q>Dll))#t]C/
r.   c                     | j                   rt        | 	  |      S | j                  j	                         j                  |t
        j                        }t        j                  j                  |      \  }}}|| _        || _
        || _        | S )Nr	   r
   )r   r$   rC   r\   r   r&   r   r   r   int8_vectorwise_quantr   r   )r+   r	   Br   r   _r,   s         r-   r   zInt8Params._quantizew  sw      7:f%% II  "%%V5==%I^^99!<
C	r.   c                 &    | j                  d      S r   r   r5   s    r-   r   zInt8Params.cpu  r   r.   r	   r   c                 <    | j                  |d|      S ||      S r   r   r   s      r-   r   zInt8Params.cuda  r   r.   c                 <    | j                  |d|      S ||      S r   r   r   s      r-   r   zInt8Params.xpu  r   r.   c                 4   t        |       j                  t        |       t        j                  | j                  |      | j
                  | j                  t        j                  | j                  |      t        j                  | j                  |            }|S )N)r\   rk   r   r   r   )	rv   rl   ro   rw   r\   rk   r   r   r   )r+   rx   ry   s      r-   rz   zInt8Params.__deepcopy__  sr    Dz))Jtyy$/,,!22}}TWWd+dhh- * 
 r.   r+   r
   r   c                      y r0   r   r   s       r-   rC   zInt8Params.to  r   r.   c                      y r0   r   r   s      r-   rC   zInt8Params.to  r   r.   r   c                      y r0   r   r   s      r-   rC   zInt8Params.to  r   r.   c                 &   t        j                  j                  j                  |i |\  }}}}| j                  j
                  t         j                  k(  }|sE|C|j                  dk7  r4| j                  j                  j                  dk(  r| j                  |      S t        t        	| 1  |||      | j                  | j                        }|r?|j                  |_        | j                   "| | j                   j                  |      |_        |S )Nr   r   r   )rk   r   )r&   r   r   r   r\   r
   int8rv   r	   r   r   r$   rC   rk   r   r   r   )
r+   r   r   r	   r
   r   r   is_quantizedr   r,   s
            r-   rC   zInt8Params.to  s    9>9O9OQU9`Y_9`6|%6yy%**4 2v{{f7LQUQZQZQaQaQfQfjoQo >>&)) GJfEJM,,!22
	 $>>ILxx#(: $F 3	r.   )NTFNNr   r   r   )rJ   rK   rL   r   r&   r   rl   r   r   r   rN   r	   r   rP   r   r   rz   r   r   r
   rC   rQ   rR   s   @r-   r   r   f  sq    (,%)&*u||$
 U\\" ell# %_8E#vs*:$;< _SW _^(5fc)9#:; ^RV ^
  03-0 	sF{+, eSj)* 	
 
  QQ5,QDQ1Q QGGFG$GG G r.   r   c                 d   | j                  | d      }|y | j                  | dd      }t        |t        j                        r|j                         }t        |t              r|t        vrt        d|       t        |t              r|t        v r	t        |   }|dk7  rt        d|       y )Nr!   weight_formatrowz'Expected supported weight format - got z+Only 'row' weight format is supported, got )	getpopr   r&   r   itemrN   r   
ValueError)	
state_dictr   local_metadatastrictmissing_keysunexpected_keys
error_msgsr!   r  s	            r-   maybe_rearrange_weightr    s    ^^vhf-.F~NNfX]#;UCM-.%**, -%-?i*iB=/RSS	M3	'M=g,gB=QF}oVWW r.   c                   :     e Zd ZdZd fd	Zd ZdedefdZ xZS )Embedding8bita  
    This class implements [LLM.int8()](https://arxiv.org/abs/2208.07339) algorithm for embedding layer

    Quantization API is similar to Linear8bitLt:
    ```python
    import torch
    import torch.nn as nn

    from bitsandbytes.nn import Embedding8bit

    fp16_module = nn.Embedding(128, 64)
    int8_module = Embedding8bit(128, 64)

    int8_module.load_state_dict(fp16_module.state_dict())

    int8_module = int8_module.to(0) # Quantization happens here
    ```
    c                     t         |   ||||       | j                  j                  j                  | _        t        | j                  j                  dd      | _        y )Nr  Fr   rk   )r$   r%   r!   r\   r
   r   )r+   r   r   r	   r
   r,   s        r-   r%   zEmbedding8bit.__init__  sK    vUS[[%%++
 !1!1EY^_r.   c                     t        d      )Nz.Saving Embedding8bit module is not implementedNotImplementedErrorr+   r   r   r   s       r-   r   z!Embedding8bit._save_to_state_dict      !"RSSr.   r?   r   c                    t        | j                  d      st        d      | j                  j                  }| j                  j                  }|j
                  | j                  | j                  fk(  sJ |j
                  | j                  fk(  sJ t        j                  ||      }t        j                  ||j                  | j                  d            }||dz  z  }|j                  | j                        S )Nr   zKEmbedding layer is not quantized. Please call .cuda() or .to(device) first.r   g     _@)hasattrr!   RuntimeErrorr\   r   r   r   r   rA   rB   viewrC   r
   )r+   r?   rows	row_statscompressed_outputcompressed_output_statsoutputs          r-   rG   zEmbedding8bit.forward  s    t{{E*lmm{{KKOO	zzd1143E3EFFFF4#6#6"8888KKt4"#++eY^^DDWDWYZ5["\"&=&EFyy$$r.   )NN)	rJ   rK   rL   rM   r%   r   r   rG   rQ   rR   s   @r-   r   r     s'    &`T%V % %r.   r   c                   b     e Zd ZdZddej
                  df fd	ZdefdZd Z	dedefd	Z
 xZS )
r   a3  
    This is the base class similar to Linear4bit. It implements the 4-bit quantization algorithm presented in
    [QLoRA](https://arxiv.org/abs/2305.14314) for embeddings.

    Quantization API is similar to Linear4bit:
    ```python
    import torch
    import torch.nn as nn

    from bitsandbytes.nn import Embedding4bit

    fp16_module = nn.Embedding(128, 64)
    quantized_module = Embedding4bit(128, 64)

    quantized_module.load_state_dict(fp16_module.state_dict())

    quantized_module = quantized_module.to(0) # Quantization happens here
    ```
    Nr[   c                 @   t         |   ||||       | j                  j                  j                  | _        t        | j                  j                  dd |||       | _        | j                  j                  }||z  dk7  rt        j                  d| d| d       y y )Nr  Fr   r   zEmbedding size z  is not divisible by block size z#. This will lead to slow inference.)	r$   r%   r!   r\   r
   rZ   r^   r   r   )	r+   r   r   r
   r`   ra   r	   r^   r,   s	           r-   r%   zEmbedding4bit.__init__  s     	vUS[[%%++
 KK $!'
 KK))	9$)MM!-0PQZP[ \4 4 *r.   r?   c                    | j                   | j                  j                  j                  z  dk(  sJ | j                  j                  j                  t        j                        j                  | j                  | j                   z  dz  d      }t        j                  j                  j                  |j                  | j                  | j                   dz        |      j                  dd      }|j                  |j                         | j                   z  dz  dfk(  sJ | j                   | j                  j                  z  }| j                  j                  j                  }|j                  | j                  |z  fk(  sJ t        j                  j                  j                  |j                  | j                  |      |      j                  d      }|j                  |j                         |z  fk(  sJ t        j                   | j                  j                        }||_        t        j"                  g |j                  | j                         |_        t$        j                  j'                  ||      }|j                  g |j                  | j                   k(  sJ |j)                  | j*                        S )Nr      r   r!   r?   r   )r   r!   r]   r^   r\   r+  r&   r   r   r   r   rB   r   r   absmaxro   rw   Sizer   dequantize_4bitrC   r
   )	r+   r?   w_4bit_uint8output_4bitblocks_per_embr6  output_absmaxoutput_quant_stater0  s	            r-    _forward_with_partial_dequantizez.Embedding4bit._forward_with_partial_dequantize8  sI   !!DKK$;$;$E$EEJJJ{{'',,U[[9>>t?R?RUYUgUg?gkl?lnophh))33$$T%8%8$:L:LPQ:QR 4 
 $r1+ 	   U[[]T5G5G%G1%La$PPPP++t{{/D/DD((//|| 3 3n DFFFF++55;;t22NC 6 
 $
 	 ""u{{}~'E&GGGG!]]4;;+B+BC$1!#(::.P.PT=O=O.P#Q //=OP||AAd.@.@AAAAyy$$r.   c                     t        d      )Nz.Saving Embedding4bit module is not implementedr$  r&  s       r-   r   z!Embedding4bit._save_to_state_dictY  r'  r.   r   c                    t        |        | j                  | j                  j                  j                  z  dk(  r| j                  |      S t        j                  j                  | j                  j                  | j                  j                        }t        j                  j                  j                  ||      j                  | j                        S )Nr   r5  )r   r   r!   r]   r^   r>  r   r   r8  r\   r&   r   rB   rC   r
   )r+   r?   dequantized_weights      r-   rG   zEmbedding4bit.forward\  s    /5 7 7 A AAQF88?? ^^;;DKK<L<LdkkNeNefxx"",,% - 
 "TZZ.	r.   )rJ   rK   rL   rM   r&   r   r%   r   r>  r   rG   rQ   rR   s   @r-   r   r     sE    0 kk:%f %BTV  r.   r   c                   :     e Zd Zdej                  df fd	Z xZS )EmbeddingFP4Nc                 0    t         |   |||d||       y )Nr[   r
   r`   ra   r	   r   r+   r   r   r
   ra   r	   r,   s         r-   r%   zEmbeddingFP4.__init__k  )     	' 	 	
r.   rJ   rK   rL   r&   r   r%   rQ   rR   s   @r-   rC  rC  j      
 kk
 
r.   rC  c                   :     e Zd Zdej                  df fd	Z xZS )EmbeddingNF4Nc                 0    t         |   |||d||       y )Nr   rE  r   rF  s         r-   r%   zEmbeddingNF4.__init__~  rG  r.   rH  rR   s   @r-   rK  rK  }  rI  r.   rK  c                   |     e Zd ZdZ	 	 	 	 	 ddedef fdZ fdZ fdZd Z fdZ	d	e
j                  fd
Z xZS )Linear8bitLtaZ  
    This class is the base module for the [LLM.int8()](https://arxiv.org/abs/2208.07339) algorithm.
    To read more about it, have a look at the paper.

    In order to quantize a linear layer one should first load the original fp16 / bf16 weights into
    the Linear8bitLt module, then call `int8_module.to("cuda")` to quantize the fp16 weights.

    Example:

    ```python
    import torch
    import torch.nn as nn

    import bitsandbytes as bnb
    from bnb.nn import Linear8bitLt

    fp16_model = nn.Sequential(
        nn.Linear(64, 64),
        nn.Linear(64, 64)
    )

    int8_model = nn.Sequential(
        Linear8bitLt(64, 64, has_fp16_weights=False),
        Linear8bitLt(64, 64, has_fp16_weights=False)
    )

    int8_model.load_state_dict(fp16_model.state_dict())
    int8_model = int8_model.to(0) # Quantization happens here
    ```
    r   r   c                 V   t         |   ||||       t        j                         | _        || _        || j                  _        || j                  _        |dkD  r|sd| j                  _        t        | j                  j                  ||      | _
        | j                  t               y)ay  
        Initialize Linear8bitLt class.

        Args:
            input_features (`int`):
                Number of input features of the linear layer.
            output_features (`int`):
                Number of output features of the linear layer.
            bias (`bool`, defaults to `True`):
                Whether the linear class uses the bias term as well.
                Tr"  N)r$   r%   r   MatmulLtStaterq   index	thresholdr   use_poolr   r!   r\   "_register_load_state_dict_pre_hookr  )	r+   r   r   r   r   rS  rR  r	   r,   s	           r-   r%   zLinear8bitLt.__init__  s    * 	$G&&(

(

&6

#s?#3"&DJJ !1!1DTdtu//0FGr.   c                    t         	|   |||       d}t        | j                  |      }t        | j                  |      }|| z   }|dz   }| j                  j
                  s|@|r|n|j                         ||<   t        j                  dt        j                        ||<   y |@|r|n|j                         ||<   t        j                  dt        j                        ||<   y y y )Nr   r  r   )r
   )
r$   r   r   r!   rq   r   r   r&   r   r   )
r+   r   r   r   scb_nameparam_from_weightparam_from_statekey_nameformat_namer,   s
            r-   r   z Linear8bitLt._save_to_state_dict  s    #KC  $DKK:"4::x8xj) .zz** ,=F(9L]LdLdLfH%+0<<+MK(!-<E(8K[KbKbKdH%+0<<+MK( .	 +r.   c           	         t         |   |||||||       t        |      }|D ]  }	|	t        |      d  }
|
dk(  s| j                  j
                  t        d      ||	   }| j                  j
                  j                  |       | j                  j
                  %| j                  j
                  | j                  _        |j                  |	        y )Nr   zLoading a quantized checkpoint into non-quantized Linear8bitLt is not supported. Please call module.cuda() before module.load_state_dict())
r$   _load_from_state_dictlistlenr!   r   r*  copy_rq   remove)r+   r  r   r  r  r  r  r  unexpected_copykey
input_nameinput_paramr,   s               r-   r]  z"Linear8bitLt._load_from_state_dict  s     	%	
 /" 	,CS[]+JU";;??*&c 
 )o%%k2::>>-%)[[__DJJN&&s+!	,r.   c                     | j                   j                  | j                  _        | j                   j                  | j                  _        d | j                   _        d | j                   _        y r0   r!   r   rq   r   r5   s    r-   init_8bit_statezLinear8bitLt.init_8bit_state  >    



r.   c                    t        |   |i |}t        j                  j                  j
                  |i |\  }}}}||j                  j                  4|j                  j                  j                  |      |j                  _        |j                  j                  4|j                  j                  j                  |      |j                  _        |S r0   )	r$   rC   r&   r   r   r   rq   r   r   )	r+   r   r   r   r	   r
   r   r   r,   s	           r-   rC   zLinear8bitLt.to  s    T,V,9>9O9OQU9`Y_9`6|%6 ||*"(,,//"4"4V"<||+#)<<#3#3#6#6v#> r.   r   c                 r   | j                   | j                  _        | j                  j                  | j                          | j                  a| j                  j                  |j                  k7  r>| j                  j                  j                  |j                        | j                  _        t        j                  || j                  | j                  | j                        }| j                  j                  s;| j                  j                  %| j                  j                  | j                  _        |S N)r   rq   )trainingrq   is_trainingr!   r   rh  r   r
   r\   rC   r   matmulr   )r+   r   outs      r-   rG   zLinear8bitLt.forward&  s    !%

;;>>%  " 99 TYY__%?!YY^^..qww7DIINjjDKKdiitzzJzz**tzz}}/H#zz}}DKK
r.   )TTrP  NN)rJ   rK   rL   rM   rN   r%   r   r]  rh  rC   r&   r   rG   rQ   rR   s   @r-   rN  rN    s[    F  H H  HDN0%,N r.   rN  c                   2     e Zd Zd fd	Zd Zd Zd Z xZS )OutlierAwareLinearc                 F    t         |   ||||       d | _        d| _        y r   )r$   r%   outlier_dimr  )r+   r   r   r   r	   r,   s        r-   r%   zOutlierAwareLinear.__init__8  s&    $G!r.   c                     t        d      )NzJPlease override the `forward_with_outliers(self, x, outlier_idx)` functionr$  )r+   r   outlier_idxs      r-   forward_with_outliersz(OutlierAwareLinear.forward_with_outliers=  s    !"noor.   c                     t        d      )NzEPlease override the `quantize_weights(self, w, outlier_idx)` functionr$  )r+   r   rv  s      r-   quantize_weightz"OutlierAwareLinear.quantize_weight@  s    !"ijjr.   c                 |   | j                   Qt        j                         }|j                         st	        d       |j                  | j                        }|| _         | j                  sS| j                  | j                  | j                         }| j                  j                  j                  |       d| _        y y )NzTPlease use OutlierTracer.initialize(model) before using the OutlierAwareLinear layerT)rt  r   r)   is_initializedprintget_outliersr!   r  ry  r\   r`  )r+   r   tracerrv  r   s        r-   rG   zOutlierAwareLinear.forwardC  s    #"//1F((*lm --dkk:K*D  $$T[[$2B2BCAKK""1% $D !r.   )TN)rJ   rK   rL   r%   rw  ry  rG   rQ   rR   s   @r-   rr  rr  7  s    "
pk%r.   rr  c                   8     e Zd Z	 	 	 	 	 	 d fd	Zd Zd Z xZS )SwitchBackLinearBnbc	                 N   t         	|   ||||       t        j                         | _        || _        || j                  _        || j                  _        || j                  _        |dkD  r|sd| j                  _	        t        | j                  j                  ||      | _        y )NrP  Tr"  )r$   r%   r   rQ  rq   rR  rS  r   memory_efficient_backwardrT  r   r!   r\   )
r+   r   r   r   r   r  rS  rR  r	   r,   s
            r-   r%   zSwitchBackLinearBnb.__init__S  s     	$G&&(

(

&6

#/H

,s?#3"&DJJ !1!1DTdtur.   c                     | j                   j                  | j                  _        | j                   j                  | j                  _        d | j                   _        d | j                   _        y r0   rg  r5   s    r-   rh  z#SwitchBackLinearBnb.init_8bit_statej  ri  r.   c                 0   | j                   | j                  _        | j                  j                  | j                          t        j                  |j                         | j                  j                         d | j                        | j                  z   S rl  )
rm  rq   rn  r!   r   rh  r   matmul_mixedhalfr   r   s     r-   rG   zSwitchBackLinearBnb.forwardp  sf    !%

;;>>%  "$++*:*:*<4tzzZ]a]f]fffr.   )TTFrP  NN)rJ   rK   rL   r%   rh  rG   rQ   rR   s   @r-   r  r  R  s(    
 "'v.gr.   r  ).ro   typingr   r   r   r   r   r   r&   r   r	   r
   r   torch.nn.functionalr   rA   bitsandbytesr   bitsandbytes.cextensionr   bitsandbytes.functionalr   bitsandbytes.optimr   bitsandbytes.utilsr   r   r   rT   r   	ParameterrZ   r   Linearrc   r   r   r   r  r   r   rC  rK  rN  rr  r  r   r.   r-   <module>r     s\  
  : :   + +    3 . 1 XC()g4ehh(( g4TL"" L^zE## zEz3E:W4X 3"wh wht!

 !
H(

 (
VY## YxX*,%BLL ,%^aBLL aH
= 
&
= 
&d299 dN% %6$g")) $gr.   