
    bi>             ,       D   d dl mZ d dlZd dlZd dlmZ d dlmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZmZ dd	lmZmZ i Z	 ej0                  ej2                  fej4                  ej6                  fej8                  ej:                  fej<                  ej>                  fej0                  ej2                  fej4                  ej6                  fd
Z  G d d      Z! G d d      Z" G d d      Z# ejH                  dd       Z%ejL                  jO                         dkD  rdej                  fdZ(nd dl)Z)dej                  fdZ(ejT                  e%ddZ+ddej                  fdZ,ddZ-ddZ.ddZ/ddZ0ddZ1dd Z2dd!Z3d"ee	ej                        fd#Z4d$ed%ejj                  fd&Z6de	e   d%e	ejj                     fd'Z7 G d( d)      Z8	 	 	 	 	 ddej                  d*e	ej                     d+e	ej                     d,e	ej                     d%e9ej                  e8f   f
d-Z:	 	 	 	 	 	 ddej                  d.e	e8   d+e	ej                     d*e	ej                     d,e	ej                     d/e;d%ej                  fd0Z<dd1Z=ddddej|                  fdej                  d+e	ej                     d,e	ej                     fd2Z?ddddej|                  fdej                  d+e	ej                     d,e	ej                     fd3Z@ddddd4ej|                  fdej                  d+e	ej                     d,e	ej                     d%e9ej                  e8f   fd5ZA	 	 	 	 ddej                  d.e	e8   d+e	ej                     d,e	ej                     d/e	e;   d%ej                  fd6ZB	 	 	 	 ddej                  d.e	e8   d+e	ej                     d,e	ej                     d/e	e;   d%ej                  fd7ZC	 	 	 	 	 ddej                  d.e	e8   d+e	ej                     d,e	ej                     d/e	e;   d%ej                  fd8ZD ed9eE:      	 	 dded*e	ej                     d,e	ej                     d%e9ee9eef   f   fd;       ZF ed9eE:      	 	 	 	 dded<e	e9eef      d+e	ej                     d*e	ej                     d,e	ej                     d%efd=       ZG ed9eE:      dded*ed,e	ej                     d%efd>       ZH ed9eE:      dded*ed,e	ej                     d%efd?       ZI	 	 	 	 	 	 	 	 	 dd@eJdAedBedCedDeKdEeKdFe;dGeKdHe	ej                     dIeKdJeKdKeKdLeKdMeKdNe	ej                     dOeKd%df"dPZL edQeE:      	 	 	 	 dd@eJdAedBedCedHe	ej                     dDeKdIeKdEeKdFe;dGeKdRedSe	ej                     dTedUe	ej                     dVedWe	ej                     dLeKdMeKdNe	ej                     dOeKd%df*dX       ZM	 	 	 dd@eJdAedBedCedHe	ej                     dDeKdIeKdJeKdKeKdEeKdFe;dGeKdRedSe	ej                     dYedZe	ej                     dLeKdMeKd%df&d[ZN ed9eE:      dd\ed]edFe;d^e;fd_       ZOej                  fd`ZQ	 	 	 	 ddedaed,e	ej                     fdbZR	 	 	 ddedaed,e	ej                     fdcZS	 	 	 ddedaed,e	ej                     fddZTdej                  fdej                  daej                  d,e	ej                     fdeZV	 	 ddej                  dfej                  dgej                  d,e	ej                     dhe	ej                     f
diZW ed9eE:      	 	 	 	 ddej                  dfe	ej                     dge	ej                     dje	ej                     d%e9ej                  ej                  e	ej                     f   f
dk       ZX ed9eE:      ddej                  fdl       ZY G dm dn      ZZ G do dp      Z[ G dq dr      Z\ds Z]dt Z^ej                  fduZ`	 	 	 	 	 ddej                  dge	ej                     dfe	ej                     dve	ej                     dwe	ej                     f
dxZadej                  dyej                  fdzZbddej                  fd{Zc	 dd|e
eZej                  f   daej                  d,e	ej                     fd}Zddd~ZedZfy)    )IterableN)prod)AnyOptionalUnion)Tensor)
deprecated)pack_dict_to_tensorunpack_tensor_to_dict   )HIP_ENVIRONMENTlib)adammomentumrmsproplionlamblarsc                   4    e Zd ZdZd Zd Zed        ZddZy)GlobalPageManagerNc                     t        d      NzCall get_instance() insteadRuntimeErrorselfs    R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/functional.py__init__zGlobalPageManager.__init__6       899    c                     g | _         y N)paged_tensorsr   s    r   
initializezGlobalPageManager.initialize9   s
    r    c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r"   	_instance__new__r$   clss    r   get_instancezGlobalPageManager.get_instance<   6    == KK,CMMM$$&}}r    c                 J    | j                   d d d   D ]  }t        ||        y )N)r#   prefetch_tensor)r   to_cputs      r   prefetch_allzGlobalPageManager.prefetch_allC   s,     ##DbD) 	'AAv&	'r    F)	__name__
__module____qualname__r'   r   r$   classmethodr+   r2    r    r   r   r   3   s*    I:   'r    r   c                   2    e Zd ZdZd Zd Zed        Zd Zy)CUBLAS_ContextNc                     t        d      r   r   r   s    r   r   zCUBLAS_Context.__init__N   r   r    c                     i | _         y r"   )contextr   s    r   r$   zCUBLAS_Context.initializeQ   s	    r    c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r"   r&   r)   s    r   r+   zCUBLAS_Context.get_instanceT   r,   r    c                    |j                   | j                  vrt        j                  j	                         }t        j                  j                  |       t        j                  t        j                               | j                  |j                   <   t        j                  j                  |       | j                  |j                      S r"   )
indexr=   torchcudacurrent_device
set_devicectc_void_pr   get_context)r   deviceprev_devices      r   rG   zCUBLAS_Context.get_context[   s}    <<t||+**335KJJ!!&))+S__5F)GDLL&JJ!!+.||FLL))r    )	r4   r5   r6   r'   r   r$   r7   r+   rG   r8   r    r   r:   r:   K   s*    I:  *r    r:   c                   ,    e Zd ZdZd Zd Zed        Zy)Cusparse_ContextNc                     t        d      r   r   r   s    r   r   zCusparse_Context.__init__g   r   r    c                 \    t        j                  t        j                               | _        y r"   )rE   rF   r   get_cusparser=   r   s    r   r$   zCusparse_Context.initializej   s    {{3#3#3#56r    c                     | j                   0| j                  |       | _         | j                   j                          | j                   S r"   r&   r)   s    r   r+   zCusparse_Context.get_instancem   r,   r    )r4   r5   r6   r'   r   r$   r7   r+   r8   r    r   rK   rK   d   s%    I:7  r    rK   rB   )r@   ac                 @    t         j                  j                  |       S r"   )rA   rB   	device_ofrP   s    r   _cuda_device_ofrT   }   s    zz##A&&r    c                 *    t        j                         S r"   )
contextlibnullcontextrS   s    r   rT   rT      s    %%''r    dtyperH   c                    | j                   t        |      z  }t        j                  t	        j
                  |            }t	        j                  |t	        j                  t        j                              }t        j                  j                  ||      }t        j                  || t        |            j                  |      }d|_        |j                   |_        |S )N)shape)rY   countT)itemsizer   r   cget_managed_ptrrE   c_size_tcastPOINTERc_intnp	ctypeslibas_arrayrA   
frombufferviewis_pagedr@   page_deviceid)rY   rH   r[   	num_bytescuda_ptrc_ptr	new_arrayouts           r   	get_pagedro      s    e,I##BKK	$:;HGGHbjj23E%%e5%9I


9Ee
E
J
J5
QCCLCJr    FAc                     | j                   sJ d       |rd}n| j                  }t        j                  t	        |       t        j                  | j                        t        j                  |             y )Nz%Only paged tensors can be prefetched!r.   )	rh   ri   r   	cprefetchget_ptrrE   r_   nbytesc_int32)rp   r0   deviceids      r   r/   r/      sO    ::>>>:??MM'!*bkk!((3RZZ5IJr    c           	         d }|j                   t        j                  k(  r+t        t        d|  dd       }t        j                  |      }nG|j                   t        j                  k(  r*t        t        d|  dd       }t        j                  |      }|t        d|        t        |dd      }|r|rt        |       |t        |        |t        |      t        |      t        j                  |j                                      |j                  s|j                  rt        j                  j!                          y y )Nc_fp32_uint8zFunction not implemented: 
is_managedF)rY   rA   float32getattrr   rE   c_floatuint8c_uint8NotImplementedErrorr/   rs   c_int64numelrh   rB   synchronize)	func_namerp   Bvalueprefetchfunccvaluer{   s           r   elementwise_funcr      s    Dww%--sa	{%0$7E"	
EKK	sa	{&148E"|!$>yk"JKKL%0Jh=AWQZAGGI)>?zzQZZ
 	

   r    c                      t        d| d |       y )Nfillr   )rp   r   rH   r   s       r   r   r      s    VQe,r    c                      t        d| |d       y )N_mulr   r   )rp   r   rH   s      r   r   r      s    VQ1%r    c                 N   | rdnd}d|z  }|s|dk  r| sd|z  nd|z  dz
  }t        j                  |d|      }d|j                         z
  }|dk(  r|S |j                         dz  }t        j                  |d | j	                         dg|z  z   ||d  j	                         z         S )	N                    r         ?   r   )rA   linspacer   r   tolist)signed
total_bitsadd_zerosigntotal_valuesvaluesgapls           r   create_linear_mapr      s    4sDj=L:>
 -3q*}:8I^^D#|4F

C
axLLNa||F2AJ--/1#);fQRj>O>O>QQRRr    c                    	 ddl m} |rv|j                  t	        j
                  | dd      d d       j                         }dgdz  }|j                  t	        j
                  | dd      d d        j                         }nu|j                  t	        j
                  | dd      d d       j                         }dgd	z  }|j                  t	        j
                  | dd      d d        j                         }||z   |z   }t	        j                  |      }|j                         j                  }||j                         z  }|j                         d
k(  sJ |S # t        $ r}t        d      |d }~ww xY w)Nr   )normzZScipy is required for `create_normal_map`. Install `bitsandbytes` with the `[test]` extra.g      ?	   r.      r      r   )scipy.statsr   ImportErrorppfrA   r   r   r   sortr   maxr   )	offsetuse_extra_valuer   iev1v2v3vr   s	            r   create_normal_mapr      sR   $ XXennVS!4Sb9:AACSHxxvsA6s;<<DDFXXennVS!4Sb9:AACSHxxvsA6s;<<DDF
R"A\\!_F[[]!!F
fjjlF<<>S   M/  h
	s   E 	E0E++E0c                 P   |}|}| rdnd}||z   ||z
  k(  sJ g }t        t        d||z
  z   d||z
  z  d            D ]  \  }}	|j                  d|	z          g }
t        t	        j
                  ddg|            }d|dz
  z  }t        d|z        D ]  }|D ]z  }|dk7  rdnd}t        t        |            D ]  \  }}||d|dz    z  z  z  } |dk(  r
|d| z  z  }n|d||z
  dz
   z  z  }|
j                  |       | si|
j                  |        |  t        |
      d|z  k(  sJ |
j                          |dk  r/dt        |
      z
  }t        |      D ]  }|
j                  d        |
j                          t        j                  |
      }||j                         z  }|S )Nr   r   r   )repeatr   r   )	enumeraterangeappendlist	itertoolsproductlenr   rA   tensorr   )r   exponent_bitsprecision_bitsr   ephas_signevaluesivalr   lstbiasevaluebit_patternr   pvalr   codes                      r   create_fp8_mapr      s   AAqHq5J))))GEA-(*B$C"DaM\dLdFeghij 3q#v F
y  !Q?
@C"#Dm,- & 	&K1A!E$T+%67 04Ah//0{T7
* v}q'8%9 99MM% uf%	&& v;!Z-'''
KKMA~CKs 	AMM!	
KKM<<DDHHJDKr    c                    g }|dz
  }d||z
  z  dz
  }t        |      D ]  }t        | rd||z   |z
  z  dz   nd||z   |z
  dz   z  dz         }t        j                  dd|t        j                        }|dd |dd z   dz  }	|d|dz
   |z   z  |	z  j                         z  }| s|d|dz
   |z   z   |	z  j                         z  } |d	kD  r{t        j                  dd|dz   t        j                        }|dd |dd z   dz  }	|d|dz
   z   z  |	z  j                         z  }| r!|d|dz
   |z   z   |	z  j                         z  }|j                  d	       |j                  d
       t        |      d|z  k(  sJ dt        |      z
  }
t        |
      D ]  }|j                  d	        |j                          t        j                  |t        j                        S )a+  
    Creates the dynamic quantiztion map.

    The dynamic data type is made up of a dynamic exponent and
    fraction. As the exponent increase from 0 to -7 the number
    of bits available for the fraction shrinks.

    This is a generalization of the dynamic type where a certain
    number of the bits and be reserved for the linear quantization
    region (the fraction). n determines the maximum number of
    exponent bits.

    For more details see
    (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
    r   r   g?rY   Nr.          @
   r   r   r   )
r   intrA   r   r|   r   r   r   r   r   )r   max_exponent_bitsr   datanon_sign_bitsadditional_itemsr   fraction_items
boundariesmeansr   s              r   create_dynamic_mapr     s-   " D NM]->>?!C$% 
O !m#&7781<q=(+<<q@AAE

 ^^CN%--P
CR:ab>1S8",q01A56%?GGIIr 1A 56:;<uDLLNND
O !^^C,<q,@V
CR:ab>1S8",q01A56%?GGIIr 1A 56:;<uDLLNNDKKNKKt9:%%%
D	/C3Z A 	IIK<<EMM22r    tensorsc                    d}t               }| D ]j  }|t        |dd      r||j                  j                  dk7  z  }|j	                  |j                  j                  |j                  j
                  f       l |s2t        d| D cg c]  }|j                  |j                  f c}       t        |      dkD  r2t        d| D cg c]  }|j                  |j                  f c}       |S c c}w c c}w )ap  Verifies that the input tensors are all on the same device.

    An input tensor may also be marked as `paged`, in which case the device placement is ignored.

    Args:
        tensors (`Iterable[Optional[torch.Tensor]]`): A list of tensors to verify.

    Raises:
        `RuntimeError`: Raised when the verification fails.

    Returns:
        `Literal[True]`
    Trh   FcpuzZAll input tensors need to be on the same GPU, but found some tensors to not be on a GPU:
 r   zcInput tensors need to be on the same GPU, but found the following tensor and device combinations:
 )	setr}   rH   typeaddr@   r   r[   r   )r   on_gpugpu_idsr1   s       r   	is_on_gpur   O  s9    FeG 9=J!>ahhmmu,,FKK78	9 i  IP  kQ  DElmlslsuvu}u}k~  kQ  jR  S
 	
 7|ar  RY  tZ  MNuvu|u|~  G  G  uH  tZ  s[  \
 	
 M kQ
 tZs   C2
C7
r   returnc                 L   | j                   j                  dk(  rFt        j                  t        j
                  j                  | j                   j                              S t        j                  t        j
                  j                  | j                   j                              S )Nxpu)	rH   r   rE   rF   rA   _C_xpu_getCurrentRawStreamr@   _cuda_getCurrentRawStream)r   s    r   _get_tensor_streamr   s  sd    }}U"{{588<<V]]=P=PQRR;;uxx99&--:M:MNOOr    c                 N    | yt        j                  | j                               S )zGets the memory address of the first element of a tenso

    Args:
        A (`Optional[Tensor]`): A PyTorch tensor.

    Returns:
        `Optional[ct.c_void_p]`: A pointer to the underlying tensor data.
    N)rE   rF   data_ptr)rp   s    r   rs   rs   z  s!     	y;;qzz|$$r    c                       e Zd ZdZdZeD  cg c]  }d| 	 c}} Zg dZ	 	 	 	 	 	 	 ddZd Ze	de
eef   d	ej                  d
d fd       ZddZd Zd Zyc c}} w )
QuantStatezWcontainer for quantization state components to work with Params4bit and similar classes)fp4nf4bitsandbytes__)absmax	quant_mapnested_absmaxnested_quant_mapquant_state
quant_type	blocksizerY   r[   nested_blocksizenested_dtypenested_offsetNc	                     || _         || _        || _        || _        || _        || _        || _        || _        |d u| _        y r"   )	r   r[   r   rY   r   r   r   state2nested)	r   r   r[   r   r   r   rY   r   r   s	            r   r   zQuantState.__init__  sH     
	
"$D(r    c                 B   | j                   rU| j                  | j                  | j                  | j                  | j
                  | j                  g| j                  g}||   S | j                  | j                  | j                  | j                  d| j                  g}||   S )a$  
        ensures compatibility with older quant state scheme with nested lists.
        assumes the following layout:
        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
        N)r   r   r[   rY   r   r   r   r   )r   idx	list_reprs      r   __getitem__zQuantState.__getitem__  s     ;;



dkk*I ~ djj$**dnndTXTcTcdI~r    qs_dictrH   r   c                    |j                         D cg c]'  \  }}d|v st        |t        j                        s&|) }}}t	        |      sd|vrt        d      t	        |      dk7  s#|d   j                  d      d   | j                  vrt        d| j                   d	| d      t	        |      dk(  r.|d   }|j                  t        |j                  |                   |j                         D ci c]  \  }}|j                  d      d   | }}}t        |j                               j                  | j                        sJ d
|v rut        j                  t!        |d               j#                  |      } | |d
   j#                  |      |d   |d   j#                  |      t%        t        |d               }nd\  }} | |d   |d   j#                  |      |d   |d   j#                  |      t%        t        |d         |d   t        j&                  |d         nd||      }	|	S c c}}w c c}}w )aO  
        unpacks components of state_dict into QuantState
        where necessary, convert into strings, torch.dtype, ints, etc.

        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.

        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.
        r   r   z<Expected packed or unpacked quant_state items, found neitherr   r   .r.   z@There should be exactly one `quant_state` item with ending from z.
Detected r   r   r   r   r   )r   r   r   rY   NNr   r   r   rY   r[   N)r   r   r   r   rY   r[   r   r   )items
isinstancerA   r   r   
ValueErrorsplitvalid_qs_type_keysupdater   popr   keysissubsetvalid_qs_keysr   floattor}   Size)
r*   r  rH   kr   qs_keyfirst_qs_keyr   r   r   s
             r   	from_dictzQuantState.from_dict  s9    !(f1=A3E*UVX]XdXdJe!ff6{|7:[\\[A!5b!9AWAW!WRSVSiSiRjjvw}v~~  A 
 v;!!!9LNN0\1JKL3:==?C41a1773<#Q&CC7<<>"++C,=,=>>>g%\\%(@"ABEEfMF/226:!"45/033F;eW^%<=	F (NFF|,8$''/k*%((0%!1229'2B2N%**WW-.TX	
 K g Ds   H7H7H79H=c                    | j                   | j                  | j                  | j                  t	        | j
                        j                  d      t        | j                        d}| j                  r|j                  | j                  j                  | j                  j                  | j                  j                  j                         t	        | j                  j
                        j                  d      | j                  j                         d       |s|S |j                         D ci c]#  \  }}t!        |t"        j$                        s!||% }}}|j                         D ci c]#  \  }}t!        |t"        j$                        r!||% }}}t'        |      |d| j                   z   <   |S c c}}w c c}}w )z
        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
        ztorch.)r   r   r   r   rY   r[   )r   r   r   r   r   zquant_state.bitsandbytes__)r   r   r   r   strrY   striptupler[   r   r  r   cloner   itemr  r  rA   r   r
   )r   packedr  r  r   qs_packed_dictnon_tensor_dicts          r   as_dictzQuantState.as_dict  sT    //kk_**844::&
 ;;NN%)[[%7%7(,(=(=(,(8(8(>(>(@$'(9(9$:$@$@$J%)[[%5%5%7 N ,3==?Z41ajELL>Y!Q$ZZ,3MMO_DAq:aQVQ]Q]C^1a4__NabqNr84??JK [_s   /"F5F5-"F;F;c                    | j                   j                  |      | _         | j                  j                  |      | _        | j                  r| j                  j                  |      | _        | j
                  j                  j                  |      | j
                  _        | j
                  j                   j                  |      | j
                  _         y y r"   )r   r  r   r   r   r   )r   rH   s     r   r  zQuantState.to  s    IILL(	kknnV,;;++..0DK!%!3!3!6!6v!>DKK#{{//226:DKK r    c                    t        |t              syt        j                  | j                  |j                  d      xr/ | j
                  |j
                  k(  xr t        j                  | j                  |j                  d      xr | j                  |j                  k(  xr | j                  |j                  k(  xr | j                  |j                  k(  xr | j                  %|j                  | j                  |j                  k(  n| j                  |j                  u xrI | j                  %|j                  | j                  |j                  k(  S | j                  |j                  u S )NFgư>)atol)r  r   rA   allcloser   r[   r   rY   r   r   r   r   )r   others     r   __eq__zQuantState.__eq__"  s&   %, NN4;;4@ 

ekk)tyy%**4@ 

ekk) %//1	
 5#3#33 ;;*u||/G u||+[[ELL0 ;;*u||/G u||+	
 [[ELL0	
r    )NNNNNNNr3   )r4   r5   r6   __doc__valid_quant_typesr  r  r   r  r7   dictr  r   rA   rH   r  r!  r  r'  ).0xs   00r   r   r     s    a&8IJ1N1#.JM$ )*( 0S#X 0 0 0 0d@;
k Ks   Ar   r   r   rn   c                    |;dt         vr*t               j                  | j                        t         d<   t         d   }t        j
                  j                  j                  j                  | |j                  | j                        |      \  }}|r]|j                         }||z  }t        ||d      \  }	}
t        |	|j                  | j                  d      || j                  ||
      }n4t        ||j                  | j                  d      || j                        }||j                  |      n|}| |j                  |j                        |_        ||fS )aW  Quantize a tensor in blocks of values.

    The input tensor is quantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is calculated for scaling
    the non-linear quantization.

    Args:
        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
        code (`torch.Tensor`, *optional*):
            A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
            For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 4096.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
        - `torch.Tensor`: The quantized tensor.
        - [`QuantState`]: The state object used to undo the quantization.
    dynamicF)r   r   T)copy)r   r   r   rY   r   r   r   r   r   rY   )	name2qmapr   r  rH   rA   opsbitsandbytesquantize_blockwisedefaultmeanr   rY   copy_r   )rp   r   r   rn   r   r   _out_absmaxr   qabsmaxr   r   s               r   r4  r4  :  s:   F |I%#5#7#:#:188#DIi #II**==EE	MD' 6,W	RWX -''
 !dggahhTg6R^gopovovw !_#))D/$C #\\+*<*<=r    r   r   c                 v   ||J |=|;dt         vr*t               j                  | j                        t         d<   t         d   }|t	        |||t
        j                        }|j                  }|j                  r\t        |j                  |j                        }||j                  z  }|j                  t
        j                  k7  r|j                         }|rt
        j                  j                  j                  j!                  | ||j"                  j                  | j                        |j$                  |j                  |       |S t
        j                  j                  j                  j'                  | ||j"                  j                  | j                        |j$                  |j                        S )a  Dequantize a tensor in blocks of values.

    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is used for scaling
    the non-linear dequantization.

    Args:
        A (`torch.Tensor`): The quantized input tensor.
        quant_state ([`QuantState`], *optional*):
            The quantization state as returned by [`quantize_blockwise`].
            Required if `absmax` is not provided.
        absmax (`torch.Tensor`, *optional*):
            A tensor containing the scaling values.
            Required if `quant_state` is not provided and ignored otherwise.
        code (`torch.Tensor`, *optional*):
            A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
            For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
            Ignored when `quant_state` is provided.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 4096.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
            Ignored when `quant_state` is provided.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        `torch.Tensor`:
            The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
    r.  r0  rn   )r1  r   r  rH   r   rA   r|   r   r   dequantize_blockwiser   r   rY   r  r2  r3  rn   r   r   r5  )rp   r   r   r   rn   r   r   s          r   r=  r=    sw   R "f&888|+I%#5#7#:#:188#DIi # TYV[VcVcdF%k&8&8+:L:LM+$$$<<5==(\\^F
		3377)!! 	8 	
 
99!!66>>	AHH% r    c                 d   |d}d }| dk(  r	 g d}n4| dk(  rg d}n*| dk(  rg d}n | dk(  r|d	k(  rg d
d d d   }nt        d      |t        d|  d      t        j                  ||      }|j                  |j	                         j                                |j                         dk(  sJ |S )NrB   r   )r   g    6Gg    fg    TFٿg   I4ҿg   ০ǿg    Or   g   __?g   `\?g   ?g   @g?g    4?g   ` ?g   `v"?r   r   )r   g      ?g       @g      (@g      @g      @r   g      @r   g      g       g      (g      g      g       g      int4)               r   r   r   r   r.   iiaf4@   )r   g|8geg:Kڞ׿gH2퓊cпg}Yu-ÿgQ	#(Dr   gF?g`_?g
0E?gL_߹E?gƶ=?ga@?gкv-?r   r.   z94-bit AbnormalFloats currently only support blocksize 64.z	Typename z not supportedrH      )r   rA   r   div_absr   r   )typenamerH   r   r   s       r   get_4bit_typerP    s    ~D5		
$ 
U	 l	V	G	U	 ?" d#D& &&abb|!IhZ~"FGG<<V,DIIdhhjnn::<2Kr    c           	      <    |
t         sdnd}t        | ||||d|      S NrJ     r   r   quantize_4bitrp   r   rn   r   compress_statisticsquant_storages         r   quantize_fp4rY     ,     -B3	FC4GP]^^r    c           	      <    |
t         sdnd}t        | ||||d|      S NrJ  rS  r   rT  rV  s         r   quantize_nf4r]  -  rZ  r    r   c           
         |
t         sdnd}| j                  }t        j                  j                  j
                  j                  | |||      \  }}	t        || j                        }
|rB|	j                         }t        |	|z
  d      \  }}~	t        ||| j                  ||
|||      }nt        |	|| j                  ||
|      }||j                  |      n|}| |j                  |j                        |_        ||fS )a  Quantize tensor A in blocks of 4-bit values.

    Quantizes tensor A by dividing it into blocks which are independently quantized.

    Args:
        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 128 on ROCm and 64 otherwise.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        compress_statistics (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
        quant_storage (`torch.dtype`, *optional*): The dtype of the tensor used to store the result. Defaults to `torch.uint8`.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        Tuple[`torch.Tensor`, `QuantState`]: A tuple containing the quantization results.
        - `torch.Tensor`: The quantized tensor with packed 4-bit values.
        - [`QuantState`]: The state object used to undo the quantization.
    rJ  rS  rK  r   )r   )r   r[   rY   r   r   r   r   r   )r   r[   rY   r   r   r   )r   r[   rA   r2  r3  rU  r5  rP  rH   r6  r4  r   rY   r7  r   )rp   r   rn   r   rW  r   rX  input_shaper8  r9  r   r   r:  r   states                  r   rU  rU  :  s   B -B3	''KII**88@@		MD' AHH5D,Wv-=M''!	
 ''!
 !_#))D/$C ||ELL1:r    c                 :    |
t         sdnd}t        | ||||d      S rR  r   dequantize_4bitrp   r   r   rn   r   s        r   dequantize_fp4re    (     -B3	1k63	5IIr    c                 :    |
t         sdnd}t        | ||||d      S r\  rb  rd  s        r   dequantize_nf4rh    rf  r    c           	      .   |
t         sdnd}|+||J t        ||j                  |j                  ||      }n|j                  }|j
                  r\t        |j                  |j                        }||j                  z  }|j                  t        j                  k7  r|j                         }|ct        j                  j                  j                  j                  | ||j                   |j"                  |j                  |j                  |       n`t        j                  j                  j                  j%                  | ||j                   |j"                  |j                  |j                        }| j                  d   dk(  r|j'                         S |S )a  Dequantizes a packed 4-bit quantized tensor.

    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is used for scaling
    the non-linear dequantization.

    Args:
        A (`torch.Tensor`): The quantized input tensor.
        quant_state ([`QuantState`], *optional*):
            The quantization state as returned by [`quantize_4bit`].
            Required if `absmax` is not provided.
        absmax (`torch.Tensor`, *optional*):
            A tensor containing the scaling values.
            Required if `quant_state` is not provided and ignored otherwise.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 128 on ROCm and 64 otherwise.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.

    Raises:
        ValueError: Raised when the input data type or blocksize is not supported.

    Returns:
        `torch.Tensor`: The dequantized tensor.
    rJ  rS  )r   r[   rY   r   r   r<  r   r   )r   r   r[   rY   r   r   r=  r   r   rA   r|   r  r2  r3  rc  rn   r   r   r5  r1   )rp   r   r   rn   r   r   s         r   rc  rc    sb   F -B3	!co55 ))))!
 ##%k&8&8+:L:LM+$$$<<5==(\\^F
		..22v{,,k.D.DkFWFWYdYjYjps 	3 	
 ii$$44<<!!""
 	wwqzQuuwJr    zDThis function is deprecated and will be removed in a future release.)categoryc                    |Vdt         vr*t               j                  | j                        t         d<   t         d   }|j                  | j                        }t	        j
                  |       j                         }|j                  t        j                  k7  r|j                         }| |z  }t        |||      }|||ffS )Nr.  )r1  r   r  rH   rA   rN  r   rY   r|   r  quantize_no_absmax)rp   r   rn   r   inps        r   quantizern    s     |I%#5#7#:#:188#DIi #wwqxx YYq\F||u}}$
f*C
S$
,Cr    r`  c                     ||J |X|Vdt         vr*t               j                  | j                        t         d<   t         d   }|j                  | j                        }|||f}t	        | |d   |      }||d   z  S )Nr.  r   r   )r1  r   r  rH   dequantize_no_absmax)rp   r`  r   r   rn   s        r   
dequantizerq    s      222|I%#5#7#:#:188#DIi #wwqxx }
q%(C
0Cq>r    c           
      X   t        |       5  |%t        j                  | t        j                        }t	        | |g       t        j                  t        |      t        |       t        |      t        j                  | j                                      ddd       |S # 1 sw Y   |S xY w)a  
    Quantizes input tensor to 8-bit.

    Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
    `out` using the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor, optional
        The output tensor. Needs to be of type byte.

    Returns
    -------
    torch.Tensor:
        Quantized 8-bit tensor.
    Nr   )rT   rA   
zeros_liker   r   r   	cquantizers   rE   rb   r   )rp   r   rn   s      r   rl  rl    s    , 
	 T;""1EKK8C1c(gdmWQZrxx	?RS	T JT Js   B	BB)c           
      r   t        |       5  |%t        j                  | t        j                        }t	        || |g       t        |       }t        j                  t        |      t        |       t        |      t        j                  | j                               |       ddd       |S # 1 sw Y   |S xY w)a  
    Dequantizes the 8-bit tensor to 32-bit.

    Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
    the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The 8-bit input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor
        The 32-bit output tensor.

    Returns
    -------
    torch.Tensor:
        32-bit output tensor.
    Nr   )rT   rA   rs  r|   r   r   r   cdequantizers   rE   rb   r   )rp   r   rn   streams       r   rp  rp  8  s    , 
	 ^;""1EMM:C4C.!#A&wqz73<!'')ATV\]^ J^ Js   BB,,B6optimizer_namegr   state1beta1epssteplrr   beta2beta3alphaweight_decaygnorm_scale	unorm_vec	max_unormc                     d}|dkD  r-t        j                  |j                  j                               }t	        |||||g       t         j
                  j                  j                  | |||||||||	|
|||||||       y)az  
    Performs an inplace optimizer update with one or two optimizer states.

    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer: {adam}.
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Optimizer state 1.
    beta1 : float
        Optimizer beta1.
    eps : float
        Optimizer epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    state2 : torch.Tensor
        Optimizer state 2.
    beta2 : float
        Optimizer beta2.
    beta3 : float
        Optimizer beta3.
    alpha : float
        Optimizer alpha.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    skip_zeros : bool
        Whether to skip zero-valued gradients or not (default: False).
    r   N)rA   r   r   r  r   r2  r3  optimizer_update_32bit)rx  ry  r   rz  r{  r|  r}  r~  r   r  r  r  r  r  r  r  
skip_zeros
param_norms                     r   r  r  X  s    | J3ZZ/
q!VVY/0	II11		
%r    zyThis function is deprecated and will be removed in a future release. Please use optimizer_update_8bit_blockwise instead. qmap1qmap2max1max2new_max1new_max2c                 h   d}|dkD  r-t        j                  |j                  j                               }t	        |      5  t        ||||||
|||||g       |j                  t         j                  k(  rt|j                  t         j                  k(  rVt        |    d   t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |	      t        |
      t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |j                                      n|j                  t         j                  k(  rs|j                  t         j                  k(  rUt        |    d   t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |      t        j                  |	      t        |
      t        |      t        |      t        |      t        |      t        |      t        j                  |      t        j                  |      t        j                  |j                                      n%t!        d|j                   d|j                         ddd       y# 1 sw Y   yxY w)a  
    Performs an inplace Adam update.

    Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
    Uses AdamW formulation if weight decay > 0.0.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer. Choices {adam, momentum}
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Adam state 1.
    state2 : torch.Tensor
        Adam state 2.
    beta1 : float
        Adam beta1.
    beta2 : float
        Adam beta2.
    eps : float
        Adam epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    qmap1 : torch.Tensor
        Quantization map for first Adam state.
    qmap2 : torch.Tensor
        Quantization map for second Adam state.
    max1 : torch.Tensor
        Max value for first Adam state update.
    max2 : torch.Tensor
        Max value for second Adam state update.
    new_max1 : torch.Tensor
        Max value for the next Adam update of the first state.
    new_max2 : torch.Tensor
        Max value for the next Adam update of the second state.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    r   r   r   zAGradient+optimizer bit data type combination not supported: grad z, optimizer N)rA   r   r   r  rT   r   rY   r|   r   str2optimizer8bitrs   rE   r~   ru   r   float16r	  )rx  ry  r   rz  r   r{  r  r|  r}  r~  r  r  r  r  r  r  r  r  r  r  r  s                        r   optimizer_update_8bitr    s   Z J3ZZ/
		 51aE5$hX`ab77emm#(Cn-a0

	"

9%

:&

5!

5!

3

4 

2!!

<(

;'

1779%+. WW%&,,%++*En-a0

	"

9%

:&

5!

5!

3

4 

2!!

<(

;'

1779%+0 STUT[T[S\\hioiuiuhvw g5 5 5s    MN((N1absmax1absmax2c                     t        ||||||||g       t        j                  j                  j	                  | |||||||||	|
||||||||       y r"   )r   rA   r2  r3  optimizer_update_8bit_blockwise)rx  ry  r   rz  r   r{  r  r  r  r|  r}  r~  r  r  r  r  r  r  r  s                      r   r  r  :  sn    * q!VVUE7GDE	II::		
'r    grad	gnorm_vec
percentilec           
      6   t        |       5  t        | |g       | j                  t        j                  k(  r_t        j                  t        |       t        |      t        j                  |      t        j                  | j                                      n| j                  t        j                  k(  r_t        j                  t        |       t        |      t        j                  |      t        j                  | j                                      nt        d| j                   d      ddd       t        j                  ||dz           }t        j                  |      \  }}t        j                  ||         }d}||kD  r||z  }|||fS # 1 sw Y   exY w)a  Applies percentile clipping

    grad: torch.Tensor
        The gradient tensor.
    gnorm_vec: torch.Tensor
        Vector of gradient norms. 100 elements expected.
    step: int
        The current optimization steps (number of past gradient norms).

    zGradient type z not supported!Nd   r   )rT   r   rY   rA   r|   r   cpercentile_clipping_g32rs   rE   ru   r   r  cpercentile_clipping_g16r	  sqrtr   )	r  r  r}  r  current_gnormvalsr   
clip_valuer  s	            r   percentile_clippingr  h  sF    
	 K4#$::&((	"

4 

4::<(	 ZZ5==(((	"

4 

4::<(	 ~djj\IJJ#K& JJy45M

9%ID#D,-JKz! =0*k117K Ks   DFFc                    t         j                  j                         st         j                  j                          | j                  |k7  s|j                  |k7  r%t        d| j                   d|j                         | j                  }|j                  }|}|}	d}
t        |      dk(  rt        |      dk(  r|s%|	s#| j                  d   |j                  d   k7  rd}
n|r%|	s#| j                  d   |j                  d   k7  rd}
n|r%|	r#| j                  d   |j                  d   k7  rd}
n|s|	r| j                  d   |j                  d   k7  rld}
nht        |      dk(  rt        |      dk(  r|s%|	s#| j                  d   |j                  d   k7  rd}
n%|r$|	s"| j                  d   |j                  d   k7  rd}
n|r$|	r"| j                  d   |j                  d   k7  rd}
n|s|	r| j                  d   |j                  d   k7  rd}
nt        |      dk(  rt        |      dk(  r|s$|	s"| j                  d   |j                  d   k7  rd}
nq|r$|	s"| j                  d   |j                  d   k7  rd}
nK|r$|	r"| j                  d   |j                  d   k7  rd}
n%|s#|	r!| j                  d   |j                  d   k7  rd}
|a|j                  }|
syt        |      dk(  rjt        |      dk(  r[|d   |d   k(  rO|d   |d   k(  rC|d   |d   k(  r7|d   |d   k(  r+d}
n't        |      dk(  rJt        |      dk(  r<|s|	s|d   |d   f}n|r|	r|d   |d   f}n|r|	s|d   |d   f}n|s|	r|d   |d   f}nt        |      dk(  rZt        |      dk(  rL|s|	s|d   |d   |d   f}n|r|	r|d   |d   |d   f}n|r|	s|d   |d   |d   f}nz|sx|	rv|d   |d   |d   f}ngt        |      dk(  rYt        |      dk(  rK|s|	s|d   |d   |d   f}n8|r|	r|d   |d   |d   f}n%|r|	s|d   |d   |d   f}n|s|	r|d   |d   |d   f}|
st        d	| d
| d| d
|	 d	      S )Nz3Expected torch.int8 input tensors A and B, but got  and Tr   r   r   FrD  z?Tensor dimensions incorrect for matrix mulitiplication: A x B:  x z with transpose for A x B: r  )	rA   rB   is_initializedinitrY   	TypeErrorr[   r   r	  )rp   r   rn   transposed_Atransposed_Bexpected_typesAsBtAtBcorrectsouts               r   check_matmulr    sr   ::$$&

ww-177m#;MaggYV[\]\c\c[deff	
B	
B	B	BG
2w!|B1"qwwqz!9GqwwqzQWWQZ7GB1771:3GqwwqzQWWQZ7G	RA#b'Q,"qwwqz!9GqwwqzQWWQZ7GB1771:3GqwwqzQWWQZ7G	RA#b'Q,"qwwqz!9GqwwqzQWWQZ7GB1771:3GqwwqzQWWQZ7G
yy3r7a<CGqLAw"Q%DGr!u$4A"Q%BqEUWXYUZNr7a<CGqLb1r!u~1r!u~B1r!u~B1r!u~W\c"glb1r!ube,1r!ube,B1r!ube,B1r!ube,W\c"glb1r!ube,1r!ube,B1r!ube,B1r!ube,MbTQTUWTXXstvswwz{}z~~  A
 	
 Kr    r   c           	         |t        d      |j                  }|j                  r#t        ||j                        |j
                  z   }|Zt        j                  j                  j                  j                  | ||j                  ||j                  |j                  |       |S t        j                  j                  j                  j                  | ||j                  ||j                  |j                        S )NzIstate cannot be None. gemv_4bit() requires the state from quantize_4bit()r<  )r	  r   r   r=  r   r   rA   r2  r3  	gemv_4bitrn   r[   r   r   r5  )rp   r   rn   r  r  r`  r   s          r   r  r    s     }dee\\F||%fell;ellJ
		((,,KKJJOO 	- 	
 
99!!++33		

 r    c                    t        | ||||      }|0t        j                  |t        j                  | j                        }t        | j                        dk(  rct        |j                        dk(  rK| j                  d   |j                  d   k(  r,| j                  d   |j                  d   k(  rt        | ||      S | j                  }|j                  }|rt        |      dk(  r|d   |d   f}n|rt        |      dk(  r|d   |d   |d   f}|rt        |      dk(  r|d   |d   f}n|rt        |      dk(  r|d   |d   |d   f}t        |      dk(  rx|j                         d   |j                  d   k(  rd}n%|j                         d   |j                  d   k(  rd}t        | j                        dk(  rL| j                         d   | j                  d   k(  rd}nq| j                         d   | j                  d   k(  rNd}nK| j                         d   | j                  d   k(  rd}n%| j                         d   | j                  d   k(  rd}t        |      dk(  r|d   }| j                         |rdnd   }	n,t        |      dk(  rt        |      dk(  r|d   |d   z  }|d   }	|d   }
|d   }|j                         |rdnd   }|d   }ngt        |      dk(  rYt        |      dk(  sJ |d   |d   k(  r|d   |d   k(  st        d| d	|       d}d}|d   }
|d   }|d   |d   z  }|
}|d   }	|
}t        j                         j                  | j                        }t        || |g       t        j                  |t!        j"                  |      t!        j"                  |      t!        j$                  
      t!        j$                        t!        j$                        t'        |      t'        |       t'        |      t!        j$                        t!        j$                  	      t!        j$                               |S )
NsizerY   rH   rD  r   r   r   FTzMOnly bsi,bso->io supported for tensor contractions, but dims for A x B were: r  )r  rA   zerosint32rH   r   r[   batched_igemmstrider	  r:   r+   rG   r   r   cigemmrE   c_boolru   rs   )rp   r   rn   r  r  r  r  r  nldbmr  ldaldcptrs                  r   igemmr    s    1c<>D
{kkt5;;qxxH
177|qS\Q.771:#
aggaj(@ As++	
B	
BB1eRU^	#b'Q,eRUBqE"B1eRU^	#b'Q,eRUBqE" 2w!|88:a=AGGAJ& LXXZ]aggaj(Lqww<1xxz!}
*$A!''!*,#xxz!}
*$A!''!*,#r7a<1A((*,QA6CW\c"gl11AQ%CqEqEhhj|!4e	RA2w!||1A2a5BqE>_`b_ccfgifjk  qEqEqEBqEMe

%
%
'
3
3AHH
=C q!SkJJ
		,
		,


1


1


1




3


3


3 Jr    c                    t        | j                        dk(  rt        |j                        dk(  s%t        d| j                   d|j                         t        | ||||      }|0t	        j
                  |t        j                  | j                        }|j                         r|j                         d   }d}n|j                         }|d   |j                  d   k7  r$|j                         }|j                         d   }n|d   |j                  d   k(  rd	}|j                         d   }n{|d   dk(  r$|j                         }|j                         d   }nO|d   dk(  r$|j                         }|j                         d   }n#|j                         }|j                         d   }| j                         r| j                         d   }d}n| j                         }|d   | j                  d   k7  r&| j                         } | j                         d   }d}nP|d   | j                  d   k(  r| j                         d   }d	}n%| j                         } | j                         d   }d}| j                  d   }	| j                  d   }
|j                  d   }|j                  d   }|}|j                  d   |j                  d   z  }| j                  d   | j                  d   z  }| j                  d   |j                  d   z  }t        j                         j                  | j                        }t        || |g       t        j                   |t#        j$                  |      t#        j$                  |      t#        j&                  |      t#        j&                  |
      t#        j&                  |      t)        |      t)        |       t)        |      t#        j&                  |      t#        j&                  |      t#        j&                  |      t#        j*                  |      t#        j*                  |      t#        j*                  |      t#        j,                  |	             |S )
NrD  z@Expected 3-dimensional tensors for bmm, but got shapes A and B: r  r  r   Fr   r   T)r   r[   r	  r  rA   r  r  rH   is_contiguousr  
contiguousr:   r+   rG   r   r   cbatched_igemmrE   r  ru   rs   c_longc_uint32)rp   r   rn   r  r  r  r  sr  	num_batchr  r  r  r  strideAstrideBstrideCr  s                     r   r  r  p  so    qww<1CLA$5[\]\c\c[ddijkjqjqirstt1c<>D
{kkt5;;qxxHhhjmHHJQ41771:A((*Q-CqTQWWQZL((*Q-CtqyLLNhhjm1LLNhhjmLLNhhjmhhjmHHJQ41771:A((*Q-C LqTQWWQZ((*Q-CLA((*Q-C L 
I	
A	
A	
A
Cggaj1771:%Gggaj1771:%Gggaj1771:%G

%
%
'
3
3AHH
=Cq!Sk
		,
		,


1


1


1




3


3


3
		'
		'
		'
I!$ Jr    c                     |7t         j                  j                  j                  j	                  | ||       |S t         j                  j                  j                  j                  | |      S )aL  Performs an 8-bit integer matrix multiplication.

    A linear transformation is applied such that `out = A @ B.T`. When possible, integer tensor core hardware is
    utilized to accelerate the operation.

    Args:
        A (`torch.Tensor`): The first matrix operand with the data type `torch.int8`.
        B (`torch.Tensor`): The second matrix operand with the data type `torch.int8`.
        out (`torch.Tensor`, *optional*): A pre-allocated tensor used to store the result.
        dtype (`torch.dtype`, *optional*): The expected data type of the output. Defaults to `torch.int32`.

    Raises:
        `NotImplementedError`: The operation is not supported in the current environment.
        `RuntimeError`: Raised when the cannot be completed for any other reason.

    Returns:
        `torch.Tensor`: The result of the operation.
    )rA   r2  r3  int8_linear_matmulrn   r5  )rp   r   rn   rY   s       r   r  r    sS    & 		1155aC@
99!!44<<QBBr    	row_stats	col_statsr   c                     t         j                  j                  j                  j	                  | ||t         j
                  |      }||j                  |      S |S )a  Performs dequantization on the result of a quantized int8 matrix multiplication.

    Args:
        A (`torch.Tensor` with dtype `torch.int32`): The result of a quantized int8 matrix multiplication.
        row_stats (`torch.Tensor`): The row-wise quantization statistics for the lhs operand of the matrix multiplication.
        col_stats (`torch.Tensor`): The column-wise quantization statistics for the rhs operand of the matrix multiplication.
        out (`torch.Tensor`, *optional*): A pre-allocated tensor to store the output of the operation.
        bias (`torch.Tensor`, *optional*): An optional bias vector to add to the result.

    Returns:
        `torch.Tensor`: The dequantized result with an optional bias, with dtype `torch.float16`.
    )rY   r   )rA   r2  r3  int8_mm_dequantr5  r  r7  )rp   r  r  rn   r   results         r   r  r    sR    & YY##33;;Ay)[`[h[hos;tF yy  Mr    nnz_block_ptrc                 0   | j                         sJ d}||z| j                         j                  d| j                  d         }|dkD  r||k\  }|j	                  |d       |t        | |      }|!|j                  dd      j                         }|||fS )a   "Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    The row-wise and column-wise absmax values are determined.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    <Tip>
    This function is useful for training, but for inference it is advised to use [`get_row_absmax`] instead.
    The column-wise quantization scales are not typically needed in inference scenarios.
    </Tip>

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): Input tensor.
        row_stats (`torch.Tensor`, *optional*): If provided, calculation of row statistics is skipped.
        col_stats (`torch.Tensor`, *optional*): If provided, calculation of column statistics is skipped.
        nnz_block_ptr (`torch.Tensor`, *optional*): Not used.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.
            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing quantization statistics.
        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization statistics.
        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization statistics.
        - `torch.Tensor` with dtype `torch.bool`, *optional*: A mask indicating the locations of outliers in the input tensor.
    Nr.   r   r   F)dimkeepdim)is_floating_pointrN  rg   r[   masked_fill_get_row_absmaxamaxr  )rp   r  r  r  	thresholdoutlier_maskabsAs          r   get_colrow_absmaxr    s    D    LI-uuw||B,s?9,LlC0 'q)4I		a	7==?Ii--r    c                 *   | j                   t        j                  k(  sJ t        | j                  dd       }| j                  d   }t        j
                  |ft        j                  | j                        }t        | g       t        |       5  t        j                  t        |       t        |      t        j                  |      t        j                  |      t        j                  |      t!        |              ddd       |S # 1 sw Y   |S xY w)aT  Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.
            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `torch.Tensor` with dtype `torch.float32`: The absolute maximum value for each row, with outliers ignored.
    Nr.   rX   )rY   rA   r  r   r[   emptyr|   rH   r   rT   r   cget_row_statsrs   rE   r~   ru   r   )rp   r  rowscolsr  s        r   r  r  @  s      77emm###D772;DTG5==JIqcN		 
AJIJJy!JJtJJtq!	

 
 s   A/DDc                   h    e Zd Zdedededej
                  dej
                  dej
                  fdZy)	COOSparseTensorr  r  nnzrowidxcolidxr   c                    |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ || _        || _        || _        || _        || _	        || _
        y r"   )rY   rA   r  r  r   r  r  r  r  r  r   )r   r  r  r  r  r  r   s          r   r   zCOOSparseTensor.__init__g  s     ||u{{***||u{{***||u}},,,||~$$$||~$$$||~$$$		r    N)r4   r5   r6   r   rA   r   r   r8   r    r   r  r  f  sE    "),6;llLQLLbgbnbnr    r  c                       e Zd Zd Zy)CSRSparseTensorc                    |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ |j	                         |dz   k(  sJ || _        || _        || _        || _        || _	        || _
        y Nr   )rY   rA   r  r  r   r  r  r  rowptrr  r   )r   r  r  r  r  r  r   s          r   r   zCSRSparseTensor.__init__z      ||u{{***||u{{***||u}},,,||~$$$||~$$$||~)))		r    Nr4   r5   r6   r   r8   r    r   r  r  y      r    r  c                       e Zd Zd Zy)CSCSparseTensorc                    |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j                   t        j                  k(  sJ |j	                         |k(  sJ |j	                         |k(  sJ |j	                         |dz   k(  sJ || _        || _        || _        || _        || _	        || _
        y r  )rY   rA   r  r  r   r  r  r  colptrr  r   )r   r  r  r  r  r  r   s          r   r   zCSCSparseTensor.__init__  r  r    Nr  r8   r    r   r  r    r  r    r  c                    t        j                  | j                  d      \  }}|j                  d       t        j                  | j
                  dz   ft         j                  | j                  j                        }|j                  |j                         |j                         d       |j                  d       t        | j
                  | j                  | j                  || j                  | j                         S NTreturn_countsr   rX   r   )r@   srcr  )rA   uniquer  add_r  r  r  rH   scatter_longr   cumsum_r  r  r  r  r   )cooAr   countsr  s       r   coo2csrr    s    \\$++TBNFF
KKN[[$))a-)T[[EWEWXF
OO&++-VZZ\qOA
NN1499dii64;;PTP[P[\\r    c                 F   t        j                  | j                        \  }}| j                  |   }| j                  |   }t        j
                  |d      \  }}|j                  d       t        j                  | j                  dz   ft         j                  | j                  j                        }|j                  |j                         |j                         d       |j                  d       t        | j                   | j                  | j"                  |||      S r  )rA   r   r  r  r   r   r  r  r  r  rH   r  r  r   r  r  r  r  )r  r   
col2rowidxr  r   	colvaluesr  r  s           r   coo2cscr    s    jj-OC[[$F[[$FS=IvNN1[[$))a-)T[[EWEWXF
OO)..*

!OD
NN1499dii666RRr    c                     t        j                  |ft         j                  |      }t        j                  |ft         j                  |      }t        j                  |f||      }t        | |||||      S )NrX   )rA   r  r  r  )r  r  r  rH   rY   r  r  r   s           r   	coo_zerosr    s[    [[#u{{6BF[[#u{{6BF[[#uV<F4sFFFCCr    out_colout_rowc                     |t        d      |t        d      |t        d      |t        d      t        j                  j                  j                  j                  | |      S )aL  Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    The statistics are determined both row-wise and column-wise (transposed).

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    <Tip>
    This function is useful for training, but for inference it is advised to use [`int8_vectorwise_quant`] instead.
    This implementation performs additional column-wise transposed calculations which are not optimized.
    </Tip>

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
        col_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantization scales.
        row_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantization scales.
        out_col (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantized data.
        out_row (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantized data.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.

            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
        - `torch.Tensor` with dtype `torch.int8`: The row-wise quantized data.
        - `torch.Tensor` with dtype `torch.int8`: The column-wise quantized data.
        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization scales.
        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization scales.
        - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
    zUrow_stats must be None. int8_double_quant() does not support pre-allocated row_stats.zUcol_stats must be None. int8_double_quant() does not support pre-allocated col_stats.zQout_col must be None. int8_double_quant() does not support pre-allocated out_col.zQout_row must be None. int8_double_quant() does not support pre-allocated out_row.)r  )r	  rA   r2  r3  int8_double_quantr5  )rp   r  r  r  r  r  s         r   r  r    ss    N pqqpqqlmmlmm99!!33;;A;SSr    statsc                 j    t         j                  j                  j                  j	                  | |      S )aY  Dequantizes a tensor with dtype `torch.int8` to `torch.float32`.

    Args:
        A (`torch.Tensor` with dtype `torch.int8`): The quantized int8 tensor.
        stats (`torch.Tensor` with dtype `torch.float32`): The row-wise quantization statistics.

    Returns:
        `torch.Tensor` with dtype `torch.float32`: The dequantized tensor.
    )rA   r2  r3  int8_vectorwise_dequantr5  )rp   r  s     r   r  r    s'     99!!99AA!UKKr    c                 j    t         j                  j                  j                  j	                  | |      S )aw  Quantizes a tensor with dtype `torch.float16` to `torch.int8` in accordance to the `LLM.int8()` algorithm.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input tensor.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.

            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
        - `torch.Tensor` with dtype `torch.int8`: The quantized data.
        - `torch.Tensor` with dtype `torch.float32`: The quantization scales.
        - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
    )rA   r2  r3  int8_vectorwise_quantr5  )rp   r  s     r   r  r    s'    $ 99!!77??9MMr    r  c                    t        | t              s| j                  r| j                  t        j
                  k(  sJ d       t        | j                  d   | j                  d   | j                         | j                         d   j                         | j                         d   j                         | j                               } |Et	        j                  | j                  |j                  d   f|j                  |j                        }| j                  }| j                   j#                         |k(  sJ | j$                  j#                         |k(  sJ | j                  j#                         |k(  sJ | j&                  |j                  d   k(  sJ |j)                          }|j+                         |rdnd   }|j                  d   }t,        j/                         j0                  }t3        | j                         }t3        | j$                        }	t3        | j                        }
t3        |      }t3        |      }t5        j6                  | j                        }t5        j6                  | j                        }t5        j6                  | j&                        }t5        j6                  |j                  d         }t5        j6                  |      }t5        j6                  |      }t9        | j                   | j$                  | j                  ||g       t;        j<                  |||	|
||||||||t5        j>                  |             |S )Nz8Tensor must be `COOSparseTensor or a PyTorch COO tensor.r   r   )r  r  r  r  r  r   rH   rY   ) r  r  	is_sparselayoutrA   
sparse_coor[   _nnzindicesr   r   r  r  rH   rY   r  r  r   r  r  r  r  rK   r+   r=   rs   rE   ru   r   r   	cspmm_coor  )r  r   rn   r  r  r  r  r  	ptrRowidx	ptrColidx	ptrValuesptrBptrCcnnzcrowsAccolsAccolsBcldbcldcs                      r   spmm_coor*    s   
 dO,~~$++1A1A"A 	
F	
A
 AA		<<>!$((*<<>!$((*;;=
 {kk499aggaj1!((!''R
((C;;#%%%;;#%%%;;#%%%99
"""((L
((*<aQ
0C
''!*C

'
'
)
1
1C$I$I$I1:D3<D::dhhDZZ		"FZZ		"FZZ
#F::c?D::c?Dt{{DKKa=>MM
		,  Jr    c                 ^   |Ot        j                  | j                  |j                  d   f|j                  | j
                  j                        }| j                  }| j                  j                         |k(  sJ | j                  j                         |k(  sJ | j
                  j                         |k(  sJ | j                  |j                  d   k(  s J | j                   d|j                          t        j                  | j                  d      \  }}|j                  d      j                         }t        j                  |d      \  }}	|	j                         }	|j                         }|d   dk  sJ d	|d    d
       |j                  t         j                   t         j"                  fv sJ t%        |      }
t%        |      }t%        |	      }t%        | j                        }t%        | j                        }t%        | j
                        }t%        |      }t%        |      }t%        |      }t'        j(                  |j                               }t'        j(                  | j                        }t'        j(                  | j                        }t'        j(                  |j                  d         }t'        j(                  |j                  d         }t+        |      5  t-        | j                  | j                  | j
                  |||g       |j                  t         j                   k(  r#t/        j0                  |||
|||||||||||       n?|j                  t         j"                  k(  r"t/        j2                  |||
|||||||||||       d d d        |S # 1 sw Y   |S xY w)Nr   r  r   z vs Tr  )
descending    z)Current max count per row is 8 but found r  )rA   r  r  r[   rH   r   rY   r  r  r   r  r  r   cumsumr   r   r  int8rs   rE   ru   rT   r   r    cspmm_coo_very_sparse_naive_fp16 cspmm_coo_very_sparse_naive_int8)r  r   dequant_statsrn   r  _r  r   	max_countmax_idx	ptrOffsetptrMaxCount	ptrMaxIdxr  r   r!  r"  r#  ptrDequantStats	cnnz_rowsr$  r%  crowsBr'  s                           r   spmm_coo_very_sparser<  P  s   
{kk499aggaj1!(($++J[J[\
((C;;#%%%;;#%%%;;#%%%99
"?tyykaggY$??"T[[=IAv]]1!!#FFt<IwkkmGIQ<2Z!J9UV<.XYZZ77u}}ejj1111I)$K I$I$I$I1:D3<Dm,O

6<<>*I::dhhDZZ		"FZZ
#FZZ
#F		 #4;;T[[!S-PQ77emm#00  WW

"00)#L JM#L Js   )B/N""N,g     _@r3   )T)NTr"   )Tr   T)g+ew?T)TrB  r   r   )Tr@  r   )NNN   F)NNNNr=  F)NrJ  )NNNN)NNNNr   r  )	Nr   r   r   r   r   Nr   F)r   r   Nr   )r   r   F)rB  )NFFN)NFF)NNNr   )r   )NNNNr   )gcollections.abcr   ctypesrE   r   mathr   typingr   r   r   numpyrc   rA   r   typing_extensionsr	   bitsandbytes.utilsr
   r   
cextensionr   r   r1  cadam_static_8bit_grad_32cadam_static_8bit_grad_16cmomentum_static_8bit_grad_32cmomentum_static_8bit_grad_16crmsprop_static_8bit_grad_32crmsprop_static_8bit_grad_16clion_static_8bit_grad_32clion_static_8bit_grad_16r  r   r:   rK   rH   FIRST_CUDA_DEVICErB   device_countrT   rV   r|   ro   r/   r   r   r   r   r   r   r   r   rF   r   rs   r   r  r4  r   r=  rP  r   rY  r]  rU  re  rh  rc  FutureWarningrn  rq  rl  rp  r  r  r  r  r  r  r/  r  r  r  r  r  r  r  r  r  r  r  r  r  r  halfr  r  r  r  r*  r<  Cr8   r    r   <module>rS     sw  
 %    ' '    ( I ,	   	%%%%
 	))))
 	((((
 	%%%%
 	%%%%
 	))))+ 8' '0* *2 " !ELLq1  	::q '5<< ' (5<< ( "MM2C Ku|| K!6-&S&:'T43n!x 67 !HPv P"++ P%x %HR[[$9 %n
 n
f $(%)"&D||D
5<<
 D U\\"D 
%,,		D 5<<#$DR )-%)#'"&J||J*%J U\\"J 5<<
 	J
 
%,,	J J \\JZOh &*"&++
_||
_U\\"
_ 
%,,	
_ &*"&++
_||
_U\\"
_ 
%,,	
_ &*"&++N||NU\\"N 
%,,	N 5<<#$Nf )-%)"&#	J||	J*%	J U\\"	J 
%,,			J
 }	J \\	J )-%)"&#	J||	J*%	J U\\"	J 
%,,			J
 }	J \\	J )-%)"&#J||J*%J U\\"J 
%,,		J
 }J \\JZ R]jk $("&
5<<
  
%,,	 65(()	 l& R]jk .2%)#'"&E&&.)* U\\" 5<<
 	
 
%,,	  l( R]jk&  Xell5K W]  l< R]jkF & x7M Y_  lP &*(,#VVV V 	V
 V 
V V 	V U\\"V V V V V V %V  !V$ 
%Vr ;, (,)AAA A 	A
 U\\"A A A 
A A 	A A ELL!A A 5<<
 A A  u||$!A" #A$ %A& %'A( )A* 
+A
Aj '+++ + 	+
 U\\"+ + + + + 
+ + 	+ + ELL!+ +  ell#!+" #+$ %+( 
)+\ R]jk&2f &2 &2s &2PS &2 l&2R GLjj Qn #'
""" 
%,,	"P #'bbb 
%,,	bP #']]] 
%,,	]@ X\chcncn C%,, C5<< Chu||>T C< #'#'|||| || 
%,,		
 5<<
 8 R]jk )-(,,06.||6.%6. %6. ELL)	6. 5<<x'==>6. l6.r R]jk"ell " l"J & " "]	S .3ZZ D )-(,&*&*0T||0T%0T %0T ell#	0T
 ell#0TfLu|| LELL LNU\\ N0 #'@
-
.@||@ 
%,,	@FGT 
r    