
    bi2>                         d dl mZ  G d de      Z G d de      Z G d de      Z G d d	e      Z G d
 de      Z G d de      Zy)    )Optimizer2Statec                   6     e Zd Z	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Adamc                 <    t         |   d||||||||	|
||       y)a  
        Base Adam optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        adamis_pagedNsuper__init__selfparamslrbetasepsweight_decayamsgrad
optim_bitsargsmin_8bit_sizepercentile_clipping
block_wiser	   	__class__s                R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/optim/adam.pyr   zAdam.__init__
   s<    X 	 	 	
    gMbP?)g?g+?g:0yE>r   F    Ni   d   TF__name__
__module____qualname__r   __classcell__r   s   @r   r   r   	   0     9
 9
r   r   c                   6     e Zd Z	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Adam8bitc                 v    |rt        d      |dk7  rt        d      t        | 	  d|||||d||	|
||       y)aj  
        8-bit Adam optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
                Note: This parameter is not supported in Adam8bit and must be False.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
                Note: This parameter is not used in Adam8bit as it always uses 8-bit optimization.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        z&Adam8bit does not support amsgrad=Truer   zFAdam8bit only supports optim_bits=32 (default value for compatibility)r      r   N
ValueErrorr   r   r   s                r   r   zAdam8bit.__init__G   sa    ^ EFF eff 	 	
r   r   r    r%   s   @r   r(   r(   F   2     D
 D
r   r(   c                   6     e Zd Z	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )	Adam32bitc                 <    t         |   d|||||d||	|
||       y)a  
        32-bit Adam optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        r   r   r   Nr
   r   s                r   r   zAdam32bit.__init__   s<    X 	 	 	
r   r   r    r%   s   @r   r/   r/      r&   r   r/   c                   6     e Zd Z	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )	PagedAdamc                 <    t         |   d||||||||	|
|d       y)a  
        Paged Adam optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        r   Tr   Nr
   r   s                r   r   zPagedAdam.__init__   s<    X 	 	 	
r   r   r    r%   s   @r   r2   r2      r&   r   r2   c                   6     e Zd Z	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )PagedAdam8bitc                 v    |rt        d      |dk7  rt        d      t        | 	  d|||||d||	|
|d       y)	az  
        8-bit paged Adam optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
                Note: This parameter is not supported in PagedAdam8bit and must be False.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
                Note: This parameter is not used in PagedAdam8bit as it always uses 8-bit optimization.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        z+PagedAdam8bit does not support amsgrad=Truer   zKPagedAdam8bit only supports optim_bits=32 (default value for compatibility)r   r*   Tr   Nr+   r   s                r   r   zPagedAdam8bit.__init__	  sa    ^ JKK jkk 	 	
r   r   r    r%   s   @r   r5   r5     r-   r   r5   c                   6     e Zd Z	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )PagedAdam32bitc                 <    t         |   d|||||d||	|
|d       y)a  
        Paged 32-bit Adam optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        r   r   Tr   Nr
   r   s                r   r   zPagedAdam32bit.__init__Q  s<    X 	 	 	
r   r   r    r%   s   @r   r8   r8   P  r&   r   r8   N)bitsandbytes.optim.optimizerr   r   r(   r/   r2   r5   r8    r   r   <module>r<      s[    9:
? :
zE
 E
P:
 :
z:
 :
zE
O E
P:
_ :
r   