
    bi                     R    d dl mZ  G d de      Z G d de      Z G d de      Zy)	    )Optimizer2Statec                   :     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )LAMBc                 <    t         |   d||||||	|
|||d       y)a  
        Base LAMB optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            bias_correction (`bool`, defaults to `True`):
                Whether to apply bias correction to the first and second-order moments.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 1e-2):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            adam_w_mode (`bool`, defaults to `True`):
                Whether to use the AdamW variant.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            max_unorm (`float`, defaults to 1.0):
                The maximum gradient norm.
        lamb      ?	max_unormNsuper__init__)selfparamslrbias_correctionbetasepsweight_decayamsgradadam_w_mode
optim_bitsargsmin_8bit_sizepercentile_clipping
block_wiser
   	__class__s                  R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/optim/lamb.pyr   zLAMB.__init__	   s<    d 	 	 	
    )MbP?Tg?g+?:0yE>r   FT    N   d   Fr   __name__
__module____qualname__r   __classcell__r   s   @r   r   r      s6     ?
 ?
r   r   c                   8     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )LAMB8bitc                 <    t         |   d|||||d|	|
||d       y)a(  
        8-bit LAMB optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            bias_correction (`bool`, defaults to `True`):
                Whether to apply bias correction to the first and second-order moments.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 1e-2):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            adam_w_mode (`bool`, defaults to `True`):
                Whether to use the AdamW variant.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            max_unorm (`float`, defaults to 1.0):
                The maximum gradient norm.
        r      r   r	   Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   r
   r   s                 r   r   zLAMB8bit.__init__L   s<    ^ 	 	 	
r   r   Tr    r!   r   FTNr#   r$   Fr   r%   r*   s   @r   r,   r,   K   3     <
 <
r   r,   c                   8     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )	LAMB32bitc                 <    t         |   d|||||d|	|
||d       y)a)  
        32-bit LAMB optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            bias_correction (`bool`, defaults to `True`):
                Whether to apply bias correction to the first and second-order moments.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 1e-2):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            adam_w_mode (`bool`, defaults to `True`):
                Whether to use the AdamW variant.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            max_unorm (`float`, defaults to 1.0):
                The maximum gradient norm.
        r   r"   r   r	   Nr   r/   s                 r   r   zLAMB32bit.__init__   s<    ^ 	 	 	
r   r0   r%   r*   s   @r   r3   r3      r1   r   r3   N)bitsandbytes.optim.optimizerr   r   r,   r3    r   r   <module>r7      s2   
 9@
? @
F=
 =
@=
 =
r   