
    bi                     R    d dl mZ  G d de      Z G d de      Z G d de      Zy)	    )Optimizer1Statec                   4     e Zd Z	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Adagradc                     d|k  st        d|       d|k  st        d|       d|k  st        d|       |dk7  rt        d      |dk7  rt        d      t        | 	  d||d|||||	|
|       y	)
a  
        Base Adagrad optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-2):
                The learning rate.
            lr_decay (`int`, defaults to 0):
                The learning rate decay.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            initial_accumulator_value (`int`, defaults to 0):
                The initial momemtum values.
            eps (`float`, defaults to 1e-10):
                The epsilon value prevents division by zero in the optimizer.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
                Invalid learning rate: Invalid weight_decay value: Invalid epsilon value: /Initial accumulator value != 0.0 not supported!Lr Decay != 0.0 not supported!adagradr   r   N
ValueErrorsuper__init__selfparamslrlr_decayweight_decayinitial_accumulator_valueeps
optim_bitsargsmin_8bit_sizepercentile_clipping
block_wise	__class__s               U/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/optim/adagrad.pyr   zAdagrad.__init__	   s    R by6rd;<<l";L>JKKcz6se<==$+NOOs?=>>	
    
{Gz?r   r   r   绽|=    N   d   T__name__
__module____qualname__r   __classcell__r    s   @r!   r   r      -     "#?
 ?
r"   r   c                   4     e Zd Z	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Adagrad8bitc                     d|k  st        d|       d|k  st        d|       d|k  st        d|       |dk7  rt        d      |dk7  rt        d      |sJ t        | 	  d||d||d	||	|
|       y
)a  
        8-bit Adagrad optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-2):
                The learning rate.
            lr_decay (`int`, defaults to 0):
                The learning rate decay.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            initial_accumulator_value (`int`, defaults to 0):
                The initial momemtum values.
            eps (`float`, defaults to 1e-10):
                The epsilon value prevents division by zero in the optimizer.
            optim_bits (`int`, defaults to 8):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
        r   r   r	   r
   r   r   r   r      Nr   r   s               r!   r   zAdagrad8bit.__init__L   s    R by6rd;<<l";L>JKKcz6se<==$+NOOs?=>>z	
r"   )
r$   r   r   r   r%   r3   Nr'   r(   Tr)   r.   s   @r!   r1   r1   K   s/     "#@
 @
r"   r1   c                   4     e Zd Z	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Adagrad32bitc                     d|k  st        d|       d|k  st        d|       d|k  st        d|       |dk7  rt        d      |dk7  rt        d      t        | 	  d||d||d	||	|
|       y
)a  
        32-bit Adagrad optimizer.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-2):
                The learning rate.
            lr_decay (`int`, defaults to 0):
                The learning rate decay.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            initial_accumulator_value (`int`, defaults to 0):
                The initial momemtum values.
            eps (`float`, defaults to 1e-10):
                The epsilon value prevents division by zero in the optimizer.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
        r   r   r	   r
   r   r   r   r   r&   Nr   r   s               r!   r   zAdagrad32bit.__init__   s    R by6rd;<<l";L>JKKcz6se<==$+NOOs?=>>	
r"   r#   r)   r.   s   @r!   r5   r5      r/   r"   r5   N)bitsandbytes.optim.optimizerr   r   r1   r5    r"   r!   <module>r9      s6   
 9@
o @
FA
/ A
H@
? @
r"   