Ë
    §Ùbi³<  ã                   ó”   — d dl mZ  G d„ de«      Z G d„ de«      Z G d„ de«      Z G d„ d	e«      Z G d
„ de«      Z G d„ de«      Zy)é    )ÚOptimizer2Statec                   ó6   ‡ — e Zd Z	 	 	 	 	 	 	 	 	 	 	 dˆ fd„	Zˆ xZS )ÚAdamWc                 ó<   •— t         ‰|   d||||||||	|
||¬«       y)a³  
        Base AdamW optimizer.

        Arguments:
            params (`torch.Tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 1e-2):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        Úadam©Úis_pagedN©ÚsuperÚ__init__©ÚselfÚparamsÚlrÚbetasÚepsÚweight_decayÚamsgradÚ
optim_bitsÚargsÚmin_8bit_sizeÚpercentile_clippingÚ
block_wiser	   Ú	__class__s                €úS/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/optim/adamw.pyr   zAdamW.__init__
   s<   ø€ ôX 	‰ÑØØØØØØØØØØØØð 	õ 	
ó    ©çü©ñÒMbP?©gÍÌÌÌÌÌì?g+‡ÙÎ÷ï?ç:Œ0âŽyE>ç{®Gáz„?Fé    Né   éd   TF©Ú__name__Ú
__module__Ú__qualname__r   Ú__classcell__©r   s   @r   r   r   	   ó0   ø„ ð ØØØØØØØØØØ÷9
ñ 9
r   r   c                   ó6   ‡ — e Zd Z	 	 	 	 	 	 	 	 	 	 	 dˆ fd„	Zˆ xZS )Ú	AdamW8bitc                 óv   •— |rt        d«      ‚|dk7  rt        d«      ‚t        ‰| 	  d|||||d||	|
||¬«       y)an  
        8-bit AdamW optimizer.

        Arguments:
            params (`torch.Tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 1e-2):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
                Note: This parameter is not supported in AdamW8bit and must be False.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
                Note: This parameter is not used in AdamW8bit as it always uses 8-bit optimization.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        z'AdamW8bit does not support amsgrad=Truer"   zGAdamW8bit only supports optim_bits=32 (default value for compatibility)r   é   r   N©Ú
ValueErrorr   r   r   s                €r   r   zAdamW8bit.__init__G   sa   ø€ ñ^ ÜÐFÓGÐGà˜Òô ÐfÓgÐgä‰ÑØØØØØØØØØØØØð 	õ 	
r   r   r%   r*   s   @r   r-   r-   F   s2   ø„ ð ØØØØØØØØØØ÷D
ñ D
r   r-   c                   ó6   ‡ — e Zd Z	 	 	 	 	 	 	 	 	 	 	 dˆ fd„	Zˆ xZS )Ú
AdamW32bitc                 ó<   •— t         ‰|   d|||||d||	|
||¬«       y)aµ  
        32-bit AdamW optimizer.

        Arguments:
            params (`torch.Tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 1e-2):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        r   r"   r   Nr
   r   s                €r   r   zAdamW32bit.__init__   s<   ø€ ôX 	‰ÑØØØØØØØØØØØØð 	õ 	
r   r   r%   r*   s   @r   r3   r3   Ž   r+   r   r3   c                   ó4   ‡ — e Zd Z	 	 	 	 	 	 	 	 	 	 dˆ fd„	Zˆ xZS )Ú
PagedAdamWc                 ó<   •— t         ‰|   d||||||||	|
|d¬«       y)a=  
        Paged AdamW optimizer.

        Arguments:
            params (`torch.Tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 1e-2):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
        r   Tr   Nr
   ©r   r   r   r   r   r   r   r   r   r   r   r   r   s               €r   r   zPagedAdamW.__init__Ì   s<   ø€ ôR 	‰ÑØØØØØØØØØØØØð 	õ 	
r   ©
r   r   r    r!   Fr"   Nr#   r$   Tr%   r*   s   @r   r6   r6   Ë   ó-   ø„ ð ØØØØØØØØØ÷6
ñ 6
r   r6   c                   ó4   ‡ — e Zd Z	 	 	 	 	 	 	 	 	 	 dˆ fd„	Zˆ xZS )ÚPagedAdamW8bitc                 óv   •— |rt        d«      ‚|dk7  rt        d«      ‚t        ‰| 	  d|||||d||	|
|d¬«       y)	a  
        Paged 8-bit AdamW optimizer.

        Arguments:
            params (`torch.Tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 1e-2):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
                Note: This parameter is not supported in PagedAdamW8bit and must be False.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
                Note: This parameter is not used in PagedAdamW8bit as it always uses 8-bit optimization.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
        z,PagedAdamW8bit does not support amsgrad=Truer"   zLPagedAdamW8bit only supports optim_bits=32 (default value for compatibility)r   r/   Tr   Nr0   r8   s               €r   r   zPagedAdamW8bit.__init__  sa   ø€ ñX ÜÐKÓLÐLà˜Òô ÐkÓlÐlä‰ÑØØØØØØØØØØØØð 	õ 	
r   r9   r%   r*   s   @r   r<   r<     s/   ø„ ð ØØØØØØØØØ÷A
ñ A
r   r<   c                   ó4   ‡ — e Zd Z	 	 	 	 	 	 	 	 	 	 dˆ fd„	Zˆ xZS )ÚPagedAdamW32bitc                 ó<   •— t         ‰|   d|||||d||	|
|d¬«       y)aD  
        Paged 32-bit AdamW optimizer.

        Arguments:
            params (`torch.Tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
                The beta values are the decay rates of the first and second-order moment of the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value prevents division by zero in the optimizer.
            weight_decay (`float`, defaults to 1e-2):
                The weight decay value for the optimizer.
            amsgrad (`bool`, defaults to `False`):
                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
        r   r"   Tr   Nr
   r8   s               €r   r   zPagedAdamW32bit.__init__K  s<   ø€ ôR 	‰ÑØØØØØØØØØØØØð 	õ 	
r   r9   r%   r*   s   @r   r?   r?   J  r:   r   r?   N)Úbitsandbytes.optim.optimizerr   r   r-   r3   r6   r<   r?   © r   r   ú<module>rC      s[   ðõ 9ô:
ˆOô :
ôzE
ô E
ôP:
ô :
ôz7
ô 7
ôtB
_ô B
ôJ7
oõ 7
r   