
    bi                     X    d dl Z d dlmZmZ d dlmZmZ ddlmZ e G d de             Z	y)    N)	dataclassfield)LiteralOptional   )OnPolicyConfigc                   8   e Zd ZU dZ eej                  j                  e      dd ddi      Z	e
ed<    eddd	i      Ze
ed
<    edddi      Zee
   ed<    edddi      Zee
   ed<    edddi      Zeed<    edddi      Zeed<    edddi      Zeed<    edddi      Zed   ed<    edddi      Zeed<    eddd i      Zeed!<    eddd"i      Zeed#<    ed$dd%i      Zeed&<    ed'dd(i      Zeed)<    ed*dd+i      Zeed,<   y)-	PPOConfiga
  
    Configuration class for the [`PPOTrainer`].

    This class includes only the parameters that are specific to PPO training. For a full list of training arguments,
    please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
    values in this class may differ from those in [`~transformers.TrainingArguments`].

    Using [`~transformers.HfArgumentParser`] we can turn this class into
    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
    command line.

    Parameters:
        exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`):
            Name of this experiment.
        reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
            Path to the reward model.
        model_adapter_name (`str` or `None`, *optional*, defaults to `None`):
            Name of the train target PEFT adapter, when using LoRA with multiple adapters.
        ref_adapter_name (`str` or `None`, *optional*, defaults to `None`):
            Name of the reference PEFT adapter, when using LoRA with multiple adapters.
        num_ppo_epochs (`int`, *optional*, defaults to `4`):
            Number of epochs to train.
        whiten_rewards (`bool`, *optional*, defaults to `False`):
            Whether to whiten the rewards.
        kl_coef (`float`, *optional*, defaults to `0.05`):
            KL coefficient.
        kl_estimator (`Literal["k1", "k3"]`, *optional*, defaults to `"k1"`):
            Which estimator for KL-Divergence to use from [Approximating KL
            Divergence](http://joschu.net/blog/kl-approx.html). Defaults to "k1", a straightforward, unbiased
            estimator. Can be set to "k3", an unbiased estimator with lower variance which "appears to be a strictly
            better estimator". Cannot be set to "k2", as it is used for logging purposes.
        cliprange (`float`, *optional*, defaults to `0.2`):
            Clip range.
        vf_coef (`float`, *optional*, defaults to `0.1`):
            Value function coefficient.
        cliprange_value (`float`, *optional*, defaults to `0.2`):
            Clip range for the value function.
        gamma (`float`, *optional*, defaults to `1.0`):
            Discount factor.
        lam (`float`, *optional*, defaults to `0.95`):
            Lambda value for GAE.
        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
            improving generation speed. However, disabling this option allows training models that exceed the VRAM
            capacity of a single GPU, albeit at the cost of slower generation.
    NhelpzName of this experiment.)defaultmetadataexp_namezEleutherAI/pythia-160mzPath to the reward model.reward_model_pathzNName of the train target PEFT adapter, when using LoRA with multiple adapters.model_adapter_namezKName of the reference PEFT adapter, when using LoRA with multiple adapters.ref_adapter_name   zNumber of epochs to train.num_ppo_epochsFzWhether to whiten the rewards.whiten_rewardsg?zKL coefficient.kl_coefk1aW  Which estimator for KL-Divergence to use from Approximating KL Divergence (http://joschu.net/blog/kl-approx.html). Defaults to 'k1', a straightforward, unbiased estimator. Can be set to 'k3', an unbiased estimator with lower variance which 'appears to be a strictly better estimator'. Cannot be set to 'k2', as it is used for logging purposes.)r   k3kl_estimatorg?zClip range.	cliprangeg?zValue function coefficient.vf_coefz"Clip range for the value function.cliprange_valueg      ?zDiscount factor.gammagffffff?zLambda value for GAE.lamTa  This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation.ds3_gather_for_generation)__name__
__module____qualname____doc__r   ospathbasename__file__r   str__annotations__r   r   r   r   r   intr   boolr   floatr   r   r   r   r   r   r   r        Q/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/ppo_config.pyr
   r
      s   -^   *3B/45Hc  #(56s  ).jk)  ',gh'hsm   67NC  !:;ND  +,GU  ). U
)L'*%  -(Iu  78GU  #>?OU  ,-E5  12C  ', a
't r.   r
   )
r$   dataclassesr   r   typingr   r   trainer.utilsr   r
   r-   r.   r/   <module>r3      s1    
 ( $ * p p pr.   