
    bi                     H    d dl Z d dlmZmZ ddlmZ e G d de             Zy)    N)	dataclassfield   )OnPolicyConfigc                      e Zd ZU dZ eej                  j                  e      dd ddi      Z	e
ed<    eddd	i      Ze
ed
<    edddi      Zeed<    edddi      Zeed<    edddi      Zeed<    edddi      Zeed<    edddi      Zeed<    edddi      Zeed<    edddi      Zeed<    edddi      Zeed <    eddd!i      Zeed"<    ed#dd$i      Zeed%<   y)&
RLOOConfiga  
    Configuration class for the [`RLOOTrainer`].

    This class includes only the parameters that are specific to RLOO training. For a full list of training arguments,
    please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
    values in this class may differ from those in [`~transformers.TrainingArguments`].

    Using [`~transformers.HfArgumentParser`] we can turn this class into
    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
    command line.

    Parameters:
        exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[: -len(".py")]`):
            Name of this experiment.
        reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
            Path to the reward model.
        num_ppo_epochs (`int`, *optional*, defaults to `4`):
            Number of epochs to train.
        whiten_rewards (`bool`, *optional*, defaults to `False`):
            Whether to whiten the rewards.
        kl_coef (`float`, *optional*, defaults to `0.05`):
            KL coefficient.
        cliprange (`float`, *optional*, defaults to `0.2`):
            Clip range.
        rloo_k (`int`, *optional*, defaults to `2`):
            REINFORCE Leave-One-Out (RLOO) number of online samples per prompt.
        normalize_reward (`bool`, *optional*, defaults to `False`):
            Whether to normalize rewards.
        reward_clip_range (`float`, *optional*, defaults to `10.0`):
            Clip range for rewards.
        normalize_advantage (`bool`, *optional*, defaults to `False`):
            Whether to normalize advantages.
        token_level_kl (`bool`, *optional*, defaults to `True`):
            Whether to use token-level KL penalty or sequence-level KL penalty.
        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
            improving generation speed. However, disabling this option allows training models that exceed the VRAM
            capacity of a single GPU, albeit at the cost of slower generation.
    NhelpzName of this experiment.)defaultmetadataexp_namezEleutherAI/pythia-160mzPath to the reward model.reward_model_path   zNumber of epochs to train.num_ppo_epochsFzWhether to whiten the rewards.whiten_rewardsg?zKL coefficient.kl_coefg?zClip range.	clipranger   zCREINFORCE Leave-One-Out (RLOO) number of online samples per prompt.rloo_kzWhether to normalize rewardsnormalize_rewardg      $@zClip range for rewardsreward_clip_rangezWhether to normalize advantagesnormalize_advantagezBWhether to use token-level KL penalty or sequence-level KL penaltytoken_level_klTa  This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation.ds3_gather_for_generation)__name__
__module____qualname____doc__r   ospathbasename__file__r   str__annotations__r   r   intr   boolr   floatr   r   r   r   r   r   r        R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/rloo_config.pyr   r      s   &P   *3B/45Hc  #(56s   67NC  !:;ND  +,GU  -(Iu  _`FC  #89d   %23 u  !&;<!  !^_ND  ', a
't r(   r   )r   dataclassesr   r   trainer.utilsr   r   r'   r(   r)   <module>r,      s.    
 ( * \ \ \r(   