
    bi/                     f    d dl Z d dlZd dlmZmZ d dlmZ d dlmZ ddl	m
Z
 e G d d             Zy)	    N)	dataclassfield)Optional)is_bitsandbytes_available   )flatten_dictc                      e Zd ZU dZ eej                  j                  ej                  d         d e
d        ddi      Zeed<    ed	dd
i      Zeed<    edddi      Zeed<    eddddgd      Zee   ed<    eeddi      Zeed<    eeddi      Zeed<    eeddi      Zeed<    edddi      Zeed<    edddi      Zeed<    ed dd!i      Zeed"<    ed#dd$i      Zeed%<    ed&dd'i      Zeed(<    ed)dd*i      Zeed+<    ed,dd-i      Zeed.<    ed	dd/i      Zeed0<    ed1dd2i      Z eed3<    ed4dd5i      Z!e"ed6<    ed7dd8i      Z#e"ed9<    ed#dd:i      Z$eed;<    ed<dd=i      Z%eed><    ed#dd?i      Z&eed@<    edAddBi      Z'eedC<    edDddEi      Z(e"edF<    edGddHi      Z)e"edI<    edJddKi      Z*e"edL<    edMddNi      Z+e"edO<    edPddQi      Z,e"edR<    ed#ddSi      Z-eedT<    ed4ddUi      Z.e"edV<    ed#ddWi      Z/eedX<    ed,ddYi      Z0eedZ<    ed7dd[i      Z1e"ed\<    edMdd]i      Z2e"ed^<    ed4dd_i      Z3e"ed`<    edAddai      Z4eedb<    edcdddi      Z5eede<    edcddfi      Z6eedg<    edAddhi      Z7eedi<    ed<ddji      Z8eedk<    ed	ddli      Z9eedm<    edAddni      Z:eedo<   dp Z;dq Z<y)r
DDPOConfiga  
    Configuration class for the [`DDPOTrainer`].

    Using [`~transformers.HfArgumentParser`] we can turn this class into
    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
    command line.

    Parameters:
        exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`):
            Name of this experiment (by default is the file name without the extension name).
        run_name (`str`, *optional*, defaults to `""`):
            Name of this run.
        seed (`int`, *optional*, defaults to `0`):
            Random seed.
        log_with (`Literal["wandb", "tensorboard"]]` or `None`, *optional*, defaults to `None`):
            Log with either 'wandb' or 'tensorboard', check
            https://huggingface.co/docs/accelerate/usage_guides/tracking for more details.
        tracker_kwargs (`Dict`, *optional*, defaults to `{}`):
            Keyword arguments for the tracker (e.g. wandb_project).
        accelerator_kwargs (`Dict`, *optional*, defaults to `{}`):
            Keyword arguments for the accelerator.
        project_kwargs (`Dict`, *optional*, defaults to `{}`):
            Keyword arguments for the accelerator project config (e.g. `logging_dir`).
        tracker_project_name (`str`, *optional*, defaults to `"trl"`):
            Name of project to use for tracking.
        logdir (`str`, *optional*, defaults to `"logs"`):
            Top-level logging directory for checkpoint saving.
        num_epochs (`int`, *optional*, defaults to `100`):
            Number of epochs to train.
        save_freq (`int`, *optional*, defaults to `1`):
            Number of epochs between saving model checkpoints.
        num_checkpoint_limit (`int`, *optional*, defaults to `5`):
            Number of checkpoints to keep before overwriting old ones.
        mixed_precision (`str`, *optional*, defaults to `"fp16"`):
            Mixed precision training.
        allow_tf32 (`bool`, *optional*, defaults to `True`):
            Allow `tf32` on Ampere GPUs.
        resume_from (`str`, *optional*, defaults to `""`):
            Resume training from a checkpoint.
        sample_num_steps (`int`, *optional*, defaults to `50`):
            Number of sampler inference steps.
        sample_eta (`float`, *optional*, defaults to `1.0`):
            Eta parameter for the DDIM sampler.
        sample_guidance_scale (`float`, *optional*, defaults to `5.0`):
            Classifier-free guidance weight.
        sample_batch_size (`int`, *optional*, defaults to `1`):
            Batch size (per GPU) to use for sampling.
        sample_num_batches_per_epoch (`int`, *optional*, defaults to `2`):
            Number of batches to sample per epoch.
        train_batch_size (`int`, *optional*, defaults to `1`):
            Batch size (per GPU) to use for training.
        train_use_8bit_adam (`bool`, *optional*, defaults to `False`):
            Use 8bit Adam optimizer from bitsandbytes.
        train_learning_rate (`float`, *optional*, defaults to `3e-4`):
            Learning rate.
        train_adam_beta1 (`float`, *optional*, defaults to `0.9`):
            Adam beta1.
        train_adam_beta2 (`float`, *optional*, defaults to `0.999`):
            Adam beta2.
        train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`):
            Adam weight decay.
        train_adam_epsilon (`float`, *optional*, defaults to `1e-8`):
            Adam epsilon.
        train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`):
            Number of gradient accumulation steps.
        train_max_grad_norm (`float`, *optional*, defaults to `1.0`):
            Maximum gradient norm for gradient clipping.
        train_num_inner_epochs (`int`, *optional*, defaults to `1`):
            Number of inner epochs per outer epoch.
        train_cfg (`bool`, *optional*, defaults to `True`):
            Whether to use classifier-free guidance during training.
        train_adv_clip_max (`float`, *optional*, defaults to `5.0`):
            Clip advantages to the range.
        train_clip_range (`float`, *optional*, defaults to `1e-4`):
            PPO clip range.
        train_timestep_fraction (`float`, *optional*, defaults to `1.0`):
            Fraction of timesteps to train on.
        per_prompt_stat_tracking (`bool`, *optional*, defaults to `False`):
            Whether to track statistics for each prompt separately.
        per_prompt_stat_tracking_buffer_size (`int`, *optional*, defaults to `16`):
            Number of reward values to store in the buffer for each prompt.
        per_prompt_stat_tracking_min_count (`int`, *optional*, defaults to `16`):
            Minimum number of reward values to store in the buffer.
        async_reward_computation (`bool`, *optional*, defaults to `False`):
            Whether to compute rewards asynchronously.
        max_workers (`int`, *optional*, defaults to `2`):
            Maximum number of workers to use for async reward computation.
        negative_prompts (`str`, *optional*, defaults to `""`):
            Comma-separated list of prompts to use as negative examples.
        push_to_hub (`bool`, *optional*, defaults to `False`):
            Whether to push the final model checkpoint to the Hub.
    r   Nz.pyhelpzQName of this experiment (by default is the file name without the extension name).)defaultmetadataexp_name zName of this run.run_namezRandom seed.seedz)Log with either 'wandb' or 'tensorboard'.wandbtensorboard)r   choiceslog_withz7Keyword arguments for the tracker (e.g. wandb_project).)default_factoryr   tracker_kwargsz&Keyword arguments for the accelerator.accelerator_kwargszJKeyword arguments for the accelerator project config (e.g. `logging_dir`).project_kwargstrlz$Name of project to use for tracking.tracker_project_namelogsz2Top-level logging directory for checkpoint saving.logdird   zNumber of epochs to train.
num_epochs   z2Number of epochs between saving model checkpoints.	save_freq   z:Number of checkpoints to keep before overwriting old ones.num_checkpoint_limitfp16zMixed precision training.mixed_precisionTzAllow `tf32` on Ampere GPUs.
allow_tf32z"Resume training from a checkpoint.resume_from2   z"Number of sampler inference steps.sample_num_stepsg      ?z#Eta parameter for the DDIM sampler.
sample_etag      @z Classifier-free guidance weight.sample_guidance_scalez)Batch size (per GPU) to use for sampling.sample_batch_sizer   z&Number of batches to sample per epoch.sample_num_batches_per_epochz)Batch size (per GPU) to use for training.train_batch_sizeFz*Use 8bit Adam optimizer from bitsandbytes.train_use_8bit_adamga2U0*3?zLearning rate.train_learning_rateg?zAdam beta1.train_adam_beta1g+?zAdam beta2.train_adam_beta2g-C6?zAdam weight decay.train_adam_weight_decayg:0yE>zAdam epsilon.train_adam_epsilonz&Number of gradient accumulation steps.!train_gradient_accumulation_stepsz,Maximum gradient norm for gradient clipping.train_max_grad_normz'Number of inner epochs per outer epoch.train_num_inner_epochsz8Whether to use classifier-free guidance during training.	train_cfgzClip advantages to the range.train_adv_clip_maxzPPO clip range.train_clip_rangez"Fraction of timesteps to train on.train_timestep_fractionz7Whether to track statistics for each prompt separately.per_prompt_stat_tracking   z?Number of reward values to store in the buffer for each prompt.$per_prompt_stat_tracking_buffer_sizez7Minimum number of reward values to store in the buffer."per_prompt_stat_tracking_min_countz*Whether to compute rewards asynchronously.async_reward_computationz>Maximum number of workers to use for async reward computation.max_workersz<Comma-separated list of prompts to use as negative examples.negative_promptsz6Whether to push the final model checkpoint to the Hub.push_to_hubc                 j    i }| j                   j                         D ]
  \  }}|||<    t        |      S )N)__dict__itemsr   )selfoutput_dictkeyvalues       R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/ddpo_config.pyto_dictzDDPOConfig.to_dict   s<    ----/ 	%JC$K	%K((    c                 H    | j                   rt               st        d      y y )NzfYou need to install bitsandbytes to use 8bit Adam. You can install it with `pip install bitsandbytes`.)r/   r   ImportError)rG   s    rK   __post_init__zDDPOConfig.__post_init__&  s-    ##,E,GF  -H#rM   )=__name__
__module____qualname____doc__r   ospathbasenamesysargvlenr   str__annotations__r   r   intr   r   dictr   r   r   r   r   r   r!   r#   r%   r&   boolr'   r)   r*   floatr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r>   r?   r@   rA   rB   rC   rL   rP    rM   rK   r
   r
      sN   [z   !-mU<mnHc  -.Hc  .)D#  $?/
Hhsm  !STND   %BC   !fgND  !&@A!#  NOFC  67J  NOIs  !&VW!#  !56OS  89J  >?K  ">?c  ?@J  $)<=$5  #EFs  ).BC) #  "EFc  !&FG!  "'*+"  $-(e  $-(e  &+./&U  !&/*!  .3BC.%s  "'HI"  #(CD#C  TUIt  !&9:!  $+,e  &+>?&U  &+ST&d  16[\1(#  /4ST/&  &+FG&d  Z[K  "XYc  RSK 
)rM   r
   )rU   rX   dataclassesr   r   typingr   transformersr   corer   r
   ra   rM   rK   <module>rf      s5    
 
 (  2  Q Q QrM   