
    bi-                     2   d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZ d dlmZmZmZmZmZmZ d dlmZ deiZe G d	 d
e             Zd Zdde j6                  fdZedk(  r& e       Zej?                         \  Z Z!Z" ee e!e"       yy)    N)	dataclassfield)Optional)load_dataset)AutoModelForCausalLM"AutoModelForSequenceClassificationAutoTokenizer)
GRPOConfigGRPOTrainerModelConfigScriptArguments	TrlParserget_peft_config)think_format_rewardr   c                   h    e Zd ZU dZ edddi      Zee   ed<    edddi      Z	ee
e      ed<   y)	GRPOScriptArgumentsal  
    Script arguments for the GRPO training script.

    Args:
        reward_model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
            Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
            directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
        reward_funcs (`list[str]` or `None`, *optional*, defaults to `None`):
            Reward functions to use. It can be either one of `"think_format_reward"`; or a dotted import path " (e.g.,
            `'my_lib.rewards.custom_reward'`).
    NhelpzReward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`.)defaultmetadatareward_model_name_or_pathzReward functions to use. It can be either one of  'think_format_reward'; or a dotted import path. (e.g., 'my_lib.rewards.custom_reward').reward_funcs)__name__
__module____qualname____doc__r   r   r   str__annotations__r   list     K/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/scripts/grpo.pyr   r   "   s]    
 05 p
0x}  ). C
)L(49% r    r   c                    t        j                  |j                  |j                        }t	        j                  |j                  |j                        }g }| j
                  r=t        j                  | j
                  |j                  d      }|j                  |       | j                  r| j                  D ]  }|t        v r|j                  t        |          $d|v rz|j                  dd      \  }}t        j                  j                  dt        j                                t!        j"                  |      }	t%        |	|      }
|j                  |
       t'        d| dt)        t        j+                                d       t-        | j.                  | j0                  	      }t3        |||| j4                     |j6                  d
k7  r|| j8                     nd |t;        |            }|j=                          |j?                  |j@                         |jB                  r|jC                  | j.                         y y )N)trust_remote_code   )r#   
num_labels.r   z Could not load reward function 'z'. Expected one of z or a valid import path.)nameno)modelr   argstrain_dataseteval_datasetprocessing_classpeft_config)dataset_name)"r   from_pretrainedmodel_name_or_pathr#   r	   r   r   appendr   reward_funcs_registryrsplitsyspathinsertosgetcwd	importlibimport_modulegetattr
ValueErrorr   keysr   r/   dataset_configr   dataset_train_spliteval_strategydataset_test_splitr   train
save_model
output_dirpush_to_hub)script_argstraining_args
model_argsr)   	tokenizerr   reward_model	func_namemodule_pathmodulereward_funcdatasettrainers                r!   mainrR   @   s    00%%9U9UE --%%9U9UI
 L,,9II11ZEaEano
 	L)$11 	I11##$9)$DE	!)2)9)9#q)A&Y299;/"00=%fi8##K0 6ykAT16689::RT 	  ;33+:T:TUG !k==>@M@[@[_c@cW[;;<im"#J/G MMO }//0  )A)AB !r    
subparsersc                 n    t         t        t        f}| | j                  dd|      }|S t	        |      }|S )NgrpozRun the GRPO training script)r   dataclass_types)r   r
   r   
add_parserr   )rS   rV   parsers      r!   make_parserrY   x   sE    *JDO&&v4Rds&t M ?+Mr    __main__)N)#argparser:   r8   r5   dataclassesr   r   typingr   datasetsr   transformersr   r   r	   trlr
   r   r   r   r   r   trl.rewardsr   r3   r   rR   _SubParsersActionrY   r   rX   parse_args_and_configrG   rH   rI   r   r    r!   <module>rd      s      	 
 (  ! ` ` a a + . 
 /  :5CpH66  z]F-3-I-I-K*K
mZ0 r    