
    biA             
          d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZmZ d dlZd dlZd dlZd dlZd dlmZmZmZmZmZ d d	lmZmZ d d
l m!Z! d dlm"Z" d dl#m$Z% d dlm&Z&m'Z' d dlm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z<m=Z= ddl>m?Z? ddl@mAZAmBZB ddlCmDZDmEZEmFZF ddlGmHZH ddlImJZJ ddlKmLZL ddlMmNZNmOZOmPZPmQZQmRZRmSZS  e5       rd dlTmUZUmVZV  eA       rd dlWmXZX  eB       rd dlYmZZZm[Z[ d dl\m]Z]  e0       rd dl^Z^ee_e,ee`e`ge`ea   f   f   Zb G d  d!e'      Zcd"ej                  d#ej                  fd$Zed%efe_eej                     f   d&egd#e`efe_eej                     f      fd'Zhd%efe_eej                     f   d#efe_eej                     f   fd(Zid"ej                  d#ej                  fd)Zjd"ej                  d#ej                  fd*Zkd+ Zl G d, d-e.      Zmy).    N)defaultdictdeque)Sized)nullcontext)partial)Path)AnyCallableOptionalUnion)broadcast_object_listgathergather_objectis_peft_modelset_seed)DatasetIterableDataset)version)nn)FullyShardedDataParallel)
DataLoaderSampler)	AutoModelForCausalLM"AutoModelForSequenceClassificationAutoTokenizerGenerationConfigPreTrainedModelPreTrainedTokenizerBaseTrainerTrainerCallbackis_wandb_available)seed_worker)is_datasets_availableis_peft_availableis_rich_available   )apply_chat_templateis_conversationalmaybe_apply_chat_template)profiling_contextprofiling_decorator)
VLLMClient)is_liger_kernel_availableis_vllm_available)prepare_deepspeedprepare_fsdpunwrap_model_for_generation)_ForwardRedirection   )SyncRefModelCallback)
GRPOConfig)disable_dropout_in_modelgenerate_model_cardget_comet_experiment_urlpadprint_prompt_completions_sampleselective_log_softmax)
PeftConfigget_peft_model)LigerFusedLinearGRPOLoss)LLMSamplingParams)GuidedDecodingParamsc                   R    e Zd ZdZ	 	 	 	 ddedededededee   fd	Zd
 Z	defdZ
y)RepeatSamplera=  
    Sampler that repeats the indices of a dataset in a structured manner.

    Args:
        data_source (`Sized`):
            Dataset to sample from.
        mini_repeat_count (`int`):
            Number of times to repeat each index per batch.
        batch_size (`int`, *optional*, defaults to `1`):
            Number of unique indices per batch.
        repeat_count (`int`, *optional*, defaults to `1`):
            Number of times to repeat the full sampling process.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the dataset.
        seed (`int` or `None`, *optional*, defaults to `None`):
            Random seed for reproducibility (only affects this sampler).

    Example:
    ```python
    >>> sampler = RepeatRandomSampler(
    ...     ["a", "b", "c", "d", "e", "f", "g"], mini_repeat_count=2, batch_size=3, repeat_count=4
    ... )
    >>> list(sampler)
    [4, 4, 3, 3, 0, 0,
     4, 4, 3, 3, 0, 0,
     4, 4, 3, 3, 0, 0,
     4, 4, 3, 3, 0, 0,
     1, 1, 2, 2, 6, 6,
     1, 1, 2, 2, 6, 6,
     1, 1, 2, 2, 6, 6,
     1, 1, 2, 2, 6, 6]
    ```

    ```txt
    mini_repeat_count = 3
          -   -   -
         [0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,      |
          4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,      |
          8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11,      |
                                                                repeat_count = 2
          0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,      |
          4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,      |
          8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11, ...] |
          ---------   ---------   ---------   ---------
           ---------   ---------   ---------   ---------
            ---------   ---------   ---------   ---------
                         batch_size = 12
    ```
    Ndata_sourcemini_repeat_count
batch_sizerepeat_countshuffleseedc                     || _         || _        || _        || _        t	        |      | _        || _        || _        |r8t        j                         | _
        || j                  j                  |       y y y N)rD   rE   rF   rG   lennum_samplesrH   rI   torch	Generator	generatormanual_seed)selfrD   rE   rF   rG   rH   rI   s          S/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/grpo_trainer.py__init__zRepeatSampler.__init__   sq     '!2$({+	"__.DN**40       c              #   >  K   | j                   r:t        j                  | j                  | j                        j                         }nt        t        | j                              }t        dt        |      | j                        D cg c]  }|||| j                  z     }}|D cg c]  }t        |      | j                  k(  s| }}|D ]A  }t        | j                        D ]'  }|D ]   }t        | j                        D ]  }|  " ) C y c c}w c c}w w)N)rP   r   )rH   rN   randpermrM   rP   tolistlistrangerL   rF   rG   rE   )rR   indexesichunk_indexs         rS   __iter__zRepeatSampler.__iter__   s    <<nnT%5%5PWWYG5!1!123G >C1c'lTXTcTc=de71q4??23ee '.OUUt1N5OO 	$E4,,- $" $E"4#9#9: $#$$$	$ f Ps%   BD	D"D(DD
ADreturnc                 N    | j                   | j                  z  | j                  z  S rK   )rM   rE   rG   rR   s    rS   __len__zRepeatSampler.__len__   s$    $"8"884;L;LLLrU   )r3   r3   TN)__name__
__module____qualname____doc__r   intboolr   rT   r`   rd    rU   rS   rC   rC   V   sl    0l "11 1 	1
 1 1 sm1,$*M MrU   rC   tensorra   c                     t        j                  | t        j                  | d      z
  dz        }t        j                  t        j                  |              }|||dz
  z  z  }t        j                  |      S )a%  
    Compute the standard deviation of a tensor, ignoring NaNs. This function only supports 1D tensors.

    Args:
        tensor (`torch.Tensor`):
            Input tensor of shape `(N,)`.

    Returns:
        `torch.Tensor`:
            Standard deviation of the tensor, ignoring NaNs.
    T)keepdimr&   r3   )rN   nanmeansumisnansqrt)rl   variancecounts      rS   nanstdru      sc     }}fu}}VT'JJqPQHIIu{{6**+E##H::hrU   tensor_dict
num_chunksc                    t        d | j                         D              }|j                  d   |z  }t        |      D cg c]6  }| j	                         D ci c]  \  }}|||||z  |dz   |z   nd c}}8 c}}}S c c}}w c c}}}w )a   
    Splits a dictionary of tensors along the first dimension into `num_chunks` equal parts.

    Example:
    ```python
    >>> x = torch.arange(12).reshape(6, 2)
    >>> y = torch.arange(6).reshape(6, 1)
    >>> tensor_dict = {"x": x, "y": y}
    >>> split_tensor_dict(tensor_dict, 3)
    [
        {"x": tensor([[0, 1], [2, 3]]), "y": tensor([[0], [1]])},
        {"x": tensor([[4, 5], [6, 7]]), "y": tensor([[2], [3]])},
        {"x": tensor([[ 8,  9], [10, 11]]), "y": tensor([[4], [5]])}
    ]
    ```
    c              3   &   K   | ]	  }||  y wrK   rk   .0rl   s     rS   	<genexpr>z$split_tensor_dict.<locals>.<genexpr>        X6VEWX   r   Nr3   )nextvaluesshaperZ   items)rv   rw   first_tensor
chunk_sizer\   keyrl   s          rS   split_tensor_dictr      s    & X[-?-?-AXXL##A&*4J z" 
   +002	
V &BTJ!a%:)=>Z^^	
 	
s   BB7BBc                     t        d | j                         D              }|j                  d   }t        j                  |      }| j                         D ci c]  \  }}||||   nd c}}S c c}}w )a  
    Shuffles a dictionary of tensors along the first dimension in unison.

    Example:
    ```python
    >>> x = torch.arange(6).reshape(3, 2)
    >>> y = torch.arange(3).reshape(3, 1)
    >>> tensor_dict = {"x": x, "y": y}
    >>> shuffle_tensor_dict(tensor_dict)
    {'x': tensor([[2, 3],
                    [0, 1],
                    [4, 5]]),
        'y': tensor([[1],
                    [0],
                    [2]])}
    ```
    c              3   &   K   | ]	  }||  y wrK   rk   rz   s     rS   r|   z&shuffle_tensor_dict.<locals>.<genexpr>   r}   r~   r   N)r   r   r   rN   rW   r   )rv   r   rF   permutationr   rl   s         rS   shuffle_tensor_dictr      sm    $ X[-?-?-AXXL##A&J..,KXcXiXiXklfC(:$Dllls   A0c                 
   t        j                  |       j                         r5t        j                  t	        d      | j
                  | j                        S t        j                  | t        j                  |                 S )a&  
    Compute the minimum value of a tensor, ignoring NaNs. This function only supports 1D tensors.

    Args:
        tensor (`torch.Tensor`): Input tensor of shape `(N,)`.

    Returns:
        `torch.Tensor`: Minimum value of the tensor, ignoring NaNs. Returns NaN if all values are NaN.
    nandtypedevice)rN   rq   allrl   floatr   r   minrl   s    rS   nanminr     X     {{6 ||E%LV]]SS99VU[[00122rU   c                 
   t        j                  |       j                         r5t        j                  t	        d      | j
                  | j                        S t        j                  | t        j                  |                 S )a&  
    Compute the maximum value of a tensor, ignoring NaNs. This function only supports 1D tensors.

    Args:
        tensor (`torch.Tensor`): Input tensor of shape `(N,)`.

    Returns:
        `torch.Tensor`: Maximum value of the tensor, ignoring NaNs. Returns NaN if all values are NaN.
    r   r   )rN   rq   r   rl   r   r   r   maxr   s    rS   nanmaxr     r   rU   c                     | S )z Do we really need docs for this?rk   )xs    rS   identityr     s    HrU   c                       e Zd ZdZddgZ	 	 	 	 	 	 	 	 d0deeef   deee	e   f   de
e   de
eeef      d	e
eeeeeeeef   f   f      d
e
e   de
eee	e   f      de
e	e      dee
ej&                  j(                     e
ej&                  j*                  j,                     f   de
d   f fdZd Zd Zd1de
e   defdZdefdZdededefdZed1d       Zed1dej@                  fd       Z!d2de"jF                  defdZ$ed        Z%edeeeej@                  e&f   f   deeeej@                  e&f   f   fd       Z'e fd        Z(d!e	eeeej@                  e&f   f      deeeej@                  e&f   f   f fd"Z)d# Z*ed3d$       Z+d% Z,d1d&e
e	e      fd'Z-d1d(eee.f   d)e
e.   ddf fd*Z/ fd+Z0	 	 	 d4d,e
e   d-e
e   d.eee	e   df   fd/Z1 xZ2S )5GRPOTrainera  
    Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
    paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
    Models](https://huggingface.co/papers/2402.03300).

    Example:

    ```python
    from datasets import load_dataset
    from trl import GRPOTrainer

    dataset = load_dataset("trl-lib/tldr", split="train")


    def reward_func(completions, **kwargs):
        # Dummy reward function that rewards completions with more unique letters.
        return [float(len(set(completion))) for completion in completions]


    trainer = GRPOTrainer(
        model="Qwen/Qwen2-0.5B-Instruct",
        reward_funcs=reward_func,
        train_dataset=dataset,
    )

    trainer.train()
    ```

    Args:
        model (`Union[str, PreTrainedModel]`):
            Model to be trained. Can be either:

            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
              path to a *directory* containing model weights saved using
              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
              `args.model_init_kwargs`.
            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
        reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
            functions with the prompts and completions and sum the rewards. Can be either:

            - A single reward function, such as:
                - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
                path to a *directory* containing model weights saved using
                [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
                using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
                keyword arguments in `args.model_init_kwargs`.
                - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
                - A custom reward function: The function is provided with the prompts and the generated completions,
                  plus any additional columns in the dataset. It should return a list of rewards. Custom reward
                  functions can also return None when the reward is not applicable to those samples. This is useful for
                  multi-task training where different reward functions apply to different types of samples. When a
                  reward function returns None for a sample, that reward function is excluded from the reward
                  calculation for that sample. For more details, see [Using a custom reward
                  function](#using-a-custom-reward-function).
            - A list of reward functions, where each item can independently be any of the above types. Mixing different
            types within the list (e.g., a string model ID and a custom reward function) is allowed.
        args ([`GRPOConfig`], *optional*, defaults to `None`):
            Configuration for this trainer. If `None`, a default configuration is used.
        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
            Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
            ignored. The format of the samples can be either:

            - [Standard](dataset_formats#standard): Each sample contains plain text.
            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
              and content).
        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
        processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`):
            Processing class used to process the data. The padding side must be set to "left". If `None`, the
            processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`]. A
            padding token, `processing_class.pad_token`, must be set. If the processing class has not set a padding
            token, `processing_class.eos_token` will be used as the default.
        reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:

            - A single processing class: Used when `reward_funcs` contains only one reward function.
            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
            If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
            `None`, the tokenizer for the model is automatically loaded using
            [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
            functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
            are ignored.
        callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
            in [here](https://huggingface.co/docs/transformers/main_classes/callback).

            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
            method.
        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
        peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
    trlgrpoNmodelreward_funcsargstrain_dataseteval_datasetprocessing_classreward_processing_classes	callbacks
optimizerspeft_configr<   c                 b   |Jt        |t              r|n|j                  j                  }|j	                  d      d   }t        | d      }|j                  xs i }t        |t              r|}|j                  d      }t        |t        j                        s|dk(  s|n5t        |t              rt        t        |      }||d<   nt        d| d      |j                  rdn|j                  d	      |d	<   t        j                  |fi |}n-|j                  j                  }|j                  t        d
      |
!t               st!        d      t#        ||
      }|j                  r| j%                  ||      }|+t'        j                  |j                  j                  d      }|j(                  |j*                  |_        t        |t,              s|g}g | _        t1        |      D ]  \  }}t        |t              rt3        j                  |fddi|||<   t        ||   t4        j6                        rE| j.                  j9                  ||   j                  j                  j	                  d      d          | j.                  j9                  ||   j:                          || _        |j>                  tA        |j>                        tA        |      k7  r.t        dtA        |j>                         dtA        |       d      t        jB                  |j>                  t        jD                        | _        n3t        jF                  tA        |      t        jD                        | _        |d gtA        |      z  }n6t        |t,              s|g}n"tA        |      tA        |      k7  rt        d      t1        tI        ||            D ]  \  }\  }}t        |tJ              s|)t'        j                  |j                  j                        }|jL                  |j*                  |_        |jL                  |j                  _&        |||<    || _'        |jP                  | _(        |jR                  | _)        |jT                  | _*        |jV                  | _+        |jX                  | _,        |jZ                  | _-        |j\                  | _.        |j^                  | _/        |j`                  | _0        |jb                  | _1        |jd                  | _2        |jf                  | _3        |jh                  | _4        |jj                  | _5        |jl                  | _6        |jn                  | _7        |jp                  | _8        t        |tr              s@t        |tr              s0t        |tt              r+tw        d |jy                         D              rt{        d      |j|                  | _>        |j~                  | _@        |j                  |j                  n|j~                  | _A        d| _B        d | _C        d|j                  d<   t        |   ||t        |||||	       |j                  | _H        | j                  dk(  rd | _I        n.t        |      rd | _I        nt        j                  |fi || _I        |j                  r,t        |       | j                  t        | j                         | jh                  rt               st!        d      t               | _O        t        | j                  | j                  | j                  | jV                  | j                  dk7  | jj                  | jR                        | _Q        t        t,              t        t,              d| _S        d| _T        |j                  | _U        |j                  | _V        |j                  | _W        | j                  j                  |j                  z  |j                  z  t              t              t        fd       t              d!| _]        t        |j                  d"       | j`                  rt               st!        d#      | jb                  d$k(  r| j                  j                  rm|j                  |j                  }nd%|j                   d&|j                   }t        ||j                  '      | _g        | j                  j                          n| jb                  d(k(  r| j                  j                  | jf                  z  dk(  s0t        d)| jf                   d*| j                  j                   d+      | jf                  dkD  rt        j                  j                  t        | j                  j                  | jf                  z        D cg c]4  }t-        t        || jf                  z  |dz   | jf                  z              6 c}      \  | _l        }t        |j                  |jf                  | jd                  | j                  j                  | jf                  z  | j                  j                  z  | jP                  | jR                  z   d,| j                  j                  | jf                  z  d-.      | _r        |j                  | _t        d| _u        | j                  j                          n| jR                  d|jL                  |j                  |j                  | jV                  | jX                  | jZ                  | j\                  | j^                  |j                  d/}|j                  |j                  |j                         t        d3i || _}        d| _~        | j                  j                  | j                         | j                  | j                  r't        | j                  | j                        | _I        na| j                  r't        | j                  | j                        | _I        n-| j                  j                  | j                  d0      | _I        |j                  r2| j                  t        | j                  | j                  1             t1        | j<                        D ]t  \  }}t        |tJ              s| j                  r%t        || j                        | j<                  |<   I| j                  j                  |dd2      | j<                  |<   v y c c}w )4N/z-GRPOtorch_dtypeautozInvalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing a `torch.dtype` (e.g., 'float32'), but got .F	use_cachezYou passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. This argument can only be used when the `model` argument is a string.z>PEFT is required to use `peft_config`. Run `pip install peft`.left)padding_side
num_labelsr3   zNumber of reward weights (z)) must match number of reward functions ())r   zRThe number of reward processing classes must match the number of reward functions.c              3   <   K   | ]  }t        |t                y wrK   )
isinstancer   )r{   dss     rS   r|   z'GRPOTrainer.__init__.<locals>.<genexpr>  s     6w[]z"o7V6ws   z^Iterable datasets are not yet supported in GRPOTrainer. Please use a standard dataset instead.r   Testimate_tokens)r   r   data_collatorr   r   r   r   r           zWLiger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`.)betaepsilon_lowepsilon_hightemperatureuse_ref_model	loss_typemax_completion_length)trainevalmaxlenc                      t               S )Nr   )r   r   s   rS   <lambda>z&GRPOTrainer.__init__.<locals>.<lambda>f  s    5+? rU   )prompt
completionrewards
advantages)device_specificzkvLLM is not available and `use_vllm` is set to True. Please install vLLM with `pip install vllm` to use it.serverzhttp://:)base_urlconnection_timeoutcolocatezvllm_tensor_parallel_size (z) must divide world size (z	) evenly.external_launcheri   )r   tensor_parallel_sizegpu_memory_utilizationmax_num_seqsmax_model_lendistributed_executor_backendrI   max_num_batched_tokens)max_new_tokens	do_samplepad_token_idbos_token_ideos_token_idr   top_ptop_kmin_prepetition_penaltycache_implementation)evaluation_mode)	ref_modelaccelerator)r   device_placementrk   )r   strconfig_name_or_pathsplitr5   model_init_kwargsgetrN   r   getattr
ValueErrorgradient_checkpointingr   from_pretrainedr$   ImportErrorr=   _enable_gradient_checkpointingr   	pad_token	eos_tokenrY   reward_func_names	enumerater   r   Moduleappendre   r   reward_weightsrL   rl   float32oneszipr   r   r   max_prompt_lengthr   num_generationsr   r   r   r   r   use_vllm	vllm_modevllm_gpu_memory_utilizationvllm_tensor_parallel_sizeuse_liger_lossr   scale_rewardsmask_truncated_completionsshuffle_datasetr   dictanyr   NotImplementedErrornum_iterationsepsilonr   r   _step_buffered_inputswarnings_issuedsuperrT   r   r   r   r   disable_dropoutr6   r-   r2   _forward_redirectionr>   liger_grpo_lossr   _metrics_total_train_tokenslog_completionswandb_log_unique_promptsnum_completions_to_printr   num_processesper_device_train_batch_sizesteps_per_generationr   _textual_logsr   rI   r.   is_main_processvllm_server_base_urlvllm_server_hostvllm_server_portr,   vllm_server_timeoutvllm_clientinit_communicatordistributednew_subgroups_by_enumerationrZ   tp_groupr?   name_or_pathr   gradient_accumulation_stepsprocess_indexllmvllm_guided_decoding_regexguided_decoding_regex_last_loaded_stepwait_for_everyoner   r   r   generation_kwargsupdater   generation_configmodel_accepts_loss_kwargsr   add_model_tags
_tag_namesis_deepspeed_enabledr/   is_fsdp_enabledr0   prepare_modelsync_ref_modeladd_callbackr4   )rR   r   r   r   r   r   r   r   r   r   r   
model_namer   model_idr   r\   reward_funcreward_processing_classr   r^   r.  r   	__class__s                        @rS   rT   zGRPOTrainer.__init__  s    <",UC"8ell>X>XJ#))#.r2JE23D !228beS!H+//>K+u{{3{f7LP[PcK-%e[93>!-0 BBMaQ  44:K:O:OP[:\ k* )88TBSTE||11H%%1 \ 
 "$&!"bcc"5+6E &&77tDE #,<<U\\=W=Wflm%%-)9)C)C& ,-(>L!#'5 	HNA{+s+"D"T"T#,-#1B#Q ,q/2995&&--l1o.D.D.R.R.X.XY\.]^`.ab&&--l1o.F.FG	H ) *4&&'3|+<< 0T5H5H1I0J K""%l"3!4A7  #(,,t/B/B%--"XD"'**S->emm"TD %,)-\1B(B%5t<)B(C%,-\1BB !uvv9B3G`bnCo9p 		G5A5'+7*2.;.K.KKL^L^LlLl.m+*77?8O8Y8Y+5 3J2V2V""//F)!,		G *C& "&!7!7%)%?%?"#33++ZZ
ZZ
ZZ
"&"9"9+/+K+K()-)G)G&"11!//*.*I*I'  $33 }o6,8<.36wamatatav6w3w &p 
 #11<<151B1B1ND--TXT`T`
 !% 48/0"'%-! 	 		
 II	99!DN5! "DN 2AA(`N_`DN $U+~~)(8 ,.!m  )<(=D%#;YY ,,!.. ,,"ii3...&*&@&@$D  #.d"3[=NO#$ #33(,(E(E%(,(E(E% !!//$2R2RRUYUnUnn6*v."#?@v.	
 	D1==$&!4 
 ~~)d.>.>.N.N,,8#88H!()>)>(?qAVAV@WXH#-xTXTlTl#m   224:- ''558V8VVZ[[$5d6T6T5U V ,,::;9F 
 11A5 (-'8'8'U'U &+4+;+;+I+ITMkMk+k%l ! !q4+I+I'IAPQEUYUsUsKs!tu($DM1 ,,)-)G)G+/+K+K!%!F!F44"5ii;;"< #'"8"84;U;U"U1D))774;Y;YY+/  *.)H)HD&%'D"
 ..0 #'"<"<! 0 = = 0 = = 0 = =#//&*&=&=(,(A(A! %%1!(()?)?@%5%J8I%JD"
 */& 	

!!$//2>>%((!24>>4CSCS!T%%!-dnnd>N>N!O!%!1!1!?!?`d!?!e2T^^Y]YiYijk'(9(9: 	NA{+7,,+<[$JZJZ+[D%%a( ,0+;+;+I+I#TD ,J ,D%%a(	Gs   9t,c                 .    | j                   	dg| _         y y )Nr   )_signature_columnsrc   s    rS    _set_signature_columns_if_neededz,GRPOTrainer._set_signature_columns_if_needed  s    
 ""*'/jD# +rU   c                    | j                   t        d      | j                   }| j                  }t               r.t	        |t
        j                        r| j                  |d      }n| j                  |d      }| j                  | j                  j                  z  || j                  j                  | j                  j                  | j                  j                  d}t	        |t        j                   j"                  j$                        s| j'                         |d<   | j                  j(                  |d<   t+        j,                  t.        j0                        t+        j,                  d      k\  r>t3        t4        | j                  j                  | j                  j6                        |d	<   n	t4        |d	<   | j                  j8                  |d
<   | j:                  j=                  t?        |fi |      S )Nz+Trainer: training requires a train_dataset.training)description)rF   
collate_fnnum_workers
pin_memorypersistent_workerssampler	drop_lastz4.52.0)rE  rankworker_init_fnprefetch_factor) r   r   r   r#   r   datasetsr   _remove_unused_columns"_get_collator_with_removed_columns_train_batch_sizer   r  dataloader_num_workersdataloader_pin_memorydataloader_persistent_workersrN   utilsdatar   _get_train_samplerdataloader_drop_lastr   parsetransformers__version__r   r"   r(  dataloader_prefetch_factorr   preparer   )rR   r   r   dataloader_paramss       rS   get_train_dataloaderz GRPOTrainer.get_train_dataloader  s   %JKK**** "z-AQAQ'R 77S]7^M CCM_iCjM 004993Q3QQ'99;;))99"&))"I"I
 -)9)9)I)IJ+/+B+B+Di(-1YY-K-Kk*}}\556'--:QQ6=TYY-M-MTXT]T]TkTk7!"23 7B!"2337993W3W/0''
=(VDU(VWWrU   datasetra   c                    || j                   }t        || j                  | j                  j                  | j                  z  | j
                  | j                  j                  z  | j                  | j                  j                        S )N)rD   rE   rF   rG   rH   rI   )	r   rC   r   r   generation_batch_sizer
  r  r  rI   )rR   r_  s     rS   rV  zGRPOTrainer._get_train_sampler  sq    2 ?((G"22yy66$:N:NN,,tyy/M/MM((
 	
rU   c                 Z    t        || j                  | j                  j                        S )N)rD   rE   rI   )rC   r   r   rI   )rR   r   s     rS   _get_eval_samplerzGRPOTrainer._get_eval_sampler/  s&    $"22
 	
rU   c                     d|j                   _        t        |      r|j                  j	                          n|j	                          |j
                  xs i }d|vxs |d   }|r|j                          |S )z-Enables gradient checkpointing for the model.Fuse_reentrant)r   r   r   
base_modelgradient_checkpointing_enablegradient_checkpointing_kwargsenable_input_require_grads)rR   r   r   rh  re  s        rS   r   z*GRPOTrainer._enable_gradient_checkpointing7  s{     "' ::< //1(,(J(J(Pb%#@@rDabqDr 	 ,,.rU   c                     t        |      r|j                  j                  }|j                  ||      j                  }|d d d dd d f   }||d d | d d d f   }|S )N)	input_idsattention_maskr   )r   rf  r   last_hidden_state)rR   unwrapped_modelrk  rl  logits_to_keeprm  s         rS   _get_last_hidden_statez"GRPOTrainer._get_last_hidden_stateM  sp    )-88>>O+11IVd1eww-a"ai8% 1!n_5Eq2H I  rU   c                 |   |xs |j                  d      }g }t        d|j                  d      |      D ]p  }||||z    }||||z    }	 |||	|dz         j                  }
|
d d d dd d f   }
|d d | d f   }|
| j                  z  }
t	        |
|      }|j                  |       r t        j                  |d      S )Nr   r3   )rk  rl  ro  r   dim)sizerZ   logitsr   r;   r   rN   cat)rR   r   rk  rl  ro  rF   	all_logpsr\   input_ids_batchattention_mask_batchru  logpss               rS   _get_per_token_logpsz GRPOTrainer._get_per_token_logpsX  s    49>>!#4
	q)..+Z8 	$A'A
N;O#1!a*n#E  ):N_mpq_qf  AssAI&F-a.1A.ABO d...F)&/BEU#	$ yy**rU   moduleprefixc                    |
t               }|j                         D ]$  \  }}|r| d| n|}| j                  |||       & t        |t              r-t	        j
                  |dd      5  |j                         D ]  \  }}|r| d| n|}	dD ]  }
|	j                  |
d      }	 |	|v r/|j                  |	       | j                  dk(  r=| j                  j                  r'| j                  j                  |	|j                         | j                  d	k(  s| j                  j                   j"                  j$                  j&                  j(                  }|j+                  |	|j                  fg        	 ddd       yy# 1 sw Y   yxY w)
zdMemory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM.Nr   )r}  visitedF)recurse	writeback)z_fsdp_wrapped_module.z_checkpoint_wrapped_module. r   r   )setnamed_children_sync_fsdp_params_to_vllmr   FSDPsummon_full_paramsnamed_parametersreplaceaddr   r   r  r!  update_named_paramrU  r)  
llm_enginemodel_executordriver_workermodel_runnerr   load_weights)rR   r|  r}  r  
child_namechild_modulechild_prefix
param_nameparam	full_nameextra	llm_models               rS   r  z%GRPOTrainer._sync_fsdp_params_to_vllmm  s   ?eG(.(=(=(? 	$J7=fXQzl3:L**\7 + 	 fd#((%P J)/)@)@)B J%J<B6(!J< 8
I!Y A$-$5$5eR$@	A !G+ KK	*~~1d6F6F6V6V((;;IuzzR:5$(HH$7$7$F$F$T$T$a$a$g$g	!..EJJ0G/HIJJ J $J Js   -B,FAFFc                    | j                   j                  j                  }|d uxr |j                  dk(  }|rdd l}|j
                  j                  }nt        }t        | j                        r |t        | j                  j                                     5  | j                  j                          | j                  r| j                  | j                         n-| j                  j                         D ]  \  }}|j!                  d      j#                  dd      }| j                  j$                  |v rAd|v rF|j#                  dd      }| j&                  dk(  r=| j                   j(                  r'| j*                  j-                  ||j.                         | j&                  d	k(  s| j0                  j2                  j4                  j6                  j8                  j                  }|j;                  ||j.                  fg        | j                  j=                          d d d        n| j                  r| j                  | j                         n| j                  j                         D ]  \  }} ||g      5  | j&                  dk(  r=| j                   j(                  r'| j*                  j-                  ||j.                         nk| j&                  d	k(  r\| j0                  j2                  j4                  j6                  j8                  j                  }|j;                  ||j.                  fg       d d d         | j&                  dk(  r1| j                   j(                  r| j*                  j?                          y | j&                  d	k(  r| j0                  j?                          y y # 1 sw Y   txY w# 1 sw Y   RxY w)
N   r   zbase_model.model.z.base_layerr  original_modulezmodules_to_save.default.r   r   ) r   statedeepspeed_plugin
zero_stage	deepspeedzeroGatheredParametersr   r   r   rY   
parametersmerge_adapterr5  r  r  removeprefixr  r}  r   r  r!  r  rU  r)  r  r  r  r  r  unmerge_adapterreset_prefix_cache)rR   r  zero_stage_3r  gather_if_zero3namer  r  s           rS   _move_model_to_vllmzGRPOTrainer._move_model_to_vllm  s     ++11BB't3X8H8S8SWX8X'nn??O)O$ !djj&;&;&=!>? -

((* '' 224::> (,zz'B'B'D Ie#001DEMMm]_`::,,4$,4$#||,FK>>X5$:J:J:Z:Z ,,??ejjQ!^^z9(,(;(;(J(J(X(X(e(e(k(kI%22T5::4F3GHI  

**,5- -< ##..tzz:#'::#>#>#@ IKD%(%1 I>>X5$:J:J:Z:Z ,,??ejjQ!^^z9(,(;(;(J(J(X(X(e(e(k(kI%22T5::4F3GHI II >>X%$*:*:*J*J//1^^z)HH'') *Y- -DI Is!   DN+*A9N+>B8N7+N47O	generation_batchc                    | j                   j                  rdnd}|dk(  r| j                  j                  | j                  z  }| j
                  |z  dk(  s| j                  A| j                  |      }t        |      }t        || j                  j                        | _        | j                  | j
                  | j                  j                  z     }| xj
                  dz  c_        |S | j                  |      }|S )Nr   r   r   r3   )
r   rB  r   r  r
  r  r  _generate_and_score_completionsr   r   )rR   r  modegenerate_everyinputss        rS   _prepare_inputszGRPOTrainer._prepare_inputs  s    " **--w67?!YY;;d>Q>QQNzzN*a/43H3H3P#'#G#GHX#Y #67G#H (9:JDIILjLj(k%**4::		8V8V+VWFJJ!OJ
  99:JKFrU   c           
      R   | j                   j                  }t        j                  t	        |      t	        | j
                        |      }|d   D cg c]	  }|dvs| }}|D 	ci c]  }||D 	cg c]  }	|	|   	 c}	 }
}}	t        t        | j
                  | j                  | j                              D ]\  \  }\  }}}t        | |      5  t        |t        j                        rt        |d         r@t        ||      D cg c]  \  }}d||z   i }}}|D cg c]  }t        ||      d    }}n!t        ||      D cg c]
  \  }}||z    }}} ||dddd	
      }t         | E  |      }t        j$                         5   |di |j&                  d d df   |d d |f<   d d d        nZ |d|||d|
}|D cg c]  }||nt        j(                   }}t        j*                  |t        j,                  |      |d d |f<   d d d        _ t        j.                  |      j1                  d      j3                         rt        j.                  |      j1                  d      j5                  d      d   d   }|
j7                         D ci c]  \  }}|||    }}}||   |d<   ||   |d<   t9        j:                  d| d       t=        |      }|S c c}w c c}	w c c}	}w c c}}w c c}w c c}}w # 1 sw Y   xY wc c}w # 1 sw Y   fxY wc c}}w )Nr   r   )r   r   completion_idsmessagestextptTrightFr  return_tensorspaddingr   add_special_tokens)promptscompletionsr  r   r3   rr  )as_tupler   r   z=All reward functions returned None for the following kwargs: zI. Please ensure that at least one reward function returns a valid reward.rk   )r   r   rN   zerosrL   r   r   r   r   r   r*   r   r   r   r(   r'   r  r  inference_moderu  r   rl   r   rq   r   r  nonzeror   warningswarnr   )rR   r  r  r  completion_ids_listr   rewards_per_funcr   keysexamplereward_kwargsr\   r;  r<  reward_func_namepcr  r   textsreward_inputsoutput_reward_funcrewardnan_row_idxvaluerow_reward_kwargsr=  s                             rS   _calculate_rewardszGRPOTrainer._calculate_rewards  sa   !!(( ;;s7|S9J9J5KTZ[  &aybC7a,abbNRSs6Bws|BBSSKT!!4#A#A4CYCYZL
 	rGAG46F #4)9: rk2995(3DGQ\D]#^DAqZQ$7#^#^bj k]^!4Q8O!PQW!X k k36w3L M41aQ M M$;"4T[pu%M %*G$;M$JM--/ [1<1M}1M1T1TUVXYUY1Z(A.[ [ *5 * '[Qd*hu*& ew)wZ`F4F&EII*U)w&)w-2\\:LTYTaTajp-q$QT*)r r	r4 ;;'(,,,3779++&67;;;BJJTXJYZ[\]^_KKXK^K^K` aZS%eK&8!8 a a*1+*>h'.9+.Fl+MMOPaOb cZ Z ""23U cBS $_ k M
[ [ *x%r r2 !bs   	K$K$(
K.2K)>K.8L	K4
L!K:7L	K?
3L!L,LL /LL#)K.4LL
LL 	r  c                 T"   | j                   j                  }| j                  j                  rdnd}|D cg c]  }|d   	 }}|D cg c]  }t	        || j
                        d    }}| j                  |dddd      }t        F|   |      }|d	   |d
   }
}	| j                  |	d d | j                   d f   }	|
d d | j                   d f   }
| j
                  j                  |	dd      }|D cg c]D  }t        j                  dt        j                  | j
                  j                         dd|      F }}| j                  r1| j                  j                   | j"                  k7  r+| j%                          | j                  j                   | _        | j&                  dk(  r`t)        |      }| j                   j*                  r|d d | j,                     }t/        | d      5  | j0                  j3                  || j,                  | j4                  | j6                  | j8                  | j:                  dn| j:                  | j<                  dn| j<                  | j>                  | j@                  | jB                  jD                  
      }d d d        nd gtG        |      z  }tI        d      }tK        | j                   jL                  tG        |      z  | j                   jL                  dz   tG        |      z        }||   }n| j&                  dk(  r| j@                  rtO        d| j@                        }nd }d| j4                  | j6                  | j8                  | j:                  dn| j:                  | j<                  dn| j<                  | j>                  |d}| jB                  jD                  %|jQ                  | jB                  jD                         tS        d<i |}| jT                  dkD  rrtG        |      }tW        | jT                        D cg c]  }d  }}tX        jZ                  j]                  ||| j^                         |D cg c]  }|D ]  }|  }}}n|}t/        | d      5  | j`                  j3                  ||d      }d d d        D cg c]  }|jb                  D ]  }|jd                   ! }}}| jT                  dkD  rDtX        jZ                  jg                  | j^                        }tK        |z  |dz   |z        }||   }D cg c]  }tY        jh                  ||       }}tk        || j
                  jl                        }tY        jn                  |	|gd      }ntq        | jr                  | j                   | jB                  jt                         5 }| jv                  r!ty        jz                  | jr                  d!      n	t}               5  |j3                  |	|
| j~                  "      }d d d        d d d        |	j                  d      } d d d | f   }	|d d | d f   }|| j
                  j                  k(  }!tY        j                  |!j                  d      f|!j                  d      tX        j                  |#      }"|!j                         j                  d      |!j                  d         |"|!j                  d      <   tY        j                  |!j                  d      |      j                  |!j                  d      d      }#|#|"j                  d      k  j                         }$t        ||$      D %&'(cg c]5  \  }%}&t        |%|&      D '(cg c]  \  }'}(|(s	|'j                          c}(}'7 })}'}&}%}(|$j                  d      }*| j                  r6|!j                  d       }+|$|+ j                  d      j                         z  }$tY        jn                  |
|$gd      },|j                  d      }-|dk(  r| jB                  j                  n| jB                  j                  }.tY        j                         5  | j                  dkD  s-| jB                  j                  | jB                  j                  kD  r | j                  | j                  ||,|-|.      }/nd }/| j                  dk7  r| j                  | j                  | j                  ||,|-      }0n]| j                   j                  | j                        j                         5  | j                  | j                  ||,|-      }0d d d        nd }0d d d        | j
                  j                  |d$      }1t        |d         rOg }2t        ||1      D ]=  \  }3}4|3d   d%   d&k(  r|3j                         d'   nd}5|2j                  d&|5|4z   d(g       ? n|1}2| j                  |||2|)      }6|6| j                  j                  |      j                  d      z  j                  d      }7|7j                  d| j,                        j                  d      }8|7j                  d| j,                        j                  d      }9tY        j                  |9tY        j                  |9            }:|8j                  | j,                  d      }8|9j                  | j,                  d      }9|7|8z
  };| j                  r|;|9d)z   z  };tK        | j                   jL                  tG        |      z  | j                   jL                  dz   tG        |      z        }|;j                         }<|;|   };|dk(  rb| j                  xj                  | j                   j                  |,j                               j                         j                         z  c_h        | j                  j                  g| j                  |   d*<   | j                   j                  |*      }=| j                  |   d+   j                  |=j                         j                         j                                | j                  |   d,   j                  |=j                         j                         j                                | j                  |   d-   j                  |=j                         j                         j                                | j                   j                  |!j                  d            }>|=|>   }?dtG        |?      tG        |=      z  z
  }@| j                  |   d.   j                  |@       tG        |?      dk(  rtY        j                  d|      }?| j                  |   d/   j                  |?j                         j                         j                                | j                  |   d0   j                  |?j                         j                         j                                | j                  |   d1   j                  |?j                         j                         j                                t        | j                        D ]  \  }A}BtY        j                  |6d d |Af         j                         }C| j                  |   d2|B d3   j                  |C       t        |6d d |Af         j                         }D| j                  |   d2|B d4   j                  |D        | j                  |   d5   j                  |8j                         j                                | j                  |   d6   j                  |9j                         j                                | j                  |   d7   j                  |:j                         j                         j                                | j                  d   j                  t)        |             | j                  d8   j                  t)        |1             t        | j                        D ];  \  }A}E| j                  d9   |E   j                  |6d d |Af   j                                = | j                  d:   j                  |<j                                |	|
||$|;/0d;S c c}w c c}w c c}w # 1 sw Y   xY wc c}w c c}}w # 1 sw Y   <xY wc c}}w c c}w # 1 sw Y   
xY w# 1 sw Y   
xY wc c}(}'w c c}(}'}&}%w # 1 sw Y   xY w# 1 sw Y   xY w)=Nr   r   r   r  Tr   Fr  rk  rl  )skip_special_tokensclean_up_tokenization_spacesz^(z)+r  r   zvLLM.generater   r   )
r  nr   r   r   r   r   
max_tokensr+  r.  r   )from_processr3   r   outlines)backendregex)r  r   r   r   r   r   r  guided_decoding)group)sampling_paramsuse_tqdmr  )padding_valuerr  )gather_deepspeed3_params)r  )rl  r0  r   )r  role	assistantcontent)r  r  g-C6?
num_tokenszcompletions/mean_lengthzcompletions/min_lengthzcompletions/max_lengthzcompletions/clipped_ratioz"completions/mean_terminated_lengthz!completions/min_terminated_lengthz!completions/max_terminated_lengthzrewards/z/meanz/stdr  
reward_stdfrac_reward_zero_stdr   r   r   )
prompt_idsprompt_maskr  completion_maskr   old_per_token_logpsref_per_token_logpsrk   )vr   r   r   rB  r)   r   r  r  r   batch_decoderesubescaper   r   r  global_stepr,  r  r   r   r  r   r*   r!  generater   r   r   r   r   r   r+  r   r.  rL   r   slicer(  rA   r/  r@   r  rZ   rN   r#  all_gather_objectr%  r)  outputs	token_idsget_rankrl   r9   r   rv  r1   model_wrappedds3_gather_for_generationr5  r  r  r   r0  rt  r   fulllongri   argmaxr  arangeexpand	unsqueezer   itemrp   r  r  per_device_eval_batch_sizeno_gradr
  r  r'  r{  r   r   unwrap_modeldisable_adapterr(   popr   r  r   tonansumviewmeanstdisclose
zeros_likerepeat_interleaver  clonenum_input_tokens_seenr   r  r   r   r   r  r   r   ro   ru   r  extendrX   )GrR   r  r   r  r   r  r  prompts_textprompt_inputsr  r  r  all_prompts_textordered_set_of_promptsr  process_slicer  r.  r  	orig_sizer^   gathered_promptssublistr  all_outputsr  outputlocal_rank_in_grouptp_sliceidsprompt_completion_idsrn  prompt_lengthis_eoseos_idxsequence_indicesr  rowmask_rowidmr  completion_lengthstruncated_completionsrl  ro  rF   r  r  completions_textr  r   r   	bootstrapr  r   mean_grouped_rewardsstd_grouped_rewardsis_std_zeror   all_process_advantagesagg_completion_lengthsagg_terminated_with_eosterm_completion_lengthsclipped_completions_ratior\   r  mean_rewardsstd_rewardsr  r=  sG                                                                         rS   r  z+GRPOTrainer._generate_and_score_completions  s    !!((**--w6(./11X;//kqr`g1'4;P;PQRZ[rr--dDvjo . 
 />"/"<mL\>]K
!!- $A(>(>'>'@$@AJ%a$*@*@)@)B&BCK00==TY > L dp[_"RYYt'<'<'F'FGHKRQUVL 
 ==zz%%)?)??((*)-)?)?& ~~)#0#> ##33 .>>UAUAU>U-V**4A )-)9)9)B)B$:"22/3/F/F(,(8(8"&**(,

(:"

)-);#'+'A'A262L2L.2ii.I.I *C *  '+Vc2B.C%CN "7~TU!V %$$22S\A%%33a73w<G! "0!> :---&::UYUoUo&pO&*O *.*A*A#'#3#3!ZZ#'::#5R4::$(JJ$6SDJJ"&"<"<'6	%! 99..:%,,TYY-H-HI"0"E3D"E11A5 !$L 1I6;D<Z<Z6['\'\$'\%%778H,^b^k^k7l9I'[gSZ'[a'['[$'['3$&t_= w"&(("3"34DVepu"3"vKw CN!lw\c\k\k!lRX&"2"2!l"2!l!l11A5 +0*;*;*D*D4==*D*Y'$%89%DGZ]^G^bkFklH%3H%=N KYY3ell3v>YNY t?T?T?a?abN$)IIz>.JPQ$R! -""D$4$4tyyOrOr 
  ++ ++D,>,>N$
 -<,D,D";RVRhRh -E -)
 'OOA.M.q.=./@AJ21mn3DEN  4#8#8#E#EE**fkk!n.AejjY_`%+ZZ\%8%8Q%8%?

q
@Q%R

q
!" <<AvFMMfkkZ[n^`a+w/@/@/CCHHJ
 SVVdfuRv
 
ANhCX$6<52q!RWWY<
 

 -003 **%+ZZAZ%6$6!-2G1G0R0RST0U0Y0Y0[[O K#AqI',,Q/>BgoTYY::SWS\S\SwSw
]]_ 	+ ""Q&$))*H*H499KpKp*p&*&?&?JJ 5~~Wa'# '+# yyC>>-*.*C*C(=~~+' ))66tzzBRRT .2.G.G JJ(=~~/+ 
 '+#/	+4  00==nbf=gVAY'K&)'3C&D _"
7=bz&7I[7XFJJL3^`	""[YQ[E[$\#]^_ +K
  2267KQde $d&9&9&<&<V&D&N&Nq&QQYY^_Y`  '||B0D0DEJJqJQ%ll2t/C/CDHHQHOmm$79I9IJ]9^_  4EEdFZFZ`aEb1CCDDXDX^_C`33
#':T'ABJ **S\9++a/3w<?
 ",!1!1!3.
 7?JJ,,0@0@0G0GHZHZH\0]0a0a0c0h0h0jj,-1ZZ-M-M,NdL) "&!1!1!8!89K!Ld56==>T>Z>Z>\>a>a>c>h>h>jkd45<<=S=Y=Y=[=_=_=a=f=f=hid45<<=S=Y=Y=[=_=_=a=f=f=hi #'"2"2"9"9&***:K"L"89P"Q$%,C(DsKaGb(b$b!d78??@YZ&'1,&+kk!F&C#d@AHHI`IfIfIhImImIoItItIvwd?@GGH_HeHeHgHkHkHmHrHrHtud?@GGH_HeHeHgHkHkHmHrHrHtu $-T-C-C#D 	WA ==)9!Q$)?@EEGLMM$(+;*<E BCJJ<X !1!Q$!78==?KMM$(+;*<D ABII+V		W
 	dH%,,-A-F-F-H-M-M-OPdL)001D1I1I1K1P1P1RSd23::;;L;L;N;S;S;U;Z;Z;\] 	8$++M,,GH<(//>N0OP !7!78 	XGAty)$/667G17M7T7T7VW	X<(//0F0M0M0OP %&,.$#6#6
 	
 0r & b (]'[w w "m Z 
 
6 =
H %	+ 	+s   AB) AB.3A	AB3(B&AB8	ACAC
7AC#$AC!AC#28AC5*AC(	AC5AD
4
AD?ADAD
CAD*AD	ADB8ACCACC(AC2	C-AC5C5AC?DAD
DAD	DADDAD'c                 V   |d   |d   }}|d   |d   }}t        j                  ||gd      }t        j                  ||gd      }|j                  d      }	| j                  ||||	      }
| j	                  |
|j
                  j                  |||d   |j
                  j                  |d   |d	   
      \  }}| j                  dk7  r|d   nd }|d   }| j                  j                  rdnd}| j                  dk7  rV| j                  |   d   j                  | j                  j                  |      j                         j!                                | j                  |   d   j                  | j                  j                  |      j                         j!                                |S )Nr  r  r  r  r3   rr  r   r  r  )_input
lin_weightselected_token_idsrl  r   biasr  r  r   r   r   r   r   kl
clip_ratio)rN   rv  rt  rp  r  lm_headweightr;  r   r   rB  r  r   r   r   r  r  )rR   rn  r  r  r  r  r  rk  rl  ro  rm  lossmetricsmean_klr=  r  s                   rS   compute_liger_losszGRPOTrainer.compute_liger_loss*  s   "("6}8MK
*01A*BFK\D]IIz>:B	K#AqI',,Q/ !77Tbdrs ,,$&..55-*l+ ((-- &'< = &'< = - 	
g !%		S 0'!*dR[
**--w699MM$%,,T-=-=-D-DW-M-R-R-T-Y-Y-[\dL)001A1A1H1H1T1Y1Y1[1`1`1bcrU   c                     |rt        d      | j                  r:| j                  j                  |      }| j	                  ||| j
                  ||      S | j                  ||      S )Nz2The GRPOTrainer does not support returning outputs)r   r  r   r  r  rC  _compute_loss)rR   r   r  return_outputsnum_items_in_batchrn  s         rS   compute_losszGRPOTrainer.compute_lossK  sd    QRR"..;;EBO,,UOTE\E\^mouvv%%eV44rU   c                 
   |d   |d   }}|d   |d   }}t        j                  ||gd      }t        j                  ||gd      }|j                  d      }	| j                  ||||	      }
| j                  dk7  r&|d   }t        j
                  ||
z
        ||
z
  z
  dz
  }|d	   }|d
   |
j                         n|d
   }t        j
                  |
|z
        }t        j                  |d| j                  z
  d| j                  z         }| j                  j                  +t        j                  || j                  j                        }||j                  d      z  }||j                  d      z  }t        j                  ||       }| j                  dk7  r|| j                  z  z   }| j                  dk(  rE||z  j                  d      |j                  d      j                  d      z  j!                         }n| j                  dk(  r5||z  j                         |j                         j                  d      z  }nZ| j                  dk(  r3||z  j                         |j                  d      | j"                  z  z  }nt%        d| j                         | j&                  j(                  rdnd}| j                  dk7  rz|z  j                         |j                         z  }| j*                  |   d   j-                  | j.                  j1                  |      j3                         j5                                |d| j                  z
  k  |j                  d      dk  z  }|d| j                  z   kD  |j                  d      dkD  z  }||z  }||z  j                         |j                         z  }||z  j                         |j                         z  }||z  j                         |j                         z  }| j.                  j1                  |      }| j*                  |   d   j-                  |j3                         j5                                | j*                  |   d   j-                  t7        |      j5                                | j.                  j1                  |      }| j*                  |   d   j-                  |j3                         j5                                | j*                  |   d   j-                  t9        |      j5                                | j.                  j1                  |      }| j*                  |   d   j-                  |j3                         j5                                |S )Nr  r  r  r  r3   rr  r   r  r   r  )r   r   r   g      ?)r   bnpodr_grpor   zUnknown loss type: r   r   r<  zclip_ratio/low_meanzclip_ratio/low_minzclip_ratio/high_meanzclip_ratio/high_maxzclip_ratio/region_mean)rN   rv  rt  r{  r   expdetachclampr   r   r   deltar  r   r   rp   r  r   r   r   rB  r  r   r   r   ro   r  r   r   ) rR   r   r  r  r  r  r  rk  rl  ro  per_token_logpsr  per_token_klr   r  coef_1coef_2per_token_loss1per_token_loss2per_token_lossr@  r  rB  is_low_clippedis_high_clippedis_region_clippedlow_clip	high_clipr=  gathered_low_clipgathered_high_clipgathered_clip_ratios                                    rS   rE  zGRPOTrainer._compute_lossV  s   "("6}8MK
*01A*BFK\D]IIz>:B	K#AqI',,Q/33E9nVde 99"()>"?		-?@DWZiDijmnn 
 L)

 )//D(E(MO""$SYZoSp 	 ?-@@AVQ)9)9%91t?P?P;PQ 99??&[[TYY__=F :#7#7#:: :#7#7#::))O_EE99+dii,.FFN>>V##o5::2>ATATUWAXA^A^cfA^AggmmoD^^v%"_499;o>Q>Q>S>Y>Y^a>Y>bbD^^y("_499;~?R?RST?UX\XrXr?rsD24>>2BCDD **--w699#o5::<?R?R?TTGMM$%,,T-=-=-D-DW-M-U-U-W-\-\-^_ !1t'7'7#77J<P<PQR<SVW<WX!A(9(9$99j>R>RST>UXY>YZ*_<"_499;o>Q>Q>SS$6;;=@S@S@UU	'/9>>@?CVCVCXX
 ,,33H=d1299:K:S:S:U:Z:Z:\]d0188@Q9R9W9W9YZ!--44Y?d23::;M;U;U;W;\;\;^_d1299&AS:T:Y:Y:[\"..55jAd45<<=P=X=X=Z=_=_=abrU   ignore_keysc                 *   | j                  |      }t        j                         5  | j                         5  | j	                  ||      }d d d        j                         j                         }d d d        d d fS # 1 sw Y   4xY w# 1 sw Y   xY wrK   )r  rN   r  compute_loss_context_managerrH  r  rM  )rR   r   r  prediction_loss_onlyr_  r@  s         rS   prediction_stepzGRPOTrainer.prediction_step  s    %%f-]]_ 	(224 8((7899;%%'D	( T48 8	( 	(s"   B	A=
&B	=B	B		Blogs
start_timec           	         | j                   j                  rdnd}| j                  |   j                         D ci c]  \  }}|t	        |      t        |      z   }}}|dk(  r&|j                         D ci c]  \  }}d| | }}}i ||}t        
|   ||       | j                  |   j                          | j                  j                  r| j                  rt               rbt        | j                  d   | j                  d   | j                  d   | j                  d   | j                  j                   | j"                         | j$                  j&                  rd| j$                  j&                  v rt(        j*                  d	d l}t/        | j                  j                         gt        | j                  d         z  | j                  d   | j                  d   d
| j                  d   d| j                  d   i}|j1                  |      }	| j2                  r|	j5                  dg      }	t)        j                  dt)        j6                  |	      i       y y y y y y c c}}w c c}}w )Nr   r   eval_r   r   r   r   wandbr   )stepr   r   	advantage)subsetr  )	dataframe)r   rB  r  r   rp   rL   r  logclearr   r  r  r%   r:   r  r  r  r  r   	report_torh  runpandasr   	DataFramer  drop_duplicatesTable)rR   rd  re  r  r   valrA  pdtabledfr=  s             rS   rm  zGRPOTrainer.log  s9   **--w6<@MM$<O<U<U<WXS3C3s8++XX 6>:A--/Jhc3se}c)JGJ"$"'"D*%d!!#++0D0D "/&&x0&&|4&&y1&&|4JJ**11 yy""w$))2E2E'E%))J_# !!7!789C@R@RS[@\<]]"00:"&"4"4\"B ((3	
  !3!3L!A \\%(00++H:+>B		=%++*CDE K`'E" 1E+ Y
 Ks   "I*9I0c                    | j                   j                  *t        | j                   j                        j                  }n(| j                   j                  j                  d      d   }| j                  |       t        | !  ||       y )Nr   r   )r9  )	r   hub_model_idr   
output_dirr  r   create_model_cardr  _save_checkpoint)rR   r   trialr9  r=  s       rS   r}  zGRPOTrainer._save_checkpoint  sl    99!!)dii22388J//55c:2>J*5 .rU   r9  dataset_nametagsc                    | j                         syt        | j                  j                  d      r^t        j
                  j                  | j                  j                  j                        s!| j                  j                  j                  }nd}|t               }nt        |t              r|h}nt        |      }t        | j                  j                  d      r|j                  d       |j                  | j                         t        j                  d      }t!        ||| j"                  ||t%               r.t&        j(                  t&        j(                  j+                         ndt-               d|dd	      }|j/                  t        j
                  j1                  | j2                  j4                  d
             y)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        Nr   unsloth_versionunslotha              @article{zhihong2024deepseekmath,
                title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
                author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
                year         = 2024,
                eprint       = {arXiv:2402.03300},
            }
            GRPOzRDeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Modelsz
2402.03300)rf  r9  rz  r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zerohasattrr   r   ospathisdirr   r  r   r   r  r/  r3  textwrapdedentr7   rz  r!   rh  rp  get_urlr8   savejoinr   r{  )rR   r9  r  r  rf  citation
model_cards          rS   r|  zGRPOTrainer.create_model_card  sG   " ))+4::$$o6rww}}TZZM^M^MlMl?m**88JJ <5Dc"6Dt9D4::$$&78HHYDOO$??	
 )!!**%-?-AeiiF[eii'')ae.0%l!

 	TYY%9%9;GHrU   )NNNNNN)NNNrK   )r  N)FN)NNN)3re   rf   rg   rh   r3  r   r   r   
RewardFuncrY   r   r5   r   r   r  r   r    tuplerN   optim	Optimizerlr_schedulerLambdaLRrT   r@  r^  r   rV  rc  r   r+   rp  Tensorr{  r   r   r  r  r	   r  r  r  rC  rH  rE  rc  r   rm  r}  r|  __classcell__)r=  s   @rS   r   r   $  sk   _B J &*CGnr>Bmq59jv.2NS/)*N JZ(889N z"	N
  g&> ?@N uWotCwXgOgIhDh?i%ijkN ##:;N $,E2I4PgKh2h,i#jN D12N (5;;#8#898EKKD\D\DeDe;ffgN l+N`
1$XB"
(7*; "
w "
H
 
O : Zi , ! ! +iniuiu + +(J		 J3 J8 <* <*|  $S%c0A*B%B C	c5s*++	, @ /  / bM
4U5<<+<%= =>?M
	c5s*++	,M
^B 5 5GR PXY]^aYbPc  %FS%Z( %Fhuo %FQU %FP/ %)&*,0	?ISM?I sm?I CcD()	?IrU   r   )nr  r  r  r  collectionsr   r   collections.abcr   
contextlibr   	functoolsr   pathlibr   typingr	   r
   r   r   rM  rN   torch.utils.datarY  accelerate.utilsr   r   r   r   r   r   r   	packagingr   r   torch.distributed.fsdpr   r  r   r   r   r   r   r   r   r   r   r    r!   transformers.trainer_utilsr"   transformers.utilsr#   r$   r%   
data_utilsr'   r(   r)   extras.profilingr*   r+   extras.vllm_clientr,   import_utilsr-   r.   modelsr/   r0   r1   models.utilsr2   r   r4   grpo_configr5   rT  r6   r7   r8   r9   r:   r;   peftr<   r=   liger_kernel.chunked_lossr>   vllmr?   r@   vllm.sampling_paramsrA   rh  r   rY   r   r  rC   r  ru   r  ri   r   r   r   r   r   r   rk   rU   rS   <module>r     s   
 	   * ! "   1 1     b b -   C 0
 
 
 3 Z Z Z Z E + G Q Q . + #  /B(9 34,U2K)LLM
_MG _MF 5<<  ELL  $c8ELL112@C	$sHU\\**
+,<mT#x/E*E%F m4PSU]^c^j^jUkPkKl m035<< 3ELL 335<< 3ELL 3
rI' rIrU   