
    bi                        d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlZd dlZd dlZd dlmZ d dlmc mZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d d	l&m'Z'm(Z( d d
l)m*Z* d dl+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZF  e,       rd dlGmHZHmIZI  e$       rd dlJmKZK  e-       r,d dlLmMZN  ej                  eN       ej                  d      k\  ZPndZP e4       rd dlQmRZRmSZS  e%       rd dlTZT e.j                  eV      ZW G d de"      ZXy)    N)wraps)Path)AnyCallableOptionalUnion)Dataset)version)
DataLoaderIterableDataset)BaseImageProcessorDataCollatorFeatureExtractionMixinGenerationConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerTrainerCallbackis_apex_availableis_wandb_available)EvalPredictionseed_worker)OptimizerNames)is_peft_availableis_sagemaker_mp_enabledlogging   )apply_chat_templateis_conversationalmaybe_apply_chat_template)is_vllm_available)create_reference_model)unwrap_model_for_generation   )BasePairwiseJudge)OnlineDPOConfig)	SIMPLE_CHAT_TEMPLATEDPODataCollatorWithPaddingdisable_dropout_in_modelempty_cachegenerate_model_cardget_comet_experiment_url
get_rewardprepare_deepspeedtruncate_right)	PeftModelget_peft_model)amp)__version__z1.10F)LLMSamplingParamsc            "           e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d)deeej                  f   deeej                  df   deeej                  df   de	e
   d	e	e   d
e	e   de	eeedf      de	eeeeef   df      de	eeeeef      de	e   de	e   de	eegef      de	ee      deej6                  j8                  ej6                  j:                  j<                  f   de	eej>                  ej>                  gej>                  f      ddf  fdZ e!d        Z"e#de$dedeee%f   fd       Z& e'e(jR                        de*fd       Z) e'e(jV                        d*de	eeef      de*fd       Z+d Z,d Z-d Z.	 d*dej                  d eeeej>                  e%f   f   d!e	e/   dej>                  fd"Z0	 d*d#Z1 fd$Z2	 	 	 d+d%e	e   d&e	e   d'eeee   df   fd(Z3 xZ4S ),OnlineDPOTrainera	  
    Initialize OnlineDPOTrainer.

    Args:
        model (`transformers.PreTrainedModel` or `torch.nn.Module`):
            The model to train, preferably an `AutoModelForCausalLM`.
        ref_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`):
            The reference model to use for training. If None is specified, the reference model will be created from the
            model.
        reward_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`):
            The reward model to score completions with, preferably an `AutoModelForSequenceClassification`.
        judge (`BasePairwiseJudge`):
            The judge to use for pairwise comparison of model completions.
        args (`OnlineDPOConfig`):
            The online DPO config arguments to use for training.
        data_collator (`transformers.DataCollator`):
            The data collator to use for training. If None is specified, the default data collator
            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
            sequences in the batch, given a dataset of paired sequences.
        train_dataset (`datasets.Dataset`):
            The dataset to use for training.
        eval_dataset (`datasets.Dataset`):
            The dataset to use for evaluation.
        processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
            Processing class used to process the data. If provided, will be used to automatically process the inputs
            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
            reuse the fine-tuned model.
        peft_config (`dict`):
            The peft config to use for training.
        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
            metric values.
        callbacks (`list[transformers.TrainerCallback]`):
            The callbacks to use for training.
        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
            The optimizer and scheduler to use for training.
        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
            The function to use to preprocess the logits before computing the metrics.
    trlz
online-dpoNmodel	ref_modelreward_modeljudgeargsdata_collatortrain_datasetzdatasets.Dataseteval_datasetprocessing_classreward_processing_classpeft_configcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricsreturnc                    ||u rt        d      || _        ||t        j                  dt               d }n||t        d      || _        |
| _        || _        |j                  j                  | _	        |j                  |t        d      |t        d      |	t        d      |At               st        d      t        |t              r|j                         }t!        ||      }|j"                  r,t%        |       | j                  t%        | j                         ||t'        |      | _        n)d | _        n!|| _        | j                  j)                          | j
                  | j
                  j)                          |t+        |	j,                        }|j.                  | _        g g g g g g g g g g g d	| _        | j
                  -g | j0                  d
<   g | j0                  d<   g | j0                  d<   |j2                  rut5               st        d      t7        d|j8                  |j:                  ddd      | _        t?        |j@                  |jB                  tD        jF                        | _$        n8tK        |j8                  |j:                  ddd|jL                  rdnd      | _        d|jN                  d<   tP        |   ||||||	||||
       tU        | jV                  d      r%| jV                  jY                  | jZ                         |j\                  | _/        | j`                  r| j
                  ;tc        | j
                  |jd                  |jf                  |jh                        | _        | j                  <tc        | j                  |jd                  |jf                  |jh                        | _        y y | j                  4| j                  jk                  | jl                  jn                        | _        | j
                  5| j
                  jk                  | jl                  jn                        | _        y y )Nz`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the same as `model`, either omit the `ref_model` argument or pass `None`.zBoth `reward_model` and `judge` are provided. Please choose provide only one of them. Ignoring `judge` and using `reward_model`.z2Either `reward_model` or `judge` must be provided.z@`missing_eos_penalty` is not supported when `judge` is provided.z`args` must be provided.z$`processing_class` must be provided.zfPEFT is not available and passed `peft_config`. Please install PEFT with `pip install peft` to use it.)pad_token_id)objective/klobjective/entropyobjective/non_score_rewardrewards/chosenrewards/rejectedrewards/accuraciesrewards/marginslogps/chosenlogps/rejectedval/contain_eos_tokenbetaobjective/rlhf_rewardobjective/scores_marginobjective/scoreszkvLLM is not available and `use_vllm` is set to True. Please install vLLM with `pip install vllm` to use it.r   2   g      ?F)n
max_tokenstemperaturetop_ktop_p
detokenize)r:   gpu_memory_utilizationdtypeT)max_new_tokensr]   r^   r_   	do_sample	use_cacheestimate_tokens)
r:   r>   r?   r@   rA   rB   rE   rF   rG   rH   add_model_tags)8
ValueErrorr;   warningswarnUserWarningr<   rC   r=   configis_encoder_decodermissing_eos_penaltyr   ImportError
isinstancer1   merge_and_unloadr2   disable_dropoutr*   r#   evalr)   rK   
max_lengthstatsuse_vllmr"   r6   rc   r]   generation_configr5   name_or_pathra   torchfloat32llmr   gradient_checkpointingwarnings_issuedsuper__init__hasattrr:   rg   
_tag_namesrV   _betais_deepspeed_enabledr/   per_device_train_batch_sizefp16bf16toacceleratordevice)selfr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   	__class__s                   Y/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/online_dpo_trainer.pyr   zOnlineDPOTrainer.__init__   s&   ( X 
 ##(9MM=
 E!emQRR('>$
"',,"A"A##/E4E_``<788 #CDD "$&!4  %+..0 #5+6E $U+~~)(8 "!7!>!%&DNNN! (""$  6DTDaDabM// !#*,  ""$! %'

 (24DJJ./46DJJ01-/DJJ)*==$&!4  &4.. ,, &D" (('+'B'BmmDH &6#22 ,,#'#>#>%D&D" 48/0''%-+!*G 	 	
 4::/0JJ%%doo6YY
 $$  ,$5%%t'G'GTXT]T]%! ~~)!2NND$D$DdiiQUQZQZ" *
 ~~)!%!2!243C3C3J3J!K  ,$($5$5$8$89I9I9P9P$Q! -    c                     t        | j                  t              rL| j                  j                  }|t        | j                        k  r| j                  |   S | j                  d   S | j                  S )N)rp   r   liststateepochlen)r   r   s     r   rV   zOnlineDPOTrainer.betaM  sU    djj$'JJ$$E(-DJJ(?4::e$STZZPR^S::r   rm   	tokenizerc                 D   |sd || d   d      }|j                   Xt        |d         }|dk(  s|j                   |d   d   k7  r0|j                   g|d   z   |d<   dg|d   z   |d<   n || d   d      }|j                         D ci c]  \  }}d	| | }}}|S c c}}w )
z2Tokenize a single row from a DPO specific dataset.promptF)add_special_tokens	input_idsr   r%   attention_maskTprompt_)bos_token_idr   items)featurerm   r   batchprompt_len_input_idskeyvalues          r   tokenize_rowzOnlineDPOTrainer.tokenize_rowU  s     "gh/EJE%%1'*5+='>$'1,	0F0F%P[J\]^J_0_*3*@*@)AE+DV)VE+&/0cE:J4K.KE*+gh/DIE:?++-HJC73%%'HH Is   Bc                 >   | j                   t        d      | j                   }| j                  }| j                  || j                  j
                  | j                  j                  | j                  j                  d}t        |t        j                  j                  j                        sN| j                         |d<   | j                  j                  |d<   t        |d<   | j                  j                   |d<   | j"                  j%                  t'        |fi |      S )Nz+Trainer: training requires a train_dataset.
batch_size
collate_fnnum_workers
pin_memorypersistent_workerssampler	drop_lastworker_init_fnprefetch_factor)r@   rh   r?   _train_batch_sizer>   dataloader_num_workersdataloader_pin_memorydataloader_persistent_workersrp   ry   utilsdatar   _get_train_samplerdataloader_drop_lastr   dataloader_prefetch_factorr   preparer   )r   r@   r?   dataloader_paramss       r   get_train_dataloaderz%OnlineDPOTrainer.get_train_dataloaderf  s    %JKK****00'99;;))99"&))"I"I
 -)9)9)I)IJ+/+B+B+Di(-1YY-K-Kk*2=./37993W3W/0''
=(VDU(VWWr   c                    || j                   t        d      t        |t              r|nd}t	        | d      rL|| j
                  v r>| j                  j                  r(| j                  j                  | j
                  |         S t        |t              r| j                   |   n||n| j                   }| j                  }| j                  j                  || j                  j                  | j                  j                  | j                  j                  d}t        |t        j                  j                   j"                        sF| j%                  |      |d<   | j                  j&                  |d<   | j                  j(                  |d<   t+        |fi |}| j                  j                  r%t	        | d      r|| j
                  |<   n	||i| _        | j                  j                  |      S )Nz-Trainer: evaluation requires an eval_dataset.rs   _eval_dataloadersr   r   r   r   )rA   rh   rp   strr   r   r>   r   r   r   r?   eval_batch_sizer   r   ry   r   r   r   _get_eval_samplerr   r   r   )r   rA   dataloader_keyr?   r   eval_dataloaders         r   get_eval_dataloaderz$OnlineDPOTrainer.get_eval_dataloader~  s   D$5$5$=LMM *4L#)FFD-.$"8"88		77##++D,B,B>,RSS ,, l+ ' "" 	 ** ))33'99;;))99"&))"I"I
 ,(8(8(H(HI+/+A+A,+Oi(-1YY-K-Kk*37993W3W/0 %\G5FG9922t019H&&~6*8/)J&''88r   c           	      t   | j                   j                  }| j                   j                  }| j                  j                  j
                  j                  j                  j                  }|j                  |j                         j                                t        d|d   i      r)| j                  j                  || j                  d      }n(| j                  j                  || j                  d      }t!        d      D cg c]+  }|D ]$  }t#        |j$                  |   j&                        & - }	}}t!        d      D 
cg c]  }
|D ]  }t#        |j(                           }}
}t+        d |D              }|D cg c]$  }dg|t-        |      z
  z  dgt-        |      z  z   & }}|D cg c]  }|g|t-        |      z
  z  |z    }}| j                  j.                  }|	D cg c]$  }dgt-        |      z  dg|t-        |      z
  z  z   & }}|	D cg c]   }|d   |k7  rt-        |      |k  r||gz   n|" }	}|	D cg c]  }||g|t-        |      z
  z  z    }	}t1        j2                  || j4                  j6                  	      }t1        j2                  || j4                  j6                  	      }t1        j2                  |	| j4                  j6                  	      }	t1        j2                  || j4                  j6                  	      }|||	|fS c c}}w c c}}
w c c}w c c}w c c}w c c}w c c}w )
Nr   r   F)use_tqdmr   c              3   2   K   | ]  }t        |        y wN)r   ).0idss     r   	<genexpr>z2OnlineDPOTrainer._generate_vllm.<locals>.<genexpr>  s     ?SC?s   r%   r   r   )rB   eos_token_idrK   r{   
llm_enginemodel_executordriver_workermodel_runnerr:   load_weights
state_dictr   r    chatrw   generateranger   outputs	token_idsprompt_token_idsmaxr   r\   ry   tensorr   r   )r   r:   promptsr   rK   	llm_modelr   ioutputcompletion_ids_
prompt_idsmax_prompt_lengthr   prompt_maskr\   completion_masks                    r   _generate_vllmzOnlineDPOTrainer._generate_vllm  s   ,,99,,99 HH''66DDQQWW	u//1779:h
34hhmmGT-C-CemTGhh''1G1GRW'XGEJ1Xd\cdRX$v~~a0::;d;dd=B1X\T[\&d6223\3\
\  ?J??XbcQTs/#c(:;qcCHnLccWabPS|n(9CH(DEKb
b++66
UcdcA3S>QC:C3H,IIdd &
 %(G|$;C:@UC<. [^^
 
 UccS#*s3x2G HHcc \\*T5E5E5L5LM
ll;t7G7G7N7NOnT=M=M=T=TU,,t?O?O?V?VW;GG+ e\ dbd
 ds*   0L#L)L!3L&+)L+%L0L5c                    | j                   j                  }| j                   j                  }|D cg c]  }d|i }}|D cg c]  }t        || j                          }}|D cg c])  }| j	                  || j
                  | j                         + }}| j                  |      }| j                  |      }|d   j                  dd      }|d   j                  dd      }	t        || j                  | j                  j                        5 }
|
j                  ||	| j                        }d d d        d d |j                  d      d f   }t!        |||      \  }}||	||fS c c}w c c}w c c}w # 1 sw Y   HxY w)Nr   prompt_input_idsr   r%   prompt_attention_mask)gather_deepspeed3_params)r   r   rw   )rB   r   rK   r!   r   rm   r?   _prepare_inputsrepeatr$   r   r>   ds3_gather_for_generationr   rw   sizer0   )r   r:   r   r   rK   r   inputsxr   r   unwrapped_modelr   r   r   s                 r   	_generatezOnlineDPOTrainer._generate  s   ,,99,,99 4;;8V$;;OUV!+At/D/DEVV`fg[\$##At'>'>@U@UVgg##F+ %%f-./66q!<
45<<QB(4##dii>a>a
 	$--$*"&"8"8 . F	  :??1#5#7 78*8Wc*d';GG+ <Vg	 	s   E$E)%.E.E33E<c                    t        |j                  d      |j                  d      z   | j                  z
  d      }|d d |d f   }|d d |d f   }t        j                  ||fd      }t        j                  ||fd      } |||      }	|	j
                  d d |j                  d      dz
  df   }
t        j                  |
j                  d      |j                  d      d      j                  d      }|S )Nr%   r   dim)r   r   r   )
r   r   rt   ry   catlogitstake_along_dimlog_softmax	unsqueezesqueeze)r   r:   r   r   r   r   num_tokens_to_truncateprompt_completion_idsprompt_completion_maskr   r   logprobss               r   _forwardzOnlineDPOTrainer._forward  s   !$Z__Q%7.:M:Ma:P%PSWSbSb%bde!f  #9#: :;
!!%;%<"<= !&		:~*FA N!&K+Iq!Q ,=ST q*//!"4q"82"==> ''(:(:r(:(BND\D\]_D`fghppqstr   r   num_items_in_batchc                 2   |j                          |d   }t        |      }| j                  j                  r| j	                  ||      \  }}}}	n| j                  ||      \  }}}}	t        j                  || j                  j                  k(  d      }
| j                  |||||	      }t        j                         5  | j                   | j                  | j                  ||||	      }nB| j                  j                         5  | j                  | j                  ||||	      }d d d        d d d        |j                  }| j                  j!                  |d      }t#        d|d   i      r|D cg c]  }d|dg
 }}| j$                  t#        d|d   i      rgt'        j(                         }|j+                  t,              }|D cg c]  }|j/                  |	       }}|D cg c]  }|j/                  |	       }}| j$                  j%                  |t1        t3        |d | ||d                    }t        j4                  |D cg c]  }|dk(  	 c}|
      }nd|z  }t#        d|d   i      rht3        ||      D cg c]
  \  }}||d }}}|D cg c]  }t7        || j8                         }}|D cg c]  }|d   	 }}|D cg c]  }|d   	 }}| j9                  |ddd      d   j;                  |      }|j<                  d   }| j9                  |ddd      d   j;                  |      }t        j>                  ||fd      }t        j@                         5  tC        | jD                  || j8                  jF                  |      \  }}}| j                  jH                  "||
 xx   | j                  jH                  z  cc<   d d d        jK                  |      \  } }!| |!k\  }t        jL                  ||
      }"|"| |z  z   }#|"||z  z   }$t        j>                  |#|$fd      }%||%   }&|%   }'|	jO                          }(|(|%   })|&|) z  jQ                  d      }*|'|) z  jQ                  d      }+t        jJ                  |*|      \  },}-t        jJ                  |+|      \  }.}/|,|-z
  }0|.|/z
  }1|0|1z
  }2| j                  jR                  dk(  r$tU        jV                  | jX                  |2z         }3nJ| j                  jR                  dk(  r|2dd| jX                  z  z  z
  dz  }3nt[        d| jR                         |3j]                         }4| jD                  |#   ||$   z
  }5| j^                  d   ja                  | jb                  je                  |5j]                               j]                         jg                                | j^                  d   ja                  | jb                  je                  |j]                               j]                         jg                                | j^                  d   ja                  |
ji                         j]                         jg                                | j^                  d   ja                  | jb                  je                  |,      j]                         jg                                | j^                  d   ja                  | jb                  je                  |-      j]                         jg                                ||z
  }6|6jQ                  d      j]                         }7| j^                  d   ja                  | jb                  je                  |7      j]                         jg                                | jX                   |6z  jQ                  d      }8|8j]                         }9| j^                  d   ja                  | jb                  je                  |9      j]                         jg                                | jD                  X|8z   }:| j^                  d   ja                  | jb                  je                  |:      j]                         jg                                |jQ                  d      j]                          };| j^                  d   ja                  | jb                  je                  |;      j]                         jg                                | jX                  |,|.z
  z  }<| jb                  je                  |<      }=| j^                  d    ja                  |=j]                         jg                                | jX                  |-|/z
  z  }>| jb                  je                  |>      }?| j^                  d!   ja                  |?j]                         jg                                |=|?z
  }@| j^                  d"   ja                  |@j]                         jg                                |@dkD  }A| j^                  d#   ja                  |Aji                         j]                         jg                                | j^                  d$   ja                  | jX                         | j                  jj                  :| jl                  jn                  | j                  jj                  z  dk(  r
tq                i }B| j                  jr                  tt        jv                  tt        jx                  fv r| j{                         Bd%<   | j                  j|                  dkD  r|4j]                         }4| j~                  r:t        j                  |4| j                        5 }C|Cj                          d d d        n | jb                  j                  |4fi B |4j                         | j                  j                  z  S # 1 sw Y   
xY w# 1 sw Y   
xY wc c}w c c}w c c}w c c}w c c}}w c c}w c c}w c c}w # 1 sw Y   0xY w# 1 sw Y   xY w)&Nr   r   r   T)skip_special_tokensr   	assistant)rolecontent)messagesr   r   )r   
completionr  ptleft)paddingreturn_tensorspadding_sider   r%   rightsigmoidipozinvalid loss type rX   rY   rU   rS   rT   rL   rN   rW   rM   rO   rP   rR   rQ   rV   learning_rate)Ftrainr   r>   rv   r   r   ry   anyrB   r   r   no_gradr;   r:   disable_adapterr   batch_decoder    r=   jinja2Environmentfrom_stringr(   renderr   zipr   r   rC   r   shaper   inference_moder.   r<   rK   rn   splitarangeboolsum	loss_typeF
logsigmoidrV   NotImplementedErrormeanru   appendr   gather_for_metricsitemfloattorch_empty_cache_stepsr   global_stepr+   optimr   LOMOADALOMO_get_learning_raten_gpuuse_apexr3   
scale_loss	optimizerbackwarddetachgradient_accumulation_steps)Dr   r:   r   r   r   r   r   r   r   r   contain_eos_tokenr   ref_logprobsr   completionsr  environmenttemplater   ranks_of_first_completionrankmaskpcexamplesexampleprompts_idscontext_lengthcompletions_idsr   r   scores
first_halfsecond_halfbatch_rangechosen_indicesrejected_indices
cr_indicescr_logprobscr_ref_logprobspadding_maskcr_padding_maskcr_logprobs_sumcr_ref_logprobs_sumchosen_logprobs_sumrejected_logprobs_sumchosen_ref_logprobs_sumrejected_ref_logprobs_sumpi_logratiosref_logratiosr   losseslossscores_marginklmean_klnon_score_rewardmean_non_score_rewardrlhf_rewardmean_entropychosen_rewardsgathered_chosen_rewardsrejected_rewardsgathered_rejected_rewardsmarginaccuracykwargsscaled_losssD                                                                       r   training_stepzOnlineDPOTrainer.training_step  s
    	"\
99GKGZGZ[`biGjDJ^_GK~~V[]dGeDJ^_!IIn8M8M8Z8Z&Z`bc==
KQ`a]]_ 	w~~)#}}T^^ZVdfuvZZ//1 w#'==ZVdfu#vLw		w ++88]a8bh
34\ghj[ZHIhKh ::!
 !(GAJ!78$002&223GHJQR8??F?;RRVab
x
Cbb(,

(8(8c+kz":K
<TUV)% <<7P Qt QZ`aD 'kG (GAJ!78GJ7T_G`atq!q:aafno[b/9U9UVoo<DE78,EEDLMw|4MM 66d 7 2f:  )..q1N #::T$W ; 2f: 
 %*II{O.LRS$T!%%' P)%%'<d>Z>Z>g>giw 61 9900<--.$))2O2OO.P '-ll:&>#J ,Dll:f=$
(:;&$*;< YY0@AqI
z*&z2 (,,..&z2&/)99>>qA./1AAFFqI 6;[[R\5]22=B[[I\^h=i:!:*-BB/2KK-99)+ll499v#566FYY  E)qA		M22q8F%(:4>>:J&KLL{{} (">2V<L5MMMJJ0188  33M4F4F4HINNPUUW JJ)*11$2B2B2U2UV\VaVaVc2d2i2i2k2p2p2rs

*+223D3J3J3L3Q3Q3S3X3X3Z[

>"))$*:*:*M*MNa*b*g*g*i*n*n*pq

#$++D,<,<,O,OPe,f,k,k,m,r,r,tu$&&).."

>"))$*:*:*M*Mg*V*[*[*]*b*b*de!YYJO003 0 5 5 7

/077//0EFKKMRRT	
 ( #33KJJ./66t7G7G7Z7Z[f7g7l7l7n7s7s7uv Q,,..

&'..t/?/?/R/RS_/`/e/e/g/l/l/no&9<S&ST"&"2"2"E"En"U

#$++,C,H,H,J,O,O,QR99(=@Y(YZ$($4$4$G$GHX$Y!

%&--.G.L.L.N.S.S.UV(+DD

$%,,V[[]-?-?-ABA:

'(//0@0E0E0G0L0L0NO

6!!$)), II--9

&&)J)JJaOM 99??~22N4J4JKK&*&=&=&?F?#99??Q99;D==dnn5 '$$&' ' &D%%d5f5{{}tyyDDDD}w w		w 	w i Sb !R boEMP PJ' 'sn   =Ao
 n=$o
1o	o(o!o&o+o1?o6o;)A)p p=o	o

o p
pc	                    | j                   j                  r| j                  j                  | j                  kD  roi }	| j                  |      j                         j                         }
||z  }t        |
| j                  j                  | j                  z
  z  d      |	d<   |=t        |t        j                        r|j                         j                         n||	d<   |||	d<   n| j                         |	d<   | j                  j                         D ]  \  }}t!        |      t#        |      z  |	|<   ! | j                  D ci c]  }|g  c}| _        | xj$                  |
z  c_        | j                  j                  | _        | j'                          | j)                  |	|       d }| j                   j*                  rO| j-                  ||      }| j/                  ||      }| j0                  j2                  dk(  r|| j                   _        | j                   j4                  rS| j7                  ||       | j8                  j;                  | j0                  | j                  | j                         | _         y y c c}w )N   rX  	grad_normr  )metricstrialbest)control
should_logr   r)  _globalstep_last_logged_nested_gatherr#  r&  roundrp   ry   Tensorr3  r-  ru   r   r  r   _total_loss_scalar
store_floslogshould_evaluate	_evaluate_determine_best_metricr>   save_strategyshould_save_save_checkpointcallback_handleron_save)r   tr_lossrk  r:   rm  r   ignore_keys_for_eval
start_timer  logstr_loss_scalarr   valrl  is_new_best_metrics                  r   _maybe_log_save_evaluatez)OnlineDPOTrainer._maybe_log_save_evaluate  s    <<""tzz'='=@\@\'\%'D "009>>@EEGN wG 4::3I3IDLhLh3h!iklmDL$AKIW\WcWcAdI$4$4$6$;$;$=js[!((5_%(,(?(?(A_% !JJ,,. 0SHs3x/S	0-1ZZ8c#r'8DJ##~5#+/::+A+AD(OOHHT:&<<''nnU,@AG!%!<!<WTY!<!Zyy&&&0+=(<<##!!%/0088DJJPTP\P\]DL $ 9s   
I<c                    | j                   j                  *t        | j                   j                        j                  }n(| j                   j                  j                  d      d   }| j                  |       t        | !  ||       y )N/r   )
model_name)	r>   hub_model_idr   
output_dirnamer  create_model_cardr~   r}  )r   r:   rm  r  r   s       r   r}  z!OnlineDPOTrainer._save_checkpoint  sl    99!!)dii22388J//55c:2>J*5 .r   r  dataset_nametagsc                    | j                         syt        | j                  j                  d      r^t        j
                  j                  | j                  j                  j                        s!| j                  j                  j                  }nd}|t               }nt        |t              r|h}nt        |      }t        | j                  j                  d      r|j                  d       |j                  | j                         t        j                  d      }t!        ||| j"                  ||t%               r.t&        j(                  t&        j(                  j+                         ndt-               d|dd	      }|j/                  t        j
                  j1                  | j2                  j4                  d
             y)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        N_name_or_pathunsloth_versionunslotha          @article{guo2024direct,
            title        = {{Direct Language Model Alignment from Online AI Feedback}},
            author       = {Shangmin Guo and Biao Zhang and Tianlin Liu and Tianqi Liu and Misha Khalman and Felipe Llinares and Alexandre Ram{'{e}} and Thomas Mesnard and Yao Zhao and Bilal Piot and Johan Ferret and Mathieu Blondel},
            year         = 2024,
            eprint       = {arXiv:2402.04792}
        }z
Online DPOz7Direct Language Model Alignment from Online AI Feedbackz
2402.04792)
base_modelr  r  r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zeror   r:   rl   ospathisdirr  setrp   r   addupdater   textwrapdedentr,   r  r   wandbrunget_urlr-   savejoinr>   r  )r   r  r  r  r  citation
model_cards          r   r  z"OnlineDPOTrainer.create_model_card  sG   " ))+4::$$o6rww}}TZZM^M^MlMl?m**88JJ <5Dc"6Dt9D4::$$&78HHYDOO$?? $  )!!**%-?-AeiiF[eii'')ae.0%%Q!

 	TYY%9%9;GHr   )NNNNNNNNNNNN)NNNr   )NNN)5__name__
__module____qualname____doc__r   r   r   nnModuler   r&   r'   r   r	   r   dictr   r   r   r   r   r   r   r   r   tuplery   r*  	Optimizerlr_schedulerLambdaLRrt  r   propertyrV   staticmethodr  r   r   r   r   r   r   r   r   r   r   intrh  r  r}  r  __classcell__)r   s   @r   r8   r8   \   sw   &P &J
 >B@D-1*.04W[Y] EI&*FJ59Vbhl%DR_bii/0DR "))T9:DR ORYY<=	DR
 )*DR 'DR  -DR  g@R&R STDR uWd3<.@BT%TUVDR #)+=?UWeef
DR "**A!BDR d^DR "(N+;T+A"BCDR  D12!DR" %++//1I1I1R1RRS#DR$ (0%,,9UW\WcWc9c0d'e%DR& 
'DRL   $ CZ _cdgildl_m    7''(Xj X )X. 7&&'-9sG|9L0M -9Yc -9 (-9^"HHH:. rvrEYYrE(,S%c8I2J-J(KrEaijmanrE	rEl hl(^V/ %)&*,0	;ISM;I sm;I CcD()	;Ir   r8   )Yr  r  ri   	functoolsr   pathlibr   typingr   r   r   r   datasetsr  ry   torch.nnr  torch.nn.functional
functionalr   torch.utils.datar	   	packagingr
   r   r   transformersr   r   r   r   r   r   r   r   r   r   r   transformers.trainer_utilsr   r   transformers.training_argsr   transformers.utilsr   r   r   
data_utilsr   r    r!   import_utilsr"   modelsr#   models.utilsr$   judgesr&   online_dpo_configr'   r   r(   r)   r*   r+   r,   r-   r.   r/   r0   peftr1   r2   apexr3   smdistributed.modelparallelr4   SMP_VERSIONparseIS_SAGEMAKER_MP_POST_1_10vllmr5   r6   r  
get_loggerr  loggerr8    r   r   <module>r     s    
     1 1          8    C 5 R R Z Z , + 6 % .
 
 
 . F -k :mgmmF>S S !& (			H	%LIw LIr   