
    biog                     z   d dl Z d dlZd dlmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlm
c mZ d dlmZmZ d dlmZmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ dd	l m!Z!m"Z" dd
l#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2  e       rd dl3m4Z4  e       rd dl5Z5 e       rd dl6m7Z7  G d de(      Z8y)    N)AnyCallableOptionalUnion)DatasetIterableDataset)BaseImageProcessorFeatureExtractionMixinPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerCallbackis_apex_availableis_wandb_available)EvalPrediction)OptimizerNames)is_peft_available   )is_conversationalmaybe_apply_chat_template)unwrap_model_for_generation   )BasePairwiseJudge)OnlineDPOTrainer)SIMPLE_CHAT_TEMPLATEempty_cachegenerate_model_cardget_comet_experiment_url
get_rewardselective_log_softmaxtruncate_right)	XPOConfig)amp)	PeftModelc                        e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%deeej                  f   deeej                  f   de	ej                     d	e	e
   d
e	e   de	e   de	eeef      de	eeeeef   f      de	eeeeef      de	e   de	eegef      de	ee      deej4                  j6                  ej4                  j8                  j:                  f   de	eej<                  ej<                  gej<                  f      ddf fdZe d        Z!d Z"d Z#d Z$d Z%d Z&d Z'	 	 ddZ(	 d&dej                  deeeej<                  e)f   f   de	e*   dej<                  fd Z+	 	 	 d'd!e	e   d"e	e   d#eeee   df   fd$Z, xZ-S )(
XPOTrainera	  
    Initialize XPOTrainer as a subclass of [`OnlineDPOConfig`].

    Args:
        model (`transformers.PreTrainedModel`):
            The model to train, preferably an `AutoModelForCausalLM`.
        ref_model (`PreTrainedModelWrapper`):
            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
            and loss. If no reference model is provided, the trainer will create a reference model with the same
            architecture as the model to be optimized.
        reward_model (`transformers.PreTrainedModel`):
            The reward model to score completions with, preferably an `AutoModelForSequenceClassification`.
        judge (`BasePairwiseJudge`):
            The judge to use for pairwise comparison of model completions.
        args (`XPOConfig`):
            The XPO config arguments to use for training.
        data_collator (`transformers.DataCollator`):
            The data collator to use for training. If None is specified, the default data collator
            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
            sequences in the batch, given a dataset of paired sequences.
        train_dataset (`datasets.Dataset`):
            The dataset to use for training.
        eval_dataset (`datasets.Dataset`):
            The dataset to use for evaluation.
        processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
            Processing class used to process the data. If provided, will be used to automatically process the inputs
            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
            reuse the fine-tuned model.
        peft_config (`dict`):
            The peft config to use for training.
        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
            metric values.
        callbacks (`list[transformers.TrainerCallback]`):
            The callbacks to use for training.
        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
            The optimizer and scheduler to use for training.
        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
            The function to use to preprocess the logits before computing the metrics.
    trlxpoNNNmodel	ref_modelreward_modeljudgeargsdata_collatortrain_dataseteval_datasetprocessing_classpeft_configcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricsreturnc                    t         |   |||||||||	|	|
||||       | j                  j                  | _        g g g g g g g g g g g g g g d| _        | j                  .g | j
                  d<   g | j
                  d<   g | j
                  d<   y y )N)r*   r+   r-   r,   r.   r/   r0   r1   r2   reward_processing_classr3   r4   r5   r6   r7   )loss/dpoloss/xpoobjective/klobjective/entropyrewards/chosenrewards/rejectedrewards/accuraciesrewards/marginslogps/chosenlogps/rejectedval/model_contain_eos_tokenval/ref_contain_eos_tokenalphabetaobjective/model_scoresobjective/ref_scoresobjective/scores_margin)super__init__r.   rG   _alphastatsr,   )selfr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   	__class__s                  R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/xpo_trainer.pyrM   zXPOTrainer.__init__n   s    & 	%''%-$4#+!*G 	 	
$ iioo !#  ""$! +-)+#

& (35DJJ/013DJJ-.46DJJ01	 )    c                     t        | j                  t              rL| j                  j                  }|t        | j                        k  r| j                  |   S | j                  d   S | j                  S )N)
isinstancerN   liststateepochlen)rP   rY   s     rR   rG   zXPOTrainer.alpha   sV    dkk4(JJ$$E).T[[1A)A4;;u%Vt{{SUV;;rS   c                 @   t        || j                        5 }|j                  |d   |d   | j                        }d d d        | j                  I| j                  j                  |      }t               r!t        |t              r|j                         }n(|}n%| j                  j                  | j                        }t        || j                        5 }|j                  |d   |d   | j                        }d d d        |fS # 1 sw Y   xY w# 1 sw Y   fS xY w)N	input_idsattention_mask)r\   r]   generation_config)
r   acceleratorgenerater^   r+   unwrap_modelr   rV   r$   get_base_model)	rP   promptsr*   unwrapped_policy_model_for_genmodel_output"unwrapped_main_model_for_ref_logicactual_model_for_ref_generationfinal_ref_model_for_gen
ref_outputs	            rR   _generate_completionsz XPOTrainer._generate_completions   s+   (0@0@A 	Ec9BB!+.&'78"&"8"8 C L	 >>!151A1A1N1Nu1U. "z2TV_'`2T2c2c2e/2T/.2.>.>.K.KDNN.[+()H$JZJZ[ 	_v099!+.&'78"&"8"8 : J	 Z''3	 	$	 Z''s   %D%DDDc                 .   |d   j                   d   }|d d |d f   }t        || j                  j                  | j                  j                        \  }}t        j                  |d   |fd      t        j                  |d   |fd      |d   d}|d d |d f   }t        || j                  j                  | j                  j                        \  }}	t        j                  |d   |fd      t        j                  |d   |	fd      |d   d}
||
fS )Nr\   r   dimr]   rawr\   r]   rn   )shaper!   r2   eos_token_idpad_token_idtorchcat)rP   re   ri   rc   context_lengthmodel_completion_idsmodel_completion_mask
model_dataref_completion_idsref_completion_maskref_datas              rR   _process_completionszXPOTrainer._process_completions   s6    -33A6  ,A~,>?6D $"7"7"D"DdF[F[FhFh7
33 GK$8:N#OUVW#ii1A)BDY(Z`ab5>

 (>?(:;2@ 5 5 B BDDYDYDfDf3
// GK$8:L#MSTU#ii1A)BDW(X^_`5>
 8##rS   c                    t        j                         5  t        | j                  |d   | j                  j
                  |      \  }}}t        | j                  |d   | j                  j
                  |      \  }}}d d d        | j                  j                  t        j                  |d   | j                  j                  k(  d      }t        j                  |d   | j                  j                  k(  d      }| xx   | j                  j                  z  cc<   | xx   | j                  j                  z  cc<   fS # 1 sw Y   xY w)Nr\   rU   rl   )
rs   no_gradr   r,   r2   rr   r.   missing_eos_penaltyanyrq   )	rP   rx   r{   ru   _model_scores
ref_scoresmodel_contain_eosref_contain_eoss	            rR   _compute_rewardszXPOTrainer._compute_rewards   s-   ]]_ 	!+!!:k#:D<Q<Q<^<^`n"A|Q  *!!8K#8$:O:O:\:\^l Az1		 99((4 %		*[*ATEZEZEgEg*gmo p#ii(=AVAVAcAc(ciklO++,		0M0MM,'(DII,I,II(Z''	 	s   A'EEc           	         |d   }| j                   j                  |d   d d |d f   d      }|D cg c]  }|j                          }}| j                   j                  |d   d d |d f   d      }|D cg c]  }|j                          }}t        d|d   i      r|D cg c]  }d|dg
 }}t	        j
                         }|j                  t              }	|D 
cg c]  }
|	j                  |
	       }}
|D cg c]  }|	j                  |	       }}|D cg c]  }d|dg
 }}|D cg c]  }|	j                  |	       }}| j                  j                  |t        t        ||                  }t        j                  |D cg c]  }|dk(  	 c}|d   j                  
      S c c}w c c}w c c}w c c}
w c c}w c c}w c c}w c c}w )Nrn   r\   T)skip_special_tokenspromptr   	assistant)rolecontent)messages)device)r2   batch_decodestripr   jinja2Environmentfrom_stringr   renderr-   rW   ziprs   tensorr   )rP   rx   r{   ru   rc   model_data_completions
completionref_data_completionsenvironmenttemplatemessageranks_of_first_completionranks                rR   _compute_judgezXPOTrainer._compute_judge   s   U#!%!6!6!C!C{#A~$67T "D "
 H^!^*"2"2"4!^!^#44AA[!!^_"454  B  
 FZZz
 0 0 2ZZh
34Qg&CM+*=>&" & !,,.K"../CDHHOPWx8PGP]s%tzhoozo&J%t"%t Rf$CM+*=>$  $ \p#pZHOOZO$H#p #p$(JJ$4$4+-ABC%
! ||3LM4TQYMV`alVmVtVtuu9 "_
  [&
 Q%t$ $q Ns/   F29F7&F<"GG G3GGc                 8   fd} |||      } |||      }t        j                         5  | j                  ,|j                         5   |||      } |||      }	d d d        n& || j                  |      } || j                  |      }	d d d        |d   d d d f   dk(  }
|d   d d d f   dk(  }|j	                  |
d      }|j	                  |d      }	j	                  |d      }	j	                  |
d      }|||	|fS # 1 sw Y   xY w# 1 sw Y   xY w)Nc                      | |d   |d         }|j                   d d dz
  df   }t        ||d   d d d f         }|S )Nr\   r]   )r]   r   rU   )logitsr    )mdataoutputr   token_logprobsru   s        rR   compute_logprobs_for_dataz?XPOTrainer._compute_logprobs.<locals>.compute_logprobs_for_data$  s\    tK(>N9OPF]]1nq&82&=#=>F264;LQP^P_M_;`aN!!rS   r]   r   g        )rs   r~   r+   disable_adaptermasked_fill)rP   r*   rx   r{   ru   r   model_logprobs_model_datamodel_logprobs_ref_dataref_logprobs_model_dataref_logprobs_ref_datamodel_padding_maskref_padding_masks       `       rR   _compute_logprobszXPOTrainer._compute_logprobs#  sa   	" %>eZ$P!";E8"L ]]_ 	\~~%**, W.Gz.Z+,EeX,V)W W +DDNNT^*_'(A$..RZ([%	\ ((89!^_:LMQRR#$45a6HIQN$=$I$IJ\^a$b!"9"E"EFVX["\ 5 A ABRTW X"9"E"EFXZ]"^(*ACXZqqqW W	\ 	\s#   D
D/DD		DDc                    |j                  d      }|j                  d      }|j                  d      }|j                  d      }	t        j                  |||      }
t        j                  ||	|      }|
|z
  }t        j                  | ||      }t        j                  | |	|      }||z
  }||z
  }| j                  j                  dk(  r$t        j                  | j                  |z         }nT| j                  j                  dk(  r|dd| j                  z  z  z
  dz  }n"t        d| j                  j                         | j                  |z  }||z   j                         }|||fS )Nr   sigmoidipor   zinvalid loss type )sumrs   wherer.   	loss_typeF
logsigmoidrH   NotImplementedErrorrG   mean)rP   r   r   r   r   chosen_maskmodel_logprobs_model_data_summodel_logprobs_ref_data_sumref_logprobs_ref_data_sumref_logprobs_model_data_sumchosen_model_logprobschosen_ref_logprobschosen_log_ratiosrejected_model_logprobsrejected_ref_logprobsrejected_log_ratiosr   
dpo_losses
xpo_losseslosss                       rR   _compute_losseszXPOTrainer._compute_lossesC  sq    )B(E(Ea(H%&=&A&A!&D#$9$=$=a$@!&=&A&A!&D# %K9VXs t#kk+7RTmn14GG"'++{l<Y[v"w %[L:UWp q58MM #%8899)+,,tyy6'9::JYY  E) 1DII#661<J%(:499;N;N:O&PQQ ZZ"==
 Z'--/Z++rS   c                 	     fd} j                   d   j                   ||              j                   d   j                   ||	              j                  o j                   d   j                   ||              j                   d   j                   ||              j                   d   j                   |||z
               |j                  d      }|j                  d      }|j                  d      }|j                  d      }t	        j
                  |||      }t	        j
                  |||      }||z
  }t	        j
                  | ||      }t	        j
                  | ||      }||z
  } j                   d   j                   ||j                         |j                         z                 j                   d	   j                   ||j                         |j                         z                | j                  z  }| j                  z  } j                   d
   j                   ||j                                       j                   d   j                   ||j                                      ||z
  }||z
  }|j                  d      |j                  d      z   j                         dz  } j                   d   j                   ||             |j                  d       }|j                  d       }|j                         |j                         z   dz  } j                   d   j                   ||             ||z
  }  j                   d   j                   || j                                      | dkD  j                         }! j                   d   j                   ||!j                                      |d   d d |
d f    j                  j                  k(  j                  d      }"|d   d d |
d f    j                  j                  k(  j                  d      }# j                   d   j                   ||"j                                       j                   d   j                   ||#j                                       j                   d   j                   j                          j                   d   j                   j                         y )Nc                 r    j                   j                  |       j                         j                         S N)r_   gather_for_metricsr   item)r   rP   s    rR   gather_meanz/XPOTrainer._log_statistics.<locals>.gather_mean{  s,    ##66v>CCEJJLLrS   r;   r<   rI   rJ   rK   r   rC   rD   r?   r@   r   r=   r>   rB   r   rA   r\   rl   rE   rF   rG   rH   )rO   appendr,   r   rs   r   r   rH   floatr2   rq   r   rG   )$rP   rx   r{   r   r   r   r   r   r   r   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   chosen_rewardsrejected_rewardskl_model_datakl_ref_datamean_klentropy_model_dataentropy_ref_datamean_entropymarginaccuracy	model_eosref_eoss$   `                                   rR   _log_statisticszXPOTrainer._log_statisticsk  sY    	M 	

:%%k*&=>

:%%k*&=> (JJ/077L8QRJJ-.55k*6MNJJ0188\T^E^9_` )B(E(Ea(H%&=&A&A!&D#$9$=$=a$@!&=&A&A!&D# %K9VXs t#kk+7RTmn14GG"'++{l<Y[v"w %[L:UWp q58MM

>"))+6K6P6P6RUhUmUmUo6o*pq

#$++K8O8T8T8VYnYsYsYu8u,vw +TYY6.:

#$++K8K8K8M,NO

%&--k:J:O:O:Q.RS 24KK-0EE $$Q'+//!*<<BBDqH

>"))+g*>? 8;;A>>377::*//14D4I4I4KKqP

&'..{</HI  "22

$%,,[-GH QJ%%'

'(//HMMO0LM  ,Q-?@DDYDYDfDffkkpqkr	K(NO);<@U@U@b@bbgglmgn

0188Y__EV9WX

./66{7==?7ST 	

7""4::.

6!!$)),rS   inputsnum_items_in_batchc                     |j                          t        t        t        |j	                                           }|d   }t        |      D cg c])  }|j                         D ci c]  \  }}|||    c}}+ }}}}|D 	cg c]  }	t        |	| j                         }}	|D 	cg c]=  }	| j                  |	| j                  j                  j                  | j                        ? }}	| j                  |      }| j                  |      }|d   j                  d   }
|d   |d   |d}~| j!                  ||      \  }}| j#                  |||      \  }}| j$                  | j'                  |||
      \  }}||k\  }nd\  }}| j)                  |||
      }| j+                  ||||
      \  }}}}| j-                  |||||      \  }}}| j/                  |||j1                         |j1                         ||||j1                         |j1                         |
||       | j2                  j4                  :| j6                  j8                  | j2                  j4                  z  dk(  r
t;                i }| j2                  j<                  t>        j@                  t>        jB                  fv r| jE                         |d<   | j2                  jF                  dkD  r|jI                         }| jJ                  r:tM        jN                  || jP                        5 }|jS                          d d d        n | jT                  jR                  |fi | |j1                         | j2                  jV                  z  S c c}}w c c}}}w c c}	w c c}	w # 1 sw Y   GxY w)	Nr   prompt_input_idsr   prompt_attention_maskro   r)   r   learning_rate),trainrZ   nextitervaluesrangeitemsr   r2   tokenize_rowr*   configis_encoder_decoderr/   _prepare_inputsrp   rj   r|   r,   r   r   r   r   r   detachr.   torch_empty_cache_stepsrX   global_stepr   optimr   LOMOADALOMO_get_learning_raten_gpur   use_apexr#   
scale_loss	optimizerbackwardr_   gradient_accumulation_steps)rP   r*   r   r   
batch_sizerc   ikvxru   re   ri   rx   r{   r   r   r   r   r   r   r   r   r   r   kwargsscaled_losss                              rR   training_stepzXPOTrainer.training_step  s`    	 d6==?345
"@Ej@QRR1v||~6tq!1ad76RROUV!+At/D/DEVVmsthi$##Atzz'8'8'K'KTMbMbctt##F+ %%f- 2399!< 23$%<=

  $(#=#=gu#M j  $88zSZ[
H ('+'<'<ZSa'b$L*&*4K'1$L*--j(NSK ""5*hO 	k!#:<QSj
 (,';';%#!#(
$j* 	%,,.#**,!#	
  II--9

&&)J)JJaOM99??~22N4J4JKK&*&=&=&?F?#99??Q99;D==dnn5 '$$&' ' &D%%d5f5{{}tyyDDDD_ 7RVtP' 's1   M3(M-8M3M:)AM?N-M3N
model_namedataset_nametagsc                    | j                         syt        | j                  j                  d      r^t        j
                  j                  | j                  j                  j                        s!| j                  j                  j                  }nd}|t               }nt        |t              r|h}nt        |      }t        | j                  j                  d      r|j                  d       |j                  | j                         t        j                  d      }t!        ||| j"                  ||t%               r.t&        j(                  t&        j(                  j+                         ndt-               d|dd	      }|j/                  t        j
                  j1                  | j2                  j4                  d
             y)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        N_name_or_pathunsloth_versionunslotha          @article{jung2024binary,
            title        = {{Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF}},
            author       = {Tengyang Xie and Dylan J. Foster and Akshay Krishnamurthy and Corby Rosset and Ahmed Awadallah and Alexander Rakhlin},
            year         = 2024,
            eprint       = {arXiv:2405.21046}
        }XPOzcExploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHFz
2405.21046)
base_modelr  hub_model_idr  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zerohasattrr*   r   ospathisdirr  setrV   straddupdate
_tag_namestextwrapdedentr   r  r   wandbrunget_urlr   savejoinr.   
output_dir)rP   r  r  r  r  citation
model_cards          rR   create_model_cardzXPOTrainer.create_model_card  sG   " ))+4::$$o6rww}}TZZM^M^MlMl?m**88JJ <5Dc"6Dt9D4::$$&78HHYDOO$?? $  )!!**%-?-AeiiF[eii'')ae.0%}!

 	TYY%9%9;GHrS   )NNNNNNNNNNNNr)   Nr   )NNN).__name__
__module____qualname____doc__r  r   r   nnModuler   r   r"   r   r   r   dictr  r   r	   r
   r   r   rW   r   tuplers   r   	Optimizerlr_schedulerLambdaLRTensorrM   propertyrG   rj   r|   r   r   r   r   r   r   intr  r(  __classcell__)rQ   s   @rR   r&   r&   B   s   'R J 487;,0-1$(,0CGEI &*FJ59Vbhl#?7_bii/0?7 "))34?7 ryy)	?7
 )*?7 y!?7  )?7  g&> ?@?7 uWd3<.@%@AB?7 #)+=?UWeef
?7 d^?7 "(N+;T+A"BC?7 D12?7  %++//1I1I1R1RRS!?7" (0%,,9UW\WcWc9c0d'e#?7$ 
%?7B  (8$6($!vFr@&,h Q-h rvWEYYWE(,S%c8I2J-J(KWEaijmanWE	WEv %)&*,0	<ISM<I sm<I CcD()	<IrS   r&   )9r  r  typingr   r   r   r   r   rs   torch.nnr-  torch.nn.functional
functionalr   datasetsr   r   transformersr	   r
   r   r   r   r   r   r   transformers.trainer_utilsr   transformers.training_argsr   transformers.utilsr   
data_utilsr   r   models.utilsr   judgesr   online_dpo_trainerr   utilsr   r   r   r   r   r    r!   
xpo_configr"   apexr#   r   peftr$   r&    rS   rR   <module>rJ     s    
  1 1      -	 	 	 6 5 0 E 6 % 0   "   QI! QIrS   