
    bi^                        d dl Z d dlZd dlmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlm
c mZ d dlmZmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZmZ dd	l m!Z!m"Z" dd
l#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4  e       rd dl5m6Z6  e       rd dl7Z7 e       rd dl8m9Z9  G d de,      Z:y)    N)AnyCallableOptionalUnion)DatasetIterableDataset)BaseImageProcessorFeatureExtractionMixinPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerCallbackis_wandb_available)EvalPrediction)OptimizerNames)is_apex_availableis_peft_available   )is_conversationalmaybe_apply_chat_template)GeometricMixtureWrapper)unwrap_model_for_generation   )BasePairwiseJudge)NashMDConfig)OnlineDPOTrainer)SIMPLE_CHAT_TEMPLATEempty_cachegenerate_model_cardget_comet_experiment_url
get_rewardselective_log_softmaxtruncate_right)amp)	PeftModelc                         e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%deeej                  f   deeej                  f   deeej                  df   d	e	e
   d
e	e   de	e   de	eeef      de	eeeeef   f      de	eeeeef      de	e   de	eegef      de	ee      deej4                  j6                  ej4                  j8                  j:                  f   de	eej<                  ej<                  gej<                  f      ddf fdZe d        Z!d Z"d Z#d Z$d Z%d Z&d Z'	 	 ddZ(	 d&dej                  deeeej<                  e)f   f   de	e*   dej<                  fd Z+	 	 	 d'd!e	e   d"e	e   d#eeee   df   fd$Z, xZ-S )(NashMDTrainera
  
    Initialize NashMDTrainer as a subclass of [`OnlineDPOConfig`].

    Args:
        model (`transformers.PreTrainedModel`):
            The model to train, preferably an `AutoModelForCausalLM`.
        ref_model (`PreTrainedModelWrapper`):
            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
            and loss. If no reference model is provided, the trainer will create a reference model with the same
            architecture as the model to be optimized.
        reward_model (`transformers.PreTrainedModel`):
            The reward model to score completions with, preferably an `AutoModelForSequenceClassification`.
        judge (`BasePairwiseJudge`):
            The judge to use for pairwise comparison of model completions.
        args (`NashMDConfig`):
            The NashMD config arguments to use for training.
        data_collator (`transformers.DataCollator`):
            The data collator to use for training. If None is specified, the default data collator
            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
            sequences in the batch, given a dataset of paired sequences.
        train_dataset (`datasets.Dataset`):
            The dataset to use for training.
        eval_dataset (`datasets.Dataset`):
            The dataset to use for evaluation.
        processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
            Processing class used to process the data. If provided, will be used to automatically process the inputs
            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
            reuse the fine-tuned model.
        peft_config (`dict`):
            The peft config to use for training.
        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
            metric values.
        callbacks (`list[transformers.TrainerCallback]`):
            The callbacks to use for training.
        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
            The optimizer and scheduler to use for training.
        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
            The function to use to preprocess the logits before computing the metrics.
    trlznash-mdNNNmodel	ref_modelreward_modeljudgeargsdata_collatortrain_dataseteval_datasetprocessing_classpeft_configcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricsreturnc                     t         |   |||||||||	|	|
||||       | j                  j                  | _        g g g g g g g g g g g g d| _        | j                  g | j
                  d<   g | j
                  d<   y y )N)r*   r+   r,   r-   r.   r/   r0   r1   r2   reward_processing_classr3   r4   r5   r6   r7   )loss/klobjective/entropy
loss/scorerewards/probabilitiesrewards/accuraciesrewards/marginslogps/chosenlogps/rejectedval/model_contain_eos_tokenval/ref_contain_eos_tokenbetamixture_coefrewards/chosenrewards/rejected)super__init__r.   rF   _mixture_coefstatsr,   )selfr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   	__class__s                  V/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/nash_md_trainer.pyrJ   zNashMDTrainer.__init__n   s    & 	%''%-$4#+!*G 	 	
$ "YY33 !#%'"$! +-)+

  (+-DJJ'(-/DJJ)* )    c                     t        | j                  t              rL| j                  j                  }|t        | j                        k  r| j                  |   S | j                  d   S | j                  S )N)
isinstancerK   liststateepochlen)rM   rV   s     rO   rF   zNashMDTrainer.mixture_coef   sb    d(($/JJ$$E05D<N<N8O0O4%%e,kUYUgUghjUkk%%%rP   c                    t        || j                        5 }|j                  |d   |d   | j                        }d d d        | j                  j	                  |      }| j
                  .t               r!t        |t              r|j                         }n(|}n%| j                  j	                  | j
                        }t        j                         5  t        ||| j                  | j                  | j                  j                        }|j                  |d   |d   | j                        }d d d        |fS # 1 sw Y   xY w# 1 sw Y   fS xY w)N	input_idsattention_mask)rY   rZ   generation_config)r*   r+   r[   rF   device)r   acceleratorgenerater[   unwrap_modelr+   r   rS   r%   get_base_modeltorchno_gradr   rF   r\   )	rM   r*   promptsunwrapped_policy_for_gen_ctxmodel_outputpolicy_model_for_gmwref_model_for_gmwmixture_modelmixture_outputs	            rO   _generate_completionsz#NashMDTrainer._generate_completions   s]   (0@0@A 	Ea7@@!+.&'78"&"8"8 A L	  $//<<UC
 >>! !"z2F	'R$8$G$G$I! %9! !% 0 0 = =dnn M ]]_ 	3*+"&"8"8!..''..M +33!+.&'78"&"8"8 4 N	 ^++Y	 	:	 ^++s   %D;AE;EEc                 .   |d   j                   d   }|d d |d f   }t        || j                  j                  | j                  j                        \  }}t        j                  |d   |fd      t        j                  |d   |fd      |d   d}|d d |d f   }t        || j                  j                  | j                  j                        \  }}	t        j                  |d   |fd      t        j                  |d   |	fd      |d   d}
||
fS )NrY   r   dimrZ   rawrY   rZ   rn   )shaper#   r2   eos_token_idpad_token_idra   cat)rM   re   ri   rc   context_lengthmodel_completion_idsmodel_completion_mask
model_datamixture_completion_idsmixture_completion_maskmixture_datas              rO   _process_completionsz"NashMDTrainer._process_completions   s6    -33A6  ,A~,>?6D $"7"7"D"DdF[F[FhFh7
33 GK$8:N#OUVW#ii1A)BDY(Z`ab5>

 "0>?0B!C:H"D$9$9$F$FH]H]HjHj;
7 7 GK$8:P#QWXY#ii1A)BD[(\bcd5>
 <''rP   c                    t        j                         5  t        | j                  |d   | j                  j
                  |      \  }}}t        | j                  |d   | j                  j
                  |      \  }}}d d d        | j                  j                  t        j                  |d   | j                  j                  k(  d      }t        j                  |d   | j                  j                  k(  d      }| xx   | j                  j                  z  cc<   | xx   | j                  j                  z  cc<   fS # 1 sw Y   xY w)NrY   rR   rl   )
ra   rb   r!   r,   r2   rr   r.   missing_eos_penaltyanyrq   )	rM   rw   rz   rt   _model_scoresmixture_scoresmodel_contain_eosmixture_contain_eoss	            rO   _compute_rewardszNashMDTrainer._compute_rewards   s/   ]]_ 	!+!!:k#:D<Q<Q<^<^`n"A|Q $.!!<#<d>S>S>`>`bp$ A~q		 99((4 %		*[*ATEZEZEgEg*gmo p"'))L,EI^I^IkIk,kqs"t++,		0M0MM,//0DII4Q4QQ0^++	 	s   A'EEc           	         |d   }| j                   j                  |d   d d |d f   d      }|D cg c]  }|j                          }}| j                   j                  |d   d d |d f   d      }|D cg c]  }|j                          }}t        d|d   i      r|D cg c]  }d|dg
 }}t	        j
                         }|j                  t              }	|D 
cg c]  }
|	j                  |
	       }}
|D cg c]  }|	j                  |	       }}|D cg c]  }d|dg
 }}|D cg c]  }|	j                  |	       }}| j                  j                  |t        t        ||            d
      }t        j                  ||d   j                        S c c}w c c}w c c}w c c}
w c c}w c c}w c c}w )Nrn   rY   T)skip_special_tokenspromptr   	assistant)rolecontent)messages)return_scores)r\   )r2   batch_decodestripr   jinja2Environmentfrom_stringr   renderr-   rT   zipra   tensorr\   )rM   rw   rz   rt   rc   model_data_completions
completionmixture_data_completionsenvironmenttemplatemessageprobabilitys               rO   _compute_judgezNashMDTrainer._compute_judge  s   U#!%!6!6!C!C{#A~$67T "D "
 H^!^*"2"2"4!^!^#'#8#8#E#E%a&89t $F $
  Jb#b:J$4$4$6#b #bh
34Qg&CM+*=>&" & !,,.K"../CDHHOPWx8PGP]s%tzhoozo&J%t"%t Rj(CM+*=>($ ( H`(9C4($ ( jj&&+-EFG ' 

 ||K
;0G0N0NOO7 "_
 $c&
 Q%t((s)   F#9F(&F-"F2F7 F<3Gc                    fd} |||      }t        j                         5  | j                  #|j                         5   |||      }d d d        n || j                  |      }d d d        |d   d d d f   dk(  }|j	                  |d      }j	                  |d      }||fS # 1 sw Y   JxY w# 1 sw Y   NxY w)Nc                      | |d   |d         }|j                   d d dz
  df   }t        ||d   d d d f         }|S )NrY   rZ   )rZ   r   rR   )logitsr"   )mdataoutputr   token_logprobsrt   s        rO   compute_logprobs_for_datazBNashMDTrainer._compute_logprobs.<locals>.compute_logprobs_for_data2  s\    tK(>N9OPF]]1nq&82&=#=>F264;LQP^P_M_;`aN!!rP   rZ   r   g        )ra   rb   r+   disable_adaptermasked_fill)rM   r*   rw   rt   r   model_logprobs_model_dataref_logprobs_model_datamodel_padding_masks      `    rO   _compute_logprobszNashMDTrainer._compute_logprobs1  s    	" %>eZ$P! ]]_ 	`~~%**, [.Gz.Z+[ [ +DDNNT^*_'	` ((89!^_:LMQRR$=$I$IJ\^a$b!"9"E"EFXZ]"^)+BCC[ [	` 	`s#   B4
B(B4(B1	-B44B=c                     |dz
  |j                  d      z  }t        j                         5  ||z
  }|j                  d      }d d d        |z  j                  d      }| j                  |z  |z
  }|j	                         |fS # 1 sw Y   BxY w)Ng      ?r   )sumra   rb   rE   mean)	rM   r   r   r   score	log_ratio
kl_div_logkl_div_losslosss	            rO   _compute_losseszNashMDTrainer._compute_lossesJ  s     s"&?&C&CA&FF ]]_ 	*14KKI"q)J	* !#<<AA!D yy;&.yy{E:--	* 	*s   BBc                      fd} j                   d   j                   ||              j                   d   j                   ||             |j                  d      }|j                  d      } j                   d   j                   ||              j                   d   j                   ||              j                  H j                   d   j                   ||	              j                   d   j                   ||
              j                   d	   j                   ||             |j                  d       } j                   d
   j                   ||             ||z
  } j                   d   j                   ||             |dkD  j	                         } j                   d   j                   ||             |d   d d |d f    j
                  j                  k(  j                  d      }|d   d d |d f    j
                  j                  k(  j                  d      } j                   d   j                   ||j	                                       j                   d   j                   ||j	                                       j                   d   j                   j                          j                   d   j                   j                         y )Nc                 r    j                   j                  |       j                         j                         S N)r]   gather_for_metricsr   item)r   rM   s    rO   gather_meanz2NashMDTrainer._log_statistics.<locals>.gather_meanl  s,    ##66v>CCEJJLLrP   r=   r;   r   rA   rB   rG   rH   r>   r<   r@   r   r?   rY   rl   rC   rD   rE   rF   )
rL   appendr   r,   floatr2   rq   r~   rE   rF   )rM   rw   rz   r   r   r   r   kl_divrt   r   r   r   model_logprobs_model_data_sumref_logprobs_model_data_sumentropy_model_datamarginaccuracy	model_eosmixture_eoss   `                  rO   _log_statisticszNashMDTrainer._log_statistics^  s   	M 	

< ''E(:;

9$$[%89 )B(E(Ea(H%&=&A&A!&D#

>"))+6S*TU

#$++K8S,TU (JJ'(//L0IJJJ)*11+n2MN 	

*+22;{3KL 8;;A>>

&'..{;M/NO /1LL

$%,,[-@A QJ%%'

'(//H0EF  ,Q-?@DDYDYDfDffkkpqkr	#K0NO1CDH]H]HjHjjootuov

0188Y__EV9WX

./66{;CTCTCV7WX 	

6!!$)),

>"))$*;*;<rP   inputsnum_items_in_batchc                    |j                          t        t        t        |j	                                           }|d   }t        |      D cg c])  }|j                         D ci c]  \  }}|||    c}}+ }}}}|D 	cg c]  }	t        |	| j                         }}	|D 	cg c]=  }	| j                  |	| j                  j                  j                  | j                        ? }}	| j                  |      }| j                  |      }|d   j                  d   }
|d   |d   |d}~| j!                  ||      \  }}| j#                  |||      \  }}| j$                  /| j'                  |||
      \  }}t)        j*                  ||z
        }nd\  }}| j-                  |||
      }| j/                  |||
      \  }}| j1                  |||      \  }}}| j3                  |||j5                         |||j5                         |j5                         |
||
       | j6                  j8                  :| j:                  j<                  | j6                  j8                  z  dk(  r
t?                i }| j6                  j@                  tB        jD                  tB        jF                  fv r| jI                         |d<   | j6                  jJ                  dkD  r|jM                         }| jN                  r:tQ        jR                  || jT                        5 }|jW                          d d d        n | jX                  jV                  |fi | |j5                         | j6                  jZ                  z  S c c}}w c c}}}w c c}	w c c}	w # 1 sw Y   GxY w)	Nr   prompt_input_idsr   prompt_attention_maskro   r)   r   learning_rate).trainrW   nextitervaluesrangeitemsr   r2   tokenize_rowr*   configis_encoder_decoderr/   _prepare_inputsrp   rj   r{   r,   r   Fsigmoidr   r   r   r   detachr.   torch_empty_cache_stepsrU   global_stepr   optimr   LOMOADALOMO_get_learning_raten_gpur   use_apexr$   
scale_loss	optimizerbackwardr]   gradient_accumulation_steps)rM   r*   r   r   
batch_sizerc   ikvxrt   re   ri   rw   rz   r   r   r   r   r   r   r   r   kwargsscaled_losss                            rO   training_stepzNashMDTrainer.training_step  sE    	 d6==?345
"@Ej@QRR1v||~6tq!1ad76RROUV!+At/D/DEVVmsthi$##Atzz'8'8'K'KTMbMbctt##F+ %%f- 2399!< 23$%<=

  (,'A'A%'Q$n $(#<#<\>[b#c 
L (+/+@+@\[i+j(L.))L>$ABK+5(L.--j,WK >B=S=STY[egu=v:!#: #223LNegrseV 	%,,.#LLNMMO	
 II--9

&&)J)JJaOM99??~22N4J4JKK&*&=&=&?F?#99??Q99;D==dnn5 '$$&' ' &D%%d5f5{{}tyyDDDDM 7RVt~' 's1   M1(M+8M1M8)AM=N+M1N
model_namedataset_nametagsc                    | j                         syt        | j                  j                  d      r^t        j
                  j                  | j                  j                  j                        s!| j                  j                  j                  }nd}|t               }nt        |t              r|h}nt        |      }t        | j                  j                  d      r|j                  d       |j                  | j                         t        j                  d      }t!        ||| j"                  ||t%               r.t&        j(                  t&        j(                  j+                         ndt-               d|dd	      }|j/                  t        j
                  j1                  | j2                  j4                  d
             y)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        N_name_or_pathunsloth_versionunslotha          @inproceedings{munos2024nash,
            title        = {{Nash Learning from Human Feedback}},
            author       = {R{'{e}}mi Munos and Michal Valko and Daniele Calandriello and Mohammad Gheshlaghi Azar and Mark Rowland and Zhaohan Daniel Guo and Yunhao Tang and Matthieu Geist and Thomas Mesnard and C{\^{o}}me Fiegel and Andrea Michi and Marco Selvi and Sertan Girgin and Nikola Momchev and Olivier Bachem and Daniel J. Mankowitz and Doina Precup and Bilal Piot},
            year         = 2024,
            booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
            publisher    = {OpenReview.net},
            url          = {https://openreview.net/forum?id=Y5AmNYiyCQ}
        }zNash-MDz!Nash Learning from Human Feedbackz
2312.00886)
base_modelr   hub_model_idr   r   	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zerohasattrr*   r   ospathisdirr   setrS   straddupdate
_tag_namestextwrapdedentr   r   r   wandbrunget_urlr    savejoinr.   
output_dir)rM   r   r   r   r   citation
model_cards          rO   create_model_cardzNashMDTrainer.create_model_card  sG   " ))+4::$$o6rww}}TZZM^M^MlMl?m**88JJ <5Dc"6Dt9D4::$$&78HHYDOO$?? $  )!!**%-?-AeiiF[eii'')ae.0"%;!

 	TYY%9%9;GHrP   )NNNNNNNNNNNNr)   Nr   )NNN).__name__
__module____qualname____doc__r  r   r   nnModuler   r   r   r   r   r   dictr  r   r	   r
   r   r   rT   r   tuplera   r   	Optimizerlr_schedulerLambdaLRTensorrJ   propertyrF   rj   r{   r   r   r   r   r   r   intr   r  __classcell__)rN   s   @rO   r'   r'   B   s   'R #J 487;@D-1'+,0CGEI &*FJ59Vbhl#:0_bii/0:0 "))34:0 ORYY<=	:0
 )*:0 |$:0  ):0  g&> ?@:0 uWd3<.@%@AB:0 #)+=?UWeef
:0 d^:0 "(N+;T+A"BC:0 D12:0  %++//1I1I1R1RRS!:0" (0%,,9UW\WcWc9c0d'e#:0$ 
%:0x & &.,`(6,$ PDD2.< 9=x rvNEYYNE(,S%c8I2J-J(KNEaijmanNE	NEd %)&*,0	>ISM>I sm>I CcD()	>IrP   r'   );r   r  typingr   r   r   r   r   ra   torch.nnr  torch.nn.functional
functionalr   datasetsr   r   transformersr	   r
   r   r   r   r   r   transformers.trainer_utilsr   transformers.training_argsr   transformers.utilsr   r   
data_utilsr   r   models.modeling_baser   models.utilsr   judgesr   nash_md_configr   online_dpo_trainerr   utilsr   r   r   r    r!   r"   r#   apexr$   r  peftr%   r'    rP   rO   <module>r2     s    
  1 1      -   6 5 C E : 6 % ( 0     eI$ eIrP   