
    bi                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmc mZ d dlmZ d dlmZ d dlmZ d d	l m!Z! d d
l"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC  e2       r
d dlDmEZEmFZFmGZG  e,       rd dlHZH G d de*      ZIy)    N)defaultdict)nullcontext)Path)AnyCallableLiteralOptionalUnion)PartialState)Dataset)autocast)
DataLoader)
AutoModelForCausalLMBaseImageProcessorDataCollatorFeatureExtractionMixinPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTraineris_comet_availableis_wandb_available)TrainerCallback)EvalLoopOutput)is_peft_availableis_torch_fx_proxy   )maybe_apply_chat_templatemaybe_extract_prompt   )	CPOConfig)
DPODataCollatorWithPaddingadd_bos_token_if_neededadd_eos_token_if_neededdisable_dropout_in_modelgenerate_model_cardget_comet_experiment_urllog_table_to_comet_experimentpad_to_lengthpeft_module_casting_to_bf16selective_log_softmax)	PeftModelget_peft_modelprepare_model_for_kbit_trainingc                       e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 	 	 	 d;deeeej                  e
f      dee   dee   dee   d	eeeee
ef   f      d
eeeeeef      deeg ef      deee      deej0                  j2                  ej0                  j4                  j6                  f   deeej8                  ej8                  gej8                  f      dee   deeegef      f fdZd Zd<deeeej                  f      defdZ e!	 	 	 	 d=dee
eeejD                  f   f   de#de$de$deejJ                     dee
ejD                  f   fd       Z&dejN                  dejN                  deejN                  ejN                  ejN                  f   fdZ(e!	 	 	 d>dejN                  dejD                  d e#de$de#dejN                  fd!       Z)dej                  dee
eeejD                  f   f   deejN                  ejN                  ejN                  ejN                  f   fd"Z*	 d?dee
eeejD                  f   f   d#e+d$   fd%Z,	 	 d@deeej                  f   d&ee
eej8                  e-f   f   deej8                  eej8                  ee
ej8                  f   f   f   fd'Z.dee
ejD                  f   de
fd(Z/	 d<deeej                  f   d&ee
eej8                  e-f   f   d)e#d*eee
      fd+Z0d?d,ee
e1f   d#e+d$   ddfd-Z2	 	 	 dAd.e3d/e
d)ee#   d*eee
      d0e
def fd1Z4d<d2ee
e1f   d3ee1   ddf fd4Z5d5 Z6 fd6Z7	 	 	 dBd7ee
   d8ee
   d9ee
ee
   df   fd:Z8 xZ9S )C
CPOTrainera  
    Initialize CPOTrainer.

    Args:
        model (`transformers.PreTrainedModel`):
            The model to train, preferably an `AutoModelForSequenceClassification`.
        args (`CPOConfig`):
            The CPO config arguments to use for training.
        data_collator (`transformers.DataCollator`):
            The data collator to use for training. If None is specified, the default data collator
            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
            sequences in the batch, given a dataset of paired sequences.
        train_dataset (`datasets.Dataset`):
            The dataset to use for training.
        eval_dataset (`datasets.Dataset`):
            The dataset to use for evaluation.
        processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
            Processing class used to process the data. If provided, will be used to automatically process the inputs
            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
            reuse the fine-tuned model.
        model_init (`Callable[[], transformers.PreTrainedModel]`):
            The model initializer to use for training. If None is specified, the default model initializer will be
            used.
        callbacks (`list[transformers.TrainerCallback]`):
            The callbacks to use for training.
        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
            The optimizer and scheduler to use for training.
        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
            The function to use to preprocess the logits before computing the metrics.
        peft_config (`dict`, defaults to `None`):
            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
            a PEFT model.
        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
            metric values.
    trlcpoNmodelargsdata_collatortrain_dataseteval_datasetprocessing_class
model_init	callbacks
optimizerspreprocess_logits_for_metricspeft_configcompute_metricsc                 \   |j                   i }nt        |t              st        d      |j                   }|j	                  d      }|Xt        |t              r|dk7  rt        t        |      }|dk7  r)t        |t        j                        st        d| d      ||d<   t        |t              rt        j                  |fi |}d| _
        t               s|t        d      t               r(|%t        |t              r|j                         }t        |dd      st        |d	d      rht        |d
      xr. d
t        t!        j"                  t$              j&                        v }d|j(                  i}|r|j*                  |d
<   t%        |fi |}nK|j(                  r?t        |d      r|j-                          n"d }|j/                         j1                  |       t3        ||      }|j4                  rkt        |d	d      r^t7        |       d| _
        nK|j(                  r?t        |d      r|j-                          n"d }|j/                         j1                  |       |j8                  rt;               st=               st        d      ||j>                  j@                  | _         n(|j@                  t        d      |j@                  | _         | j@                  r6|j>                  jB                  | _!        |j>                  jD                  | _"        |t        d      |jF                  tI        jJ                  dtL               d}n|jF                  }|jN                  tI        jJ                  dtL               d}n|jN                  }||k  st        d| d| d      |jP                  )| j@                  rtI        jJ                  dtL               d}n|jP                  }|atS        |jD                  |jT                  | j@                        }|jV                  r!d|_+        tI        jJ                  dtL               d| _,        nd| _,        |jZ                  rt]        |       || _#        |j8                  | _        |jT                  | _*        |j^                  |j^                  n|jD                  | _/        || _'        |j`                  | _0        || _(        || _1        |jd                  dv r7|jf                  dkD  r(tI        jJ                  d|jd                   d tL               |jd                  d!k(  rt        d"      |jh                  | _4        |jf                  | _3        |jd                  | _2        |jj                  | _5        t        |j>                  d#d      | _6        t        |j>                  d$d%      | _7        | jl                  r)| jn                  d%k(  rtI        jJ                  d&tL               |jd                  d'k(  r|jp                  | _8        ts        d(       | _:        d|jv                  d)<   ty               j{                         5  |j}                  t~        |j                  *      }|j}                  t        d+|i|j                  ,      }|E|j}                  t~        |j                  *      }|j}                  t        d+|i|j                  ,      }|j}                  | j                  |j                  *      }|'|j}                  | j                  |j                  *      }d d d        t        |   ||||||||||	|
-       d| _E        t        | j                  d.      r%| j                  j                  | j                         t        | d/      st        d0      y # 1 sw Y   ~xY w)1NzRYou passed model_kwargs to the CPOTrainer. But your model is already instantiated.torch_dtypeautoznInvalid `torch_dtype` passed to the CPOConfig. Expected a string with either `torch.dtype` or 'auto', but got .FzvPEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT modelsis_loaded_in_8bitis_loaded_in_4bitgradient_checkpointing_kwargsuse_gradient_checkpointingenable_input_require_gradsc                 &    |j                  d       y NTrequires_grad_moduleinputoutputs      R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/cpo_trainer.pymake_inputs_require_gradz5CPOTrainer.__init__.<locals>.make_inputs_require_grad   s    --d3    Tc                 &    |j                  d       y rI   rJ   rL   s      rP   rQ   z5CPOTrainer.__init__.<locals>.make_inputs_require_grad   s    ))$/rR   z`generate_during_eval=True` requires Weights and Biases or Comet to be installed. Please install `wandb` or `comet-ml` to resolve.zMWhen no model is provided, you need to pass the parameter is_encoder_decoder.z=processing_class must be specified to tokenize a CPO dataset.z`max_length` is not set in the CPOConfig's init it will default to `512` by default, but you should do it yourself in the future.i   z`max_prompt_length` is not set in the CPOConfig's init it will default to `128` by default, but you should do it yourself in the future.   zmax_prompt_length (z+) should be strictly less than max_length (z).zWhen using an encoder decoder architecture, you should set `max_completion_length` in the CPOConfig's init it will default to `128` by default, but you should do it yourself in the future.)pad_token_idlabel_pad_token_idis_encoder_decoderzWhen using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments we have set it for you, but you should do it yourself in the future.)hingeipor   zYou are using the z loss type that does not support label smoothing. The `label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this warning.kto_pairzKSupport for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.output_router_logitsrouter_aux_loss_coef        a-  You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to `0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary loss.simpoc                       t        t              S N)r   list rR   rP   <lambda>z%CPOTrainer.__init__.<locals>.<lambda>B  s    ;t3D rR   estimate_tokens)num_proc	tokenizer)	fn_kwargsre   )r3   r4   r5   r6   r7   r8   r9   r>   r:   r;   r<   add_model_tagsacceleratorzXYour `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`.)Jmodel_init_kwargs
isinstancestr
ValueErrorgetgetattrtorchdtyper   from_pretrained_peft_has_been_casted_to_bf16r   r,   merge_and_unloadhasattrra   inspect	signaturer.   
parametersgradient_checkpointingrE   rG   get_input_embeddingsregister_forward_hookr-   bf16r*   generate_during_evalr   r   configrW   decoder_start_token_idrU   
max_lengthwarningswarnUserWarningmax_prompt_lengthmax_completion_lengthr"   rV   remove_unused_columnsuse_dpo_data_collatordisable_dropoutr%   padding_valuetruncation_moder8   	loss_typelabel_smoothingbeta	cpo_alphaaux_loss_enabledaux_loss_coefsimpo_gammar   _stored_metricswarnings_issuedr   main_process_firstmapr   dataset_num_procr   tokenize_rowsuper__init__model_accepts_loss_kwargsr3   rh   
_tag_namesAttributeError)selfr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   rj   r@   _support_gc_kwargsprepare_model_kwargsrQ   r   r   r   	__class__s                        rP   r   zCPOTrainer.__init__r   sn   " !!) "E3'qrr $ 6 6+//>K&k3/K64I")%"=K&(K1U$ I  JU  IV  VW  X  4?!-0eS!(88TBSTE .3* "{'> I   [%<%+..0u159WUL_af=g%,9& &5%%&EFQQ:  # )EdFaFa'b$%LPLnLn()HI7VAUV,,5">?4464 ..0FFG_` #5+6EyyWU,?G+E2592
 ((u:;0020 **,BBC[\$$.@.BFXFZD 
 &+ll&E&ED#$$,lmm&*&=&=D#""*/,,*M*MD' % 9 9D#\]]??"MMe
 JJ!!)MMe
 !$ $ 6 6 :-%&7%88cdncooqr  %%-$2I2IMMe
 %(!$($>$>! 6-::#'#:#:#'#:#:M ))-2*\ *.D&).D& $U+$$($=$=!"&"9"9373E3E3QT//WgWtWt!2#33%:" 0>>--$2F2F2JMM$T^^$4 5v v
 >>Z'jkkII	#33 '6Le T$U\\3I3O  T%7%73%>MM  >>W$#//D*+DE 48/0 ^..0 	c)--.BTMbMb-cM)--)kCS5T_c_t_t . M '+//0DtOdOd/e+//-*,<=!22  0   *--d.?.?$J_J_-`M'+//0A0ADLaLa/b#	c& 	''%-!+!*G 	 	
" */& 4::/0JJ%%doo6t]+ j  ,U	c 	cs   C^""^+c                    | j                  ||z   d      }| j                  |d      d   }|d   t        |      d }|d   t        |      d }t        j                  ||g      }t        j                  |d         }t        |      t        |      k7  rt        d      t        |      }	||d   d|	 k7  r|	dz  }	|d   d|	 }|d   d|	 }
t        |      t        |
      k7  rt        d      |d   |	d }|d   |	d }t        ||
||	      S )
a  
        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
        b)[len(enc(a)):]`. Reference:
            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
        Fadd_special_tokens	input_idsNattention_maskzBPrompt input ids and answer input ids should have the same length.r    z@Prompt input ids and attention mask should have the same length.)prompt_input_idsprompt_attention_maskr   r   )r8   lennpconcatenatearrayrm   dict)r   promptanswerfull_tokenizedr   answer_input_idsanswer_attention_maskfull_concat_input_idsfull_input_idsresponse_token_ids_start_idxr   s              rP   build_tokenized_answerz!CPOTrainer.build_tokenized_answer~  su    ..vSX.Y00E0RS^_)+6s;K7L7NO ./? @EUAVAX Y !#0@BR/S T ."=>~#&;"<<abb (++;'<$ ~k:;X<XYY(A-()+67T8TU ./? @A^B^ _ C(=$>>_``)+67S7TU ./? @A]A^ _-"7&0	
 	
rR   returnc           	      	   i }|d   }|d   }|d   }| j                   st        |t              st        dt	        |             | j                  |d      }|j                         D 	ci c]  \  }}	d| |	 }}}	t        |t              st        dt	        |             | j                  ||      }
t        |t              st        d	t	        |             | j                  ||      }t        |d
         }t        |
d
         }t        |d
         }t        ||      }|j                         D ]  \  }}	|	d| ||<    t        t        |
d
   |d
         D cg c]
  \  }}||k7   c}}      }t        ||z
        }|dkD  s|dkD  rt        d      t        | j
                  j                  ||||
||      \  }}
}t        | j
                  j                   |
|      \  }
}t#        t        |
d         t        |d               }|
||fD ]  }t        |d
         |z   | j$                  kD  s"| j&                  dk(  rdD ]  }||   d| j(                   ||<    N| j&                  dk(  rdD ]  }||   | j(                   d ||<    {t        d| j&                          |
|fD ]J  }t        |d
         |z   | j$                  kD  s"dD ]$  }||   d| j$                  | j(                  z
   ||<   & L dD ci c]  }||
d|    |
|   z    }}dD ci c]  }||d|    ||   z    }}|d   dd |d<   | j*                  gt        |
d
         z  |d   dt        |
d
          |d   dd |d<   | j*                  gt        |d
         z  |d   dt        |d
          |||dj                         D ],  \  }}|j                         D ]  \  }}|dk(  r||| | <    . |S | j                  |d| j,                  d      }
| j                  |d| j,                  d      }| j                  |d| j(                  d      }|
d   |d<   |d   |d<   |d   |d
<   |d   |d<   |bt/        |d      rV|j1                  t3        j4                  |d               |d<   |j1                  t3        j4                  |d               |d <   |S c c}	}w c c}}w c c}w c c}w )!a.  Tokenize a single row from a CPO specific dataset.

        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
        we truncate the chosen/rejected.

        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
        of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens.
        r   chosenrejectedz prompt should be an str but got Fr   prompt_z chosen should be an str but got z"rejected should be an str but got r   Nr    zdChosen and rejected prompt_input_ids might only differ on the last token due to tokenizer merge ops.r   
keep_start)r   r   keep_endzUnknown truncation mode: )r   r   labels)chosen_	rejected_ token_type_idsT)
truncationr   r   chosen_labelsrejected_labelsr   r   %prepare_decoder_input_ids_from_labels)r   rejected_decoder_input_idschosen_decoder_input_ids)rW   rk   rl   rm   typer8   itemsr   r   minsumzipabsr#   bos_token_idr$   eos_token_idmaxr   r   r   rV   r   ru   r   rp   tensor)r   featurer3   batchr   r   r   prompt_tokenskvchosen_tokensrejected_tokensprompt_len_input_idschosen_prompt_len_input_idsrejected_prompt_len_input_idsabnum_diff_tokensnum_diff_lenlonger_response_lengthanswer_tokenschosen_sequence_tokensrejected_sequence_tokenstokstype_keytokenss                             rP   r   zCPOTrainer.tokenize_row  sd    "":&&& fc* #CDL>!RSS 11&U1SM:G:M:M:OP$!Qwqc]A-PMPfc* #CDL>!RSS 77GMh, #Ed8nEU!VWW"99&(KO $'}5G'H#I *-m<N.O*P',/@R0S,T)#&'BDa#b %++- <1#$%:&:#;a <
 "$'6H(I?[mKn$opDAqapO :=ZZ[L"lQ&6 =  =T%%22$+-=9M=/ .E%%22M?.*M? &)];-G)H#o^iNjJk%l" #0-!P 	]}%789<RRUYUdUdd++|;!N ZA/<Q/?@X$BXBX/YM!,Z--;!N [A/<Q/?AWAW@W@Y/ZM!,[ )+DTEYEYDZ)[\\	] #0!A h}%789<RRUYUdUdd< h+8+;<fdooPTPfPf>f+ga(hh Ml&GH=71#/-2BBB&" & Qp(KL?WQC=1OA4FFF($ ( 0Fk/RST/U"8,''ZM"456Z7"8,-Us=AS3T/UV 2J+1VWX1Y$X.''^O$678^9$X./Y_EW5X1YZ
 25! eg	54
 )-

 5$Hf#33 .4EQCz*+55D / !114D4N4Ncg 2 M #33Td6P6Pei 4 O !114D4J4J_c 2 M &3;%?E/"'6{'CE#$(5k(BE$%-:;K-LE)* WU4[%\6;6a6a <<.?(@A 7b 723 5:4_4_ <<o(>? 5` 501 o Q0 q\&(s   ,S!S
:SSr   rW   rV   r   devicec           	         i }|r-t        | d   j                  d   | d   j                  d         }n,t        | d   j                  d   | d   j                  d         }| D ]  }|j                  d      st        | |   t        j
                        s3d|v s|r|}n'|j                  d      r|}n|j                  d	      rd
}|j                  dd      }	t        | |   |      ||	<    | D ]  }|j                  d      st        | |   t        j
                        s3d|v s|r|}n'|j                  d      r|}n|j                  d	      rd
}|j                  dd      }	t	        j                  ||	   t        | |   |      fd
      j                  |      ||	<    |rP| d   j                  dd      j                  |      |d<   | d   j                  dd      j                  |      |d<   |S )a  Concatenate the chosen and rejected inputs into a single tensor.

        Args:
            batch:
                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
                of shape (batch_size, sequence_length).
            is_encoder_decoder:
                Whether the model is an encoder-decoder model.
            label_pad_token_id:
                The label pad token id.
            padding_value:
                The padding value to use for the concatenated inputs_ids.
            device:
                The device for the concatenated inputs.

        Returns:
            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
        r   r    r   chosen_input_idsrejected_input_idsr   r   
_input_ids_attention_maskr   concatenated)	pad_valuer   dimr   r   r   concatenated_input_idsr   concatenated_attention_mask)r   shape
startswithrk   rp   Tensorendswithreplacer)   cattorepeat)
r   rW   rV   r   r   concatenated_batchr   r   r   concatenated_keys
             rP   concatenated_inputszCPOTrainer.concatenated_inputs@  s   4  U?399!<eDU>V>\>\]^>_`JU#56<<Q?G[A\AbAbcdAefJ 		pA||H%*U1Xu||*Lq=$6 2IZZ- -IZZ 12 !I#$99X~#F 7DU1Xzen7o"#34		p  	$A||J'JuQx,Nq=$6 2IZZ- -IZZ 12 !I#$99Z#H 7<yy*+;<%eAh
iP 8 "F"# ##34	$" ;@AS;T;[;[\]_`;a;d;dlr;d;s78-.55a;>>f>M <= "!rR   policy_chosen_logpspolicy_rejected_logpsc                 T   ||z
  j                  | j                  j                        }| j                  dk(  r| j                  | j
                  z  }||z
  }t        j                  | j
                  |z         d| j                  z
  z  t        j                  | j
                   |z        | j                  z  z
  }n| j                  dk(  ret        j                  | j
                  |z         d| j                  z
  z  t        j                  | j
                   |z        | j                  z  z
  }nv| j                  dk(  r&t        j                  d| j
                  |z  z
        }nA| j                  dk(  r|dd| j
                  z  z  z
  dz  }nt        d| j                   d      | j
                  |j                  | j                  j                        j                         z  }| j
                  |j                  | j                  j                        j                         z  }|||fS )	a  Compute the CPO loss for a batch of policy and reference model log probabilities.

        Args:
            policy_chosen_logps:
                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
            policy_rejected_logps:
                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)

        Returns:
            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the CPO
            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
            the chosen and rejected responses, respectively.
        r^   r    sigmoidrX   rY   r   zUnknown loss type: z7. Should be one of ['sigmoid', 'hinge', 'ipo', 'simpo'])r   ri   r   r   r   r   F
logsigmoidr   rp   relurm   detach)r   r   r   logitsgamma_logratioslosseschosen_rewardsrejected_rewardss           rP   cpo_losszCPOTrainer.cpo_loss  s   $ &(==AA$BRBRBYBYZ >>W$"..:Oo-F dii&011Q9M9M5MN,,		zF23d6J6JJK  ^^y( dii&011Q9M9M5MN,,		zF23d6J6JJK  ^^w&ZZDII$6 67F^^u$qA		M22q8F%dnn%55lm  &9&<&<T=M=M=T=T&U%]%]%__99(=(@(@AQAQAXAX(Y'a'a'cc~'777rR   r  r   average_log_probc                 L   | j                   dd |j                   k7  rt        d      |s'|ddddf   j                         }| ddddddf   } ||k7  }d|||k(  <   t        | |      }|r&||z  j	                  d      |j	                  d      z  S ||z  j	                  d      S )a  Compute the log probabilities of the given labels under the given logits.

        Args:
            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
            labels:
                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
                ignored. Shape: (batch_size, sequence_length)
            average_log_prob:
                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
                log probabilities of the (non-masked) tokens.
            label_pad_token_id: The label pad token id.
            is_encoder_decoder: Whether the model is an encoder-decoder model.

        Returns:
            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
            given logits.
        NzKLogits (batch and sequence length dim) and labels must have the same shape.r    r   )r   rm   cloner+   r   )r  r   r  rV   rW   	loss_maskper_token_logpss          rP   get_batch_logpszCPOTrainer.get_batch_logps  s    2 <<,jkk!AqrE]((*FAssAI&F00	 01v++,/?#i/44R89==;LLL#i/44R88rR   c                       j                  | j                   j                   j                   j                  j
                        }|d   j                  d   } j                  rd j                  |d         ini } j                  rd|d<    ||d   f|d	   d
d|}|j                  } fd}|d   j                         }	 j                  dk(  r9t        j                  d      j                   j                  j
                        }
n ||d| |	d|       }
 j                  ||d    j                   dv  j                   j                        }|d| }||d }|d| }||d } j                  r|||||
|j"                  fS |||||
fS )zRun the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.

        We do this to avoid doing two forward passes, because it's faster for FSDP.
        )rW   rV   r   r   r   r   decoder_input_idsconcatenated_labelsTr[   r   r   F)r   	use_cachec                 R   j                   s1| dd dd d f   j                         } |ddd f   j                         }t        j                         }| j	                  d| j
                  d         } |j	                  d      }|j                  | j                        } || |      }|S )N.r  r    )rW   
contiguousnnCrossEntropyLossviewr   r   r   )r  r   loss_fctlossr   s       rP   cross_entropy_lossz;CPOTrainer.concatenated_forward.<locals>.cross_entropy_loss  s    **SbS!,779QR335**,H[[V\\"%56F[[_FYYv}}-FFF+DKrR   r]   N)rY   r^   )r  rW   rV   )r   rW   rV   r   ri   r   r   _shift_rightr   r  r  r   rp   r   r   r  r   aux_loss)r   r3   r   r   
len_chosenmodel_kwargsoutputs
all_logitsr  r   nll_loss	all_logpschosen_logpsrejected_logpschosen_logitsrejected_logitss   `               rP   concatenated_forwardzCPOTrainer.concatenated_forward  s    "55#66#66,,##** 6 
 ?+11!4
 && $T%6%67IJ_7`%a  	   37L/078
-.KL
 	
 ^^
	 $$9:@@B>>Q||C(++D,<,<,C,CDH)*[j*A6+:CVWH((45!^^/??#66#66 ) 
	 !*-":;/";J/$Z[1   .-RZ\c\l\lmmnm_hWWrR   
train_eval)trainevalc                 ^   i }| j                  ||      }|dd \  }}}}	}
| j                  r|d   }| j                  ||      \  }}}|j                         | j                  |
z  z   }||kD  j                         }|dk(  rdnd}| j                  j                  |      j                         j                         || d<   | j                  j                  |      j                         j                         || d<   | j                  j                  |      j                         j                         || d<   | j                  j                  ||z
        j                         j                         || d	<   | j                  j                  |      j                         j                         j                         || d
<   | j                  j                  |      j                         j                         j                         || d<   | j                  j                  |	j                         j                               j                         j                         || d<   | j                  j                  |j                         j                               j                         j                         || d<   | j                  j                  |
      j                         j                         j                         || d<   | j                  r|| j                  z  z  }||fS )zWCompute the CPO loss and other metrics for the given batch of inputs for train or test.N   r/  eval_r   zrewards/chosenzrewards/rejectedzrewards/accuracieszrewards/marginszlogps/rejectedzlogps/chosenzlogits/rejectedzlogits/chosenr&  )r,  r   r  meanr   floatri   gather_for_metricsitemr  r   )r   r3   r   r-  metricsforward_outputr   r   policy_chosen_logitspolicy_rejected_logitspolicy_nll_lossr!  r	  r
  r  r  reward_accuraciesprefixs                     rP   get_batch_loss_metricsz!CPOTrainer.get_batch_loss_metrics/  s    225%@ 2A	
! "  %a(H37==!4
0 0
 {{}t~~??+.>>EEG&&0b-1-=-=-P-PQ_-`-e-e-g-l-l-n6(.)*/3/?/?/R/RSc/d/i/i/k/p/p/r6(*+,151A1A1T1TUf1g1l1l1n1s1s1u6(,-.//AQ0QRWWY^^` 	6(/*+ //0EFMMOTTV[[] 	6(.)* //0CDKKMRRTYY[ 	6(,'( //0F0M0M0O0T0T0VW\\^cce 	6(/*+ //0D0K0K0M0R0R0TUZZ\aac 	6(-() (,'7'7'J'J?'['b'b'd'i'i'k'p'p'r6((#$  D&&11DW}rR   inputsc                    | j                   r)t        | j                  j                  j                        n	t               }|5  | j                  ||d      \  }}d d d        | j                  d       |r|fS S # 1 sw Y   $xY w)Nr.  r-  )rs   r   ri   r   r   r   r>  store_metrics)r   r3   r?  return_outputsnum_items_in_batchcompute_loss_context_managerr  r7  s           rP   compute_losszCPOTrainer.compute_losse  s     7;6X6XHT%%,,112^i^k 	% * 	[ 77vRY7ZMD'	[ 	7w7'?"	[ 	[s   A==Bc                    | j                   r)t        | j                  j                  j                        n	t               }|5  |j                  |d   |d   | j                  d| j                  j                        }ddd       t        | j                  | j                  j                        }| j                  j                  |d      }|S # 1 sw Y   SxY w)zRGenerate samples from the model and reference model for the given batch of inputs.r   r   T)r   r   r   	do_samplerU   N)skip_special_tokens)rs   r   ri   r   r   r   generater   r8   rU   r)   batch_decode)r   r3   r   generate_context_managerpolicy_outputpolicy_output_decodeds         rP   generate_from_modelzCPOTrainer.generate_from_modelz  s     7;6X6XHT%%,,112^i^k 	! & 	!NN 23$%<=??!22?? + M	 &mT__dF[F[FhFhi $ 5 5 B B=fj B k$$	 	s   ;CCprediction_loss_onlyignore_keysc                 *   |&t        |d      rt        |j                  dg       }ng }| j                  r)t	        | j
                  j                  j                        n	t               }t        j                         5  |5  | j                  ||d      \  }}d d d        d d d        | j                  d       |rj                         d d fS |d   |d   d}|j                         D 	
cg c]  \  }	}
|	|vs|
 }}	}
t        j                  || j
                  j                        }t        j                   |j"                  d	   | j
                  j                        }j                         ||fS # 1 sw Y   xY w# 1 sw Y   xY wc c}
}	w )
Nr~   keys_to_ignore_at_inferencer/  rA  eval_logits/choseneval_logits/rejected)rT  rU  r   r   )ru   ro   r~   rs   r   ri   r   r   r   rp   no_gradr>  rB  r  r   r   zerosr   )r   r3   r?  rP  rQ  prediction_context_managerr  r7  logits_dictr   r   r  r   s                rP   prediction_stepzCPOTrainer.prediction_step  s    uh'%ell4QSUV  7;6X6XHT%%,,112^i^k 	# ]]_ 	Z8 	Z 77vRX7YMD'	Z 	Z 	7v6KKM4.. #**>"?$+,B$C
 !, 1 1 3L1q7K!LLfT-=-=-D-DEV\\!_T5E5E5L5LMvv..%	Z 	Z 	Z 	Z Ms0   <F?E7F-F:F7F 	<FFr7  c                 v    |j                         D ]&  \  }}| j                  |   |   j                  |       ( y r`   )r   r   append)r   r7  r-  keyvalues        rP   rB  zCPOTrainer.store_metrics  s;    !--/ 	@JC  ,S188?	@rR   
dataloaderdescriptionmetric_key_prefixc                    | j                   rQt        |j                        }t        j                  t        |      | j                  j                        }|j                  j                  |      }| j                  |      }	| j                  |	      }	| j                  | j                  |	      }
t        j                  ddgt        |	d   |
      D cg c]  \  }}||t        |      d g c}}      }d| j                  j                   v r+t#        j$                  dt#        j&                  |	      i       d
| j                  j                   v rt)        d|       t*        | Y  |||||      }|S c c}}w )z
        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
        `Trainer.evaluate()` and `Trainer.predict()`.

        Works both with or without labels.
        )r   PromptPolicyr   N)columnsdatawandbgame_log)rf  comet_mlzgame_log.csv)nametable)r}   r   datasetrandomsampleranger4   eval_batch_sizeselectr5   _prepare_inputsrO  r3   pd	DataFramer   	report_torg  logTabler(   r   evaluation_loop)r   r_  r`  rP  rQ  ra  num_samplesrandom_indicesrandom_batch_datasetrandom_batchrN  r   polrk  initial_outputr   s                  rP   rx  zCPOTrainer.evaluation_loop  sR     $$j001K#]]5+=AZAZ[N $.#5#5#<#<^#L --.BCL//=L$($<$<TZZ$V!LL!8,DGU]H^`uDv5@VSVSV/0E $))---		:u{{'>?@TYY000-' 0%9;HY
 %s   E5logs
start_timec                     d|v rdnd}| j                   |   j                         D ]9  \  }}t        j                  |      j	                         j                         ||<   ; | j                   |= t        |   ||      S )a1  
        Log `logs` on the various objects watching training, including stored metrics.

        Args:
            logs (`dict[str, float]`):
                The values to log.
            start_time (`float` or `None`, *optional*, defaults to `None`):
                Start time of the training.
        r  r.  r/  )r   r   rp   r   r3  r6  r   rv  )r   r  r  r-  r]  r7  r   s         rP   rv  zCPOTrainer.log  s|     !'$WF
 00<BBD 	<LCW-22499;DI	<  ,w{4,,rR   c                    | j                   t        d      t        |      rQt        j                  |j
                  d d dz   | j                         }t        j                  ||dd df   gd      }nH|j                  |j
                        }|dd df   j                         |ddd f<   | j                   |d<   | j                  t        d      |j                  |d	k(  | j                         |S )
Nz]model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id.r  )r    .r   r    ).r   z,model.config.pad_token_id has to be defined.)r   rm   r   rp   fullr   r   	new_zerosr  rU   masked_fill_)r   r   shifted_input_idss      rP   r   zCPOTrainer._shift_right  s    &&.o 
 Y' %

9??3B+?$+FHcHc d %		+<iSbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(,(C(Cf%$KLL&&'8D'@$BSBST  rR   c                    | j                   j                  *t        | j                   j                        j                  }n(| j                   j                  j                  d      d   }| j                  |       t        | !  ||       y )N/r  )
model_name)	r4   hub_model_idr   
output_dirrj  splitcreate_model_cardr   _save_checkpoint)r   r3   trialr  r   s       rP   r  zCPOTrainer._save_checkpoint  sl    99!!)dii22388J//55c:2>J*5 .rR   r  dataset_nametagsc                    | j                         syt        | j                  j                  d      r^t        j
                  j                  | j                  j                  j                        s!| j                  j                  j                  }nd}|t               }nt        |t              r|h}nt        |      }t        | j                  j                  d      r|j                  d       |j                  | j                         t        j                  d      }t!        ||| j"                  ||t%               r.t&        j(                  t&        j(                  j+                         ndt-               d|dd	      }|j/                  t        j
                  j1                  | j2                  j4                  d
             y)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        N_name_or_pathunsloth_versionunslothay          @inproceedings{xu2024contrastive,
            title        = {{Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation}},
            author       = {Haoran Xu and Amr Sharaf and Yunmo Chen and Weiting Tan and Lingfeng Shen and Benjamin Van Durme and Kenton Murray and Young Jin Kim},
            year         = 2024,
            booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
            publisher    = {OpenReview.net},
            url          = {https://openreview.net/forum?id=51iwkioZpn}
        }CPOzeContrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translationz
2401.08417)
base_modelr  r  r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zeroru   r3   r~   ospathisdirr  setrk   rl   addupdater   textwrapdedentr&   r  r   rg  runget_urlr'   savejoinr4   r  )r   r  r  r  r  citation
model_cards          rP   r  zCPOTrainer.create_model_card  sG   " ))+4::$$o6rww}}TZZM^M^MlMl?m**88JJ <5Dc"6Dt9D4::$$&78HHYDOO$?? $  )!!**%-?-AeiiF[eii'')ae.0%!

 	TYY%9%9;GHrR   )NNNNNNNN)NNNNNr`   )Fr  r   N)Fr  F)r.  )FN)NNr/  )NNN):__name__
__module____qualname____doc__r   r	   r
   r   r  Modulerl   r!   r   r   r   r   r   r   r   r   ra   r   tuplerp   optim	Optimizerlr_schedulerLambdaLRr   r   r   r   r   staticmethod
LongTensorboolintr   r   FloatTensorr  r  r,  r   r>  r   rF  rO  rZ  r4  rB  r   rx  rv  r   r  r  __classcell__)r   s   @rP   r0   r0   J   s   #J J CG$(04+/EI >B59Vbhl&*FJJoryy#=>?J y!J  -	J
  (J uWd3<.@%@ABJ #)+=?UWeef
J Xb/&9:;J D12J %++//1I1I1R1RRSJ (0%,,9UW\WcWc9c0d'eJ d^J "(N+;T+A"BCJX/
bO8E/299:T4U+V Obf Ob  $)"&)-A"CtU%5%55667A" A"  A" 	A"
 &A" 
c5###	$A" A"F38"..38  %0038 
u  %"3"3U5F5FF	G	38j  "'"&#((9!!(9  (9 (9  	(9
 !(9 
		(9 (9TIXYYIX'+CtU=M=M7M1N,N'OIX	u  %"3"3U5F5FHYHYY	ZIX^ 07	4 CtU%5%556674 O,	4t _bii/0 S%c 1223 
u||U5<<c5<<6G1H#HII	J*%S%:J:J5J0K %PS %8 ,0#/_bii/0#/ S%c 1223#/ #	#/
 d3i(#/J@T#u*%5 @7?C[ @jn @ 04+/!'00 0 'tn	0
 d3i(0 0 
0d-S%Z( -huo -QU -$!0/ %)&*,0	=ISM=I sm=I CcD()	=IrR   r0   )Jrv   r  rm  r  r   collectionsr   
contextlibr   pathlibr   typingr   r   r   r	   r
   numpyr   pandasrs  rp   torch.nnr  torch.nn.functional
functionalr  
accelerater   datasetsr   r   torch.utils.datar   transformersr   r   r   r   r   r   r   r   r   r   transformers.trainer_callbackr   transformers.trainer_utilsr   transformers.utilsr   r   
data_utilsr   r   
cpo_configr!   utilsr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   peftr,   r-   r.   rg  r0   rb   rR   rP   <module>r     s     	    # "  : :       #   '   : 5 C H !   OO QI QIrR   