
    bi                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmc mZ d dlmZ d dlmZ d dlmZ d d	l m!Z! d d
l"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZD  e3       r
d dlEmFZFmGZGmHZH  e-       rd dlIZI e,       r	d dlJmKc mLZM  G d de*      ZNy)    N)defaultdict)nullcontext)Path)AnyCallableLiteralOptionalUnion)PartialState)Dataset)autocast)
DataLoader)AutoModelForCausalLMBaseImageProcessorDataCollatorFeatureExtractionMixinPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTraineris_comet_availableis_torch_xla_availableis_wandb_available)TrainerCallback)EvalLoopOutput)is_peft_availableis_torch_fx_proxy   )maybe_apply_chat_templatemaybe_extract_prompt   )
ORPOConfig)
DPODataCollatorWithPaddingadd_bos_token_if_neededadd_eos_token_if_neededdisable_dropout_in_modelgenerate_model_cardget_comet_experiment_urllog_table_to_comet_experimentpad_to_lengthpeft_module_casting_to_bf16selective_log_softmax)	PeftModelget_peft_modelprepare_model_for_kbit_trainingc                       e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 	 	 	 d;deeeej                  e
f      dee   dee   dee   d	eeeee
ef   f      d
eeeeeef      deeg ef      deee      deej0                  j2                  ej0                  j4                  j6                  f   deeej8                  ej8                  gej8                  f      dee   deeegef      f fdZd Zd<deeeej                  f      defdZ e!	 	 	 	 d=dee
eeejD                  f   f   de#de$de$deejJ                     dee
ejD                  f   fd       Z&dejN                  dejN                  deejN                  ejN                  ejN                  ejN                  ejN                  f   fdZ(e!	 	 	 d>dejN                  dejD                  d e#de$de#dejN                  fd!       Z)dej                  dee
eeejD                  f   f   deejN                  ejN                  ejN                  ejN                  f   fd"Z*	 d?dee
eeejD                  f   f   d#e+d$   fd%Z,	 	 d@deeej                  f   d&ee
eej8                  e-f   f   deej8                  eej8                  ee
ej8                  f   f   f   fd'Z.dee
ejD                  f   de
fd(Z/	 d<deeej                  f   d&ee
eej8                  e-f   f   d)e#d*eee
      fd+Z0d?d,ee
e1f   d#e+d$   ddfd-Z2	 	 	 dAd.e3d/e
d)ee#   d*eee
      d0e
def fd1Z4d<d2ee
e1f   d3ee1   ddf fd4Z5d5 Z6 fd6Z7	 	 	 dBd7ee
   d8ee
   d9ee
ee
   df   fd:Z8 xZ9S )CORPOTrainera  
    Initialize ORPOTrainer.

    Args:
        model (`transformers.PreTrainedModel`):
            The model to train, preferably an `AutoModelForSequenceClassification`.
        args (`ORPOConfig`):
            The ORPO config arguments to use for training.
        data_collator (`transformers.DataCollator`):
            The data collator to use for training. If None is specified, the default data collator
            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
            sequences in the batch, given a dataset of paired sequences.
        train_dataset (`datasets.Dataset`):
            The dataset to use for training.
        eval_dataset (`datasets.Dataset`):
            The dataset to use for evaluation.
        processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
            Processing class used to process the data. If provided, will be used to automatically process the inputs
            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
            reuse the fine-tuned model.
        model_init (`Callable[[], transformers.PreTrainedModel]`):
            The model initializer to use for training. If None is specified, the default model initializer will be
            used.
        callbacks (`list[transformers.TrainerCallback]`):
            The callbacks to use for training.
        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
            The optimizer and scheduler to use for training.
        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
            The function to use to preprocess the logits before computing the metrics.
        peft_config (`dict`, defaults to `None`):
            The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in
            a PEFT model.
        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
            metric values.
    trlorpoNmodelargsdata_collatortrain_dataseteval_datasetprocessing_class
model_init	callbacks
optimizerspreprocess_logits_for_metricspeft_configcompute_metricsc                    |j                   i }nt        |t              st        d      |j                   }|j	                  d      }|Xt        |t              r|dk7  rt        t        |      }|dk7  r)t        |t        j                        st        d| d      ||d<   t        |t              rt        j                  |fi |}d| _
        t               s|t        d      t               r(|%t        |t              r|j                         }t        |dd      st        |d	d      rht        |d
      xr. d
t        t!        j"                  t$              j&                        v }d|j(                  i}|r|j*                  |d
<   t%        |fi |}nK|j(                  r?t        |d      r|j-                          n"d }|j/                         j1                  |       t3        ||      }|j4                  rkt        |d	d      r^t7        |       d| _
        nK|j(                  r?t        |d      r|j-                          n"d }|j/                         j1                  |       |j8                  rt;               st=               st        d      ||j>                  j@                  | _         n(|j@                  t        d      |j@                  | _         | j@                  r6|j>                  jB                  | _!        |j>                  jD                  | _"        |t        d      |jF                  tI        jJ                  dtL               d}n|jF                  }|jN                  tI        jJ                  dtL               d}n|jN                  }|jP                  .| j@                  r"tI        jJ                  dtL               d| _(        n|jP                  | _(        |atS        |jD                  |jT                  | j@                        }|jV                  r!d|_+        tI        jJ                  dtL               d| _,        nd| _,        |jZ                  rt]        |       || _#        |j8                  | _        |jT                  | _*        |j^                  |j^                  n|jD                  | _/        || _'        |j`                  | _0        || _1        |jd                  | _2        t        |j>                  dd      | _3        t        |j>                  dd      | _4        | jf                  r)| jh                  dk(  rtI        jJ                  dtL               tk        d       | _6        d|jn                  d<   tq               js                         5  |ju                  tv        |jx                         }|ju                  tz        d!|i|jx                  "      }|ju                  | j|                  |jx                         }|l|ju                  tv        |jx                         }|ju                  tz        d!|i|jx                  "      }|ju                  | j|                  |jx                         }d d d        t~        |   ||||||||||	|
#       t        | j                  d$      r%| j                  j                  | j                         t        | d%      st        d&      y # 1 sw Y   wxY w)'NzSYou passed model_kwargs to the ORPOTrainer. But your model is already instantiated.torch_dtypeautozoInvalid `torch_dtype` passed to the ORPOConfig. Expected a string with either `torch.dtype` or 'auto', but got .FzvPEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT modelsis_loaded_in_8bitis_loaded_in_4bitgradient_checkpointing_kwargsuse_gradient_checkpointingenable_input_require_gradsc                 &    |j                  d       y NTrequires_grad_moduleinputoutputs      S/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/orpo_trainer.pymake_inputs_require_gradz6ORPOTrainer.__init__.<locals>.make_inputs_require_grad   s    --d3    Tc                 &    |j                  d       y rJ   rK   rM   s      rQ   rR   z6ORPOTrainer.__init__.<locals>.make_inputs_require_grad   s    ))$/rS   z`generate_during_eval=True` requires Weights and Biases or Comet to be installed. Please install `wandb` or `comet-ml` to resolve.zMWhen no model is provided, you need to pass the parameter is_encoder_decoder.z>processing_class must be specified to tokenize a ORPO dataset.z`max_length` is not set in the ORPOConfig's init it will default to `512` by default, but you should do it yourself in the future.i   z`max_prompt_length` is not set in the ORPOConfig's init it will default to `128` by default, but you should do it yourself in the future.   zWhen using an encoder decoder architecture, you should set `max_completion_length` in the ORPOConfig's init it will default to `128` by default, but you should do it yourself in the future.)pad_token_idlabel_pad_token_idis_encoder_decoderzWhen using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments we have set it for you, but you should do it yourself in the future.output_router_logitsrouter_aux_loss_coefg        a-  You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to `0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary loss.c                       t        t              S N)r   list rS   rQ   <lambda>z&ORPOTrainer.__init__.<locals>.<lambda>1  s    ;t3D rS   estimate_tokens)num_proc	tokenizer)	fn_kwargsra   )r4   r5   r6   r7   r8   r9   r:   r?   r;   r<   r=   add_model_tagsacceleratorzXYour `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`.)Emodel_init_kwargs
isinstancestr
ValueErrorgetgetattrtorchdtyper   from_pretrained_peft_has_been_casted_to_bf16r   r-   merge_and_unloadhasattrr]   inspect	signaturer/   
parametersgradient_checkpointingrF   rH   get_input_embeddingsregister_forward_hookr.   bf16r+   generate_during_evalr   r   configrX   decoder_start_token_idrV   
max_lengthwarningswarnUserWarningmax_prompt_lengthmax_completion_lengthr#   rW   remove_unused_columnsuse_dpo_data_collatordisable_dropoutr&   padding_valuetruncation_moder9   betaaux_loss_enabledaux_loss_coefr   _stored_metricswarnings_issuedr   main_process_firstmapr    dataset_num_procr   tokenize_rowsuper__init__r4   rd   
_tag_namesAttributeError)selfr4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   rf   rA   _support_gc_kwargsprepare_model_kwargsrR   r|   r   	__class__s                       rQ   r   zORPOTrainer.__init__v   s   " !!) "E3'rss $ 6 6+//>K&k3/K64I")%"=K&(K1U$ J  KV  JW  WX  Y  4?!-0eS!(88TBSTE .3* "{'> I   [%<%+..0u159WUL_af=g%,9& &5%%&EFQQ:  # )EdFaFa'b$%LPLnLn()HI7VAUV,,5">?4464 ..0FFG_` #5+6EyyWU,?G+E2592
 ((u:;0020 **,BBC[\$$.@.BFXFZD 
 &+ll&E&ED#$$,lmm&*&=&=D#""*/,,*M*MD' % 9 9D#]^^??"MMe
 JJ!!)MMe
 !$ $ 6 6%%-$2I2IMMe
 *-D&)-)C)CD& 6-::#'#:#:#'#:#:M ))-2*\ *.D&).D& $U+$$($=$=!"&"9"9373E3E3QT//WgWtWt!2#33 0II	 '6Le T$U\\3I3O  T%7%73%>MM   ++DE 48/0 ^..0 	c)--.BTMbMb-cM)--)kCS5T_c_t_t . M *--d.?.?$J_J_-`M'+//0DtOdOd/e+//-*,<=!22  0  
  ,//0A0ADLaLa/b	c  	''%-!+!*G 	 	
 4::/0JJ%%doo6t]+ j  ,E	c 	cs   C[[c                    | j                  ||z   d      }| j                  |d      d   }|d   t        |      d }|d   t        |      d }t        j                  ||g      }t        j                  |d         }t        |      t        |      k7  rt        d      t        |      }	||d   d|	 k7  r|	dz  }	|d   d|	 }|d   d|	 }
t        |      t        |
      k7  rt        d      |d   |	d }|d   |	d }t        ||
||	      S )
a  
        Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a +
        b)[len(enc(a)):]`. Reference:
            https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257
        Fadd_special_tokens	input_idsNattention_maskzBPrompt input ids and answer input ids should have the same length.r!   z@Prompt input ids and attention mask should have the same length.)prompt_input_idsprompt_attention_maskr   r   )r9   lennpconcatenatearrayri   dict)r   promptanswerfull_tokenizedr   answer_input_idsanswer_attention_maskfull_concat_input_idsfull_input_idsresponse_token_ids_start_idxr   s              rQ   build_tokenized_answerz"ORPOTrainer.build_tokenized_answere  su    ..vSX.Y00E0RS^_)+6s;K7L7NO ./? @EUAVAX Y !#0@BR/S T ."=>~#&;"<<abb (++;'<$ ~k:;X<XYY(A-()+67T8TU ./? @A^B^ _ C(=$>>_``)+67S7TU ./? @A]A^ _-"7&0	
 	
rS   returnc           	      
   i }|d   }|d   }|d   }| j                   st        |t              st        dt	        |             | j                  |d      }|j                         D 	ci c]  \  }}	d| |	 }}}	t        |t              st        dt	        |             | j                  ||      }
t        |t              st        d	t	        |             | j                  ||      }t        |d
         }t        |
d
         }t        |d
         }t        ||      }|j                         D ]  \  }}	|	d| ||<    t        t        |
d
   |d
         D cg c]
  \  }}||k7   c}}      }t        ||z
        }|dkD  s|dkD  rt        d      t        | j
                  j                  ||||
||      \  }}
}t        | j
                  j                   |
|      \  }
}t#        t        |
d         t        |d               }|
||fD ]  }t        |d
         |z   | j$                  kD  s"| j&                  dk(  rdD ]  }||   d| j(                   ||<    N| j&                  dk(  rdD ]  }||   | j(                   d ||<    {t        d| j&                          |
|fD ]J  }t        |d
         |z   | j$                  kD  s"dD ]$  }||   d| j$                  | j(                  z
   ||<   & L dD ci c]  }||
d|    |
|   z    }}dD ci c]  }||d|    ||   z    }}|d   dd |d<   | j*                  gt        |
d
         z  |d   dt        |
d
          |d   dd |d<   | j*                  gt        |d
         z  |d   dt        |d
          |||dj                         D ],  \  }}|j                         D ]  \  }}|dk(  r||| | <    . n| j                  |d| j,                  d      }
| j                  |d| j,                  d      }| j                  |d| j(                  d      }|
d   |d<   |d   |d<   |d   |d
<   |d   |d<   |bt/        |d      rV|j1                  t3        j4                  |d               |d<   |j1                  t3        j4                  |d               |d <   t7               r}|D ]x  }d|v s| j                   r| j*                  }n1|j9                  d!      r| j:                  }n|j9                  d"      rd#}||   g| j$                  t        ||         z
  z  z   ||<   z |S c c}	}w c c}}w c c}w c c}w )$a/  Tokenize a single row from a ORPO specific dataset.

        At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt +
        chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long,
        we truncate the chosen/rejected.

        We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length
        of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens.
        r   chosenrejectedz prompt should be an str but got Fr   prompt_z chosen should be an str but got z"rejected should be an str but got r   Nr!   zdChosen and rejected prompt_input_ids might only differ on the last token due to tokenizer merge ops.r   
keep_start)r   r   keep_endzUnknown truncation mode: )r   r   labels)chosen_	rejected_ token_type_idsT)
truncationr|   r   chosen_labelsrejected_labelsr   r   %prepare_decoder_input_ids_from_labels)r   rejected_decoder_input_idschosen_decoder_input_ids
_input_ids_attention_maskr   )rX   rg   rh   ri   typer9   itemsr   r   minsumzipabsr$   bos_token_idr%   eos_token_idmaxr|   r   r   rW   r   rq   r   rl   tensorr   endswithr   )r   featurer4   batchr   r   r   prompt_tokenskvchosen_tokensrejected_tokensprompt_len_input_idschosen_prompt_len_input_idsrejected_prompt_len_input_idsabnum_diff_tokensnum_diff_lenlonger_response_lengthanswer_tokenschosen_sequence_tokensrejected_sequence_tokenstokstype_keytokens	pad_values                              rQ   r   zORPOTrainer.tokenize_row  s    "":&&& fc* #CDL>!RSS 11&U1SM:G:M:M:OP$!Qwqc]A-PMPfc* #CDL>!RSS 77GMh, #Ed8nEU!VWW"99&(KO $'}5G'H#I *-m<N.O*P',/@R0S,T)#&'BDa#b %++- <1#$%:&:#;a <
 "$'6H(I?[mKn$opDAqapO :=ZZ[L"lQ&6 =  =T%%22$+-=9M=/ .E%%22M?.*M? &)];-G)H#o^iNjJk%l" #0-!P 	]}%789<RRUYUdUdd++|;!N ZA/<Q/?@X$BXBX/YM!,Z--;!N [A/<Q/?AWAW@W@Y/ZM!,[ )+DTEYEYDZ)[\\	] #0!A h}%789<RRUYUdUdd< h+8+;<fdooPTPfPf>f+ga(hh Ml&GH=71#/-2BBB&" & Qp(KL?WQC=1OA4FFF($ ( 0Fk/RST/U"8,''ZM"456Z7"8,-Us=AS3T/UV 2J+1VWX1Y$X.''^O$678^9$X./Y_EW5X1YZ
 25! eg	54
 )-

 5$Hf#33 .4EQCz*+55 !114D4N4Ncg 2 M #33Td6P6Pei 4 O !114D4J4J_c 2 M &3;%?E/"'6{'CE#$(5k(BE$%-:;K-LE)* WU4[%\6;6a6a <<.?(@A 7b 723 5:4_4_ <<o(>? 5` 501 "# Vq=D$;$; $ 7 7IZZ- $ 2 2IZZ 12 !I 8ykT__s5QR8}5T&UUaV C Q0 q\&(s   ,U!U
:UUr   rX   rW   r   devicec           	         i }|r-t        | d   j                  d   | d   j                  d         }n,t        | d   j                  d   | d   j                  d         }| D ]  }|j                  d      st        | |   t        j
                        s3d|v s|r|}n'|j                  d      r|}n|j                  d	      rd
}|j                  dd      }	t        | |   |      ||	<    | D ]  }|j                  d      st        | |   t        j
                        s3d|v s|r|}n'|j                  d      r|}n|j                  d	      rd
}|j                  dd      }	t	        j                  ||	   t        | |   |      fd
      j                  |      ||	<    |rP| d   j                  dd      j                  |      |d<   | d   j                  dd      j                  |      |d<   |S )a  Concatenate the chosen and rejected inputs into a single tensor.

        Args:
            batch:
                A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors
                of shape (batch_size, sequence_length).
            is_encoder_decoder:
                Whether the model is an encoder-decoder model.
            label_pad_token_id:
                The label pad token id.
            padding_value:
                The padding value to use for the concatenated inputs_ids.
            device:
                The device for the concatenated inputs.

        Returns:
            A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'.
        r   r!   r   chosen_input_idsrejected_input_idsr   r   r   r   r   concatenated)r   r   dimr   r   r   concatenated_input_idsr   concatenated_attention_mask)r   shape
startswithrg   rl   Tensorr   replacer*   cattorepeat)
r   rX   rW   r   r   concatenated_batchr|   r   r   concatenated_keys
             rQ   concatenated_inputszORPOTrainer.concatenated_inputs1  s   4  U?399!<eDU>V>\>\]^>_`JU#56<<Q?G[A\AbAbcdAefJ 		pA||H%*U1Xu||*Lq=$6 2IZZ- -IZZ 12 !I#$99X~#F 7DU1Xzen7o"#34		p  	$A||J'JuQx,Nq=$6 2IZZ- -IZZ 12 !I#$99Z#H 7<yy*+;<%eAh
iP 8 "F"# ##34	$" ;@AS;T;[;[\]_`;a;d;dlr;d;s78-.55a;>>f>M <= "!rS   policy_chosen_logpspolicy_rejected_logpsc                 V   ||z
  t        j                  t        j                  |             t        j                  t        j                  |             z
  z
  }t        j                  |      }| j
                  |z  }| j
                  |j                  | j                  j                        j                         z  }| j
                  |j                  | j                  j                        j                         z  }|||t        j                  |      t        j                  |      fS )an  Compute ORPO's odds ratio (OR) loss for a batch of policy and reference model log probabilities.

        Args:
            policy_chosen_logps:
                Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
            policy_rejected_logps:
                Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)

        Returns:
            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the ORPO
            loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for
            the chosen and rejected responses, respectively. The log odds ratio of the chosen responses over the
            rejected responses ratio for logging purposes. The `log(sigmoid(log_odds_chosen))` for logging purposes.
        )rl   log1pexpF
logsigmoidr   r   re   r   detachmean)r   r   r   log_oddsratiolosseschosen_rewardsrejected_rewardss           rQ   odds_ratio_losszORPOTrainer.odds_ratio_lossu  s    * (*??KK#67785;;		RgHhGh;ii
 X&U"&9&<&<T=M=M=T=T&U%]%]%__99(=(@(@AQAQAXAX(Y'a'a'cc~'7E9JEJJW_L```rS   logitsr   average_log_probc                 p   | j                   dd |j                   k7  rt        d      |s'|ddddf   j                         }| ddddddf   } ||k7  }t        j                  ||k(  d|      }t        | |      }|r&||z  j                  d      |j                  d      z  S ||z  j                  d      S )a  Compute the log probabilities of the given labels under the given logits.

        Args:
            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
            labels:
                Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are
                ignored. Shape: (batch_size, sequence_length)
            average_log_prob:
                If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the
                log probabilities of the (non-masked) tokens.
            label_pad_token_id: The label pad token id.
            is_encoder_decoder: Whether the model is an encoder-decoder model.

        Returns:
            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the
            given logits.
        NzKLogits (batch and sequence length dim) and labels must have the same shape.r!   r   )r   ri   clonerl   wherer,   r   )r  r   r  rW   rX   	loss_maskper_token_logpss          rQ   get_batch_logpszORPOTrainer.get_batch_logps  s    2 <<,jkk!AqrE]((*FAssAI&F00	 V'991fE/?#i/44R89==;LLL#i/44R88rS   c                 \     j                  | j                   j                   j                   j                  j
                        }|d   j                  d   } j                  rd j                  |d         ini } j                  rd|d<    ||d   f|d	   d
d|}|j                  } fd} j                  r|d   j                         }	n<|d   j                         }	|d	   }
t        j                  |
dk(  |	 j                        }	 ||d| |	d|       } j                  ||d   d j                   j                        }|d| }||d } j                  s|d|ddddf   }||dddddf   }n
|d| }||d } j                  r||||||j                  fS |||||fS )zRun the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.

        We do this to avoid doing two forward passes, because it's faster for FSDP.
        )rX   rW   r   r   r   r   decoder_input_idsconcatenated_labelsTrY   r   r   F)r   	use_cachec                 R   j                   s1| dd dd d f   j                         } |ddd f   j                         }t        j                         }| j	                  d| j
                  d         } |j	                  d      }|j                  | j                        } || |      }|S )N.r  r!   )rX   
contiguousnnCrossEntropyLossviewr   r   r   )r  r   loss_fctlossr   s       rQ   cross_entropy_lossz<ORPOTrainer.concatenated_forward.<locals>.cross_entropy_loss  s    **SbS!,779QR335**,H[[V\\"%56F[[_FYYv}}-FFF+DKrS   r!   N)r  rX   rW   r  )r   rX   rW   r   re   r   r   _shift_rightr   r  r	  rl   r
  r  aux_loss)r   r4   r   r   
len_chosenmodel_kwargsoutputs
all_logitsr  r   r   chosen_nll_loss	all_logpschosen_logpsrejected_logpschosen_logitsrejected_logitss   `                rQ   concatenated_forwardz ORPOTrainer.concatenated_forward  s1    "55#66#66,,##** 6 
 ?+11!4
 && $T%6%67IJ_7`%a  	   37L/078
-.KL
 	
 ^^
	 ""'(=>DDFF'(@AGGIF/0MNN[[1!4fd>U>UVF,Z-Df[jFYZ((45!#66#66 ) 
	 !*-":;/&&&{
{CRC':;M(crc1)<=O&{
3M(5O   .-Racjcscsttnm_o^^rS   
train_eval)trainevalc                    i }| j                  ||      }|dd \  }}}}	}
| j                  r|d   }| j                  ||      \  }}}}}|
|j                         z
  }||kD  j	                         }|dk(  rdnd}| j
                  j                  |      j                         || d<   | j
                  j                  |      j                         || d<   | j
                  j                  |      j                         || d<   | j
                  j                  ||z
        j                         || d	<   | j
                  j                  |      j                         j                         || d
<   | j
                  j                  |      j                         j                         || d<   | j
                  j                  |	j                         j                               j                         || d<   | j
                  j                  |j                         j                               j                         || d<   | j
                  j                  |
      j                         j                         || d<   | j
                  j                  |      j                         j                         || d<   | j
                  j                  |      j                         j                         || d<   t               rt        j                          |j                         D ]  \  }}|j                         ||<    | j                  r|| j                  z  z  }||fS )zXCompute the ORPO loss and other metrics for the given batch of inputs for train or test.N   r)  eval_r   zrewards/chosenzrewards/rejectedzrewards/accuracieszrewards/marginszlogps/rejectedzlogps/chosenzlogits/rejectedzlogits/chosennll_losslog_odds_ratiolog_odds_chosen)r&  r   r  r   floatre   gather_for_metricsr   r   xm	mark_stepr   itemr   )r   r4   r   r'  metricsforward_outputr   r   policy_chosen_logitspolicy_rejected_logitspolicy_nll_lossr  r  r  r  r.  r/  r  reward_accuraciesprefixr   r   s                         rQ   get_batch_loss_metricsz"ORPOTrainer.get_batch_loss_metrics  sP    225%@ 2A	
! "  %a(HTXThTh!6U
Q 0./ .+.>>EEG&&0b-1-=-=-P-PQ_-`-e-e-g6(.)*/3/?/?/R/RSc/d/i/i/k6(*+,151A1A1T1TUf1g1l1l1n6(,-..2.>.>.Q.Q--/

$& 	6(/*+ .2-=-=-P-PQf-g-n-n-p-u-u-w6(.)*+/+;+;+N+NOb+c+j+j+l+q+q+s6(,'(.2.>.>.Q.Q"))+002/

$& 	6(/*+ -1,<,<,O,O '')..0-

$& 	6(-() (,'7'7'J'J?'['b'b'd'i'i'k6((#$-1-=-=-P-PQ_-`-g-g-i-n-n-p6(.)*.2.>.>.Q.QRa.b.i.i.k.p.p.r6(/*+!#LLNMMO 	"DAqGAJ	"  D&&11DW}rS   inputsc                 \   | j                   r)t        | j                  j                  j                        n	t               }|5  | j                  ||d      \  }}d d d        j                  | j                  j                        }| j                  d       |r||fS |S # 1 sw Y   IxY w)Nr(  r'  )
ro   r   re   r   r   r   r<  r   r5   store_metrics)r   r4   r=  return_outputsnum_items_in_batchcompute_loss_context_managerr  r5  s           rQ   compute_losszORPOTrainer.compute_lossH  s     7;6X6XHT%%,,112^i^k 	% * 	[ 77vRY7ZMD'	[ wwtyy''( 	7w7'?"	[ 	[s   B""B+c                    | j                   r)t        | j                  j                  j                        n	t               }|5  |j                  |d   |d   | j                  d| j                  j                        }ddd       t        | j                  | j                  j                        }| j                  j                  |d      }|S # 1 sw Y   SxY w)zRGenerate samples from the model and reference model for the given batch of inputs.r   r   T)r   r   r|   	do_samplerV   N)skip_special_tokens)ro   r   re   r   r   r   generater|   r9   rV   r*   batch_decode)r   r4   r   generate_context_managerpolicy_outputpolicy_output_decodeds         rQ   generate_from_modelzORPOTrainer.generate_from_model`  s     7;6X6XHT%%,,112^i^k 	! & 	!NN 23$%<=??!22?? + M	 &mT__dF[F[FhFhi $ 5 5 B B=fj B k$$	 	s   ;CCprediction_loss_onlyignore_keysc                 l   | j                   st        j                  d       |&t        |d      rt	        |j
                  dg       }ng }| j                  r)t        | j                  j                  j                        n	t               }t        j                         5  |5  | j                  ||d      \  }}d d d        d d d        | j                  d       |rj!                         d d fS |d   |d   d}|j#                         D 	
cg c]  \  }	}
|	|vs|
 }}	}
t        j$                  || j                  j                  	      }t        j&                  |j(                  d
   | j                  j                  	      }j!                         ||fS # 1 sw Y   xY w# 1 sw Y   xY wc c}
}	w )Na!  prediction_step is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than DPODataCollatorWithPadding - you might see unexpected behavior. Alternatively, you can implement your own prediction_step method if you are using a custom data collatorrz   keys_to_ignore_at_inferencer)  r?  eval_logits/choseneval_logits/rejected)rR  rS  r   r   )r   r}   r~   rq   rk   rz   ro   r   re   r   r   r   rl   no_gradr<  r@  r   r   r   zerosr   )r   r4   r=  rN  rO  prediction_context_managerr  r5  logits_dictr   r   r  r   s                rQ   prediction_stepzORPOTrainer.prediction_stepw  s    ))MM{ uh'%ell4QSUV  7;6X6XHT%%,,112^i^k 	# ]]_ 	Z8 	Z 77vRX7YMD'	Z 	Z 	7v6KKM4.. #**>"?$+,B$C
 !, 1 1 3L1q7K!LLfT-=-=-D-DEV\\!_T5E5E5L5LMvv..%	Z 	Z 	Z 	Z Ms0   F$ F8F$F0F0F!	F$$F-r5  c                 v    |j                         D ]&  \  }}| j                  |   |   j                  |       ( y r\   )r   r   append)r   r5  r'  keyvalues        rQ   r@  zORPOTrainer.store_metrics  s;    !--/ 	@JC  ,S188?	@rS   
dataloaderdescriptionmetric_key_prefixc                    | j                   rQt        |j                        }t        j                  t        |      | j                  j                        }|j                  j                  |      }| j                  |      }	| j                  |	      }	| j                  | j                  |	      }
t        j                  ddgt        |	d   |
      D cg c]  \  }}||t        |      d g c}}      }d| j                  j                   v r+t#        j$                  dt#        j&                  |	      i       d
| j                  j                   v rt)        d|       t*        | Y  |||||      }|S c c}}w )z
        Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by
        `Trainer.evaluate()` and `Trainer.predict()`.

        Works both with or without labels.
        )r   PromptPolicyr   N)columnsdatawandbgame_log)rd  comet_mlzgame_log.csv)nametable)ry   r   datasetrandomsampleranger5   eval_batch_sizeselectr6   _prepare_inputsrM  r4   pd	DataFramer   	report_tore  logTabler)   r   evaluation_loop)r   r]  r^  rN  rO  r_  num_samplesrandom_indicesrandom_batch_datasetrandom_batchrL  r   polri  initial_outputr   s                  rQ   rv  zORPOTrainer.evaluation_loop  sR     $$j001K#]]5+=AZAZ[N $.#5#5#<#<^#L --.BCL//=L$($<$<TZZ$V!LL!8,DGU]H^`uDv5@VSVSV/0E $))---		:u{{'>?@TYY000-' 0%9;HY
 %s   E5logs
start_timec                     d|v rdnd}| j                   |   j                         D ]9  \  }}t        j                  |      j	                         j                         ||<   ; | j                   |= t        |   ||      S )a1  
        Log `logs` on the various objects watching training, including stored metrics.

        Args:
            logs (`dict[str, float]`):
                The values to log.
            start_time (`float` or `None`, *optional*, defaults to `None`):
                Start time of the training.
        r  r(  r)  )r   r   rl   r   r   r4  r   rt  )r   r}  r~  r'  r[  r5  r   s         rQ   rt  zORPOTrainer.log  s|     !'$WF
 00<BBD 	<LCW-22499;DI	<  ,w{4,,rS   c                    | j                   t        d      t        |      rQt        j                  |j
                  d d dz   | j                         }t        j                  ||dd df   gd      }nH|j                  |j
                        }|dd df   j                         |ddd f<   | j                   |d<   | j                  t        d      |j                  |d	k(  | j                         |S )
Nz]model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id.r  )r!   .r   r!   ).r   z,model.config.pad_token_id has to be defined.)r{   ri   r   rl   fullr   r   	new_zerosr	  rV   masked_fill_)r   r   shifted_input_idss      rQ   r  zORPOTrainer._shift_right  s    &&.o 
 Y' %

9??3B+?$+FHcHc d %		+<iSbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(,(C(Cf%$KLL&&'8D'@$BSBST  rS   c                    | j                   j                  *t        | j                   j                        j                  }n(| j                   j                  j                  d      d   }| j                  |       t        | !  ||       y )N/r  )
model_name)	r5   hub_model_idr   
output_dirrh  splitcreate_model_cardr   _save_checkpoint)r   r4   trialr  r   s       rQ   r  zORPOTrainer._save_checkpoint  sl    99!!)dii22388J//55c:2>J*5 .rS   r  dataset_nametagsc                    | j                         syt        | j                  j                  d      r^t        j
                  j                  | j                  j                  j                        s!| j                  j                  j                  }nd}|t               }nt        |t              r|h}nt        |      }t        | j                  j                  d      r|j                  d       |j                  | j                         t        j                  d      }t!        ||| j"                  ||t%               r.t&        j(                  t&        j(                  j+                         ndt-               d|dd	      }|j/                  t        j
                  j1                  | j2                  j4                  d
             y)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        N_name_or_pathunsloth_versionunslotha          @article{hong2024orpo,
            title        = {{ORPO: Monolithic Preference Optimization without Reference Model}},
            author       = {Jiwoo Hong and Noah Lee and James Thorne},
            year         = 2024,
            eprint       = {arXiv:2403.07691}
        }ORPOz@ORPO: Monolithic Preference Optimization without Reference Modelz
2403.07691)
base_modelr  r  r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zerorq   r4   rz   ospathisdirr  setrg   rh   addupdater   textwrapdedentr'   r  r   re  runget_urlr(   savejoinr5   r  )r   r  r  r  r  citation
model_cards          rQ   r  zORPOTrainer.create_model_card	  sG   " ))+4::$$o6rww}}TZZM^M^MlMl?m**88JJ <5Dc"6Dt9D4::$$&78HHYDOO$?? $  )!!**%-?-AeiiF[eii'')ae.0%Z!

 	TYY%9%9;GHrS   )NNNNNNNN)NNNNNr\   )Fr  r   N)Fr  F)r(  )FN)NNr)  )NNN):__name__
__module____qualname____doc__r   r	   r
   r   r  Modulerh   r"   r   r   r   r   r   r   r   r   r]   r   tuplerl   optim	Optimizerlr_schedulerLambdaLRr   r   r   r   r   staticmethod
LongTensorboolintr   r   FloatTensorr  r  r&  r   r<  r   rD  rM  rX  r0  r@  r   rv  rt  r  r  r  __classcell__)r   s   @rQ   r1   r1   N   s   #J J CG%)04+/EI >B59Vbhl&*FJmoryy#=>?m z"m  -	m
  (m uWd3<.@%@ABm #)+=?UWeef
m Xb/&9:;m D12m %++//1I1I1R1RRSm (0%,,9UW\WcWc9c0d'em d^m "(N+;T+A"BCm^/
bY8E/299:T4U+V Ybf Yv  $)"&)-A"CtU%5%55667A" A"  A" 	A"
 &A" 
c5###	$A" A"Fa"..a  %00a 
u  %"3"3U5F5FHYHY[`[l[ll	m	a@  "'"&#((9!!(9  (9 (9  	(9
 !(9 
		(9 (9TO_YYO_'+CtU=M=M7M1N,N'OO_	u  %"3"3U5F5FHYHYY	ZO_j 07	5 CtU%5%556675 O,	5v _bii/0 S%c 1223 
u||U5<<c5<<6G1H#HII	J0%S%:J:J5J0K %PS %8 ,0(/_bii/0(/ S%c 1223(/ #	(/
 d3i((/T@T#u*%5 @7?C[ @jn @ 04+/!'00 0 'tn	0
 d3i(0 0 
0d-S%Z( -huo -QU -$!0/ %)&*,0	<ISM<I sm<I CcD()	<IrS   r1   )Orr   r  rk  r  r}   collectionsr   
contextlibr   pathlibr   typingr   r   r   r	   r
   numpyr   pandasrq  rl   torch.nnr  torch.nn.functional
functionalr   
accelerater   datasetsr   r   torch.utils.datar   transformersr   r   r   r   r   r   r   r   r   r   r   transformers.trainer_callbackr   transformers.trainer_utilsr   transformers.utilsr   r   
data_utilsr   r    orpo_configr"   utilsr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   peftr-   r.   r/   re  torch_xla.core.xla_modelcore	xla_modelr2  r1   r^   rS   rQ   <module>r     s     	    # "  : :       #   '    : 5 C H #   OO ))wI' wIrS   