
    biK              	          d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZmZm Z m!Z!m"Z" d d
l#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8  e*       r
d dl9m:Z:m;Z;m<Z<  e"       rd dl=Z=de>e?e@e   f   ddde>e?e@e   f   fdZA G d de!      ZBy)    N)defaultdict)FrozenInstanceErrorreplace)Path)AnyCallableOptionalUnion)PartialState)gather_object)Dataset)BaseImageProcessorDataCollatorFeatureExtractionMixinPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTraineris_wandb_available)TrainerCallback)nested_detach)EvalPrediction)is_peft_availableis_rich_available   )maybe_apply_chat_template   )RewardConfig)RewardDataCollatorWithPaddingcompute_accuracydecode_and_strip_paddingdisable_dropout_in_modelgenerate_model_cardget_comet_experiment_urllog_table_to_comet_experimentprint_rich_table)	PeftModelget_peft_modelprepare_model_for_kbit_trainingbatch	tokenizerr   returnc                     g g g g d}t        | d   | d         D ]q  \  }} ||      } ||      }|d   j                  |d          |d   j                  |d          |d   j                  |d          |d	   j                  |d          s |S )
z1Tokenize a batch from a reward modelling dataset.)input_ids_chosenattention_mask_choseninput_ids_rejectedattention_mask_rejectedchosenrejectedr.   	input_idsr/   attention_maskr0   r1   )zipappend)r*   r+   new_examplesr2   r3   tokenized_chosentokenized_rejecteds          U/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/reward_trainer.py	_tokenizer<   A   s     !# #%	L  hz1BC ]$V,&x0'(//0@0MN,-445EFV5WX)*112D[2QR./667IJZ7[\]     c                       e Zd ZddgZ	 	 	 	 	 	 	 	 	 	 	 	 ddeeeej                  f      dee	   dee
   dee   deeeeeef   f      d	eeeeeef      d
eeg ef      deeegef      deee      deej0                  j2                  ej0                  j4                  j6                  f   deeej8                  ej8                  gej8                  f      dee   f fdZ	 	 d deeej                  f   deeeej8                  ef   f   deej8                  eej8                  eeej8                  f   f   f   fdZ	 d!deeej                  f   deeeej8                  ef   f   de deee      deeej8                     eej8                     eej8                     f   f
dZ! fdZ"de#fdZ$ fdZ%	 	 	 d"dee   dee   deeee   df   fdZ& xZ'S )#RewardTrainertrlzreward-trainerNmodelargsdata_collatortrain_dataseteval_datasetprocessing_class
model_initcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricspeft_configc                    t               s|t        d      t               r|t        |t              st	        |dd      st	        |dd      rdt        t        j                  t              j                        v }d|j                  i}|s'|j                  t        j                  dt               n|r|j                  |j                  |d<   t        |fi |}t        ||      }|j                   rt#        |       |t$        }|Z|t        d	      |j&                  t)        |      }|j*                  r"	 d|_        t        j                  dt               d| _        nd| _        d|j2                  d<   d|j4                  vrt7               j9                         5  d|i}|j;                  t<        d|i      }|j;                  t>        d||j@                        }|jC                  fd|j@                        }|\|j;                  t<        d|i      }|j;                  t>        |d|j@                        }|jC                  fd|j@                        }ddd       tD        |   |||||||||	|
|       tI        | jJ                  d      r&| jJ                  jM                  | jN                         yy# t,        $ r t/        |d
      }Y w xY w# 1 sw Y   }xY w)ar	  
        Initialize RewardTrainer.

        Args:
            model (`transformers.PreTrainedModel`):
                The model to train, preferably an `AutoModelForSequenceClassification`.
            args (`RewardConfig`):
                The arguments to use for training.
            data_collator (`transformers.DataCollator`):
                The data collator to use for training. If None is specified, the default data collator
                (`RewardDataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of
                the sequences in the batch, given a dataset of paired sequences.
            train_dataset (`datasets.Dataset`):
                The dataset to use for training.
            eval_dataset (`datasets.Dataset`):
                The dataset to use for evaluation.
            processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
                Processing class used to process the data. If provided, will be used to automatically process the
                inputs for the model, and it will be saved along the model to make it easier to rerun an interrupted
                training or reuse the fine-tuned model.
            model_init (`Callable[[], transformers.PreTrainedModel]`):
                The model initializer to use for training. If None is specified, the default model initializer will be
                used.
            compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional* defaults to `compute_accuracy`):
                The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`)
                will be used.
            callbacks (`list[transformers.TrainerCallback]`):
                The callbacks to use for training.
            optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
                The optimizer and scheduler to use for training.
            preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
                The function to use to preprocess the logits before computing the metrics.
            peft_config (`dict`, defaults to `None`):
                The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped
                in a PEFT model.
        NzvPEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT modelsis_loaded_in_8bitFis_quantizedgradient_checkpointing_kwargsuse_gradient_checkpointingzYou passed `gradient_checkpointing_kwargs` in the trainer's kwargs, but your peft version does not support it. please update to the latest version of peft to use `gradient_checkpointing_kwargs`.zYA processing_class must be specified when using the default RewardDataCollatorWithPadding)remove_unused_columnszWhen using RewardDataCollatorWithPadding, you should set `remove_unused_columns=False` in your RewardConfig we have set it for you, but you should do it yourself in the future.Testimate_tokensr.   r+   )	fn_kwargs)batchedrT   num_procc                 L    t        | d         k  xr t        | d         k  S Nr.   r0   lenx
max_lengths    r;   <lambda>z(RewardTrainer.__init__.<locals>.<lambda>   s-    c!$6"78JFu3qQeOfKgkuKu r=   )rV   )rT   rU   rV   c                 L    t        | d         k  xr t        | d         k  S rX   rY   r[   s    r;   r^   z(RewardTrainer.__init__.<locals>.<lambda>   s3    #a(:&;"<
"J #G"6 78JF r=   )rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   add_model_tags)(r   
ValueError
isinstancer'   getattrlistinspect	signaturer)   
parametersgradient_checkpointingrP   warningswarnUserWarningr(   disable_dropoutr"   r    r]   r   rR   r   r   use_reward_data_collatorwarnings_issuedcolumn_namesr   main_process_firstmapr   r<   dataset_num_procfiltersuper__init__hasattrrA   r`   
_tag_names)selfrA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   _supports_gc_kwargsprepare_model_kwargsrT   r]   	__class__s                   @r;   ru   zRewardTrainer.__init__W   s1   p !"{'> I   [%<eY/5"5u=P^`eAf*IT))*IJUUN +' -I$JeJe+f(.43U3U3a r'
 -1S1S1_PTPrPr,-LM;EZEYZE&uk: $U+".O ' o  J9:JKM))F16D. \ -1D),1D) 48/0]%?%??224 !(*:;	 - 1 12KXceuWv 1 w - 1 1 '!22	 !2 ! !. 4 4u!22 !5 !  +#/#3#31kK[=\ $4 $L $0#3#3!"+ $!%!6!6	 $4 $L $0#6#6G!%!6!6 $7 $L;!F 	''%-!+!*G 	 	
 4::/0JJ%%doo6 1S + F"4uEDF.! !s   <J? !B?K?KKK%inputsr,   c                     ||d   |d   d      d   } ||d   |d   d      d   }d|v r8t         j                  j                  ||z
  |d   z
        j                          }n1t         j                  j                  ||z
        j                          }| j                  j
                  5|| j                  j
                  t        j                  ||z   d	z        z  z  }|r|||d
fS |S )Nr.   r/   T)r4   r5   return_dictlogitsr0   r1   marginr   )rewards_chosenrewards_rejected)nn
functional
logsigmoidmeanrB   center_rewards_coefficienttorch)rx   rA   r|   return_outputsnum_items_in_batchr   r   losss           r;   compute_losszRewardTrainer.compute_loss	  s    /0!"9:
 	
 !12!";<
 	 vMM,,^>N-NQWX`Qa-abggiiDMM,,^>N-NOTTVVD99//;DII885::~XhGhmnFn;oooD"0$4   r=   prediction_loss_onlyignore_keysc                    | j                  |      }:t        | j                  d      r"t        | j                  j                  dg       ng t        j                         5  | j                  ||d      \  }}d d d        |rd d fS j                         }t        fdj                         D              }t        |      }t        j                  |      j                  d      j                  d      j                  }t        j                   |j"                  d         }| j                  |      }|||fS # 1 sw Y   xY w)	Nconfigkeys_to_ignore_at_inferenceT)r   c              3   2   K   | ]  \  }}|vs|  y wN ).0kvr   s      r;   	<genexpr>z0RewardTrainer.prediction_step.<locals>.<genexpr>?  s     QTQA[<PqQs   r   )dimr   )_prepare_inputsrv   rA   rc   r   r   no_gradr   detachtupleitemsr   stackr   softmaxTzerosshape)	rx   rA   r|   r   r   r   logits_dictr   labelss	       `    r;   prediction_stepzRewardTrainer.prediction_step*  s,    %%f-tzz8,%djj&7&79VXZ[ ]]_ 	V $ 1 1%PT 1 UD+	V  $%%{{}Q[%6%6%8QQv& V$))a)088Q8?AAV\\!_-%%f-VV##!	V 	Vs   #D>>Ec                 h    |j                  dd      }| j                  |       t        |   |i |S )Nnum_print_samples   )popvisualize_samplesrt   evaluate)rx   rB   kwargsr   r{   s       r;   r   zRewardTrainer.evaluateJ  s9    "JJ':A>01w000r=   r   c                    | j                         }t        t              }t        |      D ]  \  }}| j	                  | j
                  |d      \  }}}t        |d   | j                        }t        |d   | j                        }|d   j                  t        |             |d   j                  t        |             |d   j                  t        |j                         D 	
cg c]  }	|	D 
cg c]  }
t        |
d       c}
 c}
}	             |d	k\  st        |d         |k\  s n t        j                  |      }| j                  j                   d	k(  rt#               rt%        |d
|        d| j&                  j(                  v r5d	d
l}|j,                  % |j.                  d |j0                  |      i       d| j&                  j(                  v rt3        d|       y
y
y
c c}
w c c}
}	w )z
        Visualize the reward model logits prediction

        Args:
            num_print_samples (`int`, defaults to `4`):
                The number of samples to print. Set to `-1` to print all samples.
        F)r   r.   r0   chosen_textrejected_textr   r   r   Nwandbcompletions)	dataframecomet_mlzcompletions.csv)nametable)get_eval_dataloaderr   rd   	enumerater   rA   r!   rF   extendr   tolistroundrZ   pd	DataFrameacceleratorprocess_indexr   r&   rB   	report_tor   runlogTabler%   )rx   r   eval_dataloaderr   _r|   r   r   r   item
inner_itemdfr   s                r;   r   zRewardTrainer.visualize_samplesO  s    224D!"?3 
	IAv//

FY^/_LAvq26:L3MtOdOdeK4V<P5QSWShShiM- ''k(BC/"))-*FG(O""Y_YfYfYhiQUtLj! 4Lij !A%#eM.B*CGX*X
	 \\% ))Q. " $6%6!78$))---99(EII}kekkB.GHITYY000-* 1 /  Mis   %	G-.G(G-(G-c                    | j                   j                  *t        | j                   j                        j                  }n(| j                   j                  j                  d      d   }| j                  |       t        | !  ||       y )N/)
model_name)	rB   hub_model_idr   
output_dirr   splitcreate_model_cardrt   _save_checkpoint)rx   rA   trialr   r{   s       r;   r   zRewardTrainer._save_checkpointu  sl    99!!)dii22388J//55c:2>J*5 .r=   r   dataset_nametagsc           
      V   | j                         syt        | j                  j                  d      r^t        j
                  j                  | j                  j                  j                        s!| j                  j                  j                  }nd}|t               }nt        |t              r|h}nt        |      }t        | j                  j                  d      r|j                  d       |j                  | j                         t        ||| j                  ||t!               r.t"        j$                  t"        j$                  j'                         ndt)               d      }|j+                  t        j
                  j-                  | j.                  j0                  d             y)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        N_name_or_pathunsloth_versionunslothReward)
base_modelr   r   r   r   	wandb_url	comet_urltrainer_namez	README.md)is_world_process_zerorv   rA   r   ospathisdirr   setrb   straddupdaterw   r#   r   r   r   r   get_urlr$   savejoinrB   r   )rx   r   r   r   r   
model_cards         r;   r   zRewardTrainer.create_model_card}  s)   " ))+4::$$o6rww}}TZZM^M^MlMl?m**88JJ <5Dc"6Dt9D4::$$&78HHYDOO$(!!**%-?-AeiiF[eii'')ae.0!	

 	TYY%9%9;GHr=   )NNNNNNNNN)NNNN)FNr   )NNN)(__name__
__module____qualname__rw   r	   r
   r   r   Moduler   r   r   dictr   r   r   r   r   r   r   rd   r   r   r   optim	Optimizerlr_schedulerLambdaLRTensorru   r   r   boolr   r   intr   r   r   __classcell__)r{   s   @r;   r?   r?   T   s   )*J >B'+04+/EI >BFJ59W
 im&*%p7oryy89:p7 |$p7  -	p7
  (p7 uWd3<.@%@ABp7 #)+=?UWeef
p7 Xb/&9:;p7 "(N+;T+A"BCp7 D12p7 %++//1I1I1R1RRSp7" (0%,,9UW\WcWc9c0d'e#p7$ d^%p7l _bii/0 S%c 1223 
u||U5<<c5<<6G1H#HII	JL ,0$_bii/0$ S%c 1223$ #	$
 d3i($ 
x%x'=x?UU	V$@1
#3 #L/ %)&*,0	1ISM1I sm1I CcD()	1Ir=   r?   )Cre   r   ri   collectionsr   dataclassesr   r   pathlibr   typingr   r   r	   r
   pandasr   r   torch.nnr   
accelerater   accelerate.utilsr   datasetsr   transformersr   r   r   r   r   r   r   r   transformers.trainer_callbackr   transformers.trainer_pt_utilsr   transformers.trainer_utilsr   transformers.utilsr   r   
data_utilsr   reward_configr   utilsr   r    r!   r"   r#   r$   r%   r&   peftr'   r(   r)   r   r   r   rd   r<   r?   r   r=   r;   <module>r	     s     	  # 4  1 1    # * 	 	 	 : 7 5 C 2 '	 	 	 OOT#tCy.) 6O TXY\^bcf^gYgTh &ZIG ZIr=   