
    bi;                     H   d dl Z d dlZd dlZd dlmZmZmZmZ d dlZd dl	m
Z
 d dlm
c mZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ dd	l m!Z! dd
l"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z-  e       rd dl.m/Z/  e       rd dl0Z0 G d de'      Z1y)    N)AnyCallableOptionalUnion)Dataset)	AutoModelForCausalLMBaseImageProcessorDataCollatorFeatureExtractionMixinGenerationConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixinis_wandb_available)TrainerCallback)EvalPrediction)is_peft_available   )prepare_deepspeed)unwrap_model_for_generation   )	GKDConfig)
SFTTrainer)DataCollatorForChatMLdisable_dropout_in_modelempty_cachegenerate_model_cardget_comet_experiment_url)
PeftConfigc                       e Zd ZddgZ	 	 	 	 	 	 	 	 	 	 	 	 	 ddeeeej                  e	f      deeej                  e	f   dee
   dee   dee   d	eeeee	ef   f      d
eeeeeef      deeegef      deee      deej0                  j2                  ej0                  j4                  j6                  f   deeej8                  ej8                  gej8                  f      ded   dee   f fdZe	 dd       Zd dZ ed!d       Z!	 d!dej                  dee	eej8                  e"f   f   dee#   dej8                  f fdZ$	 	 	 d"dee	   dee	   dee	ee	   df   fdZ% xZ&S )#
GKDTrainertrlgkdNmodelteacher_modelargsdata_collatortrain_dataseteval_datasetprocessing_classcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricspeft_configr   formatting_funcc                 L   d|_         t        ||j                        }t        |   ||||||||	|
|||       |j
                  i }nIt        |t              st        d      |j
                  }|d   dv r|d   nt        t        |d         |d<   t        |t              rt        j                  |fi |}|j                  rt        | j                         | j                   rt#        || j$                        | _        n"| j$                  j)                  |d      | _        |j*                  | _        |j,                  | _        |j.                  | _        |j0                  | _        t3        |j4                  |j.                  dd	|j6                  rdnd| j8                  j:                  
      | _        t?        | j                  j<                  d      rQ| j                  j<                  j@                  0| j                  j<                  j@                  | j<                  _         y y y )NF)	tokenizer
max_length)r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   zfYou passed teacher_model_init_kwargs to the GKDConfig, but your teacher_model is already instantiated.torch_dtype)autoNT)evaluation_moder   )max_new_tokenstemperature	do_sampletop_k	use_cachepad_token_ideos_token_id)!remove_unused_columnsr   r3   super__init__teacher_model_init_kwargs
isinstancestr
ValueErrorgetattrtorchr   from_pretraineddisable_dropoutr   r$   is_deepspeed_enabledr   acceleratorr%   prepare_modellmbdabetar8   seq_kdr   r7   gradient_checkpointingr*   r<   generation_confighasattrr=   )selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   rA   	__class__s                  R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/gkd_trainer.pyr@   zGKDTrainer.__init__>   s   & &+"-8HUYUdUde''%-+!*G#+ 	 	
 ))1(*%M3/x  )-(F(F% -];~M *-8U$=m$LM &m4 mS)0@@lRklM $TZZ0$$!2=$BRBR!SD!%!1!1!?!?_c!?!dDZZ
II	++kk!1..((#::e..;;"
 DJJ00.A

,,99E26**2N2N2[2[D""/ F B    c           	         | |z  } ||z  }t        j                  | d      }t        j                  |d      }|dk(  rt        j                  ||dd      }n|dk(  rt        j                  ||dd      }nt        j                  ||j
                        }t        j                  t        j                  |t        j                  d|z
        z   |t        j                  |      z   g      d      }	t        j                  |	|dd      }
t        j                  |	|dd      }||
z  d|z
  |z  z   }|
|d	k7  }||   }|d
k(  rW|!|j                         j                         z  S |j                         |j                  d      |j                  d      z  z  S |dk(  r|j                         S |dk(  r|j                         S |S )a  
        Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1)
        of https://huggingface.co/papers/2306.13649 for the definition.

        Args:
            student_logits:
                Tensor of shape (batch_size, sequence_length, vocab_size)
            teacher_logits:
                Tensor of shape (batch_size, sequence_length, vocab_size)
            labels:
                Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
                loss
            beta:
                Interpolation coefficient between 0 and 1 (default: 0.5)
            temperature:
                Softmax temperature (default: 1.0)
            reduction:
                Specifies the reduction to apply to the output (default: 'batchmean')

        Returns:
            loss: Scalar tensor with the generalized JSD loss
        )dimr   noneT)	reduction
log_targetr   )dtype	batchmeansummean)Flog_softmaxkl_divrF   tensorr\   	logsumexpstacklogr_   sizer`   )student_logitsteacher_logitslabelsrM   r8   rZ   student_log_probsteacher_log_probsjsdmixture_log_probs
kl_teacher
kl_studentmasks                rT   generalized_jsd_losszGKDTrainer.generalized_jsd_loss   s   8 (+5'+5 MM.bAMM.bA19((,.?6^bcCQY((,.?6^bcC <<,=,C,CDD %.1t81DDFWZ_ZcZcdhZiFijk! "35FRXeijJ"35FRXeijJ #q4x:&==C T>Dd)C #-3-?3779txxz)lSWWYRURZRZ[\R]`c`h`hij`kRkEll%779& 88:JrU   c                     ||d   |d         }| j                   j                          t        j                         5  | j                  |d   |d         }d d d        |d   j                  d   }|j
                  d d |dz
  dd d f   }j
                  d d |dz
  dd d f   }	|d   d d |d f   }
| j                  ||	|
| j                        }t                |r||fS |S # 1 sw Y   xY w)	N	input_idsattention_mask)ru   rv   promptsr   rW   rk   )ri   rj   rk   rM   )	r%   evalrF   no_gradshapelogitsrs   rM   r   )rR   r$   inputsreturn_outputsnum_items_in_batchoutputs_studentoutputs_teacherprompt_lengthsshifted_student_logitsshifted_teacher_logitsshifted_labelslosss               rT   compute_losszGKDTrainer.compute_loss   s,   [)!"23
 	!]]_ 	"00 -%&67 1 O	  	*003!0!7!7>A;MPR;RTU8U!V!0!7!7>A;MPR;RTU8U!V)!^_*<= ((11!	 ) 
 	 +9o&BdB1	 	s   C((C1c                     | j                  |d   |j                  dd       |d      }|j                  }t        j                  |      }|j                         }|d|||k(  <   d|||k(  <   |||fS )Nrw   prompt_attention_maskT)ru   rv   rP   return_dict_in_generater]   r   )generateget	sequencesrF   	ones_likeclone)r$   r|   rP   r<   generated_outputsgenerated_tokensnew_attention_mask
new_labelss           rT   generate_on_policy_outputsz%GKDTrainer.generate_on_policy_outputs   s     "NNY'!::&=tD/$(	 + 
 -66"__-=>%++-
 #59Jz\12CD/<?@!3Z??rU   r|   r~   returnc                 X   | j                   rnt        | j                  | j                        5 }| j	                  ||| j
                  | j                  j                        \  }}}ddd       |d<   |d<   |d<   t        j                         | j                  k  rdt        || j                        5 }| j	                  ||| j
                  | j                  j                        \  }}}ddd       |d<   |d<   |d<   t        	| -  |||      }|S # 1 sw Y   xY w# 1 sw Y   7xY w)aa  
        Perform a training step for the Generalized Knowledge Distillation (GKD) model.

        This method implements the on-policy learning approach described in the GKD paper. With probability
        `self.lmbda`, it generates new responses using the student model, which are then used for training instead of
        the original inputs.
        Nru   rv   rk   )rN   r   r%   rJ   r   rP   r*   r<   randomrL   r?   training_step)
rR   r$   r|   r~   unwrapped_modelnew_input_idsr   r   r   rS   s
            rT   r   zGKDTrainer.training_step  s6    ;;,T-?-?AQAQR Ve@D@_@_#VT-C-CTEZEZEgEgA=1: #0F;'9F#$)F8==?djj(,UD4D4DE @D@_@_#VT-C-CTEZEZEgEgA=1: #0F;'9F#$)F8w$UF4FG#  s   7D37D D D)
model_namedataset_nametagsc                    | j                         syt        | j                  j                  d      r^t        j
                  j                  | j                  j                  j                        s!| j                  j                  j                  }nd}|t               }nt        |t              r|h}nt        |      }t        | j                  j                  d      r|j                  d       |j                  | j                         t        j                  d      }t!        ||| j"                  ||t%               r.t&        j(                  t&        j(                  j+                         ndt-               d|dd	      }|j/                  t        j
                  j1                  | j2                  j4                  d
             y)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        N_name_or_pathunsloth_versionunslothan          @inproceedings{agarwal2024on-policy,
            title        = {{On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes}},
            author       = {Rishabh Agarwal and Nino Vieillard and Yongchao Zhou and Piotr Stanczyk and Sabela Ramos Garea and Matthieu Geist and Olivier Bachem},
            year         = 2024,
            booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
            publisher    = {OpenReview.net},
            url          = {https://openreview.net/forum?id=3zKtaqxLhW},
        }GKDzPOn-Policy Distillation of Language Models: Learning from Self-Generated Mistakesz
2306.13649)
base_modelr   hub_model_idr   r   	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zerorQ   r$   configospathisdirr   setrB   rC   addupdate
_tag_namestextwrapdedentr   r   r   wandbrunget_urlr   savejoinr&   
output_dir)rR   r   r   r   r   citation
model_cards          rT   create_model_cardzGKDTrainer.create_model_card3  sG   " ))+4::$$o6rww}}TZZM^M^MlMl?m**88JJ <5Dc"6Dt9D4::$$&78HHYDOO$?? $  )!!**%-?-AeiiF[eii'')ae.0%j!

 	TYY%9%9;GHrU   )NNNNNNNNN)NNNNN)Ng      ?g      ?r^   )FN)N)NNN)'__name__
__module____qualname__r   r   r   r   nnModulerC   r   r
   r   dictr   r	   r   r   r   r   listr   tuplerF   optim	Optimizerlr_schedulerLambdaLRTensorr@   staticmethodrs   r   r   r   intr   r   __classcell__)rS   s   @rT   r!   r!   ;   s   J CG@D$(04+/EI FJ59Vbhl.2.2!T\oryy#=>?T\ _bii<=T\ y!	T\
  -T\  (T\ uWd3<.@%@ABT\ #)+=?UWeef
T\ "(N+;T+A"BCT\ D12T\ %++//1I1I1R1RRST\ (0%,,9UW\WcWc9c0d'eT\ l+T\  "(+!T\l ZeD DL!CF @ @. rvYY(,S%c8I2J-J(Kaijman	@ %)&*,0	>ISM>I sm>I CcD()	>IrU   r!   )2r   r   r   typingr   r   r   r   rF   torch.nnr   torch.nn.functional
functionalra   datasetsr   transformersr   r	   r
   r   r   r   r   r   r   transformers.trainer_callbackr   transformers.trainer_utilsr   transformers.utilsr   modelsr   models.utilsr   
gkd_configr   sft_trainerr   utilsr   r   r   r   r   peftr   r   r!    rU   rT   <module>r      sy    
   1 1     
 
 
 : 5 0 & 6 ! #  vI vIrU   