
    bi+                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' d d
l(m)Z) d dl*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZFmGZG  e'       rd dlHZHdZI G d de$      ZJy)    N)defaultdict)Path)CallableOptionalUnion)Accelerator)	broadcastgather_object)Dataset)
DataLoader)
BaseImageProcessorDataCollatorWithPaddingFeatureExtractionMixinGenerationConfigPreTrainedTokenizerBaseProcessorMixinTrainerTrainerCallbackTrainerControlis_wandb_available)#get_reporting_integration_callbacks)DEFAULT_CALLBACKSDEFAULT_PROGRESS_CALLBACK)CallbackHandlerExportableStatePrinterCallback)is_rich_available   )unwrap_model_for_generation)OnlineTrainerStatebatch_generationdisable_dropout_in_model	exact_divfirst_true_indicesforward
get_rewardprepare_deepspeedprint_rich_tableselective_log_softmaxtruncate_response   )
RLOOConfig)empty_cachegenerate_model_cardget_comet_experiment_urllog_table_to_comet_experiment      ?c                       e Zd ZddgZ	 	 	 	 ddedeeeee	e
f      dej                  dej                  deej                  eee   gee   f   f   d	ed
ee   deeeeeef   f      deej,                  j.                  ej,                  j0                  j2                  f   deee      ddfdZdefdZdefdZd Zdde fdZ! fdZ"	 	 	 ddee   dee   deeee   df   fdZ# xZ$S )RLOOTrainertrlrlooNconfigprocessing_classpolicy
ref_policyreward_modeltrain_datasetdata_collatoreval_dataset
optimizers	callbacksreturnc                 <   ||u rt        d      || _        |}|| _        || _        |t	        | j                        }d | j                  j
                  _        d | j                  j
                  _        || _        || _	        || _
        t        |      | _        || _        || _        |	\  | _        | _        d | _        |j$                  't'        |j(                  | j                  z        |_        t+        |j,                        }|| _        |j0                  |_        |j4                  |j,                  z  |j6                  z  |_        t'        |j4                  |j2                  z        |_        t'        |j8                  |j2                  z        |_        t?        |j<                  |j6                  d      |_         t?        |j8                  |j6                  d      |_!        tE        jF                  |j$                  |j<                  z        |_$        tK        jL                  t'        tO        jN                               |jP                        }tS        |d      jU                         }|jV                   d|jX                   d| |_-        |jX                  |j\                  dz  z   | _/        |j`                  dkD  r(tc        d	|jH                  |j`                  z        | _2        t?        |j8                  |jf                  d
      | _4        |||fD ](  }tk        |tl        jn                        stq        |       * |jr                  r*|jr                  dk(  r| j                  j                  |_:        || _;        | jy                  |jH                         tz        t}        | j                  j~                        z   }|
|n||
z   | _@        t        | j                  | jv                  | j                  | j                  | j                         | _B        | j                  | j                  j                  rt        nt               t               | _H        t        | j                         | j                         | j                  j                  | j                  gz   D cg c]  }tk        |t              s| c}      | _M        d| _N        d | _O        t        | j.                  j                  dd       d u| _Q        t        | j.                  j                  dd       d u| _R        d | _S        | j                  j                  r| j                          | j                  j                  r+t        j                  | j                  j                  d       d | _Z        t        | jv                  d      r%| jv                  j                  | j                         t        | j                  | jh                  d| j                  d      | __        tK        j                  |jX                         |j                  | jv                  | j                  | j                        \  | _;        | _        | __        tK        j                  | j^                         t        | j                  |j                  | j                  d      | _c        |j                  | j                        | _c        | j                  rtk        | j                  tl        jn                        r;t        | j                  |j4                  |j                  |j                        | _	        t        | j                  |j4                  |j                  |j                        | _        | jv                  | _g        y | j                  j                  | j.                  jP                        | _        tk        | j                  tl        jn                        r5| j                  j                  | j.                  jP                        | _	        y y c c}w )Nz`policy` and `ref_policy` cannot be the same object. If you want `ref_policy` to be the same as `policy`, you must mass a copy of it, or `None` if you use peft.)gradient_accumulation_stepsz5`batch_size` must be a multiple of `num_mini_batches`z;`local_batch_size` must be a multiple of `num_mini_batches`devicer   __i r+   z/`local_batch_size` must be a multiple of rloo_keos)num_training_steps)is_local_process_zerois_world_process_zerostateful_callbacksdeepspeed_pluginfsdp_pluginT)exist_okadd_model_tags)
batch_sizeshuffle
collate_fn	drop_last)rO   rQ   rR   )i
ValueErrorargsr7   r8   r   generation_configeos_token_idpad_token_idr9   r:   r;   lentrain_dataset_lenr<   r=   	optimizerlr_scheduleroptimizer_cls_and_kwargstotal_episodesintnum_train_epochsr   rB   acceleratornum_processes
world_sizeper_device_train_batch_sizenum_mini_batcheslocal_batch_sizemicro_batch_sizerO   r#   mini_batch_sizelocal_mini_batch_sizemathceilnum_total_batchestorchtensortimerD   r	   itemexp_nameseedrun_nameprocess_index
local_seednum_sample_generationsmaxsample_generations_freqrloo_klocal_dataloader_batch_size
isinstancennModuler"   
stop_tokenstop_token_idmodelcreate_optimizer_and_schedulerr   r   	report_tor?   r   callback_handleradd_callbackdisable_tqdmr   r   r   controlr    rH   rI   r   statecurrent_floshp_search_backendgetattris_deepspeed_enabledis_fsdp_enabledhub_model_idpush_to_hubinit_hf_reposhould_saveosmakedirs
output_dirbackup_modelhasattrrN   
_tag_namesr   
dataloadermanual_seedprepareper_device_eval_batch_sizeeval_dataloaderr'   fp16bf16	deepspeedto)selfr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   rT   r`   time_tensortime_intmoduledefault_callbackscbs                     S/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/rloo_trainer.py__init__zRLOOTrainer.__init__L   s     [ 
 	 0  3D4I4IJM  	%%2 6:%%2$(*!$]!3*(,6))(,%
 &"%d&;&;d>T>T&T"UD!d>^>^_&%33,,t/O/OORVRgRgg 	 !$D$D$Dt$V Wd33dooEF(OOT224k 
 &/!!4#8#8:w&
" "&$//1"
 ll3tyy{#3K<N<NO[!,113==/DII;b
C))k&?&?&&HH&&*+.q$2H2HDLgLg2g+hD(+4!!4;;0a,
( z<8 	1F&")),(0	1 ??t%7!%!6!6!C!CD
++#55 	, 	
 .0STXT]T]TgTg0hh.7.?*EVYbEb /NNDJJ(=(=t~~tO`O`!
 	TYY-C-C/Ibc%''"&"<"<">"&"<"<">!22<<~M Q[\^`oQp 

 !%$+D,<,<,B,BDVX\$]ei$i!&t'7'7'='=}dS[__ 99  99  KK		,,t<  4::/0JJ%%doo6
 %77))
 	$))$6A6I6I$**VZVdVdfjfufu6v3
DNDO$//*)66))	 
  +2243G3GH$$$++RYY7$5%%t'G'GTXT]T]%! 0!A!A499diiDO "ZZDN"oo001A1A1H1HIDO$++RYY7$($5$5$8$89I9I9P9P$Q! 8q s   `-`c                     | j                   S Nr   r   s    r   get_train_dataloaderz RLOOTrainer.get_train_dataloader   s        c                     | j                   S r   )r   r   s    r   get_eval_dataloaderzRLOOTrainer.get_eval_dataloader   s    ###r   c                 >(  ^ | j                   }| j                  }| j                  }| j                  }| j                  | _        | j
                  }| j                  }| j                  }| j                  ^|j                  }^fd}	t         |	             }
t        |j                  |j                  dz   ddd      }|j                  d       t        j                         }|j                   |j"                  |j$                  f}t'        j(                  ||      }t'        j(                  ||      }t'        j(                  ||      }t'        j(                  ||      }t'        j(                  ||      }t'        j(                  ||      }|j+                          d	| j,                  _        d	| j,                  _        |j2                  |j"                  z  d
z  | j,                  _        |j6                  | j8                  z  | j,                  _        |j<                  p|j<                  dk  rFt?        j@                  | j,                  j4                  |j<                  z        | j,                  _        n|j<                  | j,                  _        |jB                  p|jB                  dk  rFt?        j@                  | j,                  j4                  |jB                  z        | j,                  _!        n|jB                  | j,                  _!        |jD                  p|jD                  dk  rFt?        j@                  | j,                  j4                  |jD                  z        | j,                  _"        n|jD                  | j,                  _"        | jF                  jI                  || j,                  | jJ                        | _%        tM        d|j2                  dz         D ]c  }| j,                  xj0                  d|jN                  z  z  c_        tQ        |
      }t'        jR                         5  |d   jU                  |      }|jW                  |jX                  d      }|jZ                  d   }g }g }g }g }g }g }t]        | j                  | j                  | j                   j^                        5 }ta        |||jb                  |jd                  |      \  }} d d d        tM        d	|jZ                  d	   |jb                        D ]  }!||!|!|jb                  z    }"|!|!|jb                  z    }#|#d d |d f   }$ |!|!|jb                  z    }%tg        |%|$      }&~%ti                tk        ||#|jd                        }'|'jl                  d d |dz
  df   }(|(|j                  dz   z  }(tg        |(|$      })~'~(ti                |$}*|jn                  !tq        |jn                  |jd                  |$      }*t'        jr                  |"|*fd      }+tu        |*|jd                  k(        dz
  },tw        |tx        jz                        rt}        ||+|jd                  |      \  }-}.}-nKt'        j~                   ||j                  |+d            t&        j                        jU                  |      }.|j                  |$       |j                  |*       |j                  |&       |j                  |)       |j                  |,       |j                  |.        t'        jr                  |d	      }t'        jr                  |d	      }t'        jr                  |d	      }t'        jr                  |d	      }t'        jr                  |d	      }t'        jr                  |d	      }~&~)~.ti                t        j                          t'        j                  ||j                  k(  d      }/|j                  "||/ xx   | j                   j                  z  cc<   t'        j                  |jZ                  d   |j                        jW                  |jZ                  d	   d      }0|0|j                  d      kD  }1t'        j                  ||1t              }t'        j                  ||1t              }||z
  }2|j                  rS||j                         z
  |j                         dz   z  }t'        j                  ||j                   |j                        }|j                  r|j                   |2z  }3|1j                  d      dz
  |1j                         j                         j                  dd      z
  }4t'        j                  |2      }5|j                  dd      jU                  |2j                        }6|5j                  d|4|6       |3j                  d      }7|5|3z   }8|8j                  d      }9n&|2j                  d      }:|j                   |:z  }7|7|z   }9|9j                  |jX                  d      }9|9j                  d	      |9z
  |jX                  dz
  z  };|9|;z
  }<|<j                         }<|j                  r'|<|<j                         z
  |<j                         dz   z  }<ti                d d d        tM        |j                         D ]>  }=t        j                  j                  |j                        }>d	}?tM        d	|j                  |j                        D ]  }@|@|j                  z   }A|>|@|A }Bd	}CtM        d	|j                  |j                        D ]  }D|j                  |      5  D|j                  z   }EB|D|E }F<|F   }G|F   }H|F   }I|F   }Jtk        ||I|jd                        }K|Kjl                  d d dz
  df   }%|%|j                  dz   z  }%tg        |%|H      }Lt'        j                  |L1|F   t              }L|L|Jz
  j                         }M|Lj                  d      }L|Jj                  d      }J|L|Jz
  }Nt'        j                  |N      }O|G |Oz  }P|G t'        j                  |Od|j                  z
  d|j                  z         z  }Qt'        j                  |P|Q      }R|Rj                         }S|S}T|j                  |T       |j                          |j                          t'        jR                         5  QPkD  j                         j                         }Ut&        jx                  j                  j                  |%d      }Vt'        j                  |%d      t'        j                  |V|%z  d      z
  }WdNd
z  j                         z  }X|X||=|?Cf<   |U||=|?|Cf<   S||=|?|Cf<   |Wj                         ||=|?|Cf<   Mj                         ||=|?|Cf<   d d d        d d d        Cdz  }C |?dz  }?~K~%~L~N~O~P~Q~S~T~U~V~W~X~G~H~I~Jti                 A t'        jR                         5  2j                  d      j                         }Y j                  d      j                         }Z7j                         }[t        | j,                  j0                  t        j                         |z
  z        }\i }]|\|]d<   | j                  j                  |Y      j                         j                         |]d<   | j                  j                  |Z      j                         j                         |]d<   | j                  j                  |[      j                         j                         |]d<   | j                  j                  9      j                         j                         |]d<   | j                  j                  j                               j                         j                         |]d<   | j                  j                  |      j                         j                         |]d<   | j                  j                  |      j                         j                         |]d<   | j                  j                  |      j                         j                         |]d<   | j                  j                  |      j                         j                         |]d<   | j                  j                  |      j                         j                         |]d <   | j                  j                  |      j                         j                         |]d!<   | j                  j                  |      j                         j                         |]d"<   |j                  k(  j                         j                         |]d#<   | j                  j                         d	   |]d$<   | j,                  j0                  |]d%<   | j,                  j0                  |jX                  | j8                  z  z  | j,                  _t        | j                  |]       d d d        ~2~Y~Z~| j                  j                          | j,                  xj.                  dz  c_        | jF                  j                  || j,                  | jJ                        | _%        | jJ                  j                  rS| j                  |d &       | jF                  j                  | j                   | j,                  | jJ                        | _%        ti                t        j                          |j                  d	kD  s;|dz
  | j                  z  d	k(  sR| j                  d'       f | jF                  j                  || j,                  | jJ                        | _%        | jJ                  j                  rU| j                  |d d (       | jF                  j                  | j                   | j,                  | jJ                        | _%        y y # 1 sw Y   xY w# 1 sw Y   	nxY w# 1 sw Y   ZxY w# 1 sw Y   _xY w# 1 sw Y   xY w))Nc               3   (   K   	  E d {    7 wr    r   s   r   repeat_generatorz+RLOOTrainer.train.<locals>.repeat_generator   s     %%% %s   gHz>        r1   Tmax_new_tokenstemperaturetop_ktop_p	do_samplez===training policy===rC   r   r   r+   	input_idsgather_deepspeed3_paramsskip_special_tokensdtype)dimg:0yE>)r   keepdim)r   indexsrcg      ?epszobjective/klzobjective/entropyzobjective/non_score_rewardzobjective/rlhf_rewardzobjective/scoreszpolicy/approxkl_avgzpolicy/clipfrac_avgzloss/policy_avgzval/clipfrac_avgzpolicy/entropy_avgz	val/ratiozval/ratio_varzval/num_eos_tokenslrepisode)trial)sampling)r   metrics)~rT   r`   rZ   r   model_wrappedr9   r:   r7   r   rD   iterr   response_lengthr   printrn   num_ppo_epochsrd   rB   rl   zerostrainr   global_stepr   rk   	max_stepsr]   rY   r_   logging_stepsri   rj   
eval_steps
save_stepsr   on_train_beginr   rangerO   nextno_gradr   repeatrx   shaper   ds3_gather_for_generationr!    local_rollout_forward_batch_sizerW   r)   r-   r%   logitsr~   r*   catr$   rz   r{   r|   r&   rm   batch_decodefloatappendgccollectanyrV   missing_eos_penaltyarange	unsqueezemasked_fillINVALID_LOGPROBnormalize_rewardmeanstdclampreward_clip_rangetoken_level_klkl_coefsizelongfliplrargmax
zeros_likereshaper   scatter_sumflattennormalize_advantagenprandompermutationre   rh   rc   
accumulateexp	cliprangerv   backwardstep	zero_grad
functionalsoftmax	logsumexpr^   gather_for_metricsro   varr[   get_last_lrepochlogon_step_endr   _save_checkpointon_saveru   rw   generate_completionson_train_end)_r   rT   r`   rZ   r   r9   r:   r7   rD   r   iter_dataloaderrU   
start_timestats_shapeapproxkl_statspg_clipfrac_statspg_loss_statsvf_clipfrac_statsentropy_statsratio_statsupdatedataqueriescontext_length	responsespostprocessed_responseslogprobsref_logprobsscoressequence_lengthsunwrapped_modelquery_responseslogitssiqueryquery_responseresponser   logprob
ref_output
ref_logitsref_logprobpostprocessed_responsepostprocessed_query_responsesequence_length_scorecontain_eos_tokenresponse_idxspadding_maskkl	kl_rewardeos_indiceslast_rewardscores_shapednon_score_rewardrewardrlhf_rewardsequence_klbaseline
advantagesppo_epoch_idxb_indsminibatch_idxmini_batch_startmini_batch_endmini_batch_indsgradient_accumulation_idxmicro_batch_startmicro_batch_endmicro_batch_indsmb_advantagemb_responsesmb_query_responsesmb_logprobsoutputnew_logprobs	new_ratiologprobs_diffratio	pg_losses
pg_losses2pg_loss_maxpg_losslosspg_clipfrac	prob_distentropyapproxklmean_klmean_entropymean_non_score_rewardr   r   r   s_                                                                                                 @r   r   zRLOOTrainer.train   s   yy&&NN	

!ZZ__
((00__
##	& /12,//))D0
 	12YY[
**D,A,A4CcCcd[@!KKFCK?!KKFCK?kk+f= "#



 $ 6 69N9N NSTT

&*&9&9D<R<R&R

#)!!A%+/99TZZ5I5IDL^L^5^+_

(+/+=+=

(??&"(,		$**2F2F2X(Y

%(,

%??&"(,		$**2F2F2X(Y

%(,

%,,;;D$**dll[At559: A	9FJJ!doo"55(D I{+..v6!..a8!(q!1	*,'!#%  1JJ 0 0499KnKn 	$/?'==(55)0,OW	 q'--"2D4Y4YZ -)A#AD,Q,Q(QRE%4QT=b=b9b%cN-a.@AH$QT-R-R)RSF3FHEGM!(^EUEbEb!cJ!+!2!21nq6H26M3M!NJ$"2"2T"99J"7
H"MK"JM .6*))51B ..0@0M0Mx2.
 4999eE[=\^_3`0&89OScSpSp9p&qtu&uO!,		:&0(*FHXHeHegu'5! !&( 0 = =>Zpt = u #(++	!
 "V*  $$X.+223IJOOG, ''4$++O<MM%([-)` "IIi3	*/))4KQ*O' 99Xq1$yyq9#(99-=q#A 61-k5


 %*II.EIYIfIf.fln$o!++7--.$))2O2OO. !&Y__Q-?	HXHX Y ` `ajapapqrasuv w,/?/I/I!/LL ,,X|_U$00|_] , (($v{{}49LMF"[[$2H2H1H$J`J`aF &&!% 2I #/"3"3A"6":\=N=N=P=W=W=Y=`=`efpt=`=u"uK"'"2"22"6K$*NN2q$9$<$<RXX$FM((Qk}(U (1}}Q'7$(94F"(**Q-K #%&&)K(,}{'B$"2V";K *11$++rB'OOA.<qQ(83
'//1
 ++",z/@"@Z^^EUX\E\!]JSIX "'t':':!; I"..t/D/DE !(-a1F1FHbHb(c F"$%58R8R%RN&,-=n&MO01--21d6P6PRVRrRr-s 77)(33E: 5x.?$BbBb.bO/>?PQ`/a, ,66F+GL+45E+FL1@AQ1R.*23C*DK &-U4FHXHeHe%fF%+]]1nq6H26M3M%NF"d&6&6&==F ,A+VL+0+<+< ,l;K.Lo,L
 *6)C(H(H(JI+7+;+;A+>L*5//!*<K,8;,FM$)IIm$<E *6(=I*6UCRVR`R`L`behlhvhvbv9w)wJ*/))Iz*JK&1&6&6&8G $+D (006%NN,%//1!& x/9I/E.L.L.N.S.S.U,1HH,?,?,G,GTV,G,W	*///&b*IEIIV_bhVhnpLq*q+.-2B1H1H1J+Jjr}mMf/f g$/ !2-Pi2i j jqm]Le.e fipiuiuiwm]Le.e fgpgugugwM=Jc,c dxU5xl 2Q61o77p "Q&M
 mUI"GT;	7T\$l4F  MMF"I"X  "&&)..*!)	q1668(8(=(=(?%$**,,		j0HIJ!$*.*:*:*M*Mg*V*[*[*]*b*b*d'/3/?/?/R/RS_/`/e/e/g/l/l/n+,$$778MNSSUZZ\ 45 483C3C3V3VWb3c3h3h3j3o3o3q/0.2.>.>.Q.QRXR]R]R_.`.e.e.g.l.l.n*+151A1A1T1TUc1d1i1i1k1p1p1r-.151A1A1T1TUf1g1l1l1n1s1s1u-.-1-=-=-P-PQ^-_-d-d-f-k-k-m)*.2.>.>.Q.QRc.d.i.i.k.p.p.r*+040@0@0S0STa0b0g0g0i0n0n0p,-'+'7'7'J'J;'W'\'\'^'c'c'e$+/+;+;+N+N{+[+_+_+a+f+f+h(1:>N>[>[1[0`0`0b0g0g0i,- $ 1 1 = = ? B%)ZZ%7%7	"#'::#5#5tG]G]9]#^

 !3"4 G\6""$JJ""a'"00<<T4::t||\DL||''%%e4%8#44<<TYY

TXT`T`aMJJL**Q.FQJ$B^B^3^bc3c))4)8CA	9H ,,99$

DLLY<<##!!%tT!B0088DJJPTP\P\]DL $m	 	I I|x xU5x 5xH" "sn   $BAO+&'AOV2AO+<FAPCAO8 AP2PAPOAO(O#AO+O+AO5	O8APO=APPAPPAP	r   c                    | j                   }| j                  }t        | j                   j                  dddd      }t	        t
              }t        | j                  | j                  | j                   j                        5 }| j                  D ]  }|d   }t        j                         5  |j                  d   }	t        |||j                  d	   |j                  |      \  }
}|
d d |	d f   }|}|j                   !t#        |j                   |j                  |      }|d
   j%                  t'        |j)                  |d                   |d   j%                  t'        |j)                  |                   t        j*                  ||fd      }t-        | j.                  t0        j2                        r't5        | j.                  ||j                  |	      \  }}}n^t        j6                  | j/                  |j)                  |d            t        j8                        j;                  |j<                        }|d   j%                  | j                  j?                  |      j9                         jA                         jC                                d d d        |s n d d d        tE        jF                  |      }| j                  jH                  rtK               rtM        |jN                  d	d        d|jP                  v r5d	d l)}|jT                  % |jV                  d |jX                  |      i       d|jP                  v rt[        d|       y y y # 1 sw Y   xY w# 1 sw Y   xY w)Ngaz?r   r1   Tr   r   r   r+   r   r+  r   zmodel responser   r6     wandbcompletions)	dataframecomet_mlzcompletions.csv)nametable).rT   r7   r   r   r   listr   r   r`   r   r   rl   r   r   r!   rW   r~   r*   extendr
   r   r   rz   r:   r{   r|   r&   rm   r   r   rD   r
  cpunumpypd	DataFrameis_main_processr   r(   ilocr   rf  runr  Tabler0   )r   r   rT   r7   rU   rk  r'  batchr+  r   r,  r5  r-  r2  r3  r6  dfrf  s                     r   r  z RLOOTrainer.generate_completions8  s!   yy00,9944$
 D!(JJ((499CfCf
 .	-- +k*]]_ &l%*[[^N(8'A(55))%NA  .a.@AH-5*))51B ..0@0M0Mx2. 'N))%&6&C&CE_c&C&de *+22%&6&C&CDZ&[\ 4999eE[=\^_3`0!$"3"3RYY?&0 --8,99*	'5! !& -- 0 = =>Zpt = u #(++	!
 "9@@A  'N))$*:*:*M*Me*T*Z*Z*\*`*`*b*h*h*jkM&lP W+.	^ \\% ++ " U!34$..(99(EII}kekkB.GHIT^^+-* , ,Y&l &l.	 .	s+   *M",G%M
M"M"MM""M+c                    | j                   j                  *t        | j                   j                        j                  }n(| j                   j                  j                  d      d   }| j                  |       t        | !  ||       y )N/r   )
model_name)	rT   r   r   r   rj  splitcreate_model_cardsuperr  )r   r   r   rz  	__class__s       r   r  zRLOOTrainer._save_checkpoint  sl    99!!)dii22388J//55c:2>J*5 .r   rz  dataset_nametagsc                    | j                         syt        | j                  j                  d      r^t        j
                  j                  | j                  j                  j                        s!| j                  j                  j                  }nd}|t               }nt        |t              r|h}nt        |      }t        | j                  j                  d      r|j                  d       |j                  | j                         t        j                  d      }t!        ||| j"                  ||t%               r.t&        j(                  t&        j(                  j+                         ndt-               d|dd	      }|j/                  t        j
                  j1                  | j2                  j4                  d
             y)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        N_name_or_pathunsloth_versionunslotha          @inproceedings{ahmadian2024back,
            title        = {{Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs}},
            author       = {Arash Ahmadian and Chris Cremer and Matthias Gall{'{e}} and Marzieh Fadaee and Julia Kreutzer and Olivier Pietquin and Ahmet {"{U}}st{"{u}}n and Sara Hooker},
            year         = 2024,
            booktitle    = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), {ACL} 2024, Bangkok, Thailand, August 11-16, 2024},
            publisher    = {Association for Computational Linguistics},
            pages        = {12248--12267},
            editor       = {Lun{-}Wei Ku and Andre Martins and Vivek Srikumar},
        }RLOOz`Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMsz
2402.14740)
base_modelrz  r   r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)rI   r   r   r6   r   pathisdirr  setrz   straddr  r   textwrapdedentr.   r   r   rf  rt  get_urlr/   savejoinrT   r   )r   rz  r  r  r  citation
model_cards          r   r|  zRLOOTrainer.create_model_card  sG   " ))+4::$$o6rww}}TZZM^M^MlMl?m**88JJ <5Dc"6Dt9D4::$$&78HHYDOO$?? 	$ 	 )!!**%-?-AeiiF[eii'')ae.0%z!

 	TYY%9%9;GHr   )NN)NNN)F)NNN)%__name__
__module____qualname__r   r,   r   r   r   r   r   r   r{   r|   r   rl  r  r   r   r   dicttuplerl   optim	Optimizerr[   LambdaLRr   r   r   r   r   r   boolr  r  r|  __classcell__)r~  s   @r   r3   r3   I   s   J <@EIVb59`R`R #)+=?UWeef
`R 		`R II`R BIIxcT%[0H'IIJ`R `R   78`R uWd3<.@%@AB`R %++//1I1I1R1RRS`R D12`R 
`RDj $Z $B^H
JT JZ/ %)&*,0	?ISM?I sm?I CcD()	?Ir   r3   )Kr   ri   r   r  rn   collectionsr   pathlibr   typingr   r   r   ro  r   pandasrp  rl   torch.nnr{   
accelerater   accelerate.utilsr	   r
   datasetsr   torch.utils.datar   transformersr   r   r   r   r   r   r   r   r   r   transformers.integrationsr   transformers.trainerr   r   transformers.trainer_callbackr   r   r   transformers.utilsr   models.utilsr   trainer.utilsr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   rloo_configr,   utilsr-   r.   r/   r0   rf  r   r3   r   r   r   <module>r     s    
  	   #  , ,     " 5  '   J M [ [ 0 6    $ l l C
I' C
Ir   