
    biŻ                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z?  e.       rd dl@Z@d dl@mAZAmBZBmCZCmDZD  e&       rd dlEZE edeFe      ZGdeGdeGfdZHe G d de(             ZI G d de$      ZJy)    N)defaultdict)Mapping)	dataclass)Path)AnyCallableOptionalTypeVarUnion)PartialState)DatasetIterableDataset)version)AutoModelForCausalLMAutoTokenizerBaseImageProcessorDataCollatorFeatureExtractionMixinPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerTrainingArgumentsis_wandb_available)DataCollatorMixin)TrainerCallback)EvalPrediction)is_peft_available   )is_conversationalis_conversational_from_valuemaybe_convert_to_chatmlpack_datasettruncate_dataset)clone_chat_templateget_act_offloading_ctx_manager   )	SFTConfig)ConstantLengthDatasetgenerate_model_cardget_comet_experiment_urlpadpeft_module_casting_to_bf16)
PeftConfig	PeftModelget_peft_modelprepare_model_for_kbit_trainingTListOrMappingexamplereturnc           
      ^   t        | t              r1| D cg c]%  }t        |t        t        f      rt        |      n|' c}S t        | t              rG| j                         D ci c]+  \  }}|$|t        |t        t        f      rt        |      n|- c}}S t        d      c c}w c c}}w )a  
    Recursively removes entries with `None` values from a nested structure (list or dictionary).

    Args:
        example (`list` or `Mapping`):
            Input nested structure (list or dictionary) from which to remove `None`.

    Example:
    ```python
    >>> [{
    ...     "a": {"aa": None,
    ...           "ab": 1},
    ...     "b": "my_string",
    ... }]
    >>> remove_none_values(example)
    [{'a': {'ab': 1}, 'b': 'my_string'}]
    ```
    z%Input must be a list or a dictionary.)
isinstancelistdictremove_none_valuesr   items	TypeError)r3   valuekeys      R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/trainer/sft_trainer.pyr9   r9   M   s    & '4 elm\aZd|-L"5)RWWmm	GW	% &mmo
U  jt.M#E*SXX
 	
 ?@@ n
s   *B$%0B)c                       e Zd ZU dZeed<   dZeed<   dZeed<   dZ	eed<   dZ
ee   ed	<   d
Zeed<   deeee   eeeef   f      deeef   fdZy)DataCollatorForLanguageModelinga  
    Data collator used for language modeling data. Inputs are dynamically padded to the maximum length of a batch.

    This collator expects each example in the input list to be a dictionary containing at least the `"input_ids"` key.
    If the input contains a `"completion_mask"`, it is used to set the labels to `-100` for tokens that are not in the
    completion. If `"assistant_masks"` are present, they are used to set the labels to `-100` for tokens that are not
    in the assistant part of the sequence. The collator returns a dictionary containing the following keys:
    - `"input_ids"`: Tensor of input IDs, padded to the maximum length of the batch.
    - `"attention_mask"`: Tensor of attention mask, padded to the maximum length of the batch.
    - `"position_ids"`: Tensor of position IDs, padded to the maximum length of the batch.
    - `"labels"`: Tensor of labels, padded to the maximum length of the batch. If `completion_only_loss` is set to
    `True`, tokens that are not in the completion are set to -100. If `assistant_masks` are present, tokens that are
    not in the assistant part of the sequence are set to -100.

    Args:
        pad_token_id (`int`):
            Token ID to use for padding.
        completion_only_loss (`bool`, *optional*, defaults to `True`):
            When the input contains a completion mask (`completion_mask`), the labels are set to -100 for the tokens
            that are no in the completion.
        padding_free (`bool`, *optional*, defaults to `False`):
            If set to `True`, the sequences will be flattened into a single sequence, and the position IDs will be
            generated accordingly. The attention mask will be set to 1 for all tokens.
        pad_to_multiple_of (`int` or `None`, *optional*, defaults to `None`):
            If set, the sequences will be padded to a multiple of this value.
        return_tensors (`str`, *optional*, defaults to `"pt"`):
            Type of Tensor to return. Only `"pt"` is currently supported.

    Examples:
    ```python
    >>> from trl import DataCollatorForLanguageModeling

    >>> collator = DataCollatorForLanguageModeling(pad_token_id=0)
    >>> examples = [{"input_ids": [1, 2, 3]}, {"input_ids": [4, 5]}]
    >>> collator(examples)
    {'input_ids': tensor([[  1,  2,  3],
                          [  4,  5,  0]]),
     'attention_mask': tensor([[  1,  1,  1],
                               [  1,  1,  0]]),
     'position_ids': tensor([[0, 1, 2],
                             [0, 1, 0]]),
     'labels': tensor([[   1,    2,    3],
                       [   4,    5, -100]])}

    >>> # With completion mask
    >>> examples = [
    ...     {"input_ids": [1, 2, 3], "completion_mask": [0, 1, 1]},
    ...     {"input_ids": [4, 5], "completion_mask": [0, 1]},
    ... ]
    >>> collator(examples)
    {'input_ids': tensor([[  1,  2,  3],
                          [  4,  5,  0]]),
     'attention_mask': tensor([[  1,  1,  1],
                               [  1,  1,  0]]),
     'position_ids': tensor([[0, 1, 2],
                             [0, 1, 0]]),
     'labels': tensor([[-100,    2,    3],
                       [-100,    5, -100]])}

    >>> # With padding_free
    >>> collator = DataCollatorForLanguageModeling(pad_token_id=0, padding_free=True)
    >>> collator(examples)
    {'input_ids': tensor([[ 1, 2, 3, 4, 5]]),
     'attention_mask': tensor([[1, 1, 1, 1, 1]]),
     'position_ids': tensor([[0, 1, 2, 0, 1]]),
     'labels': tensor([[1, 2, 3, 4, 5]])}
    ```
    pad_token_idTcompletion_only_lossFpadding_freereturn_position_idsNpad_to_multiple_ofptreturn_tensorsexamplesr4   c                    |D cg c]  }t        j                  |d          }}| j                  xr d|d   v xr | j                  }|s"|D cg c]  }t        j                  |       }}| j                  rXd|d   v r&|D cg c]  }t        j                  |d          }}n+D cg c]   }t        j
                  t        |            " }}d|d   v r&|D cg c]  }t        j                  |d          }}n%|D cg c]  }t        j                  |d          }}| j                  r,d|d   v r%|D cg c]  }t        j                  |d          }	}d|d   v r%|D cg c]  }t        j                  |d          }
}i }| j                  r0t        j                  d      j                  d      |d<   |s)t        j                  d      j                  d      |d<   | j                  r)t        j                  d      j                  d      |d<   t        j                  |d      j                  d      |d<   | j                  r8d|d   v r1t        j                  	d      j                  d      }	d	|d   |	dk(  <   d|d   v r1t        j                  
d      j                  d      }
d	|d   |
dk(  <   |S t        | j                  d
| j                        |d<   t        dd
| j                        |d<   | j                  rt        dd
| j                        |d<   t        |d	d
| j                        |d<   | j                  r+d|d   v r$t        	dd
| j                        }	d	|d   |	dk(  <   d|d   v r$t        
dd
| j                        }
d	|d   |
dk(  <   |S c c}w c c}w c c}w c c}w c c}w c c}w c c}w c c}w )N	input_idsposition_idsr   labelscompletion_maskassistant_masksdimattention_maskright)padding_valuepadding_siderE   )torchtensorrD   rC   	ones_likearangelenrB   cat	unsqueezer,   rA   rE   )selfrH   r3   rJ   has_packed_position_idsrQ   rK   idsrL   rM   rN   outputs               r>   
torch_callz*DataCollatorForLanguageModeling.torch_call   s   GOPGU\\'+"67P	P #'":":"r~QYZ[Q\?\"raearar 'JSTYeooi8TNT##!,U]^'W^-D E^^BKL3SX 6LLx{"EMN'ell78#45NFNHPQWell7;#78QFQ$$):hqk)IW_`Gu||G4E,FG`O`+W_`Gu||G4E,FG`O` "'))I1"="G"G"JF;*+099^+K+U+UVW+X'('').<Q)G)Q)QRS)T~&$yyQ7AA!DF8((->(1+-M"'))O"C"M"Ma"P9=x A!56 HQK/"'))O"C"M"Ma"P9=x A!568 5 #&"//$#'#:#:	#F; (+agZ^ZqZq(F#$ ''), \`\s\s*~&  #dUYUlUl F8 ((->(1+-M"%#17_c_v_v# :>x A!56 HQK/"%#17_c_v_v# :>x A!56E Q U  _LNQ``s/   N0N5N:*%N?OO	:O&O)__name__
__module____qualname____doc__int__annotations__rB   boolrC   rD   rE   r	   rG   strr7   r   r   r8   ra        r>   r@   r@   l   s    CJ !%$%L$ $$(,,NCD4d3id38n.L(M#N DSWX[]`X`Sa Drk   r@   c                    x    e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 	 	 	 	 d)deeej                  e	f   de
eeef      de
e   de
eeef      d	e
eeeeef   f      d
e
eeeeef      de
e   de
eegef      de
ee      dee
ej6                  j8                     e
ej6                  j:                  j<                     f   de
eeej6                  j8                     eee f   f      de
eejB                  ejB                  gejB                  f      de
d   de
eegef      f fdZ"dedede	fdZ#de	de dede	fdZ$de	dede	fdZ%de	dede	fdZ&deeef   d
eeeeef   dede'de
eegef      dedeeef   fdZ(d Z)d* fd 	Z* fd!Z+d+d"eee,f   d#e
e,   ddf fd$Z- fd%Z.	 	 	 d,d&e
e   de
e   d'eeee   df   fd(Z/ xZ0S )-
SFTTrainera$  
    Trainer for Supervised Fine-Tuning (SFT) method.

    This class is a wrapper around the [`transformers.Trainer`] class and inherits all of its attributes and methods.

    Example:

    ```python
    from datasets import load_dataset
    from trl import SFTTrainer

    dataset = load_dataset("roneneldan/TinyStories", split="train[:1%]")

    trainer = SFTTrainer(model="Qwen/Qwen2-0.5B-Instruct", train_dataset=dataset)
    trainer.train()
    ```

    Args:
        model (`Union[str, PreTrainedModel]`):
            Model to be trained. Can be either:

            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
              path to a *directory* containing model weights saved using
              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
              `args.model_init_kwargs`.
            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
        args ([`SFTConfig`], *optional*, defaults to `None`):
            Configuration for this trainer. If `None`, a default configuration is used.
        data_collator (`DataCollator`, *optional*):
            Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
            Will default to a custom [`DataCollatorForLanguageModeling`].
        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
            Dataset to use for training. SFT supports both [language modeling](#language-modeling) type and
            [prompt-completion](#prompt-completion) type. The format of the samples can be either:

            - [Standard](dataset_formats#standard): Each sample contains plain text.
            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
              and content).

            The trainer also supports processed datasets (tokenized) as long as they contain an `input_ids` field.
        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
        processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`):
            Processing class used to process the data. If `None`, the processing class is loaded from the model's name
            with [`~transformers.AutoTokenizer.from_pretrained`].
        callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
            in [here](https://huggingface.co/docs/transformers/main_classes/callback).

            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
            method.
        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
        optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*, defaults to `None`):
            A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
            `args`. Incompatible with the `optimizers` argument.

            Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before
            initializing the Trainer.
        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*, defaults to `None`):
            A function that preprocess the logits right before caching them at each evaluation step. Must take two
            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
            by this function will be reflected in the predictions received by `compute_metrics`.

            Note that the labels (second parameter) will be `None` if the dataset does not have them.
        peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
        formatting_func (`Optional[Callable]`):
            Formatting function applied to the dataset before tokenization. Applying the formatting function explicitly
            converts the dataset into a [language modeling](#language-modeling) type.
    trlsftNmodelargsdata_collatortrain_dataseteval_datasetprocessing_classcompute_loss_funccompute_metrics	callbacks
optimizersoptimizer_cls_and_kwargspreprocess_logits_for_metricspeft_configr.   formatting_funcc                 ,   t        |t              r|n|j                  j                  }|#|j	                  d      d   }t        | d      }n[t        |t              rKt        |t
              s;|j                         }|j                  |d<   |j                  d       t        d%i |}|t        j                  |      }|j                  L|j                  }|j                  |      }|&t        d| d|j                  j                    d      ||_        |j$                  %t        |t              st'        j(                  d	       t        |t              r| j+                  ||      }|j,                  t.        j0                  j3                  |j,                        rQ|j,                  j5                  d
      r6t7        |j,                  d      5 }|j9                         |_        d d d        nt=        |||j,                        \  }}|| j?                  |||      }|j@                  xs |jB                  xr |jD                  dk(  | _         | j@                  r|t        d      |jB                  r$|jD                  dk(  rt'        j(                  d       |j                  jF                  dk7  rt'        j(                  d       |jH                  dk(  r!|jB                  st'        j(                  d       |jJ                  tM        tO        |            }d|v | _%        n|jJ                  | _%        ||jP                  xs |jP                  xs |j                  }|j                  |      }|&t        d| d|j                  j                    d      tS        || jJ                  | j@                  |j                  jF                  dk(  |jT                        }|jB                  r=|jD                  dk(  r.|j                  jF                  dk7  rt'        j(                  d       |jV                  rtY        |d         st        d      |jZ                  d u xs |jZ                  j]                  dd       }|r| jJ                  r|rt        d      | j_                  ||||jB                  |d      }||j`                  |jB                  n|j`                  }t        |tb              r8|je                         D ci c]  \  }}|| j_                  ||||||       }}}n| j_                  |||||d       }tg        th              tg        th              d!| _5        d| _6        tn        |   |||||||||	|
||"       | jr                  jt                  rtw        | jx                  #      | _=        nt}        j~                         | _=        t        | jx                  d$      r&| jx                  j                  | j                         y y # 1 sw Y   xY wc c}}w )&N/z-SFT	hub_tokenpush_to_hub_tokenzThe specified `eos_token` ('zC') is not found in the vocabulary of the given `processing_class` (zX). Ensure that the `eos_token` exists in the vocabulary before using it as an EOS token.zYou passed model_init_kwargs to the `SFTConfig`, but your model is already instantiated. The `model_init_kwargs` will be ignored.)z.jinjaz.j2zutf-8)encodingffdzHPassing a custom data collator is not supported when using padding-free.wrappedzYou are passing `padding_free=True` with the 'wrapped' packing strategy, which is not recommended. Please refer to the documentation to understand why this is not recommended.flash_attention_2a  Padding-free training is enabled, but the attention implementation is not set to 'flash_attention_2'. Padding-free training flattens batches into a single sequence, and 'flash_attention_2' is the only known attention mechanism that reliably supports this. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation='flash_attention_2'` in the model configuration, or verify that your attention mechanism can handle flattened sequences.r'   zYou are using a per_device_train_batch_size of 1 with padding-free training. Using a batch size of 1 anihilate the benefits of padding-free training. Please consider increasing the batch size to at least 2.promptzThe specified `pad_token` ('z[). Ensure that the `pad_token` exists in the vocabulary before using it as a padding token.)rA   rB   rC   rD   rE   a  You are using packing, but the attention implementation is not set to 'flash_attention_2'. Packing flattens batches into a single sequence, and 'flash_attention_2' is the only known attention mechanism that reliably supports this. Using other implementations may lead to cross-contamination between batches. To avoid this, either disable packing by setting `packing=False`, or set `attn_implementation='flash_attention_2'` in the model configuration.r   zYou set `assistant_only_loss=True`, but the dataset is not conversational. This option is only supported for conversational datasets.skip_prepare_datasetFaE  A formatting function was provided while `completion_only_loss=True`, which is incompatible. Using a formatter converts the dataset to a language modeling type, conflicting with completion-only loss. To resolve this, apply your formatting function before passing the dataset, or disable `completion_only_loss` in `SFTConfig`.traineval)r   r   )rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   )rp   add_model_tagsrj   )Cr6   ri   config_name_or_pathsplitr(   r   to_dictr   popr   from_pretrained	eos_tokenconvert_tokens_to_ids
ValueError	__class__rb   eos_token_idmodel_init_kwargswarningswarn_create_model_from_pathchat_template_pathospathisfileendswithopenreadchat_templater%   _prepare_peft_modelrC   packingpacking_strategy_attn_implementationper_device_train_batch_sizerB   nextiter	pad_tokenr@   rE   assistant_only_lossr    dataset_kwargsget_prepare_dataseteval_packingr8   r:   r   r7   _metrics_total_train_tokenssuper__init__rq   activation_offloadingr&   rp    maybe_activation_offload_context
contextlibnullcontexthasattrr   
_tag_names)r]   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   model_id
model_name	dict_argsr   r   chat_template_filefirst_exampler   rA   preprocess_datasetr   r=   datasetr   s                               r>   r   zSFTTrainer.__init__N  s   ( 'uc258R8R<!,R0J
|401D/0D)9TI%)^^Ik"MM-.)y)D #,<<XF>>%I+AA)LL# 29+ >++;+E+E+N+N*O PII 
 -9) !!-j6LMM; eS!00=E"".ww~~d5564;R;R;[;[\m;n$11GD OHZ5G5L5L5N$2O O +>eEUW[WnWn*o'' ",,UKFE
 !--b$,,2a4CXCX\aCa( !kll|| 5 5 Bp ||004GGJ //14T\\% $$, m!45M(0M(AD%(,(A(AD%  b*:*D*DbHXHbHbI+AA)LL# 29+ >++;+E+E+N+N*O PLL 
 <)%)%>%>!..$)LL$E$EI\$\#'#:#:M LL%%.115HHMMX ##,=mA>N,O9  "00D8v@S@S@W@WXnpu@v<v((_ Q  !11/t||_V]M '*.*;*;*C$,,IZIZlD1 -9,>,>,@$(C T227<LdT[]lnqrr$L $
 $(#8#8$&6gX^$L
 #.d"3[=NO#$  	''%-/+!%=*G 	 	
  99**4RY]YcYc4dD14>4J4J4LD1 4::/0JJ%%doo6 1UO OB$s   X:!XX
model_pathr4   c                 "   |j                   xs i }|j                  d      }t        |t        j                        s|dk(  s|n5t        |t
              rt        t        |      }||d<   nt        d| d      t        j                  |fi |}|S )z0Creates a model from a path or model identifier.torch_dtypeautozInvalid `torch_dtype` passed to `SFTConfig`. Expected either 'auto' or a string representing a `torch.dtype` (e.g., 'float32'), but got .)
r   r   r6   rV   dtyperi   getattrr   r   r   )r]   r   rq   r   r   rp   s         r>   r   z"SFTTrainer._create_model_from_path  s     228b'++M:k5;;/;&3HKL_S)!%5K/:m,>>I]!M  %44ZUCTUrk   c                 N   t               st        d      t        |t              st	        dt        |       d      t        |t              r|S t        |dd      xs t        |dd      }d}t        |dd      rU|j                         D ]B  \  }}|j                  j                  dk(  s |j                  j                  j
                  dv } n |r,|s*| j                  ||      }t        j                  |d	      }n|j                   r| j#                  ||      }t%        j&                  t(        j*                        t%        j&                  d
      k\  rt        |dd      r|rt-        ||d      }nt-        ||      }|j.                  rt        |dd      r|st1        |       |S )z#Prepares a model for PEFT training.z9To use PeftModel, you need to install the `peft` library.z#Expected PeftConfig object but got z[. If you want to use the PeftModel, you need to pass a PeftConfig object to the SFTTrainer.is_loaded_in_4bitFis_loaded_in_8bit
Params4bit>   cpumeta)gradient_checkpointingz0.12)autocast_adapter_dtype)r   ImportErrorr6   r.   r   typer/   r   named_parametersr   rb   datadevice _prepare_model_for_kbit_trainingdataclassesreplacer   _enable_gradient_checkpointingr   parsepeft__version__r0   bf16r-   )r]   rp   r|   rq   is_qlorais_sharded_qlora_params           r>   r   zSFTTrainer._prepare_peft_model*  s    "YZZ+z25d;6G5H IA A 
 eY'L 5"5u=kPcejAk 5-u5!224 5??++|;',zz'8'8'='='P$ ,99%FE&&tEJD((77tDE MM$**+w}}V/DD2E: "5+eTE"5+6E 99(;UCL\'.rk   c                 T    |j                   |j                  xs i d}t        |fi |S )z-Prepares a quantized model for kbit training.)use_gradient_checkpointinggradient_checkpointing_kwargs)r   r   r1   )r]   rp   rq   prepare_model_kwargss       r>   r   z+SFTTrainer._prepare_model_for_kbit_training[  s7     +/*E*E-1-O-O-USU 

 /uM8LMMrk   c                     |j                   xs i }d|vxs |d   }|r@t        |d      r|j                          |S d }|j                         j	                  |       |S )z-Enables gradient checkpointing for the model.use_reentrantenable_input_require_gradsc                 &    |j                  d       y )NT)requires_grad_)moduleinputr`   s      r>   make_inputs_require_gradzKSFTTrainer._enable_gradient_checkpointing.<locals>.make_inputs_require_gradp  s    ))$/rk   )r   r   r   get_input_embeddingsregister_forward_hook)r]   rp   rq   r   r   r   s         r>   r   z)SFTTrainer._enable_gradient_checkpointingd  st    (,(J(J(Pb%#@@rDabqDr 	 u:;002 0 **,BBC[\rk   r   r   dataset_namec           	         t        |t              r|S t        |t              r|j                  t              }t        t        t        |            j                               }d|v }i }	t        |t              r|j                  |	d<   t               j                         5  |rt        j                  dt               6|s4t        |t              r	d| d|	d<   fd}
	  |j                  |
fdd	i|	}|s%t        t        |            }t%        |      rZt        |t              r	d| d|	d<   t        t        |            j                         } |j                  t&        fdd|v rdnd i|	}t        t        |            }t)        |      sDt        |t              r	d| d|	d<   d } |j                  |fd|j*                  id|v rdnd d|	}t        |t              r	d| d|	d<   d } |j                  |fd||j,                  |j.                  di|	}|rd|j0                  t3        d      t        |t              r	d| d|	d<   |j5                  d      }t7        ||j0                  |j8                  |	      }n<|j0                  0t        |t              r	d| d|	d<   t;        ||j0                  |	      }|j<                  r,|j5                  ddhj?                  |j@                              }d d d        |S # t         $ r>}t        j                  d
| dt"                |j                  |
fddi|	}Y d }~Nd }~ww xY w# 1 sw Y   |S xY w)NrJ   num_proczYou passed a dataset that is already processed (contains an `input_ids` field) together with a formatting function. Therefore `formatting_func` will be ignored. Either remove the `formatting_func` or pass a dataset that is not already processed.z Applying formatting function to z datasetdescc                     d |       iS )Ntextrj   )r3   r}   s    r>   _funcz*SFTTrainer._prepare_dataset.<locals>._func  s    "OG$<==rk   batchedFzDFailed to apply the formatting function due to the following error: a3  . This may be because the function is designed for batched input. Please update it to process one example at a time (i.e., accept and return a single example). For now, we will attempt to apply the function in batched mode, but note that batched formatting is deprecated and will be removed in version 0.21.TzConverting z dataset to ChatMLremove_columnsconversationszAdding EOS to c                     d| v r!| d   j                  |      s| d   |z   | d<   | S d| v r| d   j                  |      s| d   |z   | d<   | S )Nr   
completion)r   )r3   r   s     r>   add_eosz,SFTTrainer._prepare_dataset.<locals>.add_eos  sj    !W,WV_5M5Mi5X.5fo	.IGFO  ' *W4W\=R=[=[\e=f4;L4II4UGL1&rk   r   messages)	fn_kwargsr   zTokenizing c                 F   d| v r
t        |       ru |j                  | d   fd| j                  d      i| j                  di       } |j                  | d   | d   z   fd| j                  d      i| j                  di       }n2 || d         j                  } || d   | d   z         j                  }|d t	        |       |k(  st        j                  d       dgt	        |      z  dgt	        |      t	        |      z
  z  z   }||d	}|S t        |       ri |j                  | d
   fd|| j                  d      d| j                  di       }d|v rd|d   vrt        d      dD ci c]  }||v s|||    }}|S d || |         j                  i}|S c c}w )Nr   toolschat_template_kwargsr   )r   zMismatch between tokenized prompt and the start of tokenized prompt+completion. This may be due to unexpected tokenizer behavior, whitespace issues, or special token handling. Verify that the tokenizer is processing text consistently.r   r'   )rJ   rM   r   T)return_dictreturn_assistant_tokens_maskr   rN   u?  You're using `assistant_only_loss=True`, but at least one example has no assistant tokens. This usually means the tokenizer's chat template doesn't generate assistant masks — it may be missing the `{% generation %}` keyword. Please check the template and ensure it's correctly configured to support assistant masking.)rJ   rN   rJ   )r    apply_chat_templater   rJ   rZ   r   r   RuntimeError)	r3   ru   dataset_text_fieldr   
prompt_idsprompt_completion_idsrM   	processedks	            r>   tokenizez-SFTTrainer._prepare_dataset.<locals>.tokenize  s   7*,W5)M)9)M)M ' 1*&-kk'&:* #*++.Db"I*J
 5Y4D4X4X ' 1GL4I I5&-kk'&:5 #*++.Db"I51 *:wx?P)Q)[)[J4D%,X%69N%N5'i 2
  55Fs:G:U$MM!m ,-#J*?1#MbIcfijtfuIuBv*v2G\k$l	, %$' -W5(L(8(L(L '
 3),0=P&-kk'&:	)
 #*++.Db"I)I  1I=!9UfKgBg&2%/'" !" Cc(uQfgktftIaL(uI(u %$ *56FGTfLg6h6r6r(sI$$ )vs   .	F8Fr   )ru   r  r   z6When packing is enabled, `max_length` can't be `None`.zPacking zTruncating rK   )!r6   r)   r   with_transformr9   r7   r   r   keysdataset_num_procr   main_process_firstr   r   UserWarningmap	ExceptionDeprecationWarningr!   r"   r    r   r  r   
max_lengthr   select_columnsr#   r   r$   use_liger_kernelintersectioncolumn_names)r]   r   ru   rq   r   r}   r   r  is_processed
map_kwargsr   er   r   r
  s        `         r>   r   zSFTTrainer._prepare_datasetw  s    g45N gw',,-?@G Dg/4467"l2 
gw'%)%:%:Jz"^..0 Q	s*|Y  	 *<gw/+KL>Ya)bJv&>M)gkk%MM*MG   $T'] 3/>!'73/:<.HZ-[
6*#'W#6#;#;#=L)gkk/:I\:Y_c %G !%T'] 3(7!'73/=l^8-T
6*' *gkk#.0@0J0J"K5?<5OzUY %	G gw/+6|nH)MJv&3%j &'++ -=.2.E.E/3/G/G ! ??*$%]^^gw/+3L>)JJv&!00=&wAVAVXbc,gw/+6|nH)MJv&*7DOOZP$$!00+~1N1[1[\c\p\p1qrcQ	sf A ! 	MMM^_`^a b+ +
 + *gkk%LLLG	M'Q	sf s8   !AM#L8HM	M3M MMMMc                 0    | j                   
g d| _         y y )N)rJ   rL   rK   rM   rN   )_signature_columns)r]   s    r>    _set_signature_columns_if_neededz+SFTTrainer._set_signature_columns_if_needed'  s    
 ""*'D# +rk   c                    | j                   j                  rdnd}t        |   ||d|      \  }}|dk(  rd|v rI| j                  j                  |d   j                               j                         j                         }n}d|v rnt        j                  |d   j                  d      |d   j                        }	| j                  j                  |	      j                         j                         }nt        d	      | xj                  |z  c_        | j                  g| j                  |   d
<   d|v r#| j                  j                   s|j"                  dddddf   j%                         }
|d   dddf   j%                         }|
j'                  d      }|dk7  }||k(  |z  }|j                         }|j                         }| j                  j                  |      }| j                  j                  |      }|j                         }|dkD  r!|j                         |z  j                         nd}| j                  |   d   j)                  |       |r||fS |S )zQ
        Compute training loss and additionally compute token accuracies
        r   r   T)return_outputsnum_items_in_batchrQ   rK   r'   )r   z6Expected 'attention_mask' or 'position_ids' in inputs.
num_tokensrL   .Nr   rO   rR   r   g        mean_token_accuracy)rp   trainingr   compute_lossacceleratorgather_for_metricssumitemrV   rW   sizer   r   r   r   rq   r  logits
contiguousargmaxappend)r]   rp   inputsr  r   modelossoutputsnum_tokens_in_batchlocal_num_tokensshift_logitsshift_labelspredictionsmaskcorrect_predictionstotal_tokenscorrect_tokens	total_sumaccuracyr   s                      r>   r$  zSFTTrainer.compute_loss5  sR    **--w6'.6$CU / 
w 7?  6)&*&6&6&I&I&QaJbJfJfJh&i&m&m&o&t&t&v#6)#(<<~0F0K0KA0NW]^lWmWtWt#u &*&6&6&I&IJZ&[&_&_&a&f&f&h# !YZZ$$(;;$-1-E-E,FdL) vdii&@&@">>#ssA+6AACL!(+CG4??AL '--"-5K  4'D $/,#>$"F88:L0446N "--@@PN++>>|LL %((*IDMPQM**,y8>>@WZHMM$ 56==hG"0g:d:rk   c                 f    | j                   5  t        |   |i |cd d d        S # 1 sw Y   y xY wN)r   r   training_step)r]   rq   kwargsr   s      r>   r?  zSFTTrainer.training_stepf  s3    22 	:7($9&9	: 	: 	:s   '0logs
start_timec           	         | j                   j                  rdnd}| j                  |   j                         D ci c]  \  }}|t	        |      t        |      z   }}}|dk(  r&|j                         D ci c]  \  }}d| | }}}i ||}t        |   ||       | j                  |   j                          y c c}}w c c}}w )Nr   r   eval_)	rp   r#  r   r:   r'  rZ   r   logclear)r]   rA  rB  r/  r=   valmetricsr   s          r>   rE  zSFTTrainer.logj  s    **--w6<@MM$<O<U<U<WXS3C3s8++XX 6>:A--/Jhc3se}c)JGJ"$"'"D*%d!!# Y
 Ks   "C 9Cc                    | j                   j                  *t        | j                   j                        j                  }n(| j                   j                  j                  d      d   }| j                  |       t        | !  ||       y )Nr   r   )r   )	rq   hub_model_idr   
output_dirnamer   create_model_cardr   _save_checkpoint)r]   rp   trialr   r   s       r>   rN  zSFTTrainer._save_checkpointx  sl    99!!)dii22388J//55c:2>J*5 .rk   r   tagsc           
      h   | j                         syt        | j                  j                  d      r^t        j
                  j                  | j                  j                  j                        s!| j                  j                  j                  }nd}|t               }nt        |t              r|h}nt        |      }t        | j                  j                  d      r|j                  d       |j                  | j                         t        ||| j                  |t!        |      t#               r.t$        j&                  t$        j&                  j)                         ndt+               d      }|j-                  t        j
                  j/                  | j0                  j2                  d             y)a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        Nr   unsloth_versionunslothSFT)
base_modelr   rJ  r   rP  	wandb_url	comet_urltrainer_namez	README.md)is_world_process_zeror   rp   r   r   r   isdirr   setr6   ri   addupdater   r*   rJ  r7   r   wandbrunget_urlr+   savejoinrq   rK  )r]   r   r   rP  rU  
model_cards         r>   rM  zSFTTrainer.create_model_card  s-   " ))+4::$$o6rww}}TZZM^M^MlMl?m**88JJ <5Dc"6Dt9D4::$$&78HHYDOO$(!!**%d-?-AeiiF[eii'')ae.0	

 	TYY%9%9;GHrk   )NNNNNNNN)NNNNNN)FNr>  )NNN)1rb   rc   rd   re   r   r   ri   nnModuler   r	   r(   r   r   r   r   r8   r   r   r   r   r   r   r7   r   tuplerV   optim	Optimizerlr_schedulerLambdaLRr   r   Tensorr   r   r   r   r   rh   r   r  r$  r?  floatrE  rN  rM  __classcell__)r   s   @r>   rm   rm     s{   HT J
 ?C04CGEI 04FJ59jvaehl.2;?#C7S"))_45C7 uY(99:;C7  -	C7
  g&> ?@C7 uWd3<.@%@ABC7 #)+=?UWeef
C7 $H-C7 "(N+;T+A"BCC7 D12C7 (5;;#8#898EKKD\D\DeDe;ffgC7 #+5ekk6K6K1LdSVX[S[n1\+]"^C7 (0%,,9UW\WcWc9c0d'eC7  l+!C7" "(D63;"78#C7J# Y ? ./ /s /R[ /`o /bNo NY N[j NO 9 Yh &nw/0n   79KMces stn 	n
 n "(D63;"78n n 
w'	(n`.;b:$S%Z( $huo $QU $/ %)&*,0	1ISM1I sm1I CcD()	1Irk   rm   )Kr   r   r   r   collectionsr   collections.abcr   r   pathlibr   typingr   r   r	   r
   r   rV   torch.nnrd  
accelerater   datasetsr   r   	packagingr   transformersr   r   r   r   r   r   r   r   r   r   r   transformers.data.data_collatorr   transformers.trainer_callbackr   transformers.trainer_utilsr   transformers.utilsr   
data_utilsr    r!   r"   r#   r$   modelsr%   r&   
sft_configr(   utilsr)   r*   r+   r,   r-   r   r.   r/   r0   r1   r^  r7   r2   r9   r@   rm   rj   rk   r>   <module>r     s      	  # # !  : :   # -     > 9 5 0  I !  [[ )49A A> A> Q&7 Q Qhp
I p
Irk   