
    biG                        d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ d dlmZmZmZ dd	lmZmZ eefZerd d
lmZ d dlmZ d dlmZ d dlmZ e G d d             Z de iZ!	 	 d&dededee
d      dee"   de#eef   f
dZ$	 d'dedede%dee"   de#eef   f
dZ&d(dZ'd)dZ(d)dZ)d(dZ*e	 d*ded   ddd e+fd!       Z,d+d"Z-d# Z. G d$ d%      Z/y),    N)contextmanager)deepcopy)	dataclass)TYPE_CHECKINGAnyLiteralOptionalUnion)version)AutoTokenizerPreTrainedModelPreTrainedTokenizer   )!AutoModelForCausalLMWithValueHead"AutoModelForSeq2SeqLMWithValueHead)Accelerator)DeepSpeedEngine)Module)DistributedDataParallelc                   |    e Zd ZU dZdZeed<   dZeed<   dZeed<   e	d        Z
e	d        Ze	d	        Ze	d
        Zy)ChatMlSpecialTokensziDataclass for special tokens used in ChatML, including system, user, assistant, bos, eos, and pad tokens.z<|im_start|>	bos_tokenz
<|im_end|>	eos_token	pad_tokenc                      | j                    dS )Nsystemr   selfs    K/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/models/utils.pyr   zChatMlSpecialTokens.system1   s    ..!((    c                      | j                    dS )Nuserr   r   s    r    r#   zChatMlSpecialTokens.user5   s    ..!&&r!   c                      | j                    dS )N	assistantr   r   s    r    r%   zChatMlSpecialTokens.assistant9   s    ..!++r!   c                 V    d| j                    d| j                   d| j                   dS )Nz {% for message in messages %}{{'z2' + message['role'] + '
' + message['content'] + 'z7' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ 'z
' }}{% endif %})r   r   r%   r   s    r    chat_templatez!ChatMlSpecialTokens.chat_template=   sA    NN##VW[WeWeVf g ^^$ %		
r!   N)__name__
__module____qualname____doc__r   str__annotations__r   r   propertyr   r#   r%   r'    r!   r    r   r   )   sq    s#Is#!Is!!Is!) ) ' ' , , 
 
r!   r   chatmlmodel	tokenizerformatresize_to_multiple_ofreturnc                 N   |j                   t        d      |t        vr#t        d| dt        j                                t        |          }|j                  |_        |j
                  |_        |j                  |_        |j                  d|j                  |j                  gi       |j                   |_         | j                  t        |j                        ||nd       t        | dd      Q|j                  | j                  _        |j                  | j                  _        |j                  | j                  _        t        | dd      Q|j                  | j                   _        |j                  | j                   _        |j                  | j                   _        | |fS )	a  
    Setup chat format by adding special tokens to the tokenizer, setting the correct format, and extending the
    embedding layer of the model based on the new special tokens.

    <Tip warning="true"> We recommend using [`clone_chat_template`] instead of this function.

    </Tip>

    If the model already has a chat template, this will throw an error. If you want to overwrite it, please set
    `tokenizer.chat_template` to `None`.

    Args:
        model (`~transformers.PreTrainedModel`): The model to be modified.
        tokenizer (`~transformers.PreTrainedTokenizer`): The tokenizer to be modified.
        format (`Optional[Literal["chatml"]]`): The format to be set. Defaults to "chatml".
        resize_to_multiple_of (`int` or `None`): Number to resize the embedding layer to. Defaults to None.

    Returns:
        model (`~transformers.PreTrainedModel`):
            The modified model.
        tokenizer (`~transformers.PreTrainedTokenizer`):
            The modified tokenizer.
    NzcChat template is already added to the tokenizer. If you want to overwrite it, please set it to NonezFormat z" not available. Please use one of additional_special_tokensnew_num_tokenspad_to_multiple_ofconfiggeneration_config)r'   
ValueErrorFORMAT_MAPPINGkeysr   r   r   add_special_tokensresize_token_embeddingslenvocabgetattrpad_token_idr;   bos_token_ideos_token_idr<   )r1   r2   r3   r4   chat_formats        r    setup_chat_formatrI   L   s   < *q
 	

 ^#76(*L^M`M`MbLcdee (*K &//I%//I%//I  "=@U@UWbWlWl?m!no)77I 
!! 9??+4I4U0[_ "  uh%1$-$:$:!$-$:$:!$-$:$:!u)40</8/E/E,/8/E/E,/8/E/E,)r!   source_tokenizer_pathc                    t        j                  |      }|j                         |_        |j	                  t        |j                  j                                      |j                  |_        |j                  | j                  _	        |j                  | j                  _	        | j                  t        |j                        ||nd       | |fS )a  
    Clones a chat template from a source tokenizer to the target tokenizer and updates the model accordingly.

    This function:
    - Copies the chat template from a source tokenizer to the target tokenizer.
    - Adds any new tokens from the source tokenizer to the target tokenizer.
    - Sets and synchronizes the EOS token across the tokenizer and model.
    - Resizes the model's token embeddings to match the new vocabulary size, optionally rounding it up to a multiple of
      a specified value.

    Args:
        model (`PreTrainedModel`):
            Model to update.
        tokenizer (`PreTrainedTokenizer`):
            Tokenizer to update.
        source_tokenizer_path (`str`):
            Path or identifier of the pretrained tokenizer to clone from.
        resize_to_multiple_of (`int` or `None`, *optional*, defaults to `64`):
            The embedding layer will be resized to the new vocabulary size. If this is not `None`, it will round up the
            new vocabulary size to the nearest multiple of this value.

    Returns:
        model (`PreTrainedModel`):
            Updated model with resized token embeddings and EOS token configured.
        tokenizer (`~transformers.PreTrainedTokenizer`):
            Updated tokenizer with the chat template and special tokens applied.

    Example:
    ```python
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from trl import clone_chat_template

    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
    model, tokenizer = clone_chat_template(model, tokenizer, "Qwen/Qwen3-0.6B")
    ```
    Nr8   )r   from_pretrainedget_chat_templater'   
add_tokenslistadded_tokens_decodervaluesr   rG   r;   r<   rA   rB   rC   )r1   r2   rJ   r4   tokenizer_sources        r    clone_chat_templaterS      s    X %445JK /@@BI .CCJJLMN +44I ) 6 6ELL+4+A+AE( 
!! 9??+4I4U0[_ "  )r!   c                    t        | d      sy| j                  -t        | j                  d      r| j                  j                  }n$| j                  | j                  }nt        d      t	        |j
                  d      D ]  }|j                  j                           |j                  D ]  }|j                           |j                  D ]  }|j                           g |_        g |_
        y)z:Removes the optimizer hooks from a DeepSpeed ZeRO-3 model.	optimizerNparameter_offload8The model optimizer is None, which is not yet supported.Trecurse)hasattrrU   rV   RuntimeErroriter_paramsmoduleds_active_sub_modulesclearforward_hooksremovebackward_hooks)r1   optimizer_offloadparamhooks       r    remove_hooksrf      s    5+&"wu@S'T!OO==		$!OOUVV.55tD ,##))+, "// !00  ')#')$r!   c                 j    t        j                  | j                  |      | j                               S )NrX   )	itertoolschainnamed_parametersds_external_parameters)
sub_modulerY   s     r    get_all_parametersrm      s*    ??:66w6GIjIjIlmmr!   c                 L    t        | |      D cg c]  \  }}|	 c}}S c c}}w N)rm   )r]   rY   _rd   s       r    r\   r\      s!    "4VW"EFhaEFFFs    c                    ddl }t        | d      sy| j                  -t        | j                  d      r| j                  j                  }n$| j                  | j                  }nt	        d      t        j                  |j                        t        j                  d      k\  r|j                  |j                         y|j                  |j                         y)z7Adds the optimizer hooks from a DeepSpeed ZeRO-3 model.r   NrU   rV   rW   z0.16.4)	deepspeedrZ   rU   rV   r[   r   parse__version___register_deepspeed_moduler]   _register_hooks_recursively)r1   rr   rc   s      r    	add_hooksrw      s    5+&"wu@S'T!OO==		$!OOUVV}}Y**+w}}X/FF445F5M5MN556G6N6NOr!   )r   r   acceleratorr   gather_deepspeed3_paramsc              #     K   |j                  |       }|j                  j                  |j                  j                  j                  dk(  rv|s|j                  |        yddl}|j
                  j                  | j                               5  t        |        |j                  |        t        |        ddd       y| y# 1 sw Y   yxY ww)a  
    Context manager to unwrap distributed or accelerated models for generation tasks.

    Args:
        model (`Union[DistributedDataParallel, DeepSpeedEngine]`):
            Model to be unwrapped.
        accelerator (`~accelerate.Accelerator`):
            Accelerator instance managing the model.
        gather_deepspeed3_params (`bool`, *optional*, defaults to `True`):
            Whether to gather weights for DeepSpeed ZeRO Stage 3 models. If `False`, skips parameter gathering, which
            can be more memory-efficient but may lead to slower generation times.

    Yields:
        Unwrapped model.

    Example:
    ```python
    with unwrap_model_for_generation(model, accelerator) as unwrapped_model:
        generated_outputs = unwrapped_model.generate(input_ids)
    ```
    N   r   )
unwrap_modelstatedeepspeed_plugin
zero_stagerr   zeroGatheredParameters
parametersrf   rw   )r1   rx   ry   unwrapped_modelrr   s        r    unwrap_model_for_generationr   
  s     6 "..u5O))5+:K:K:\:\:g:gkl:l'**5112253C3C3EF !U#!..u55% ! !
 ! !s   BC*C:CCCc                    ddl }|j                  j                  }t        |j                        }|d   d   }| ut        | j                  dd      rt        | j                  j                        nt        | j                  dd      }|&|dk(  r!|j                  ||z  d|z  d	|z  |z  d
       |dk7  rd|d   d<   |j                  | |      ^} }| j                          | S )a  Prepares the model for DeepSpeed inference or evaluation by initializing it with the appropriate configuration.

    Adapted from accelerate:
    https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
    r   Nzero_optimizationstagehidden_sizeshidden_sizer{   
   g?)z$zero_optimization.reduce_bucket_sizez4zero_optimization.stage3_param_persistence_thresholdz-zero_optimization.stage3_prefetch_bucket_size)r1   r;   )rr   r}   r~   r   deepspeed_configrD   r;   maxr   update
initializeeval)r1   rx   rr   r~   config_kwargsr   r   rp   s           r    prepare_deepspeedr   4  s     "((99->>?M-.w7E u||^T: ))*}d; 	
 "uz   <G+<ULNQ\L\EH;EVYdEd z67)*73$$5$GIEA	JJLLr!   c                    ddl m} t        | |      s|j                  j                  j                  |        |j                  j                  }|j                  xs |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                   |j"                  |j$                  d} || fi |} | j'                          | S )Nr   )FullyShardedDataParallel)sharding_strategycpu_offloadauto_wrap_policymixed_precisionsync_module_statesbackward_prefetchforward_prefetchuse_orig_paramsparam_init_fnignored_moduleslimit_all_gathers	device_id)2torch.distributed.fsdp.fully_sharded_data_parallelr   
isinstancer}   fsdp_pluginset_auto_wrap_policyr   reshard_after_forwardr   r   mixed_precision_policyr   r   r   r   r   r   r   devicer   )r1   rx   FSDPr   kwargss        r    prepare_fsdpr   \  s    c eT"%%::5A!''33!,!>!>!c+BcBc&22 + < <*AA"-"@"@!,!>!> + < <*::(66*::!,!>!>$++
 U%f%	JJLLr!   c            
           e Zd ZdZdej
                  dej
                  dededef
dZdej
                  dej
                  dd	fd
Z	dej
                  dej
                  dd	fdZ
y	)_ForwardRedirectionaI  Implements the `forward-redirection`.

    Taken from Pytorch-lightning:
    https://github.com/Lightning-AI/pytorch-lightning/blob/02311d03fb982560246eead7c08104481fac9579/src/lightning/pytorch/strategies/strategy.py#L602

    A method call to a wrapped module gets rerouted through the wrapper's `forward` method instead.

    wrapper_moduleoriginal_modulemethodargsr   c                      j                   dt        dt        dt        f fd}|_          |i |} j                         |S )a  Reroutes a method call through the `wrapper_module`'s `forward` method.

        Args:
            wrapper_module: The module that has `original_module` wrapped.
            original_module: The module that was wrapped inside `wrapper_module`.
            method_name: The name of the method that should be called on the `original_module` after inputs get
                redirected through the `wrapper_module`'s `forward` method.
            *args: The positional arguments to the method `method_name`. They will get passed to a patched
                `forward` method instead.
            **kwargs: The keyword arguments to the method `method_name`. They will get passed to a patched
                `forward` method instead.

        _args_kwargsr5   c                  J    _          | i |}j                         |S ro   )forwardon_after_inner_forward)r   r   outr   original_forwardr   r   r   s      r    wrapped_forwardz5_ForwardRedirection.__call__.<locals>.wrapped_forward  s1     '7O#%+7+C''HJr!   )r   r   on_after_outer_forward)	r   r   r   r   r   r   r   wrapper_outputr   s	   ````    @r    __call__z_ForwardRedirection.__call__  s_      +22	C 	C 	C 	 	 #2'88##NODr!   r5   Nc                      y ro   r/   r   r   r   s      r    r   z*_ForwardRedirection.on_after_inner_forward      r!   c                      y ro   r/   r   s      r    r   z*_ForwardRedirection.on_after_outer_forward  r   r!   )r(   r)   r*   r+   nnr   callabler   r   r   r   r/   r!   r    r   r   x  s      ii :<)) MU ^a mp DRYY QSQZQZ _c RYY QSQZQZ _c r!   r   )r0   N)@   )r1   r   r5   N)F)T)r1   r   rx   r   )0rh   
contextlibr   copyr   dataclassesr   typingr   r   r   r	   r
   torch.nnr   	packagingr   transformersr   r   r   modeling_value_headr   r   SUPPORTED_ARCHITECTURES
accelerater   deepspeed.runtime.enginer   r   torch.nn.parallel.distributedr   r   r>   inttuplerI   r,   rS   rf   rm   r\   rw   boolr   r   r   r   r/   r!   r    <module>r      s~    %  ! ? ?   L L f && 
 &8E 
 
 
> /0 +3+/	DD"D WX&'D $C=	D
 ?//0DV ,.	BB"B B $C=	B
 ?//0BJ*.nGP&  &*&=>&& #& &R%P80 0r!   