
    bi                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlm Z  d dl!m"Z"m#Z# d dlm$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z> dZ? e j                  ej                         e j                  d      k\  rd dlmBZB  eB       Z?g dZCd4dZDd5dZEd6dZF G d dej                  j                        ZI G d deI      ZJ G d deI      ZKd7dZLd ZMd  ZN	 	 d8	 d9d!ZOd" ZPd# ZQd:d$ZRd% ZSd& ZTd;d'ZUd<d(ZVd=d)ZWd* ZXd7d+ZYd>d,ZZd- Z[d?d.Z\d@d/Z]dAd0Z^dBd1Z_d2 Z`d3 Zay)C    )annotationsN)Sequence)nullcontext)AnyOptionalUnion)FullyShardedDataParallelPlugin)add_hook_to_moduleremove_hook_from_module)is_npu_availableis_xpu_available)file_exists)EntryNotFoundErrorHFValidationError)version)storage_ptrstorage_size)PreTrainedModel   )is_auto_gptq_availableis_gptqmodel_availableis_torch_tpu_available   )CONFIG_NAMEEMBEDDING_LAYER_NAMESINCLUDE_LINEAR_LAYERS_SHORTHANDSAFETENSORS_WEIGHTS_NAME5TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING1TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING7TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING6TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING1TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING6TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING2TRANSFORMERS_MODELS_TO_LOHA_TARGET_MODULES_MAPPING2TRANSFORMERS_MODELS_TO_LOKR_TARGET_MODULES_MAPPING2TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING8TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING6TRANSFORMERS_MODELS_TO_RANDLORA_TARGET_MODULES_MAPPING3TRANSFORMERS_MODELS_TO_SHIRA_TARGET_MODULES_MAPPING4TRANSFORMERS_MODELS_TO_VBLORA_TARGET_MODULES_MAPPING2TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPINGWEIGHTS_NAME&bloom_model_postprocess_past_key_value*starcoder_model_postprocess_past_key_valueFz0.29.0)is_mlu_availablec                    t         j                  j                         ryt        t         j                  d      r)t         j                  j
                  j                         ryt        ryt               ryt               ryy)Ncudampsmluxpunpucpu)	torchr1   is_availablehasattrbackendsr2   mlu_availabler   r        K/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/peft/utils/other.pyinfer_devicer?   a   sV    zz 		'ENN,>,>,K,K,M					r=   c                "   t        | dd      xs t        | dd      }t        | dd      dk(  }t        | dd      dk(  }t        | dd      dk(  }t        | dd      d	k(  }t        | dd      d
k(  xs t        | dd      }|i }| j                         D ]  \  }	}
d|
_         |s|s|s|s|s| j                         D ]  }
|
j                  t
        j                  k(  s|
j                  t
        j                  k(  s>|
j                  j                  dk7  sX|
j                  j                  t
        j                        |
_
         |s
|s|s|s|s|r|rd|vs|d   r?t        | d      r| j                          n"d }| j                         j!                  |       dt#        t%        j&                  | j(                        j                        v }|s(t+        |      dkD  rt-        j.                  dt0               |si nd|i} | j(                  di | | S )a8  
    Note this method only works for `transformers` models.

    This method wraps the entire protocol for preparing a model before running a training. This includes:
        1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
        head to fp32 4- Freezing the base model layers to ensure they are not updated during training


    Args:
        model (`transformers.PreTrainedModel`):
            The loaded model from `transformers`
        use_gradient_checkpointing (`bool`, *optional*, defaults to `True`):
            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
        gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`):
            Keyword arguments to pass to the gradient checkpointing function, please refer to the documentation of
            `torch.utils.checkpoint.checkpoint` for more details about the arguments that you can pass to that method.
            Note this is only available in the latest transformers versions (> 4.34.1).
    is_loaded_in_8bitFis_loaded_in_4bitquantization_methodNgptqaqlmeetqtorchaohqqhqq_quantized
Params4bituse_reentrantenable_input_require_gradsc                &    |j                  d       y )NT)requires_grad_)moduleinputoutputs      r>   make_inputs_require_gradzAprepare_model_for_kbit_training.<locals>.make_inputs_require_grad   s    ))$/r=   gradient_checkpointing_kwargsr   zgradient_checkpointing_kwargs is not supported in this version of transformers. The passed kwargs will be ignored. if you want to use that feature, please upgrade to the latest version of transformers.r<   )getattrnamed_parametersrequires_grad
parametersdtyper7   float16bfloat16	__class____name__datatofloat32r9   rL   get_input_embeddingsregister_forward_hooklistinspect	signaturegradient_checkpointing_enablelenwarningswarnFutureWarning)modeluse_gradient_checkpointingrS   loaded_in_kbitis_gptq_quantizedis_aqlm_quantizedis_eetq_quantizedis_torchao_quantizedis_hqq_quantizednameparamrR   _supports_gc_kwargsgc_enable_kwargss                 r>   prepare_model_for_kbit_trainingrv   o   s&   & U$7?m75ReglCmN'<dCvM'<dCvM'<dCvM"5*?F)Su&;TBeKuwW\^motOu$,(*%--/ $e#$
 !! $ %%' 	:E-5;;%..3P//**l:"ZZ]]5==9
		: 	
$"??C`apCqu:;0020 **,BBC[\ >eAABMMB
 
 #s+H'IA'MMMj *B0OQn/o 	
 	,++?.>?Lr=   c                    | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )a  
    Shift input ids one token to the right.

    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input ids
        pad_token_id (`int`): The id of the `padding` token.
        decoder_start_token_id (`int`): The id of the `start` token.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)	input_idspad_token_iddecoder_start_token_idshifted_input_idss       r>   shift_tokens_rightr      s}     "++IOO<(CRC0668ae4adLMM""#4#<lKr=   c                       e Zd ZdZ fdZd Zd Zd Zedd       Z	edd       Z
edd       Zd	 Zd
 Zd fdZd Zd Zd ZddZ	 	 	 	 	 	 	 	 	 	 ddZddZ	 	 	 	 	 	 	 	 	 	 d dZd!dZd"dZd#dZd$dZd Zd Z	 	 	 	 	 	 	 	 d%dZ xZS )&AuxiliaryTrainingWrapperzWrap a specific module so that it can be trained and saved in a way that is tangential to how
    PEFT normally works, e.g. fully training a classification layer instead of using an adapter.

    c                    t         |           || _        |g| _        d| _        t               | _         | j                  |fi |  | j                  |fi | | j                          y)zEExtra kwargs will be passed to `self.init_modules` and `self.update`.FN)
super__init__original_module_active_adapter_disable_adaptersset	_adaptersinit_modulesupdatecheck_module)selfmodule_to_saveadapter_namekwargsr[   s       r>   r   z!AuxiliaryTrainingWrapper.__init__   se    - ,~!&,1&1L+F+r=   c                    t         )zWA place to initialize PyTorch modules in `__init__` before the call to `self.update()`.NotImplementedErrorr   r   r   s      r>   r   z%AuxiliaryTrainingWrapper.init_modules       !!r=   c                     y)a  Returns a user friendly identifier for error messages, e.g. for type compatibility error messages from
        `check_module()` so that the user can backtrack where the error comes from. A generic "training wrapper" is
        less helpful than "modules_to_save", for example.
        ztraining wrapperr<   r   s    r>   _error_message_namez,AuxiliaryTrainingWrapper._error_message_name   s    
 "r=   c                   t         j                  j                  t         j                  j                  t         j                  j                  t         j                  j
                  f}t        | j                  |      r4| j                  j                  }t        | j                          d|       ddlm} t        | j                  |      r4| j                  j                  }t        | j                          d|       y)z@Perform some sanity checks on the module to ensure that it worksz& cannot be applied to modules of type r   )BaseTunerLayerN)r7   nn
ModuleDict
ModuleListParameterDictParameterList
isinstancer   r[   	TypeErrorr   peft.tuners.tuners_utilsr   )r   forbidden_classescls_namer   s       r>   r   z%AuxiliaryTrainingWrapper.check_module   s    
 #XX00%((2E2EuxxG]G]_d_g_g_u_uvd**,=>++55Ht779::`ai`jkll 	<d**N;++55Ht779::`ai`jkll <r=   c                    | j                   S Nr   r   s    r>   disable_adaptersz)AuxiliaryTrainingWrapper.disable_adapters  s     %%%r=   c                    | j                   S r   )r   r   s    r>   active_adapterz'AuxiliaryTrainingWrapper.active_adapter  s     ###r=   c                h    t        | j                  t              r| j                  gS | j                  S r   )r   r   strr   s    r>   active_adaptersz(AuxiliaryTrainingWrapper.active_adapters  s-    d**C0(())###r=   c                     y)a  Infrastructure to enable the implementing class to delegate attributes to other modules.
        Returns True if the implementing class knows how to handle attribute `name`.

        Gets passed `modules` which is PyTorch's internal list of assigned modules from `nn.Module`.
        Fr<   r   rr   moduless      r>   _hasattr_wrappedz)AuxiliaryTrainingWrapper._hasattr_wrapped  s     r=   c                     y)zIf `_hasattr_wrapped` returns True for `name`, then this function should return the corresponding
        value associated with `name`.
        Nr<   r   s      r>   _getattr_wrappedz)AuxiliaryTrainingWrapper._getattr_wrapped%  s     r=   c                   	 t         |   |      S # t        $ r Y nw xY wd| j                  vr%t        dt	        |       j
                   d| d      | j                  d   }| j                  rt        | j                  |      S | j                  ||      r| j                  ||      S t        dt	        |       j
                   d| d      )N_modules'z' object has no attribute ')r   __getattr__AttributeError__dict__typer\   r   rT   r   r   r   )r   rr   r   r[   s      r>   r   z$AuxiliaryTrainingWrapper.__getattr__+  s    	7&t,, 		 T]]* 1T$Z%8%8$99TUYTZZ[!\]] --
+  4//66""41((w77 qd!4!4 55PQUPVVWXYYs    	c                X    || j                   vr| j                   j                  |       yy)a  Called when this instance should be part of an adapter's training.
        Adds the given adapter to the list of adapters that this instance is training along with.

        Additional kwargs are expected to be the same kwargs that are also passed for initializing this class.
        N)r   addr   s      r>   r   zAuxiliaryTrainingWrapper.updateB  s'     t~~-NN|, .r=   c                $   t        t        j                  |j                  j                        }|j
                  }i }t        j                  |j                        }|j                         D ]  }||j                  v s||   ||<     |di |}|S )zh
        Creates a new hook based on the old hook. Use it only if you know what you are doing !
        r<   )rT   
acceleratehooksr[   r\   r   rc   rd   r   keysrW   )r   old_hookold_hook_clsold_hook_attrfiltered_old_hook_attrold_hook_init_signatureknew_hooks           r>   _create_new_hookz)AuxiliaryTrainingWrapper._create_new_hookK  s     z//1C1C1L1LM ))!#")"3"3L4I4I"J##% 	=A+666,9!,<&q)	=  9"89r=   c                    |j                  dd      }|yt        |      t        |      k7  r&dt        |       dt        |       d}t        |      y)zMCheck if the arguments are compatible with the configs and state of the modeladapter_namesNzNLength of `adapter_names` should be the same as the number of inputs, but got z and z respectively.)getrf   r|   )r   xargsr   r   msgs         r>   _check_forward_argsz,AuxiliaryTrainingWrapper._check_forward_argsY  sb    

?D9 q6S''`}%&eCF8>C  S/! (r=   c                    t         r   r   r   r   r   r   s       r>   _forward_wrappedz)AuxiliaryTrainingWrapper._forward_wrappedf  s    !!r=   c                    t         r   r   r   r   r   r   r   s        r>   _forward_wrapped_mixed_batchz5AuxiliaryTrainingWrapper._forward_wrapped_mixed_batchi  s
     "!r=   c                    t         )z\The forward call when no adapter is involved in the forward computation, only the base modelr   r   s       r>   _forward_wrapped_passthroughz5AuxiliaryTrainingWrapper._forward_wrapped_passthroughn  r   r=   c          	        t         j                  j                  t         j                  j                  t         j                  j                  t         j                  j
                  t         j                  j                  f}dj                  |D cg c]  }|j                   c}      }t        | j                  |      st        d| d      t        |      }g }	|D ]5  }
|	j                  t        |      D cg c]  \  }}||
k(  s| c}}       7 t        t!        |            D cg c]  }d }}t        |      D ]^  \  }}||	|      }|dk(  r | j                  |g|i |}n | j"                  ||g|i |}t        |	|         D ]  \  }}||   ||<    ` t        j$                  |      S c c}w c c}}w c c}w )Nz, z<Mixed batching is only supported for the following modules: .r   __base__)r7   r   Linear	EmbeddingConv1dConv2dConv3djoinr\   r   r   r   r   append	enumeraterangerf   r   stack)r   rP   r   r   r   SUPPORTED_MODULESrO   module_namesunique_adapterssub_batch_indices_listadapterindexitem_resultsir   	sub_batchrQ   js                       r>   _mixed_batch_forwardz-AuxiliaryTrainingWrapper._mixed_batch_forwardr  s    #XX__ehh.@.@%((//SXS[S[SbSbdidldldsdstyy@Q!Rf&//!RS$..0ABZ[gZhhijkkm,!#& 	rG"))ImD\*p[UD`dho`o5*pq	r $CJ/0100!*?!; 		+A~4Q78I+---iI$I&I:::9nfW[f_ef%&<Q&?@ +q#E]
+		+ {{7##1 "S +q0s   F79F<F<+	Gc                4      j                   |g|i | |j                  dd       } j                  st         fd j                  D              r  j
                  |g|i |S |  j                  |g|i |S   j                  |g|d|i|S )Nr   c              3  :   K   | ]  }|j                   v  y wr   )r   ).0r   r   s     r>   	<genexpr>z3AuxiliaryTrainingWrapper.forward.<locals>.<genexpr>  s     'j't~~(E'js   )r   popr   anyr   r   r   r   )r   r   r   r   r   s   `    r>   forwardz AuxiliaryTrainingWrapper.forward  s       4T4V4

?D9  C'jUYUiUi'j$j4444QHHHH (4((<T<V<<(t((YTYYRXYYr=   c                &    |rd| _         yd| _         y)zToggle the enabling and disabling of adapters

        Args:
            enabled (bool): True to enable adapters, False to disable adapters
        FTNr   )r   enableds     r>   enable_adaptersz(AuxiliaryTrainingWrapper.enable_adapters  s     %*D"%)D"r=   c                    t        |t              r|| _        yg | _        |D ]F  }|| j                  vrt	        d| d| j                         | j                  j                  |       H y)zwSet the active adapter

        Args:
            adapter_name (str): The name of the adapter to set as active
        Adapter  not found in N)r   r   r   r   r|   r   r   r   r   s      r>   set_adapterz$AuxiliaryTrainingWrapper.set_adapter  sj     mS)#0D #%D  - :t~~5$x~^DNNK[%\]]$$++L9	:r=   c                    t         )zGDelete an adapter from the layer, set a new active adapter if necessaryr   )r   r   new_active_adapterss      r>   delete_adapterz'AuxiliaryTrainingWrapper.delete_adapter  r   r=   c                    t         )z9Return the state dict of this module for a given adapter.r   r   r   s     r>   adapter_state_dictz+AuxiliaryTrainingWrapper.adapter_state_dict  r   r=   c                    t         )aX  Return a mapping from the key present in disk-loaded state dict
        and how it should be represented in the loaded model's state dict.

        The default should be a 1:1 mapping but it is important to define a mapping as it also serves as the
        ground-truth for which keys are supposed to be loaded from a saved state dict.
        r   r  s     r>   adapter_state_dict_load_mapz4AuxiliaryTrainingWrapper.adapter_state_dict_load_map  s
     "!r=   c                    t         )zHandles unloading when called from PEFT models. Returns the wrapped module
        and handles merging onto the wrapped module if requested.
        r   r   merge
safe_merger   s       r>   "unload_and_optionally_merge_modulez;AuxiliaryTrainingWrapper.unload_and_optionally_merge_module  s
     "!r=   )returnbool)r  zUnion[list[str], str])r  	list[str])rr   r   )r   torch.Tensorr   r   r   r   r  r  )
r   r  r   r   r   r   r   r   r  r  )
rP   r  r   r   r   r  r   r   r  r  )r   r  r   r  r   zUnion[str, list[str]]r   r   r  Optional[list[str]]r  Noner  r  r  r  r   r  r  torch.nn.Module)r\   
__module____qualname____doc__r   r   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r	  r  r  __classcell__r[   s   @r>   r   r      s!   
""m$ & & $ $ $ $
Z.-""""/2";>"JM"	"
" $! $*- $>G $SV $	 $D	Z	*: """""'+"<O"	"r=   r   c                       e Zd ZdZ fdZd Zd Zd Zd Zd Z	d Z
d	 Z fd
Zd fdZddZddZd Zd Z	 	 	 	 	 	 	 	 ddZ xZS )ModulesToSaveWrapperzeWraps a module that is supposed to be trained (i.e. `requires_grad_(True)`) and saved after training.c                &    t         |   ||       y r   )r   r   )r   r   r   r[   s      r>   r   zModulesToSaveWrapper.__init__  s    6r=   c                L    t         j                  j                  i       | _        y r   )r7   r   r   modules_to_saver  s     r>   r   z!ModulesToSaveWrapper.init_modules  s    $xx2226r=   c                     y)Nr&  r<   r   s    r>   r   z(ModulesToSaveWrapper._error_message_name  s     r=   c                    | j                   s | j                  |g|i |S  | j                  | j                   d      |g|i |S )Nr   )r   r   r&  r   s       r>   r   z%ModulesToSaveWrapper._forward_wrapped  sU    ##4444QHHHH<t##D$8$8$;<QPPPPr=   c                4     | j                   |   |g|i |S r   )r&  r   s        r>   r   z1ModulesToSaveWrapper._forward_wrapped_mixed_batch  s$    3t##N3AGGGGr=   c                .     | j                   |g|i |S r   )r   r   s       r>   r   z1ModulesToSaveWrapper._forward_wrapped_passthrough  s    #t##A7777r=   c                *    | j                   d   |d   v S )Nr   r&  )r   r   s      r>   r   z%ModulesToSaveWrapper._hasattr_wrapped  s    ##A&'2C*DDDr=   c                @    t        |d   | j                  d      |      S )Nr&  r   )rT   r   r   s      r>   r   z%ModulesToSaveWrapper._getattr_wrapped  s%    w01$2F2Fq2IJDQQr=   c                R   t         
|   |       t               }| j                  j	                         D ]b  \  }}|j                         }|dk(  st        |d      s)dd l}|j                  j                  | j                  j                         d      } n || j                  vr7|5  t        j                  | j                        | j                  |<   d d d        t        | j                  |   d      r[| j                  |   j                  }| j                  |      }	t!        | j                  |          t#        | j                  |   |	       | j                  j%                  d       || j&                  k(  r| j                  |   j%                  d       y y # 1 sw Y   xY w)Nr   ds_numel)modifier_rank_hf_hookFT)r   r   r   r   rU   numelr9   	deepspeedzeroGatheredParametersrW   r&  copydeepcopyr0  r   r   r
   rN   r   )r   r   r   context_managerr   rs   
num_paramsr2  r   r   r[   s             r>   r   zModulesToSaveWrapper.update  st   |$%-,,==? 	HAuJQ75*#= "+.."C"CDDXDXDcDcDeuv"C"w	 t333  Y59]]4CWCW5X$$\2Y 4''5zB++L9BBH,,X6H#D$8$8$FGt33LA8L++E2
 4...  .==dC /Y Ys   *-FF&c                   t         |   |       |rD| j                  j                  d       | j                  | j
                     j                  d       y| j                  j                  d       | j                  j                  d       y)zTakes care of setting the required_grad flag on the wrapped module.
        If adapters are enabled, gradients for the module are required as well.
        FTN)r   r   r   rN   r&  r   r   r   r[   s     r>   r   z$ModulesToSaveWrapper.enable_adapters  sp     	(  //6  !4!45DDTJ  //5  //6r=   c                `   t        |t              r|g}t        |      dkD  rt        d| d      |d   }|| j                  vrt        d| d| j                         | j
                  | j                  d      j                  d       | j
                  |   j                  d       || _        y	)
a  Set the active adapter

        Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True). If this is
        not desired, use the following code.

        ```py
        >>> for name, param in model_peft.named_parameters():
        ...     if ...:  # some check on name (ex. if 'lora' in name)
        ...         param.requires_grad = False
        ```

        Args:
            adapter_names (list[str], str): The name of the adapter to set as active
        r   Attempted to set multiple (z') adapters at once for modules_to_save.r   r   r  FTN)	r   r   rf   r|   r   r&  r   rN   r   r  s      r>   r  z ModulesToSaveWrapper.set_adapter  s     mS)*OM}!:=/Ipqrr$Q't~~-x~^DNNCSTUUT11!45DDUK\*99$?+r=   c                   || j                   vryt        |t        t        f      r6t	        |      dkD  r(| j
                  j                  }t        d| d| d      || j                  v r| j                  j                  |       |s| j                   |= g | _
        y|d   }|| j                   vr| j                   |= g | _
        y|| j                  d   k7  r| j                  |       | j                   |= y).  
        Delete the adapter if present.

        This method will also set a new active adapter if the deleted adapter was the active adapter. It is important
        that the new adapter is chosen by the caller in a deterministic way, so that the same adapter is chosen on all
        layers.
        Nr   r<  ) adapters at once for , which is not allowed.r   )r&  r   rb   tuplerf   r[   r\   r|   r   remover   r   r  r   r   r  rr   new_active_adapters        r>   r  z#ModulesToSaveWrapper.delete_adapter<  s    t333 )D%=9cBU>VYZ>Z>>**D-.A-BBYZ^Y__vw  4>>)NN!!,/"$$\2#%D 03T%9%99$$\2#%D !5!5a!88/0  .r=   c                    || j                   vri S | j                  |   j                         D ci c]  }|d| d|  c}S c c}w Nzmodules_to_save.r   r   r&  
state_dict)r   r   r   s      r>   r  z0ModulesToSaveWrapper.adapter_state_dict_load_mapc  sS     t~~- IBFBVBVWcBdBoBoBqrQ%l^1QC88rrrs   Ac           	         || j                   vri S | j                  |   j                         D ci c]  }||d| d|     c}S c c}w rF  rG  r   r   rH  r   s       r>   r	  z'ModulesToSaveWrapper.adapter_state_dictl  sc    t~~- I )),7BBD
 z,\N!A3?@@
 	
 
s   Ac                    | j                   | j                     }t        |d      r%|r|j                  ||       |j	                         }|S )zUnloading in case of `ModulesToSave` means to simply return the wrapped module.

        However, if the wrapped module is itself a tuner, we'll call merge on it before.
        
base_layerr  r   )r&  r   r9   r  get_base_layer)r   r  r  r   
new_modules        r>   r  z7ModulesToSaveWrapper.unload_and_optionally_merge_modulew  sP     ))$*=*=>
 :|,  Jm T#224Jr=   r  r  r  r  )r\   r  r  r  r   r   r   r   r   r   r   r   r   r   r  r  r  r	  r  r   r!  s   @r>   r#  r#    sx    o77!Q
H8ERD>7,<%/Ns	
'+<O	r=   r#  c                       e Zd ZdZ	 d	 	 	 	 	 	 	 d fdZed        Zd Zd Zd Z	d Z
d Zd	 Zd
 Z fdZd Zd Zd fdZd fdZddZ	 	 	 	 	 	 	 	 ddZ xZS )TrainableTokensWrappera8  Wraps a module (typically an embedding layer) that is supposed to be re-trained selectively (i.e.
    solely updating a few columns) using the `TrainableTokensLayer` PEFT method.

    Supports weight-tying to another adapter when passed a `tied_adapter` which is expected to be a
    `TrainableTokensLayer`.
    c                :    t         |   ||||       d | _        y )N)token_indicestied_adapter)r   r   r   )r   r   r   rS  rT  r[   s        r>   r   zTrainableTokensWrapper.__init__  s&     	]amn  $r=   c                .    | j                   j                  S r   )token_adapterrL  r   s    r>   r   z&TrainableTokensWrapper.original_module  s     !!,,,r=   c                D    ddl m}  || j                  |||      | _        y )Nr   )TrainableTokensLayer)peft.tuners.trainable_tokensrX  r   rV  )r   r   rS  rT  rX  s        r>   r   z#TrainableTokensWrapper.init_modules  s"    E 2$2F2FVceqrr=   c                     y)Ntrainable_token_indicesr<   r   s    r>   r   z*TrainableTokensWrapper._error_message_name  s    (r=   c                    |dk(  S )Nweightr<   r   s      r>   r   z'TrainableTokensWrapper._hasattr_wrapped  s    xr=   c                z    |dk(  r(|d   j                  | j                  j                        S t        d| d      )Nr]  rV  zWThis code should've never been reached, probably a bad check in `_hasattr_wrapped` for zH. Please file an issue under https://github.com/huggingface/peft/issues.)get_merged_weightsrV  r   RuntimeErrorr   s      r>   r   z'TrainableTokensWrapper._getattr_wrapped  sQ     8?+>>t?Q?Q?a?abbefjek lU U
 	
r=   c                h    | j                   s | j                  |g|i |S | j                  |      S r   )r   r   rV  r   s       r>   r   z'TrainableTokensWrapper._forward_wrapped  s9    ##4444QHHHH!!!$$r=   c                <    | j                   j                  ||g      S r   )rV  forward_adaptersr   s        r>   r   z3TrainableTokensWrapper._forward_wrapped_mixed_batch  s    !!221~6FGGr=   c                .     | j                   |g|i |S r   )rV  r   s       r>   r   z3TrainableTokensWrapper._forward_wrapped_passthrough  s!     "t!!!5d5f55r=   c                z    || j                   vr | j                  j                  |fi | t        |   |       y r   )r   rV  update_layerr   r   )r   r   r   r[   s      r>   r   zTrainableTokensWrapper.update  s8     /+D++NEfE~&r=   c                @    | j                   j                  ri S dd| iS )Nz$token_adapter.trainable_tokens_deltaz%token_adapter.trainable_tokens_delta.rV  rT  r  s     r>   r  z2TrainableTokensWrapper.adapter_state_dict_load_map  s+    **I6:_`l_m8noor=   c           	     v    | j                   j                  ri S dD ci c]  }d| |d| d|     c}S c c}w )N)trainable_tokens_deltaztoken_adapter.r   rh  rJ  s       r>   r	  z)TrainableTokensWrapper.adapter_state_dict  sW    ** I \v
VWnQC *~aS,-P"QQ
 	
 
s   6c                Z    t         |   |       | j                  j                  |       y)z{Enables/disables the underlying `TrainableTokens` adapter.
        Also handles the internal adapter disable flag.
        N)r   r   rV  r:  s     r>   r   z&TrainableTokensWrapper.enable_adapters  s&     	(**73r=   c                Z    t         |   |       | j                  j                  |       y r   )r   r  rV  )r   r   r[   s     r>   r  z"TrainableTokensWrapper.set_adapter  s$    M*&&}5r=   c                   | j                   j                  |       t        |t        t        f      r6t        |      dkD  r(| j                  j                  }t        d| d| d      || j                  v r| j                  j                  |       |sg | _        y|d   | j                   j                  vrg | _        y|d   }| j                  |       y)r>  r   r<  r?  r@  Nr   )rV  r  r   rb   rA  rf   r[   r\   r|   r   rB  r   rj  r  rC  s        r>   r  z%TrainableTokensWrapper.delete_adapter  s     	)),7 )D%=9cBU>VYZ>Z>>**D-.A-BBYZ^Y__vw  4>>)NN!!,/"#%D q!););)R)RR#%D 03+,r=   c                t    |r| j                   j                  ||       | j                   j                         S )zUnloading for `TrainableTokensWrapper` means to return the wrapped module, e.g. the embedding layer and,
        if requested, merging the `TrainableTokens` adapter onto the wrapped module.
        rM  )rV  r  rN  r  s       r>   r  z9TrainableTokensWrapper.unload_and_optionally_merge_module  s5     $$
-$X!!0022r=   r   )r   r  r   r   rS  z	list[int]r  r  r  r  r  r  )r\   r  r  r  r   r  r   r   r   r   r   r   r   r   r   r  r	  r   r  r  r  r   r!  s   @r>   rQ  rQ    s     
$'
$ 
$ !	
$ 

$ - -
s) 

%
H6
'p
	
46-B33'+3<O3	3r=   rQ  c                    t        | d      s|S | j                         }| j                         D ]  \  }}||u s|c S  |S )Nr`   )r9   r`   named_modules)rj   defaultinput_embeddingsrr   rO   s        r>   _get_input_embeddings_namers    sR    501113++- f%%K Nr=   c                    | j                  dj                  |j                  d      d d             }|j                  d      d   }| j                  |      }|||fS )Nr   rx   )get_submoduler   split)rj   keyparenttarget_nametargets        r>   _get_submodulesr{  (  sY      #))C."*=!>?F))C.$K  %F6;&&r=   c                L    | j                         D ]  \  }}||v sd|_         y )NF)rU   rV   )rj   r   nps       r>   _freeze_adapterr  /  s-    &&( $11#AO$r=   c                J   |t         }|syg }t               }| j                  d      D 	cg c]  \  }}	|	 }
}}	|
D ]  t        fd|D              }|st	        |       \  }}}t        ||      r/ |j                  |fi | |j                  |j                         n9 |||fi |}|j                  |       t        |||       |j                  |       |j                  |        t        |      j                  |      }|r|st        d| d      |S c c}	}w )a#  Wraps modules that are supposed to be re-trained either normally, i.e. marking them to require gradients and
    saving them alongside other modules, or with certain methods that go alongside PEFT methods, such as retraining
    specific token indices using selective read/write.

    Note that you need to validate beforehand if there are layers targeted by multiple wrappers, e.g. if the
    'embedding' layer is configured for both `ModulesToSaveWrapper` and `TrainableTokensWrapper` there would be
    conflicts down the line.

    The default is to wrap the module in a `ModulesToSaveWrapper` wrapper.

    If `strict_module_check` is set, this method raises an ValueError, similar to BaseTuner.inject_adapter when none of
    the requested modules in `module_names` is not found in the model.
    NF)remove_duplicatec              3  @   K   | ]  }j                  |        y wr   )endswith)r   
target_keyrw  s     r>   r   z!_set_trainable.<locals>.<genexpr>X  s     !Zz#,,z":!Zs   zTarget modules zL not found in the base model. Please check the target modules and try again.)r#  r   rp  r   r{  r   r   r  r   setattrr   r   
differencer|   )rj   r   r   strict_module_checkwrapper_clswrapper_kwargstrainable_modulesfound_modulesrw  r   key_listtarget_module_foundrx  rz  ry  rO  	not_founds           `        r>   _set_trainabler  5  s<   * * 	EM"'"5"5u"5"MNQNHN +!!Z\!ZZ*9%*E'FFK&+.l=n=""6#8#89(PP
&&|4Z8!((4k*+ L!,,];I=i[(tu
 	
 - Os   Dc                    d }| j                         D ]]  }t        |t              s ||      }||j                  v r#|j	                  d       |j                  |       M|j	                  d       _ y )Nc                    t        | t              r| S t        |       dkD  rt        d      t        |       dk(  rt        d      | d   } | S )Nr   z9Only one adapter can be set at a time for modules_to_saver   z*Please specify at least one adapter to set)r   r   rf   r|   )r   s    r>   check_adapter_namez(_set_adapter.<locals>.check_adapter_nameo  sU    lC( |q XYY!#IJJ#Ar=   TF)r   r   r   r   r   r  )rj   r   r  rO   s       r>   _set_adapterr  n  sm    
 --/ .f67-l;L v///&&t,""<0&&u-.r=   c                \   d|v r|d   }| j                   0d|v r|d   }nd|v r|d   }nd|v r|d   }nt        d      || _         | j                  0d|v r|d   }nd|v r|d   }nd|v r|d   }nt        d	      || _        | j                  :d
|v r|d
   }n)d|v r|d   }nd|v r|d   }nd|v r|d   }nt        d      || _        | j                  dk(  r1d|v r-|d   }| j                  | j                  z  |z  | _        || _        t        | dd       t        | d| j                         | S )Ntext_confignum_hidden_layers
num_layersn_layerz,Please specify `num_layers` in `peft_config`hidden_sizen_embdd_modelz+Please specify `token_dim` in `peft_config`num_attention_headsn_head	num_headsencoder_attention_headsz5Please specify `num_attention_heads` in `peft_config`PREFIX_TUNINGnum_key_value_headsencoder_hidden_size)r  r|   	token_dimr  	peft_typerT   r  )peft_configmodel_configr  r  r  r  s         r>   _prepare_prompt_learning_configr    s   $#M2%,.%&9:J\)%l3J,&%i0JKLL!+$L($]3I%$X.I,&$Y/IJKK )&&. L0"./D"E%".x"8L(".{";&,6"./H"ITUU*=' /4I\4Y*+@A + 5 59X9X X[n n*='{148@2K4I4IJr=   c                h   t               }t        | d      s|S | g}t        |      dkD  r|j                  d      }|j                  j
                  |vrPt        |t              r$|j                  |t        |j                        z  }|t        |j                               z  }t        |      dkD  r|S )z
    Get the modules of the model that should not be split when using device_map. We iterate through the modules to get
    the underlying `_no_split_modules`.

    Returns:
        `List[str]`: List of modules that should not be split
    _no_split_modulesr   rx   )r   r9   rf   r   r[   r\   r   r   r  rb   children)rj   r  modules_to_checkrO   s       r>   _get_no_split_modulesr    s     #&%5-.  w

!
#!%%b)$$,==&/2++7(9C@X@X<Y(Y%V__%6 77 
!
# r=   c                &   t        t        d      rt        j                  }nddlm} ddlm}m}m} ddlm	}m
}m} dj                  t        |             }t        j                  j!                  d|      j#                  d      }	|||h}
|	D ]8  }t%        |      dk(  r || |      }|t'        d	      |
j)                  |       : d
 }t+        j,                  ||      }t+        j,                  ||
      }t+        j,                  |||g      }|S )Nget_module_class_from_namer   )r  )
_or_policylambda_auto_wrap_policytransformer_auto_wrap_policyr   )PrefixEncoderPromptEmbeddingPromptEncoder,FSDP_TRANSFORMER_CLS_TO_WRAPz@Could not find the transformer layer class to wrap in the model.c                    t        t        | j                                     dk(  r$t        | dd       | j                  j
                  ryy)Nr   r]  TF)rf   rb   named_childrenrT   r]  rV   )rO   s    r>   lambda_policy_fnz/fsdp_auto_wrap_policy.<locals>.lambda_policy_fn  s?    V**,-.!3$/;++r=   )	lambda_fn)transformer_layer_cls)policies)r9   r	   r  accelerate.utils.dataclassestorch.distributed.fsdp.wrapr  r  r  tunersr  r  r  r   r  osenvironr   rv  rf   	Exceptionr   	functoolspartial)rj   r  r  r  r  r  r  r  %default_transformer_cls_names_to_wraptransformer_cls_names_to_wraptransformer_cls_to_wraplayer_classtransformer_clsr  lambda_policytransformer_wrap_policyauto_wrap_policys                    r>   fsdp_auto_wrap_policyr    s   -/KL%C%^%^"KmmFF,/HH5J55Q,R)$&JJNN&(M%eCj "  -m_M4 9{q 4UKH"^__#''89 %%&=IYZM'//$5
 !((}Ne>fgr=   c                    |s| S t        | t        j                  j                        r)t        j                  j                  | j                        S | j                  S r   )r   r7   r   	ParameterT)r]  fan_in_fan_outs     r>   	transposer    s@    &%((,,-xx!!&((++88Or=   c                ~    | j                  |      r,t        |       t        |      kD  r| j                  d|z         S yy)z
    Helper function to match module names target_key and key. Makes sure that either the key is exactly the target_key
    or the target_key is a submodule of key
    r   TF)r  rf   )rw  r  s     r>   _is_valid_matchr  
  s:    
 ||Js8c*o%<<j 011r=   c                h    | |t        d      | | j                  d   }|S |j                  d   }|S )znGet the batch size based on either input_ids or input_embeds

    Raises an ValueError if both are None.

    z5You have to provide either input_ids or inputs_embedsr   )r|   rz   )r~   inputs_embeds
batch_sizes      r>   _get_batch_sizer    sO     	 5PQQ__Q'
  #((+
r=   c                    t        | d      r<t        | j                  d      r&t        | dd      |k(  r| j                  j                  S y)zH
    Get the quantization config of the related quantization method
    configquantization_configrC   N)r9   r  rT   r  )rj   methods     r>   get_quantization_configr  &  sB    
 	x ELL"78U148FB||///r=   c           	     <   | yt               rddlm} ny| j                  }| j                  }| j
                  }t        | d      r| j                  }n| j                   }t        | d      r| j                  d   }nd} |d||||xr |dk(   |xr |d	k(   
      }|S )zW
    Get the right AutoGPTQQuantLinear class based on the quantization config file
    Nr   )dynamically_import_QuantLinearuse_exllamaexllama_configr   r   Fr   )
use_tritondesc_act
group_sizebitsdisable_exllamadisable_exllamav2)
r   auto_gptq.utils.import_utilsr  r  r  r  r9   r  r  r  )gptq_quantization_configr  r  r  r  r  exllama_versionQuantLinears           r>   get_auto_gptq_quant_linearr  3  s      'O'00H)44J#((D'7.::2BBB')9:2AA)L0(A_-AB*C!/CDK r=   c           
        | yt               syddlm} | j                  }| j                  }| j
                  }t        | d      r| j                  nd}| j                  }t        | d      r| j                  nd} ||||||||d      }	|	S )	zS
    Get the right GPTQQuantLinear class based on the quantization config file
    Nr   )hf_select_quant_linearcheckpoint_formatrD   metaauto_trainable)r  r  r  sym
device_mapr  r  backend)
r   gptqmodel.utils.importerr  r  r  r  r9   r  r  r  )
r  r  r  r  r  r  r  r  r  r  s
             r>   get_gptqmodel_quant_linearr  W  s      '!#?'00H)44J#((D +-@A 	!22 
 #
&
&C,34Lf,U#(([_D(+ 	K r=   c                    | j                   j                  dk(  r*t               r ddl}|j                  j                  |       }nt        |       }| j                   |t        |       fS )a  
    Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
    example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
    guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
    non-overlapping lifetimes may have the same id.

    This method is the exact same copy of
    https://github.com/huggingface/transformers/blob/main/src/transformers/pytorch_utils.py#L282C1-L300C58 but we added
    it here manually to avoid import issue with old versions of transformers.
    xlar   N)devicer   r   	torch_xla_XLAC_xla_get_tensor_idr   r   )tensorr  	unique_ids      r>   id_tensor_storager  |  sU     }}U"'='?
 	OO66v>	'	==)\&%999r=   c                    | j                         D ]I  }|j                  s|j                  |      |_        &|j                  t        j
                        |_        K y)a?  
    Cast all non-trainable parameters of the model to the given `dtype`. The `dtype` can be `torch.float16` or
    `torch.bfloat16` as per the mixed-precision training you are performing. The trainable parameters are cast to full
    precision. This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype for
    non-trainable parameters. Having the trainable parameters in full-precision preserves training stability when using
    automatic mixed-precision training.

    Args:
        model (`torch.nn.Module`):
            The model to cast the non-trainable parameters of.
        dtype (`torch.dtype`):
            The dtype to cast the non-trainable parameters to. The `dtype` can be `torch.float16` or
    `torch.bfloat16` as per the mixed-precision training you are performing.
    N)rW   rV   r^   r]   r7   r_   )rj   rX   r~  s      r>   cast_mixed_precision_paramsr    sF      )TT%[AFTT%--(AF	)r=   c                R    | j                         } | dv ry| dv ryt        d|        )z
    Converts a string representation of truth to `True` (1) or `False` (0).

    True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`;
    )yyesttrueon1r   )r}  noffalseoff0r   zinvalid truth value )lowerr|   )values    r>   str_to_boolr    s:     KKME44	7	7/w788r=   c           
     
   d}t        t        j                  j                  dd            r|S 	 t	        | |fi |}|S # t
        t        f$ r Y |S t        $ r*}t        j                  d| d| d|  d       Y d}~|S d}~ww xY w)zCheck if a file exists on HF Hub, if check was not successful returns None instead of erroring.

    Respect offline mode if set.

    NHF_HUB_OFFLINEr  z7Unable to fetch remote file due to the following error z- - silently ignoring the lookup for the file z in r   )
r  r  r  r   r   r   r   r  rg   rh   )repo_idfilenamer   existses        r>   check_file_exists_on_hf_hubr    s     "F2::>>"2C89	
Wh9&9 M 12  M  
EaS I%Jd7)16	
 	

 M
s   > BBA==Bc                .    t        j                  | |      S )zBacking function for `target_modules` config parameter.

    Having this as its own function ensures that target key matching can be implemented in the same way everywhere.
    )re	fullmatch)target_patternrw  s     r>   match_target_against_keyr    s    
 <<,,r=   c                T    | D ]"  }t        j                  d| d|      }|s |c S  |S )z1Match a substring of key_to_match in pattern keysz(.*\.)?(z)$)r  match)pattern_keyskey_to_matchrw  r  s       r>   get_pattern_keyr!    s=     HSE,l;
	 r=   c           
     J   t        |dd      t        | |t        |dd             t        |dd      jt        |j                  t              r|j                  }nt        | d      }||j                  i}t        |dd      }||D ]  }||v st        d| d| d       |j                         D ]  \  }}t        | ||gd	t        |
        |j                  dd      r| j                  t        | j                         t              r| j                  D 	cg c]%  }	dj                  |	j                  d      dd       ' }
}	| j                         j                  }t        | ||
d	t        |j                  |   | j                         j                         yyyyyc c}	w )a  Handle the resolution of additional trainable modules (also called AuxiliaryTrainingWrapper)
    by checking the config if such modules are requested and adding them to the model.

    Currently trainable tokens and modules to save are considered additional trainable modules.
    r&  N)r   r[  embed_tokenszbThe embedding layer is already marked to be trained fully, either specify `modules_to_save=[..., "z ", ...]` or `trainable_tokens={'z': x}` but not both.T)r   r  r  rS  tie_word_embeddingsFr   rx   )r   r  r  rS  rT  )rT   r  r   r[  dictrs  r|   itemsrQ  r   _tied_weights_keysr`   r   rv  rV  rS  )rj   r  r  r   target_layers
layer_namer&  target_layerrS  r}  module_keysrV  s               r>    set_additional_trainable_modulesr,    s    {-t4@ulN_ae9fg{5t<Hk994@'??M3E>JJ')L)LMM!+/@$G& - ?2$33?. A00<~=RT  ,9+>+>+@ 	'L-*^$(2+	 2E:((455579OP AF@X@XY1388AGGCL"$56YKY!668FFM($(2+99,G"779GG Q 5 ;? IJ Zs   *F c               \   t        j                  t        j                        t        j                  d      k\  }|rddlm}	 nt        d      t        | | j                  |       }
t        |
d      r|
j                         nd }t        |
dd       }||t        |dd       }|Vt        |dd       }t        | d|	      } || j                  t        j                  ||f| j                  	      |||||
      }|S  ||||j                         | j                  ||| j                  ||	      }|S )Nz4.53.1r   )create_masks_for_generatezDYour transformers version is too old, please upgrade it to >= 4.53.1get_decoder5_prepare_4d_causal_attention_mask_with_cache_positiontoken_type_idsr.  )rX   )r  input_embedsattention_maskcache_positionpast_key_valuesr1  position_ids)sequence_lengthtarget_lengthrX   r4  r  r  r5  r6  )r   parsetransformers__version__transformers.masking_utilsr.  ImportErrorrT   base_model_prefixr9   r/  r  r7   emptyrX   get_max_cache_shape)rj   model_inputr3  r5  r4  r  r7  r6  transformers_ge_4_53_1r.  
base_modeldecodercausal_mask_creation_functionr1  s                 r>   create_attention_maskrF  $  s>    %]]<+C+CDV^H__H`aa  7 7?J*1*m*Lj$$&RVG$+J8oqu$v!$,1D(/9prv(w% %, .>E(/7RTm(n%6<<j/%B%++V))+)%	
,  7+)==?++)!<<+%

 r=   )r  r   )TN)r~   r  r   intr   rG  r   )FN)r  z"Optional[AuxiliaryTrainingWrapper])r  zset[str])rw  r   r  r   )r~   Optional[torch.Tensor]r  rH  r  rG  )rj   r  r  r   )r  r  r  ztuple[torch.device, int, int])r  r   r  rG  )r  r   r  r   r  zOptional[bool])r  r   rw  r   )r  zSequence[str]r   r   r  r   )b
__future__r   r5  r  rc   r  r  rg   collections.abcr   
contextlibr   typingr   r   r   r   r7   r:  r	   accelerate.hooksr
   r   accelerate.utilsr   r   huggingface_hubr   huggingface_hub.errorsr   r   	packagingr   safetensors.torchr   r   r   import_utilsr   r   r   	constantsr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r;   r9  r;  r/   __all__r?   rv   r   r   Moduler   r#  rQ  rs  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r!  r,  rF  r<   r=   r>   <module>rW     sx   #    	 	  $ " ' '    5 H ? ' H  7 ( a a     2 7==''(MGMM(,CC1$&M4Ur*t"uxx t"nt3 tnN35 N3b	'$ 6:6
 46r.61h4'T	 
!H"J:2),9 2-:z2r=   