
    bi              	          d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlZddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZ  ee       Z!d Z"d Z#d-dZ$d-dZ%d Z&d.dZ'd.dZ(d/dZ)d.dZ*d0de+de+de,fdZ-	 d1de+de+de,de,fdZ.dej^                  j`                  dejb                  fd Z2dej^                  j`                  d!e3fd"Z4d#ejj                  jl                  d$e3fd%Z7dej^                  j`                  fd&Z8dej^                  j`                  d'ej^                  j`                  fd(Z9dej^                  j`                  d'eej^                  j`                  ge,f   fd)Z:d* Z;d+e3d'e3fd,Z<y)2    N)defaultdict)nullcontext)Path)Callable   )
get_logger   )FSDP_MODEL_NAMEOPTIMIZER_NAMESAFE_WEIGHTS_NAMEWEIGHTS_NAME)get_module_class_from_name)get_non_persistent_buffersis_peft_model)get_module_children_bottom_upis_compiled_modulesave)is_torch_versionc                  t    dt         j                  vrdt         j                  d<   dt         j                  d<   y)z[
    Enables RAM efficient loading of Hugging Face models for FSDP in the environment.
    ACCELERATE_USE_FSDPTrueFSDP_CPU_RAM_EFFICIENT_LOADINGNosenviron     V/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/accelerate/utils/fsdp_utils.py!enable_fsdp_ram_efficient_loadingr   %   s.    
 BJJ.,2

()39BJJ/0r   c                  *    dt         j                  d<   y)z\
    Disables RAM efficient loading of Hugging Face models for FSDP in the environment.
    Falser   Nr   r   r   r   "disable_fsdp_ram_efficient_loadingr"   /   s     4;BJJ/0r   c                     |r%t        |       rddlm}  || | j                        S |ddlm}  || |      S | j                         S )Nr   )get_peft_model_state_dictadapter_name)get_model_state_dictoptions)r   peftr$   active_adapter'torch.distributed.checkpoint.state_dictr'   
state_dict)modeladapter_only
sd_optionsr$   r'   s        r   _get_model_state_dictr1   6   sI    e,2(U=Q=QRR P#E:>>!!r   c                     |r&t        |       rddlm}  || || j                        S |ddlm}  || ||      S | j                  |      S )Nr   )set_peft_model_state_dictr%   )set_model_state_dictr(   )r   r*   r3   r+   r,   r4   load_state_dict)r.   r-   r/   r0   r3   r4   s         r   _set_model_state_dictr6   E   sO    e,2(
I]I]^^ P#E:zJJ$$Z00r   c           	          d }| j                   dk(  rXddlm} ddlm}  || j
                  |j                  k(  t        | j                  dd      t        | j                  dd            }|S )	Nr   r   )StateDictOptionsStateDictTypeoffload_to_cpuF
rank0_only)full_state_dictcpu_offloadbroadcast_from_rank0)	fsdp_versionr,   r8   2torch.distributed.fsdp.fully_sharded_data_parallelr:   state_dict_typeFULL_STATE_DICTgetattrstate_dict_config)fsdp_pluginr0   r8   r:   s       r   _prepare_sd_optionsrG   T   sj    J 1$LT%'77=;X;XX = =?OQVW!()F)FV[!\

 r   c                    dd l mc m} ddlm} ddlm} ddlm}	 t        j                  |d       | j                  |	j                  k(  r1|j                  dkD  }
|
| j                  _        |
| j                  _        | j                   dk(  r2|j                  || j                  | j                  | j"                        n	t%               }t'        |       }|5  t)        |||      }| j                  |	j                  k(  r|dk(  r	t*         d	nt*         d
| d	}t        j,                  j/                  ||      }|j0                  dk(  rt2        j5                  d|        t7        j8                  ||       t2        j5                  d|        ni| j                  |	j:                  k(  r|dk(  rt*         d|j0                   d	nt*         d
| d|j0                   d	}t        j,                  j/                  ||      }t2        j5                  d|        t7        j8                  ||       t2        j5                  d|        n| j                  |	j<                  k(  rt        j,                  j/                  |t*         d
|       }t        j                  |d       t2        j5                  d|        d|i}|j9                  ||j?                  |       |              t2        j5                  d|        d d d        y # 1 sw Y   y xY w)Nr   DefaultSavePlannerFullyShardedDataParallelr9   Texist_okr	   r/   r0   .bin_zSaving model to zModel saved to _rankr.   r-   storage_writerplanner) torch.distributed.checkpointdistributed
checkpoint,torch.distributed.checkpoint.default_plannerrJ   rA   rL   r:   r   makedirsrB   rC   num_processesrE   r;   r<   r@   optim_state_dict_configr   rG   r1   r
   pathjoinprocess_indexloggerinfotorchr   LOCAL_STATE_DICTSHARDED_STATE_DICTFileSystemWriter)rF   acceleratorr.   
output_dirmodel_indexr/   dist_cprJ   FSDPr:   is_multi_processctxr0   r-   weights_nameoutput_model_fileckpt_dirs                    r   save_fsdp_modelrp   e   s   22OcPKK
T*""m&C&CC '44q87G%%43C%%0 ##q( 	;..0M0M{OrOr	
 ]  %[1J	 6*5|Xbc
&&-*G*GG7Ba7Go.d3P_O``abmannrMsL "Z F((A-./@.ABC

:'89o.?-@AB((M,J,JJ !# ##5)B)B(C4H'(+eK<U<U;VVZ[ 
 !#Z FKK*+<*=>?JJz#45KK/*;)<=>((M,L,LLww||J?2C1[M0RSHKK40KK*8*56!:.JLL%&77A*,  
 KK/(45?6 6 6s   H#LL
c                    dd l mc m} ddlm} ddlm} ddlm}	 |j                          | j                  |	j                  k(  r1|j                  dkD  }
|
| j                  _        |
| j                  _        | j                  dk(  r2|j                  || j                  | j                  | j                         n	t#               }t%        |       }|5  | j                  |	j                  k(  rt'        |      |urK|j(                  dk7  r<|j*                  s0| j,                  s| j                  dk(  rt/        d      	 d d d        y |dk(  r	t0         dnt0         d| d}t2        j4                  j7                  ||      }t8        j;                  d	|        |j*                   xs |j<                  }|rt?        j@                  |d
      }ni }t8        j;                  d|        no| j                  |	jB                  k(  r|dk(  rt0         d|j(                   dnt0         d| d|j(                   d}t2        j4                  j7                  ||      }t8        j;                  d	|        t?        j@                  |d
      }t8        j;                  d|        n| j                  |	jD                  k(  rt0         |vr)t2        j4                  j7                  |t0         d|       n|}t8        j;                  d	|        dtG        |||      i}|jA                  ||jI                  |       |              |d   }t8        j;                  d|        tK        |||      }d d d        |S # 1 sw Y   S xY w)Nr   )DefaultLoadPlannerrK   r9   r	   zzSet the `sync_module_states` flag to `True` so that model states are synced across processes when initializing FSDP objectrP   rQ   zLoading model from Tweights_onlyzModel loaded from rR   r.   rO   )r-   storage_readerrU   )&rV   rW   rX   rY   rr   rA   rL   r:   wait_for_everyonerB   rC   r[   rE   r;   r<   r@   r\   r   rG   typer_   is_fsdp2sync_module_states
ValueErrorr
   r   r]   r^   r`   ra   is_main_processrb   loadrc   rd   r1   FileSystemReaderr6   )rF   rf   r.   	input_dirrh   r/   ri   rr   rj   r:   rk   rl   r0   rm   input_model_file
load_modelr-   ro   load_results                      r   load_fsdp_modelr      si   22OcP!!#""m&C&CC '44q87G%%43C%%0 ##q( 	;..0M0M{OrOr	
 ]  %[1J	 -q&&-*G*GGE{$&;+D+D+IR]RfRf"55+:R:RVW:W$3  -q -q 8Ca7Go.d3P_O``abmannrMsL!ww||I|DKK-.>-?@A(111P[5P5PJ"ZZ(8tL

KK,-=,>?@((M,J,JJ !# ##5)B)B(C4H'(+eK<U<U;VVZ[ 
  "ww||I|DKK-.>-?@A$44HJKK,-=,>?@((M,L,LL &&y8 Y?*;1[M(JK 
 KK-hZ89!#8\fp#qrJLL%&77A*,  
 $G,JKK,XJ78+E:Leop[-q\ ]-q\ s   A)M+ H!M++M5c                 0   dd l mc m} ddlm} ddlm} ddlm}	 t        j                  |d       | j                  dk(  r2|j                  || j                  | j                  | j                        n	t               }
t        |       }|
5  | j                  dk(  rdd	lm}  ||||
      }n|j%                  ||      }| j                  |	j&                  k(  r|j(                  dk(  r|dk(  r	t*         dnt*         d| d}t        j,                  j/                  ||      }t0        j3                  d|        t5        j6                  ||       t0        j3                  d|        nt        j,                  j/                  |t*         d|       }t        j                  |d       t0        j3                  d|        |j7                  d|i|j9                  |       |              t0        j3                  d|        d d d        y # 1 sw Y   y xY w)Nr   rI   rK   r9   TrM   r	   r   )get_optimizer_state_dictr(   rP   rQ   zSaving Optimizer state to zOptimizer state saved in 	optimizerrS   )rV   rW   rX   rY   rJ   rA   rL   r:   r   rZ   r@   rB   rE   r\   r   rG   r,   r   optim_state_dictrC   r_   r   r]   r^   r`   ra   rb   r   re   )rF   rf   r   r.   rg   optimizer_indexri   rJ   rj   r:   rl   r0   r   optim_stateoptim_state_nameoutput_optimizer_filero   s                    r   save_fsdp_optimizerr      s   22OcPKK
T* ##q( 	;..0M0M{OrOr	
 ]  %[1J	 @##q(X25)ZXK//yAK&&-*G*GG((A-/>!/C~&d+NK[[\]l\mmqIr ! )+ZAQ(R%89N8OPQ

;(=>78M7NOPww||J>2B!OCT0UVHKK40KK4XJ?@LL'5&77A*,  
 KK3H:>?5@ @ @s   E8HHc                 x   dd l mc m} ddlm} ddlm}	 |j                          | j                  dk(  r2|j                  || j                  | j                  | j                        n	t               }
t        |       }|
5  | j                  |	j                  k(  rd }|j                  dk(  s| j                  j                  s |dk(  r	t          dnt          d| d}t"        j$                  j'                  ||      }t(        j+                  d|        t-        j.                  |d	      }t(        j+                  d
|        nt          |vr)t"        j$                  j'                  |t          d|       n|}t(        j+                  d|        d|j1                         i}|j/                  |||j3                  |             |d   }t(        j+                  d|        | j                  dk(  r&|j5                  |||      }|j7                  |       nddlm}  |||||       d d d        y # 1 sw Y   y xY w)Nr   rK   r9   r	   rP   rQ   zLoading Optimizer state from Trs   zOptimizer state loaded from zLoading Optimizer from r   )checkpoint_idru   zOptimizer loaded from )r.   optimr   )set_optimizer_state_dictr(   )rV   rW   rX   rA   rL   r:   rv   r@   rB   rE   r\   r   rG   rC   r_   r<   r   r   r]   r^   r`   ra   rb   r|   r-   r}   optim_state_dict_to_loadr5   r,   r   )rF   rf   r   r.   r~   r   r/   ri   rj   r:   rl   r0   r   optimizer_nameinput_optimizer_filero   flattened_osdr   s                     r   load_fsdp_optimizerr     s'   22cP!!#
 ##q( 	;..0M0M{OrOr	
 ]  %[1J	 !X&&-*G*GGK((A-[5X5X5c5c/>!/C~&d+NK[[\]l\mmqIr  (*ww||I~'N$;<P;QRS#jj)=DQ:;O:PQR %%i7 Y>*:!O;L(MN 
 KK1(<=&	(<(<(>?KLL&&77A  
 &k2KKK0
;<##q( 99Yit9uM%%m4X$UI{JWC!X !X !Xs   >F)H00H9checkpoint_dir	save_pathsafe_serializationc                 p   ddl mc m} ddlmc mc m} i }t        |      }|j                  d       |j                  ||j                  |       |j                         d       |r	|t        z  n|t        z  }t        |j                               dk(  r|t        |      d      }t        |||       |S )z
    Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`

    Will save under `save_path` as either `model.safetensors` or `pytorch_model.bin`.
    r   NTrM   )ru   rU   no_distr	   )r   )rV   rW   rX   )torch.distributed.checkpoint.format_utilsformat_utilsr   mkdir_load_state_dictr}   _EmptyStateDictLoadPlannerr   r   lenkeyslistr   )r   r   r   ri   dist_cp_format_utilsr-   s         r   )_distributed_checkpoint_to_merged_weightsr   J  s     32LLJYIOOTO"))//?$??A	 *  2D	--UaIaI :??"Z 0 34
Y3EFr   output_pathremove_checkpoint_dirc                    t        |       } ddlm} t        dd      st	        d      | j                         s| dz  j                         }| dz  j                         }d|  d	}|r#|r!|d
z  }|d|  d|  dz  }|dz  }t	        |      |r|dz  }|d|  dz  }t	        |      |r|dz  }|d|  dz  }t	        |       |       }|j                  rlt        j                  d|         t        | ||      }	t        j                  d|	        |r-t        j                  d|         t        j                  |        |j                          y)a?  
    Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
    `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors` if
    `safe_serialization` else `pytorch_model.bin`.

    Note: this is a CPU-bound process.

    Args:
        checkpoint_dir (`str`):
            The directory containing the FSDP checkpoints (can be either the model or optimizer).
        output_path (`str`):
            The path to save the merged checkpoint.
        safe_serialization (`bool`, *optional*, defaults to `True`):
            Whether to save the merged weights with safetensors (recommended).
        remove_checkpoint_dir (`bool`, *optional*, defaults to `False`):
            Whether to remove the checkpoint directory after merging.
    r   )PartialStatez>=z2.3.0z/`merge_fsdp_weights` requires PyTorch >= 2.3.0`pytorch_model_fsdp_0optimizer_0zTried to load from z) but couldn't find a valid metadata file.zE However, potential model and optimizer checkpoint directories exist.zPlease pass in either z/pytorch_model_fsdp_0 or z/optimizer_0zinstead.z8 However, a potential model checkpoint directory exists.zPlease try passing in z/pytorch_model_fsdp_0 instead.z< However, a potential optimizer checkpoint directory exists.z/optimizer_0 instead.zMerging FSDP weights from z.Successfully merged FSDP weights and saved to z"Removing old checkpoint directory N)r   accelerate.stater   r   rz   existsr{   r`   ra   r   shutilrmtreerv   )
r   r   r   r   r   model_path_existsoptimizer_path_existserrstater   s
             r   merge_fsdp_weightsr   f  s   ( .)N-D'*JKK   "+.DDLLN!/-!? G G I#N#33\]!6ZZC+N+;;TUcTddpqqC:C o MMC+N+;;YZZC o #QQC+N+;;PQQCo NE00@AB=nk[mn	DYKPQ KK<^<LMNMM.)	r   r.   devicec                 *   	 t        |dd       }|s S i 	|D ]W  }|j                  d      }dj                  |d d       |d   }}|j                  |      }t        ||      }d 	t	        |      <   Y dt
        j                  j                  f	 fd}|S )N_tied_weights_keys.modulec                 F   t        t              }| j                  d      D ]0  \  }}t        |      v s|t        |         j	                  |       2  |       } |j                         D ]0  \  }}|D ]&  }|   }|t        | |      |<   t        | ||       ( 2 | S )NF)recurse)r   r   named_parametersidappenditemsrD   setattr)	r   params_to_tienparamid_key_param_names
param_name_tied_paramsparam_init_fns	          r   param_init_fn_tied_paramz7ensure_weights_retied.<locals>.param_init_fn_tied_param  s     $D)///> 	3HAu%yL(bi(//2	3 v& %2$7$7$9 	7 FL* 7
$V,= ,36:+FL(FJ67	7 r   )rD   splitr^   get_submoduler   rb   nnModule)
r   r.   r   _tied_namesnamer   modr   r   r   s
   `        @r   ensure_weights_retiedr     s    %!5t<K L 'zz#88D"I.Rj!!$'Z("&RY' 2 $#r   full_sdc                 b   ddl m} ddlm} |j	                         }i }d }d }| j
                  rt        |j                         |j                               D ]  \  \  }	}
}|j                  }|
j                         j                  |j                        }
|j                  |
d|j                                 ||
||j                        } |||	|
      \  }} ||||      }|||	<    n|j                         D ]  \  }	}|j                  }t!        j"                  |j%                         |j                  |j&                        }|j                  |d|j                                 ||||j                        } |||	|      \  }} ||||      }|||	<    |j)                  |d	       |S )
a  
    Loads the full state dict (could be only on rank 0) into the sharded model. This is done by broadcasting the
    parameters from rank 0 to all other ranks. This function modifies the model in-place.

    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`):
            The model to load the state dict into, expected to be on meta device or a VRAM spike can occur
        full_sd (`dict`): The full state dict to load, can only be on rank 0
    r   N)distribute_tensorc                    	 | j                  |      }t        t        d      }d }|xr |j                  t        j                  k(  }	|j                  j                  r|	s|j                  }|d uxr |j                         |fS # t        $ r5 |j                  dd      \  }}| j                  |      }t	        ||      }Y w xY w)Nr   r	   float8_e4m3fn)get_parameter_or_bufferAttributeErrorrsplitr   rD   hasattrrb   dtyper   is_floating_pointis_contiguous)
r.   r   empty_param	old_parambase_param_namelocal_param_name	submoduleis_torch_e4m3fn_availablecasting_dtypeis_param_float8_e4m3fns
             r   _infer_parameter_dtypez:fsdp2_load_full_state_dict.<locals>._infer_parameter_dtype  s    	=55jAI %,E?$C!!:!g{?P?PTYTgTg?g..7M%OOM$B)@)@)BMQQ  	=0:0A0A#q0I-O-++O<I	+;<I		=s   B ;C ?C c                 R    || j                  |      } |r| j                         } | S )N)r   )to
contiguous)tensorto_contiguousr   s      r   _cast_and_contiguousz8fsdp2_load_full_state_dict.<locals>._cast_and_contiguous  s.    YYUY+F&&(Fr   )srcgroup)r   r   T)assign)torch.distributedrW   torch.distributed.tensorr   r-   r{   zipr   valuesdevice_meshdetachr   device_type	broadcast	get_group
placementsrb   emptysizer   r5   )rf   r.   r   distr   meta_sharded_sd
sharded_sdr   r   r   
full_paramsharded_paramr   sharded_tensorr   r   full_tensors                    r   fsdp2_load_full_state_dictr    s    %: &&(OJR$ ""7:7==?OLbLbLd7e 	43$Zm'33K#**,//0G0GHJNN:1K4I4I4KNL.z;H`H`aN+A,(M=
 2.-Q^_N%3Jz"	4 *9)>)>)@ 	4%J'33K++m&8&8&:;CZCZbobubuvKNN;A[5J5J5LNM.{KIaIabN+A,(M=
 2.-Q^_N%3Jz"	4 
*T2Lr   r   mappingc                     ddl m} i }d||<   	 | j                  D ]%  }|d   D cg c]  }||j                      c}|d<   ' yc c}w # t        $ r t	        d      w xY w)a  
    Switches the parameters of the optimizer to new ones (sharded parameters in usual case). This function modifies the
    optimizer in-place.

    Args:
        optimizer (`torch.optim.Optimizer`): Optimizer instance which contains the original model parameters
        mapping (`dict`): Mapping from the original parameter (specified by `data_ptr`) to the sharded parameter

    Raises:
        KeyError:
            If a parameter in the optimizer couldn't be switched to its sharded version. This should never happen and
            indicates a bug. If we kept the original params instead of raising, the training wouldn't be numerically
            correct and weights wouldn't get updated.
    r   )DTensor_local_tensorparamszA parameter in the optimizer couldn't be switched to its sharded version. This breaks the training. Please raise an issue on GitHub.N)r   r  param_groupsdata_ptrKeyError)r   r  r  accessor_mappingparam_groupps         r   !fsdp2_switch_optimizer_parametersr    s~     1 /W
$11 	YKBMhBW$XQWQZZ%8$XK!	Y$X 
  S
 	

s   A	 AA	 A	 	Ac                 d   ddl m} t        | j                  j                  |      }t        |d      dd D ]v  \  }}t        |j                  d            dkD  r|j                  dd      \  }}nd}|}|r|j                  |      n|} ||      s[ ||d	
      }|j                  ||       x |S )a8  
    Applies the activation checkpointing to the model.

    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`): The model to apply the activation checkpointing to

    Returns:
        `torch.nn.Module`: The model with the activation checkpointing applied
    r   )checkpoint_wrapperT)return_fqnsNr   r   r	   F)preserve_rng_state);torch.distributed.algorithms._checkpoint.checkpoint_wrapperr  fsdp2_prepare_auto_wrap_policyr   rF   r   r   r   r   r   register_module)	rf   r.   r  auto_wrap_policy_func
layer_namelayerparent_name
child_nameparent_modules	            r   fsdp2_apply_acr  7  s     ;;;L;L;X;XZ_`:5dSTWUWX 
=
Ez$%)&0&7&7Q&?#KK#J<G++K8U /&uGE))*e<
= Lr   returnc                 X   ddl m}m}m} t	        ||      xs# t        |      xr t	        |j                  |      }|r|S | j                  j                  }|j                  |       |j                         }t        | dd      }|j                  |j                  |j                  xs  |       |"|t        | j                   j"                           ndd}	d}
|j%                         D ]"  \  }}|j&                  j(                  dk(  s d}
 n |j*                  r|
st-        |dd	      }t/        j0                  |j3                         D ci c]  \  }}||v s|| c}}      }|j5                  t7        j8                  d
            }t;        |d      r|j=                          t?        ||      }|2tA        |      dd D ]!  } ||      st	        ||      r ||fi |	 # t	        ||      s	 ||fi |	 |j*                  rtC        | ||       |j*                  r|
sjE                         D ]c  \  }}|j5                  | j8                        }d|v r'|jG                  dd      \  }}|jI                  |      }n|}|}|jK                  ||d       e t;        |d      r|j=                          t        |dd      }| jL                  dk7  rU||t6        jN                  k7  r@|j5                  t6        jN                        }| jP                  rtS        jT                  d       |S c c}}w )a"  Prepares the model for FSDP2 in-place. Also returns the model to avoid misuse of the original model.

    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`): The model to prepare

    Returns:
        `torch.nn.Module`: Prepared model
    r   )
FSDPModuleMixedPrecisionPolicyfully_shardtorch_device_meshN)reshard_after_forwardoffload_policy	mp_policymeshF
Params4bitT)r   fqnsmetatie_weightsr   r   r	   )
persistentr   noz~FSDP upcast of low precision parameters to fp32 (since mixed_precision != 'no') may affect the precision of model checkpoints.)+torch.distributed.fsdpr  r   r!  
isinstancer   	_orig_modr   rF   set_auto_wrap_policyr-   rD   r#  r>   mixed_precision_policytupleparallelism_configfsdp_dim_namesr   	__class____name__cpu_ram_efficient_loadingr   copydeepcopynamed_buffersr   rb   r   r   r*  r  r   r  r   r   r   register_buffermixed_precisionfloat32r{   warningswarn)rf   r.   r  r   r!  is_type_fsdpfsdp2_pluginoriginal_sdr&  fsdp2_kwargsmodel_has_params4bitr   r   non_persistent_buffer_fqnskvoriginal_non_persistent_buffersr  r   fqnbuffer_tensor
parent_fqnlocal_buffer_namer  model_dtypes                            r   fsdp2_prepare_modelrN  X  s&    UTeZ0 5!Mj*&M  $$00L%%e,""$K; 3T:D ".!C!C&22!88R<P<RNRN^U;99HHIJdhL !--/ e ??##|3#'  --6J &@tZ^%_"*.--#113Wdaq<V7VQTW+
' f-. 5-(:<O(3E:3B? 	4F$V,Z
5SF3l3	4 eZ(E*\*-- 	#;{C--6J"A"G"G"I 
	^C),,[-?-?@Mcz03

30B-
- % 3 3J ?$'! %))*;]W\)]
	^  5-( %$/K""d*0C{V[VcVcGc '&&MM Q Ls Xs   L&
L&
c                    
 ddl m}m}  j                  }t	        |t
        j                        r|j                  }||u rt        |dd      }|g }t        |      } j                   j                  }t               
|D ]0  }t        ||      }|t        d| d      
j                  |       2 dt        j                   j"                  dt$        f 
fd	}	|	S ||u r(dt        j                   j"                  dt$        f fd
}	|	S y)a!  Prepares the auto wrap policy based on its type, done to mimic the behaviour of FSDP1 auto wrap policy.

    Args:
        fsdp2_plugin (`FullyShardedDataParallelPlugin`):
            Instance of `FullyShardedDataParallelPlugin` containing the configuration options
        auto_wrap_policy_type (`str`):
            Either `transformer` or `size`
        model (`torch.nn.Module`):
            The model to wrap

    Returns:
        `Callable[[torch.nn.Module], bool]`:
            The auto wrap policy function to be applied to the model
    r   )size_based_auto_wrap_policytransformer_auto_wrap_policy_no_split_modulesNz+Could not find the transformer layer class z in the model.r   r  c                 H    j                   yt        | t                    S )NF)transformer_cls_names_to_wrapr.  r2  )r   rA  transformer_cls_to_wraps    r   policyz.fsdp2_prepare_auto_wrap_policy.<locals>.policy  s%    99Afe,C&DEEr   c                 b    t        d | j                         D              }|j                  kD  S )Nc              3   <   K   | ]  }|j                           y w)N)numel).0r  s     r   	<genexpr>zAfsdp2_prepare_auto_wrap_policy.<locals>.policy.<locals>.<genexpr>  s     #K!AGGI#Ks   )sum
parametersmin_num_params)r   module_num_paramsrA  s     r   rV  z.fsdp2_prepare_auto_wrap_policy.<locals>.policy  s.     ##Kv7H7H7J#K K$|'B'BBBr   )torch.distributed.fsdp.wraprP  rQ  auto_wrap_policyr.  	functoolspartialfuncrD   r   rT  setr   rz   addrb   r   r   bool)rA  r.   rP  rQ  fnno_split_modulesrT  layer_classtransformer_clsrV  rU  s   `         @r   r  r    s    f		&	&B"i''(WW	))"5*=tD#!(,-=(>%55A,8,V,V)"%%8 	9K8LO& #N{m[i!jkk#''8		9	F588?? 	Ft 	F M 
*	*	C588?? 	Ct 	C M r   c                      ddl m}  |di | S )a  
    Returns a `GradScaler` for FSDP2, as the current implementation of `get_grad_scaler` doesn't accept other args. We
    need this as current `get_grad_scaler` accepts only `distributed_type` as arg, which doesn't differentiate between
    FSDP1 and FSDP2
    r   )
GradScalerr   )torch.amp.grad_scalerrm  )kwargsrm  s     r   get_fsdp2_grad_scalerrp    s     1r   named_paramsc                    | j                         D ci c]  \  }}|j                  dd      | } }}| j                         D ci c]+  \  }}|j                  d      r|j                  dd      n||- } }}| j                         D ci c]  \  }}|j                  dd      | } }}| S c c}}w c c}}w c c}}w )a6  Removes parameter name modifiers in order to map them back to their original names.

    See huggingface/accelerate#3554 for more context.

    Args:
        named_params (`dict`): The named parameters dictionary to canonicalize.

    Returns:
        `dict`: The canonicalized named parameters dictionary
    z._checkpoint_wrapped_module z
_orig_mod.z
._orig_mod)r   replace
startswith)rq  rF  rG  s      r   fsdp2_canonicalize_namesrv  	  s     Q]PbPbPde1AII;R@!CeLeXdXjXjXlPTPQSTq||L'A		,#q!KL  @L?Q?Q?STtq!AIIlB/2TLT f Us   B/0B5B;)FN)r   F)r   )T)TF)=r8  rb  r   r   r>  collectionsr   
contextlibr   pathlibr   typingr   rb   loggingr   	constantsr
   r   r   r   dataclassesr   modelingr   r   otherr   r   r   versionsr   r6  r`   r   r"   r1   r6   rG   rp   r   r   r   strrg  r   r   r   r   r   r   dictr  r   	Optimizerr  r  rN  r  rp  rv  r   r   r   <module>r     s     	   # "      W W 3 ? J J & 
H	:;"1"76tEP-@`0Xfc c gk : kp44&)4?C4cg4n+$ +$ +$\J588?? JT JZ
1F1F 
QU 
>uxx BnEHHOO n nb2 2HV[V^V^VeVeUfhlUlLm 2j 4 D r   