
    bi3                     ~    d dl Z d dlZd dlmZ d dlmZmZmZ d dlm	Z	m
Z
 d dlmZ erd dlmZ e G d d             Zy)	    N)	dataclass)TYPE_CHECKINGOptionalUnion)TorchContextParallelConfigTorchTensorParallelConfig)is_torch_version)Acceleratorc                      e Zd ZU dZdZeed<   dZeed<   dZeed<   dZ	eed<   dZ
edef   ed<   dZedef   ed<   dZd	 Zd
 Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zde fdZ!d#de"e    fdZ#de$e$edf   e$e df   f   fdZ%d Z&de d efd!Z'd$d"Z(y)%ParallelismConfiga  
    A dataclass to configure parallelisms applied to the model. Inspired by torchtitan's `ParallelDims`
    https://github.com/pytorch/torchtitan/blob/main/torchtitan/distributed/parallel_dims.py

    Args:
        dp_replicate_size (`int`, defaults to `1`):
            The size of the data parallel group. If `dp_replicate_size` is set to 1, the data parallel replication
            group will not be used.
        dp_shard_size (`int`, defaults to `1`):
            The size of the model shard group. If `dp_replicate_size > 1` and `tp_size > 1`, `dp_shard_size` must also
            be greater than 1, as composing DDP + TP is currently not supported.
        tp_size (`int`, defaults to `1`):
            The size of the tensor parallel group. If `tp_size` is set to `1`, the tensor parallel group will not be
            used.
        cp_size (`int`, defaults to `1`):
            The size of the context parallel group. Currently not supported, but reserved for future use and enabled
            for downstream libraries.
        tp_handler (`~utils.TorchTensorParallelConfig`, defaults to `None`):
            The handler for the tensor parallel group.

    You may obtain different distributed data parallel paradigms by configuring `dp_replicate_size` and `dp_shard_size`
    together:
        - `dp_replicate_size == 1` and `dp_shard_size > 1`, we obtain Fully Sharded Data Parallel (FSDP).
        - `dp_replicate_size > 1` and `dp_shard_size > 1`, we obtain Hybrid Sharded Data Parallel (HSDP).
        - `dp_replicate_size > 1` and `dp_shard_size == 1` is an invalid configuration, to use pure DP, use
          `DistributedDataParallelKwargs` instead.

    Ndp_replicate_sizedp_shard_sizetp_sizecp_size
tp_handler
cp_handlerc                     d| j                    d| j                   d| j                   d| j                   d| j                   d| j
                   d| j                   dS )	Nz'ParallelismConfig(
 	dp_replicate_size=z,
	dp_shard_size=z,
	tp_size=z,
	cp_size=z,
	total_size=z
	tp_handler=z,
	cp_handler=z)
)r   r   r   r   
total_sizer   r   selfs    X/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/accelerate/parallelism_config.py__repr__zParallelismConfig.__repr__F   sy    ##'#9#9": ;#112 3 ' ' OO, - OO, - OO,C1		
    c                     dd l }dg}|j                  | j                  j                         D ci c]3  \  }}||vr*|t	        |d      r|j                  |j                        n|5 c}}       y c c}}w )Nr   device_mesh__dict__)copydeepcopyr   itemshasattr)r   r   _non_serializable_fieldskvs        r   to_jsonzParallelismConfig.to_jsonR   sm    $1?  !MM//1Aq44 :0F4==,AM	
s   8A1
c                 R    g }| j                   r|dgz  }| j                  r|dgz  }|S )zENames of enabled dimensions across which data parallelism is applied.dp_replicatedp_shard)dp_replicate_enableddp_shard_enabledr   dimss     r   dp_dim_nameszParallelismConfig.dp_dim_names_   s9     $$^$$D  ZL Dr   c                 R    g }| j                   r|dgz  }| j                  r|dgz  }|S )z]Names of enabled dimensions which will receive the same batch (non-data parallel dimensions).tpcp)
tp_enabled
cp_enabledr*   s     r   non_dp_dim_namesz"ParallelismConfig.non_dp_dim_namesi   s2     ??TFND??TFNDr   c                 R    g }| j                   r|dgz  }| j                  r|dgz  }|S )zlNames of enabled dimensions which will be flattened into a joint mesh across which is model sharded in FSDP.r'   r/   )r)   r1   r*   s     r   dp_shard_cp_dim_namesz'ParallelismConfig.dp_shard_cp_dim_namess   s5       ZL D??TFNDr   c                 v    g }| j                   r|dgz  }| j                  r|dgz  }| j                  r|dgz  }|S )z@Names of enabled dimensions across which loss should be averagedr&   r'   r/   )r(   r)   r1   r*   s     r   dp_cp_dim_namesz!ParallelismConfig.dp_cp_dim_names}   sK     $$^$$D  ZL D??TFNDr   c                 :    g }| j                   r|dgz  }|dgz  }|S )z^Names of enabled dimensions across which FSDP is applied, including data parallel replication.r&   dp_shard_cp)r(   r*   s     r   fsdp_dim_namesz ParallelismConfig.fsdp_dim_names   s0     $$^$$Dr   c                 h    | j                   | j                  z  | j                  z  | j                  z  S )zSThe total size of the parallelism configuration, which is the product of all sizes.)r   r   r   r   r   s    r   r   zParallelismConfig.total_size   s-     %%(:(::T\\IDLLXXr   c                 4    | j                   | j                  z  S )zhThe size of the non-data parallel dimensions, which is the product of tensor and context parallel sizes.)r   r   r   s    r   non_data_parallel_sizez(ParallelismConfig.non_data_parallel_size   s     ||dll**r   c                 4    | j                   | j                  z  S )z_The size of the data parallel dimensions, which is the product of data parallel replication and)r   r   r   s    r   data_parallel_sizez$ParallelismConfig.data_parallel_size   s     %%(:(:::r   c                      | j                   dkD  S )zKTrue if data parallel replication is enabled, i.e. `dp_replicate_size > 1`.   )r   r   s    r   r(   z&ParallelismConfig.dp_replicate_enabled   s     %%))r   c                      | j                   dkD  S )zDTrue if data parallel sharding is enabled, i.e. `dp_shard_size > 1`.r@   )r   r   s    r   r)   z"ParallelismConfig.dp_shard_enabled   s     !!A%%r   c                      | j                   dkD  S )z:True if tensor parallelism is enabled, i.e. `tp_size > 1`.r@   )r   r   s    r   r0   zParallelismConfig.tp_enabled        ||ar   c                      | j                   dkD  S )z;True if context parallelism is enabled, i.e. `cp_size > 1`.r@   )r   r   s    r   r1   zParallelismConfig.cp_enabled   rC   r   c                 4    | j                   | j                  z   S )z$Names of all active mesh dimensions.)r,   r2   r   s    r   active_mesh_dimsz"ParallelismConfig.active_mesh_dims   s       4#8#888r   device_typec                    t        dd      rddlm} nt        d      | j	                         }t        |      dk(  ry|\  }} ||||      }| j                  r|| j                     j                  d       | j                  r|| j                     j                  d	       | j                  r|| j                     j                  d
       |S )a!  Builds a device mesh for the given device type based on the parallelism configuration.
        This method will also create required joint meshes (e.g. `dp_shard_cp`, `dp_cp`, `dp`).

        Args:
            device_type (`str`): The type of device for which to build the mesh, e
        z>=z2.2.0r   )init_device_meshz4Building a device_mesh requires to have torch>=2.2.0N)mesh_dim_namesdpr8   dp_cp)
r	   torch.distributed.device_meshrI   RuntimeError	_get_meshlenr,   _flattenr4   r6   )r   rG   rI   meshrJ   
mesh_shaper   s          r   build_device_meshz#ParallelismConfig.build_device_mesh   s     D'*FUVV~~t9>%)"
&)

 ))*33D9%%223<<]K,,-66w?r   c                     | j                   &|"| j                  |      | _         | j                   S d|?| j                   j                  |k7  r&t        d| j                   j                   d| d      | j                   S )Nz@You need to pass a device_type e.g cuda to build the device meshz4The device_mesh is already created with device type z@. However, you are trying to get a device mesh with device_type z<. Please check if you correctly initialized your device_mesh)r   rT   rG   
ValueError)r   rG   s     r   get_device_meshz!ParallelismConfig.get_device_mesh   s    #&#'#9#9+#F   ZZ&##//;>$NtO_O_OkOkNl  mm  ny  mz  zv  w  r   return.c                     | j                   D ci c]  }|| j                  |    }}g dt        |j                         fd      }t	        t        |       S c c}w )zQGenerate mesh shape and dimension names for torch.distributed.init_device_mesh().)r&   r'   r/   r.   c                 ,    j                  | d         S )Nr   )index)x
mesh_orders    r   <lambda>z-ParallelismConfig._get_mesh.<locals>.<lambda>   s    :++AaD1 r   )key)rF   _sizessortedr   tuplezip)r   parallelism	mesh_dimssorted_itemsr]   s       @r   rO   zParallelismConfig._get_mesh   sd     OSNcNcd{[$++k"::d	d >
OO2
 S,'(( es   Ac                    | j                   .t        t        j                  j	                  dd            | _         | j
                  .t        t        j                  j	                  dd            | _        | j                  .t        t        j                  j	                  dd            | _        | j                  .t        t        j                  j	                  dd            | _        | j                  dkD  r| j                  t               | _        | j                  dkD  r| j                  t               | _
        | j                   dk  rt        d| j                          | j
                  dk  rt        d| j
                         | j                  dk  rt        d	| j                         | j                  dk  rt        d
| j                         | j                  dkD  s| j                  dkD  r)| j                   dkD  r| j
                  dk(  rt        d      | j                   | j
                  | j                  | j                  d| _        y )N$PARALLELISM_CONFIG_DP_REPLICATE_SIZE1 PARALLELISM_CONFIG_DP_SHARD_SIZEPARALLELISM_CONFIG_TP_SIZEPARALLELISM_CONFIG_CP_SIZEr@   z.dp_replicate_size must be at least 1, but got z*dp_shard_size must be at least 1, but got z$tp_size must be at least 1, but got z$cp_size must be at least 1, but got aC  Tensor/Context parallelism (tp/cp_size > 1) cannot be used with pure data parallelism (dp_replicate_size > 1 and dp_shard_size == 1). Please set dp_shard_size > 1 and dp_replicate_size == 1 to compose FSDP + TP/CP for 2D parallel, or set dp_replicate_size == 1 and dp_shard_size > 1 to compose HSDP + TP/CP for 3D parallel.)r&   r'   r.   r/   )r   intosenvirongetr   r   r   r   r   r   r   rV   r`   r   s    r   __post_init__zParallelismConfig.__post_init__   s   !!)%(8^`c)d%eD"%!$RZZ^^4VX[%\!]D<<rzz~~.JCPQDL<<rzz~~.JCPQDL<<!&";"=<<!&"<">!!A%MdNdNdMefgg!I$J\J\I]^__<<!CDLL>RSS<<!CDLL>RSSLL1q 0d6L6Lq6PUYUgUgklUlo  !22**,,,,	
r   rd   sizec                     || j                   j                         v s"J d| j                   j                                 || j                   |<   t        | | d|       y )NzParallelism must be one of _size)r`   keyssetattr)r   rd   rr   s      r   	_set_sizezParallelismConfig._set_size  s\    dkk..00d4OPTP[P[P`P`PbOc2dd0#'K U+T2r   c                    t               }|j                  s| j                  dk(  ry | j                  dk(  r| j                  d|j                         | j                  |j                  k7  r&t        d| j                   d|j                   d      | j                  dkD  r1|j                  s%|j                  st        d|j                   d      | j                  j                         D ]4  \  }}|dk(  st        | | dd       |j                  d	| d
| d       6 |r:|j                  r-t        j                  ddj                  |      z   t                y y y )Nr@   r&   zParallelismConfig total_size (z ) does not match num_processes (zB). Please adjust dp_replicate_size/ dp_shard_size/tp_size/cp_size.zpParallelismConfig is only compatible DistributedType.FSDP (version 2) or DistributedType.Multi{Device}, but got ._handlerzParallelismConfig.z_handler is set, but z0_size is set to 1. This handler will be ignored.z.ParallelismConfig has the following warnings:

)setmulti_devicer   rw   num_processesrV   is_fsdp2distributed_typer`   r   getattraddis_main_processwarningswarnjoinUserWarning)r   accelerator	_warningsrd   rr   s        r   _validate_acceleratorz'ParallelismConfig._validate_accelerator"  s}   E	''DOOq,@ ??aNN>;+D+DE??k77700A B""-";";!< =12  ??Q(<(<@X@X E  FQ  Fb  Fb  Ec  cd  e  "&!2!2!4 	KqyWTk](+CTJV(5J;-  XH  I	 44MMADIIiDXX 59r   )N)r   r
   ))__name__
__module____qualname____doc__r   rm   __annotations__r   r   r   r   r   r   r   r   r   r   r$   propertyr,   r2   r4   r6   r9   r   r<   r>   r(   r)   r0   r1   rF   strrT   r   rW   rb   rO   rq   rw   r    r   r   r   r      s   : "s!M3GSGS :>Jd556=:>Jd667>K


       	 	   Y Y + + ; ; * * & &         9 9S < 8C=  )5sCx%S/!AB )'
R3S 3 3
 r   r   )rn   r   dataclassesr   typingr   r   r   accelerate.utils.dataclassesr   r   accelerate.utils.versionsr	   
accelerater
   r   r   r   r   <module>r      s@    
  ! 1 1 ^ 6 & e e er   