
    uki)=                       d dl mZ d dlmZ d dlZd dlZd dlmZ d dlZd dl	m
Z
 d dl	mZ d dl	mZ d dlmZ  ej                  e      Z ej$                  d	d
d      Z ej(                  ddd      Z ej(                  dd
d      Z G d d      Z e       Z	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZd Zy)    )annotations)SequenceN)Any)clusters)config)
xla_bridge)_jaxjax_check_proxy_envsTz1Checks proxy vars in user envs and emit warnings.)namedefaulthelpjax_enable_recoverabilityFzYAllows a multi-controller JAX job to continue running, even after some tasks have failed.jax_enable_preemption_servicez^Enables the preemption service. See multihost_utils.reached_preemption_sync_point for details.c                      e Zd ZU dZded<   dZded<   dZded<   dZd	ed
<   dZded<   dZ	ded<   dZ
ded<   	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZd Zd Zy)Stater   int
process_id   num_processesNz+_jax.DistributedRuntimeService | Any | Noneservicez*_jax.DistributedRuntimeClient | Any | Noneclientz
Any | Nonepreemption_sync_manager
str | Nonecoordinator_address
int | Nonepartition_indexc                   |xs t         j                  j                  d      }t        |t              r|g}|It         j                  j                  d      x}r(t        t        t        |j                  d                  }|dk7  r1d ||||fv r)t        j                  j                  ||||||      \  }}}}|t        d      |t        d      |t        d      t        |t              st        d| d	t        |       d
      t        |t              st        d| d	t        |       d
      d|cxk  r|k  sn t        d| d| d
      || _        d|j                  dd      d   z   }|xs  t         j                  j                  d|      }|t        d      |rZdj!                  d |D              }t"        j%                  d|       t'        j(                  d|       t'        j(                  d|       || _        g }t,        j.                  r=t         j                  j1                         D cg c]  }d|j3                         v r| }}t5        |      dkD  r/dj!                  |      dz   }d| d}t"        j7                  |       |dk(  rK| j8                  t;        d      t"        j%                  d|       t=        j>                  ||||	      | _        || _         | jB                  t;        d      t=        jD                  |||d |tF        j.                  !      | _!        t"        j%                  d"|       | jB                  jI                          | jK                          |
zt         j                  j                  d#      }t         j                  j                  d$      }|t	        |      }
|
| _)        y |%tM        jN                  d%tP               t	        |      }
|
| _)        y c c}w )&NJAX_COORDINATOR_ADDRESSJAX_LOCAL_DEVICE_IDS,
deactivatez&coordinator_address should be defined.z$Number of processes must be defined.z6The process id of the current process must be defined.z5process_id must be a nonnegative int. Got process_id=z	 of type .z8num_processes must be a positive int. Got num_processes=r   zbprocess_id and num_processes must be nonnegative, with process_id < num_processes. Got process_id=z, num_processes=z[::]::r   JAX_COORDINATOR_BIND_ADDRESSz+coordinator_bind_address should be defined.c              3  2   K   | ]  }t        |        y wN)str).0xs     O/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/_src/distributed.py	<genexpr>z#State.initialize.<locals>.<genexpr>}   s      BAQ Bs   z4JAX distributed initialized with visible devices: %sjax_cuda_visible_devicesjax_rocm_visible_devices_proxy z. zHJAX detected proxy variable(s) in the environment as distributed setup: zpOn some systems, this may cause a hang of distributed.initialize and you may need to unset these ENV variable(s)z2distributed.initialize should only be called once.z&Starting JAX distributed service on %s)heartbeat_timeoutshutdown_timeoutT)init_timeoutuse_compressionr0   recoverablez+Connecting to JAX distributed service on %sJAX_PARTITION_INDEXJAX_SLICE_INDEXzLJAX_SLICE_INDEX has been deprecated. Please use JAX_PARTITION_INDEX instead.)*osenvironget
isinstancer   listmapsplitr   
ClusterEnv$auto_detect_unset_distributed_params
ValueError	TypeErrortyper   rsplitjoinloggerinfor   updater   _CHECK_PROXY_ENVSvaluekeyslowerlenwarningr   RuntimeErrorr	   get_distributed_runtime_servicer   r   get_distributed_runtime_client_ENABLE_RECOVERABILITYconnect"initialize_preemption_sync_managerwarningswarnDeprecationWarningr   )selfr   r   r   local_device_idscluster_detection_methodinitialization_timeoutcoordinator_bind_addressheartbeat_timeout_secondsshutdown_timeout_secondsr   env_ids default_coordinator_bind_addressvisible_devices
proxy_varskeyvarsrM   jax_partition_indexjax_slice_indexs                       r*   
initializezState.initializeA   s    / E::>>*CD "C(*+

?U0V%VW%Vc#w}}S'9:; L0$mZAQRR



B
B!&$ IM:7G "?@@=>>OPPj#& ((2|9T*=M<NaQ R RmS) ++8/4CVBWWXZ [ [+m+ ))34D]OSTV W W  3D (/1D1K1KCQR1STU1V'V$ 8 !Q "

/M/O!Q   'DEE B1A BBokkH/Zmm.@mm.@ DOJ#%::??#4 0C399;.  0j 0 :XXj!D(d
RSWRX	6	6 
 nnWQ		!OPPkk
24L 99
"M535dl
 'D{{MNN55Z6L0I*002DK KK=?RSKK++-JJNN+@A

'89o		(12 +D &,	

 o.*De0s   9O3c                   | j                   r!| j                   j                          d | _         | j                  r!| j                  j                          d | _        | j                  r"| j                  j                          d | _        y y r&   )r   shutdownr   r   rW   s    r*   rh   zState.shutdown   sh    ## ""++-%)d"{{
kkdk||
lldl     c                    t         j                  st        j                  d       y | j                  t        d      t        j                         | _        | j                  j                  | j                         y )NzwThe JAX preemption service is disabled. You can enable it using the jax_enable_preemption_service configuration option.z8Preemption sync manager should only be initialized once.)
_ENABLE_PREEMPTION_SERVICErI   rE   rF   r   rN   r	   create_preemption_sync_managerrf   r   ri   s    r*   rS   z(State.initialize_preemption_sync_manager   sn    %++kkA ##/
DF F 	++- 	   ++DKK8rj   )
NNNNN,  Nd   rn   N)r   r   r   r   r   r   rX   int | Sequence[int] | NonerY   r   rZ   r   r[   r   r\   r   r]   r   r   r   )__name__
__module____qualname__r   __annotations__r   r   r   r   r   r   rf   rh   rS    rj   r*   r   r   8   s    *c-9='6=7;&
4;(,:,$(z( $/:$ 48-1*.@D8</28<2514/3w+&0w+ *w+ (w+ $>	w+
 ,6w+ *-w+ ,6w+ -0w+ ,/w+ #-w+r9rj   r   c                    t        j                         rt        d      |
|	t        j                  dt
               |	}
t        j                  | |||||||||

       y)a  Initializes the JAX distributed system.

  Calling :func:`~jax.distributed.initialize` prepares JAX for execution on
  multi-host GPU and Cloud TPU. :func:`~jax.distributed.initialize` must be
  called before performing any JAX computations.

  The JAX distributed system serves a number of roles:

    * It allows JAX processes to discover each other and share topology information,
    * It performs health checking, ensuring that all processes shut down if any process dies, and
    * It is used for distributed checkpointing.

  If you are using TPU, Slurm, or Open MPI, all arguments are optional: if omitted, they
  will be chosen automatically.

  The ``cluster_detection_method`` may be used to choose a specific method for detecting those
  distributed arguments. You may pass any of the automatic ``spec_detect_methods`` to this
  argument though it is not necessary in the TPU, Slurm, or Open MPI cases.  For other MPI
  installations, if you have a functional ``mpi4py`` installed, you may pass
  ``cluster_detection_method="mpi4py"`` to bootstrap the required arguments.

  Otherwise, you must provide the ``coordinator_address``,
  ``num_processes``, ``process_id``, and ``local_device_ids`` arguments
  to :func:`~jax.distributed.initialize`. When all four arguments are provided, cluster
  environment auto detection will be skipped.

  Please note: on some systems, particularly HPC clusters that only access external networks
  through proxy variables such as HTTP_PROXY, HTTPS_PROXY, etc., the call to
  :func:`~jax.distributed.initialize` may timeout.  You may need to unset these variables
  prior to application launch.

  Args:
    coordinator_address: the IP address of process `0` and a port on which that
      process should launch a coordinator service. The choice of
      port does not matter, so long as the port is available on the coordinator
      and all processes agree on the port.
      May be ``None`` only on supported environments, in which case it will be chosen automatically.
      Note that special addresses like ``localhost`` or ``127.0.0.1`` usually mean that the program
      will bind to a local interface and are not suitable when running in a multi-host environment.
    num_processes: Number of processes. May be ``None`` only on supported environments, in
      which case it will be chosen automatically.
    process_id: The ID number of the current process. The ``process_id`` values across
      the cluster must be a dense range ``0``, ``1``, ..., ``num_processes - 1``.
      May be ``None`` only on supported environments; if ``None`` it will be chosen automatically.
    local_device_ids: Restricts the visible devices of the current process to ``local_device_ids``.
      If ``None``, defaults to all local devices being visible to the process except when processes
      are launched via Slurm and Open MPI on GPUs. In that case, it will default to a single device per process.
    cluster_detection_method: An optional string to attempt to autodetect the configuration of the distributed
      run.  Note that "mpi4py" method requires you to have a working ``mpi4py`` install in your environment,
      and launch the applicatoin with an MPI-compatible job launcher such as ``mpiexec`` or ``mpirun``.
      Legacy auto-detect options "ompi" (OMPI) and "slurm" (Slurm) remain enabled. "deactivate" bypasses
      automatic cluster detection.
    initialization_timeout: Time period (in seconds) for which connection will
      be retried. If the initialization takes more than the timeout specified,
      the initialization will error. Defaults to 300 secs i.e. 5 mins.
    heartbeat_timeout_seconds: The time (in seconds) after which a process is
      considered dead if it hasn't successfully sent any heartbeats. Defaults
      to 100 seconds.
    shutdown_timeout_seconds: The time (in seconds) a terminating process will
      wait for all other processes to also terminate. Defaults to 300 seconds.
    coordinator_bind_address: the address and port to which the coordinator service
      on process `0` should bind. If this is not specified, the default is to bind to
      all available addresses on the same port as ``coordinator_address``. On systems
      that have multiple network interfaces per node it may be insufficient to only
      have the coordinator service listen on one address/interface.
    slice_index: DEPRECATED: Use ``partition_index`` instead.
    partition_index: The partition index assigned to this process' local devices. If any process sets ``partition_index``,
      then all processes must do so. If ``None`` the partition indices will be chosen automatically.

  Raises:
    RuntimeError: If :func:`~jax.distributed.initialize` is called more than once
      or if called after the backend is already initialized.

  Examples:

  Suppose there are two GPU processes, and process 0 is the designated coordinator
  with address ``10.0.0.1:1234``. To initialize the GPU cluster, run the
  following commands before anything else.

  On process 0:

  >>> jax.distributed.initialize(coordinator_address='10.0.0.1:1234', num_processes=2, process_id=0)  # doctest: +SKIP

  On process 1:

  >>> jax.distributed.initialize(coordinator_address='10.0.0.1:1234', num_processes=2, process_id=1)  # doctest: +SKIP
  zjax.distributed.initialize() must be called before any JAX calls that might initialise the XLA backend. This includes any computation, but also calls to jax.devices, jax.device_put, and others.NzH`slice_index` has been deprecated. Please use `partition_index` instead.)r\   r]   r   )r   backends_are_initializedrN   rT   rU   rV   global_staterf   )r   r   r   rX   rY   rZ   r\   r]   r[   slice_indexr   s              r*   rf   rf      s|    D ((*
 t u u mm
T
 "O-}j*,D02J4M3K*9  ;rj   c                 &    t         j                  duS )z3Check if the JAX distributed system is initialized.N)rx   r   ru   rj   r*   is_initializedr{   M  s    			D	((rj   c                 ,    t         j                          y)z_Shuts down the distributed system.

  Does nothing if the distributed system is not running.
  N)rx   rh   ru   rj   r*   rh   rh   Q  s    
 rj   )NNNNNrn   ro   rn   NNN)r   r   r   r   r   r   rX   rp   rY   r   rZ   r   r\   r   r]   r   r[   r   ry   r   r   r   )returnbool)
__future__r   collections.abcr   loggingr7   typingr   rT   jax._srcr   r   r   jax._src.libr	   	getLoggerrq   rE   	bool_flagrH   
bool_staterQ   rl   r   rx   rf   r{   rh   ru   rj   r*   <module>r      sH   # $  	      			8	$ %F$$		<  +**	$		  /V..	(	F	 [9 [9z w15+/(,>B6:-003/26:)--1s;(s;%s; "<s; *4	s;
 (+s; +.s; *-s; *4s; 's; !+s;l)rj   