
    uki$                    2   d dl mZ d dlmZ d dlmZ d dlmZ d dlZd dl	Z
d dlZd dlZd dlZd dlZd dlZd dlmZ  ej$                  e      Zdd  e
j*                  ddd	      e
j,                  j/                  d	      z  efd
Z G d dej4                        Zy)    )annotations)contextmanager)cache)chainN)clusters      c                .    fd}| |S  ||       S )Nc                      fd}|S )Nc                 F   t        t        g            D ]  \  }}t        j                  dj                   d|dd| dt                      t        j                  |       	  | i |t        j                  dj                   d|dz    d	       c S  y # $ r$}|t              k(  rt        d
      |Y d }~nd }~ww xY w	 t        j                  dj                   d|dz    d	       # t        j                  dj                   d|dz    d	       w xY w)NzTrying z in z.2fz seconds, attempt /z	Finished z after r	   z	 attemptsz(Retry failed with all attempts exhausted)		enumerater   loggerdebug__name__lentimesleepRuntimeError)	argskwargsite
exceptionsfuncinitial_delaywaits	        X/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/_src/clusters/k8s_cluster.pyretry_driverz4retry.<locals>.retry_decorator.<locals>.retry_driver(   s#   E=/489 $!QDMM?$qg-?s!CI;
O	
 	

1	t&v&
 ,,gacU)<  	R#d)^IJPQQ 	R ,,gacU)<&,,gacU)<s*   )BC#C=C5CC55+D  )r   r!   r   r   r   s   ` r    retry_decoratorzretry.<locals>.retry_decorator'   s          r"   )r   r   r   r   r#   s    ``` r    retryr%   !   s     & 
\4  r$   c                     e Zd ZdZedd       Zeed               Zeed               Z	ee e
e      d                      Zeed               Zeed               Zeed	               Zedd
       Zedd       Zedd       Zy)
K8sCluster55527c                   dt         j                  v r	 dd l}|j                  j                          |j                  j                         | _        |j                  j                         | _        |j                  j"                  j$                  | _        yy# t        t        f$ r= t        j                  dj                  t        j                  d      ddg             Y yw xY w)	NKUBERNETES_SERVICE_HOSTr   
zKubernetes environment detected, but the `kubernetes` package is not installed to enable automatic bootstrapping in this environment. To enable automatic bootstrapping, please install jax with the [k8s] extra. For example:z    pip install jax[k8s]z)    pip install jax[k8s,<MORE-EXTRAS...>]FT)osenviron
kubernetesImportErrorModuleNotFoundErrorwarningswarnjointextwrapfillconfigload_incluster_configclient	CoreV1Api	_core_api
BatchV1Api
_batch_apir   ApiException_ApiException)clsk8ss     r    is_env_presentzK8sCluster.is_env_presentF   s     BJJ.  
jj&&(jj**,cmzz,,.cn**//<<c) ./ 
))MM78
 '7 
	
 s   B A	C#"C#c              #    K   	 d  y # | j                   $ rr}d|j                   d|j                   g}|j                  dk(  r&|j                  t	        j
                  dd             t        dj                  |            |d }~ww xY ww)NzKubernetes API Error: z - i  a  It appears that the Kubernetes service account (SA) associated with this job does not have the permission for pod introspection. Please either grant the default SA permission to read pod info, or create a dedicated service account with the permission and associated with the job. For an example on setting up the service account, see the example/k8s directory in the JAX repo. For more details, please refer to https://docs.jax.dev/en/latest/multi_process.html#kubernetes-exampleP   )widthr+   )r>   statusreasonappendr4   r5   r   r3   )r?   r   err_msgs      r    _handle_api_exceptionz K8sCluster._handle_api_exceptiona   s     4 4)!((3qxxjABg	
Sx}}Q 	
 		 7+,!34s%   B	 BB
A-BB

Bc                P    t        d      j                         j                         S )Nz7/var/run/secrets/kubernetes.io/serviceaccount/namespace)openreadstripr?   s    r    
_namespacezK8sCluster._namespaceu   s!     ?
dfUUWr$   )r   c                   t        j                  t        j                  d            }| j	                         5  | j
                  j                  | j                         d|       j                  \  }d d d        |S # 1 sw Y   S xY w)NHOSTNAMEzstatus.podIP=)	namespacefield_selector)	socketgethostbynamer,   getenvrI   r:   list_namespaced_podrO   items)r?   ippods      r    _podzK8sCluster._pod|   s    
 
		bii
3	4B		"	"	$ mm//.."&rd+ 0  	 s
 J
 Js   ;A>>Bc                    | j                         5  | j                  j                  | j                         j                  j
                  d   | j                               cd d d        S # 1 sw Y   y xY w)Nzjob-name)namerR   )rI   r<   read_namespaced_jobr[   metadatalabelsrO   rN   s    r    _jobzK8sCluster._job   s_     
	"	"	$ ^^//XXZ  ''
3s~~?O 0   s   AA++A4c                   | j                         5  | j                  j                  | j                               j                  }d d d        | j                         j                  j                  xs i D ]\  }|j                  j                  dk(  s|j                  j                  xs i }t        fd|j	                         D              sZ|c S  y # 1 sw Y   xY w)NNonec              3  L   K   | ]  \  }}j                  |      |k(    y wN)get).0kv
pod_labelss      r    	<genexpr>z+K8sCluster._headless_svc.<locals>.<genexpr>   s#     G$!Qz~~a A%Gs   !$)rI   r:   list_namespaced_servicerO   rX   r[   r_   r`   spec
cluster_ipselectorall)r?   servicessvcsvc_selectorrj   s       @r    _headless_svczK8sCluster._headless_svc   s     
	"	"	$ O66s~~7GHNNhO $$++1rJ 				&xx((.BG,2D2D2FGGJ	 O Os   4CC!c                    | j                         j                  j                  D ]  }|j                  du s|c S  t	        d| j                         j                  j
                   d      )NTz:Cannot automatically initialize distributed workload: pod z does not have a controller.)r[   r_   owner_references
controllerr   r]   )r?   owners     r    _controllerzK8sCluster._controller   sm    
 $$55 			T	! XXZ  %%&&BD r$   c                Z   | j                         }| j                         }| j                         }|j                  dk(  rd|j                  j
                  v r@dj                  |j                  j                  |j                  j
                  d         }n|j                  j                  | j                         }d}|r?|d|j                  j                   dz  }dd	d
dddd|j                  j                   dg}	n0|dz  }dddddd
dddd|j                  j                   dd	d
dddddg}	t        dj                  t        j                  |      g|	z               dj                  |j                  j                  |j                  j                        }|r_t        dt!        j"                  ddd      t         j$                  j'                  d      z  t(        j*                        d         }
 |
|       |xs | j,                  }d!j                  ||"      S t        d#      )$NJobzjobset.sigs.k8s.io/jobset-namez{job_name}-0.{subdomain})job_name	subdomainzSPods within a job need a headless service in order to communicate with each other. zA headless service 'z' is found that targets this job, but it is not specified as the job subdomain. Please add the following to the job specification: z```z	kind: Jobzspec:z  ...z  template:z	    spec:z      subdomain: z3To fix, add the following to the job specification:zapiVersion: v1zkind: Servicez	metadata:z  name: jaxpodsz   publishNotReadyAddresses: truez  clusterIP: Nonez  selector:z    job-name: z---z      subdomain: jaxpodsr+   g      ?r   g      ?   )r   r   r   c                .    t        j                  |        y re   )rT   rU   )hostnames    r    wait_for_hostz9K8sCluster.get_coordinator_address.<locals>.wait_for_host   s       *r$   z{hostname}:{port})r   portz=In K8s, cluster automatic bootstrap only supports Job/JobSet.)ry   ra   r[   kindr_   r`   formatr]   rm   r}   rt   r   r3   r4   r5   r%   nplogspacerandomrandrT   gaierror_coordinator_port)r?   timeout_secsoverride_coordinator_portrw   jobrZ   coordinator_hostnamerr   rH   fix_msgr   r   s               r    get_coordinator_addressz"K8sCluster.get_coordinator_address   sL   "J
((*C
((*C%	)S\\-@-@	@9@@<<$$LL''(HI  A  
 88%!!##,  $S\\%6%6$7 8D DG !#,,"3"3!45	G LLG0!s||0012(%G* TYYg(>'?''IJK
K9@@<<$$HH&&  A  

 
 Ra(299>>!+<<
+
+ ,
-&?#*?*?d ''% (   G r$   c                J    | j                         j                  j                  S re   )ra   rm   parallelismrN   s    r    get_process_countzK8sCluster.get_process_count  s     88:??&&&r$   c                l    	 t        t        j                  d         S # t        $ r t	        d      w xY w)NJOB_COMPLETION_INDEXzlTo enable automatic bootstrap in a K8s cluster, jobs must be indexed by setting `completionMode: "Indexed"`.)intr,   r-   KeyErrorr   rN   s    r    get_process_idzK8sCluster.get_process_id  s>    2344 	G s    3N)returnbool)r   z
int | Noner   z
str | Noner   str)r   r   )r   
__module____qualname__r   classmethodrA   r   rI   r   rO   r%   
ValueErrorr[   ra   rt   ry   r   r   r   r"   r$   r    r'   r'   @   s     4 4  4$  	 
 J   	   	   	  
 	 
 Z Zx ' '  r$   r'   )
__future__r   
contextlibr   	functoolsr   	itertoolsr   loggingnumpyr   r,   rT   r   r4   r1   jax._srcr   	getLoggerr   r   r   r   r   	Exceptionr%   
ClusterEnvr'   r"   r$   r    <module>r      s    # %     	      
		8	$ 
	RA	!2	2	!>Z$$ Zr$   