
    bi_                         d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlmZ 	 ddlmZ dd	lmZ d
 Zd Zd Z	 ddZd Z G d d      Z G d d      Zd Zy# e$ r dZY 8w xY w)zTF-specific utils import.    N)partial)ceil)uuid4)get_context)SharedMemory   )configc                    t        | t              r| S t        j                  rdd l}nt        d      | d   }i }|j                         D ]  \  }}t        |t        j                        r*t        j                  | D cg c]  }||   	 c}      ||<   Jt        ||j                        r&|j                  | D cg c]  }||   	 c}      ||<   t        j                  | D cg c]  }||   	 c}      ||<    |S c c}w c c}w c c}w )Nr   FCalled a Tensorflow-specific function but Tensorflow is not installed.)
isinstancedictr	   TF_AVAILABLE
tensorflowImportErroritemsnpndarraystackTensorarray)featurestffirstbatchkvfs          R/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/utils/tf_utils.pyminimal_tf_collate_fnr   $   s    (D!			bccQKEE :1a$xxx 8!1 89E!H299%xxx 8!1 89E!Hxxx 8!1 89E!H: L !9 8 8s   =C=
9D
#D
c                 :    t        |       }d|v r|d   |d<   |d= |S )Nlabellabels)r   )r   r   s     r   #minimal_tf_collate_fn_with_renamingr#   8   s-    !(+E%.h'NL    c                 ,   t         j                  j                  |       rt        | j                        S t         j                  j                  |       xs@ t         j                  j                  |       xs t         j                  j                  |       S N)patypesis_listis_numeric_pa_type
value_type
is_integeris_floating
is_decimal)pa_types    r   r*   r*   @   sc    	xx !'"4"45588w'h288+?+?+HhBHHL_L_`gLhhr$   c                 B   t        | t        j                        s| j                         } d}t        | t        j                        r|| j                            }d}nqt        j                  t        j                  |       dk(        r|| d   | d   dz    }n7t        | t        j                        r||    }nt        dt        |              |+|j                         D 	
ci c]  \  }	}
|	|v s|	dv r|	|
 }}	}
|rft        t        |j                               d         }t        |      D 	
cg c])  }|j                         D 	
ci c]  \  }	}
|	|
|    c}
}	+ }}	}}
 ||fi |}|rJi }|j                         D ]3  \  }}t        j                  ||         }|j!                  |      }|||<   5 |S g }|j                         D ]?  \  }}t        j                  ||         }|j!                  |      }|j#                  |       A |S c c}
}	w c c}
}	w c c}
}	}w )NTF   r   zUnexpected type for indices: )r!   	label_idsr"   )r   r   r   numpyintegeritemalldiffRuntimeErrortyper   lenlistvaluesranger   astypeappend)indicesdatasetcols_to_retain
collate_fncollate_fn_argscolumns_to_np_typesreturn_dict
is_batchedr   keyvalueactual_sizei	out_batchcol
cast_dtyper   s                    r   np_get_batchrP   F   s    grzz*--/J'2::&'
	 A%	&
WR[1_5	GRZZ	( :4=/JKK! $kkm
Un$/O(O J
 
 $u||~.q12JOP[J\]]Q%++-@JC#uQx-@]]u00E	288: 	#OCHHU3Z(ELL,E"IcN		#  	288: 	$OCHHU3Z(ELL,EU#		$
 5
 A]s   #H1H	HHHc	           	      $    t         j                  rddlnt        d      t	        d      rj
                  nft	        j                  j                  d      r!j                  j                  j                  n%t               dkD  rt        j                  d       dt        t         |||d	      j                         D 	cg c]  }	j                  j!                  |	       c}	j#                  j%                  dj&                        g
      fd       }
j(                  j*                  j-                  t                     }|rJHj/                  dj1                  dj&                              } fd}|j3                  ||      }n!|r|j5                  |j7                               }||j9                  ||      }|j;                  |
      }|fd}nfd}|j;                  |      S c c}	w )a  Create a tf.data.Dataset from the underlying Dataset. This is a single-process method - the multiprocess
    equivalent is multiprocess_dataset_to_tf.

    Args:
        dataset (`Dataset`): Dataset to wrap with tf.data.Dataset.
        cols_to_retain (`List[str]`): Dataset column(s) to load in the
            tf.data.Dataset. It is acceptable to include column names that are created by the `collate_fn` and
            that do not exist in the original dataset.
        collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate
            lists of samples into a batch.
        collate_fn_args (`Dict`): A  `dict` of keyword arguments to be passed to the
            `collate_fn`. Can be empty.
        columns_to_np_types (`Dict[str, np.dtype]`): A `dict` mapping column names to numpy dtypes.
        output_signature (`Dict[str, tf.TensorSpec]`): A `dict` mapping column names to
            `tf.TensorSpec` objects.
        shuffle(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
            validation/evaluation.
        batch_size (`int`, default `None`): Size of batches to load from the dataset. Defaults to `None`, which implies that
            the dataset won't be batched, but the returned dataset can be batched later with `tf_dataset.batch(batch_size)`.
        drop_remainder(`bool`, default `None`): Drop the last incomplete batch when loading. If not provided,
            defaults to the same setting as shuffle.

    Returns:
        `tf.data.Dataset`
    r   Nr   random_index_shuffleindex_shufflei zto_tf_dataset() can be memory-inefficient on versions of TensorFlow older than 2.9. If you are iterating over a dataset with a very large number of samples, consider upgrading to TF >= 2.9.F)rB   rC   rD   rE   rF   rG   )input_signaturec                     j                  | g      }t        j                               D ci c]  \  }}|||    c}}S c c}}w )N)inpTout)py_function	enumeratekeys)rA   outputrL   rI   rF   	getter_fnr   touts       r   fetch_functionz%dataset_to_tf.<locals>.fetch_function   sR    	   

 .77J7O7O7Q-RS61cVAYSSSs   A   r2   )dtype)rJ   c                     j                  | dk(        r(j                  j                  ddj                        }  || t	              dz
        }| |fS )Nr2   r_   l            )shapemaxvalra   r1   )indexseed	max_index)
reduce_allrandomuniformint64r;   )statere   shuffled_indexrB   rR   r   s      r   scan_random_indexz(dataset_to_tf.<locals>.scan_random_index   sZ    }}Ub[) 		))U"(()S1EUXY`UadeUefN.((r$   )drop_remainderc           
          | j                         D ci c]%  \  }}|j                  ||   j                        ' c}}S c c}}w r&   r   ensure_shaperc   
input_dictrI   valoutput_signaturer   s      r   ensure_shapesz$dataset_to_tf.<locals>.ensure_shapes   sA    [e[k[k[mnxsTWC.>s.C.I.IJJnnns   *Ac                     | j                         D ci c](  \  }}|j                  ||   j                  dd        * c}}S c c}}w )Nr1   rq   rs   s      r   rw   z$dataset_to_tf.<locals>.ensure_shapes   sJ    _i_o_o_qrS[SVX[C.>s.C.I.I!".MNNrrrs   -A)r	   r   r   r   hasattrrR   ri   experimentalrS   r;   warningswarnr   rP   r=   dtypesas_dtypefunction
TensorSpecrk   dataDatasetr>   fillcastscanshufflecardinalityr   map)rB   rC   rD   rE   rF   rv   r   
batch_sizero   ra   r^   
tf_dataset	base_seedrn   rw   r\   rR   r   r]   s   `   ``         @@@@r   dataset_to_tfr   v   s   H bcc r)*!66	''	9!yy55CCw<*$MM*
  $%'/I 4G3M3M3OP%BIIu%PD[["--bhh"?!@[AT BT &&s7|4J'3GGD"(((CGD		)  __Y0AB
	''
(>(>(@A
%%j%P
/J	o
	s >>-((W Qs   "Hc                   *    e Zd Zd Zd Zd Zd Zd Zy)SharedMemoryContextc                      g | _         g | _        y r&   )created_shmsopened_shmsselfs    r   __init__zSharedMemoryContext.__init__   s    r$   c                     t        t        |      ||      }|r| j                  j                  |       |S | j                  j                  |       |S )N)sizenamecreate)r   intr   r@   r   )r   r   r   r   shms        r   get_shmzSharedMemoryContext.get_shm   sM    D	VD$$S) 
 ##C(
r$   c                     | j                  |t        j                  |      t        j                  |      j                  z  |      }t        j
                  |||j                        S )N)r   r   r   )ra   buffer)r   r   prodra   itemsizer   buf)r   r   rc   ra   r   r   s         r   	get_arrayzSharedMemoryContext.get_array   sI    ll2775>BHHUO<T<T+T]cldzz%uSWW==r$   c                     | S r&    r   s    r   	__enter__zSharedMemoryContext.__enter__       r$   c                     | j                   D ]"  }|j                          |j                          $ | j                  D ]  }|j                           y r&   )r   closeunlinkr   )r   exc_type	exc_value	tracebackr   s        r   __exit__zSharedMemoryContext.__exit__   sI    $$ 	CIIKJJL	 ## 	CIIK	r$   N)__name__
__module____qualname__r   r   r   r   r   r   r$   r   r   r      s    >r$   r   c                   >    e Zd Zd Zd Zd Zed        Zed        Zy)NumpyMultiprocessingGeneratorc                    || _         || _        || _        || _        |j	                         D cg c]  \  }}|t
        j                  u s| c}}| _        |j	                         D ci c]+  \  }}||| j                  vr|nt        j                  d      - c}}| _	        || _
        || _        || _        |	| _        |
| _        |j	                         D ci c]U  \  }}||| j                  vrt        |j                   j"                        n!t        |j                   j"                        dz   W c}}| _        y c c}}w c c}}w c c}}w )NU1r1   )rB   rC   rD   rE   r   r   str_string_columnsra   rF   rv   r   r   ro   num_workersr   rc   rankcolumns_to_ranks)r   rB   rC   rD   rE   rF   rv   r   r   ro   r   rN   ra   specs                 r   r   z&NumpyMultiprocessingGenerator.__init__  s2    ,$.5H5N5N5PezsETY]_]d]dTdse 2779$
U #T%8%88bhhtnL$
  !1$,& .335!
T D4G4G)GTZZ__%SQUQ[Q[Q`Q`MadeMee!
 f$
!
s   D>D>+0EAE
c              #     K   t        | j                  t        t        t	        | j
                        | j                  z                    }| j                  | j
                  | j                  | j                  || j                        \  }}}t        d      }g }g }g }t        |      D 	cg c]  }	|j                          }
}	t        |      D 	cg c]  }	|j                          }}	| j
                  | j                  | j                  | j                  | j                   | j"                  | j$                  d}t'               5 }t        |      D ]  }t)        t+                     }d| d| d d }|j-                  |       | j"                  j/                         D ci c]0  \  }}||j1                  | d| d|ft2        j4                  d      2 }}}|j-                  |       ||   }||k(  r||}nd }||||
|   ||   d	|}|j7                  | j8                  |d
      }|j;                          |j-                  |        d}|sht        |      D ]U  }|
|   j=                  d      st?        d      |
|   jA                          ||   }tC        d |jE                         D              rd} nt'               5 }|j/                         D ci c]0  \  }}||j1                  ||    d| || j                   |   d      2 }}}|j/                         D ci c]  \  }}|t3        jF                  |       }}}| j$                  D ];  }||   jI                  d||   jJ                  d          jM                  d      ||<   = 	 d d d         ||   jO                          X |sh|D ]  }|jQ                           	 d d d        y c c}	w c c}	w c c}}w c c}}w c c}}w # 1 sw Y   dxY w# 1 sw Y   y xY ww)Nspawn)rB   rC   rD   rE   rF   r   r   dw__
   _shapeTrc   ra   r   )worker_namerA   extra_batcharray_ready_eventarray_loaded_event)targetkwargsdaemonF<   )timeoutzData loading worker timed out!c              3   L   K   | ]  }t        j                  |d k          yw)r   N)r   any).0rc   s     r   	<genexpr>z9NumpyMultiprocessingGenerator.__iter__.<locals>.<genexpr>e  s     P266%!),Ps   "$Ur2   ))minr   r   r   r;   rB   r   distribute_batchesro   r   r   r>   EventrC   rD   rE   rF   r   r   r   strr   r@   r   r   r   rk   Processworker_loopstartwaitTimeoutErrorclearr   r=   copyviewrc   squeezesetjoin)r   r   per_worker_batchesfinal_batchfinal_batch_workerctxnamesshape_arraysworkersr   array_ready_eventsarray_loaded_events	base_argsshm_ctxrL   worker_random_idr   rN   r   worker_shape_arraysworker_indicesfinal_batch_argworker_kwargsworkerend_signal_receivedarray_shapesbatch_shm_ctxrc   arraysarr
string_cols                                  r   __iter__z&NumpyMultiprocessingGenerator.__iter__*  sJ    $**CS5F5X0Y,Z[>B>U>ULL$//4+>+>T\\?
;K); '"383EFaciikFF49+4FGqsyy{GG ||"11//#33#'#;#; $ 5 5"11
	 !" H	g;' '#&uw<  #A3a(8'9:3B?[) &*%:%:%@%@%B'!T **k]!C5+GPTw^`^f^fos*tt'# ' ##$78!3A!6**{/F&1O&*O#.-#2);A)>*=a*@!  ! D,<,<][_`v&5'8 #(){+ %1A-a055b5A*+KLL&q)//1#/?LP,:M:M:OPP /3+ -. - /;.@.@.B" !+U  !8!8#(8*AcU 3&+&*&>&>s&C',	 "9 " " " EKLLN!SS#rwws|"3!S!S*.*=*= J &z 2 7 7!F:<N<T<TUW<X;Y8Z [ c cdf g #:.& !L'*..0K%1 *R " OH	 H	 GG"'X" "T aH	 H	s   B(P*OPO'APAO?5O!
C0O?O35O'O3" O-AO3$O?6O?P!O?'O33O<8O??PPc                     | S r&   r   r   s    r   __call__z&NumpyMultiprocessingGenerator.__call__  r   r$   c                 @   	
 dt         j                  d<   t        j                  rdd l}nt        d      |j                  j                  g d       
 	f
d}t               5 }|j                         D ci c]0  \  }}||j                  	 d| d|ft        j                  d	
      2 c}}|D ]
  } ||        | ||       j                         D ]
  \  }}d|d d   
j                          d d d        y c c}}w # 1 sw Y   y xY w)N3TF_CPP_MIN_LOG_LEVELr   r   GPUc           	        
 t        | 	
d      }i }t               5 }j                         D ]|  \  }}||   }|v r-|j                  d      j	                  |j
                  dz         }|j
                  |   d d  |j                   d| |j
                  |d      ||<   |||   d d  ~ j                          j                          j                          d d d        y # 1 sw Y   y xY w)NT)rA   rB   rC   rD   rE   rF   rG   r   )r2   r   r   )
rP   r   r   r   reshaperc   r   r   r   r   )rA   r   
out_arraysr   rN   rO   r   r   r   rD   rE   rC   rF   rB   r   r   r   s          r   send_batch_to_parentzGNumpyMultiprocessingGenerator.worker_loop.<locals>.send_batch_to_parent  s    -% /$7 E J$& +- (;'@'@'B /OC!#JEn, !&

4 0 8 8u9L M+0;;L%a(&3&=&=&-q.ekk\` '> 'JsO */JsOA&/ "%%'"'')"((*%+ + +s   C C))C2r   r   Fr   r2   )osenvironr	   r   r   r   set_visible_devicesr   r   r   r   rk   r   )rB   rC   rD   rE   rF   r   r   rA   r   r   r   r   r   r   r   rN   r   r   r   r   s   ````` `  ```       @r   r   z)NumpyMultiprocessingGenerator.worker_loop  s$    .1

)*#fgg
		%%b%0	+ 	+B !" 	$g "2!7!7!9C W&&+auF'CD7Z\ZbZbkp&qqL
 ! ,$U+,&$[1*002 
Ua!!#	$ 	$	$ 	$s   2D5D;A
DDDc                    t        j                  t        |             }|rt         j                  j	                  |       t        |      }|||z  z
  }t        j
                  ||g      \  }}|st        |      dk(  rd }|j                  d|      }t        |      }	|	|	|z  z
  }
t        j
                  ||
g      \  }}|j                  d||      }t        j
                  ||j                  d   d      }|D cg c]  }t        j                  |d       }}t        t        |            D ]4  }t        j                  ||   ||   j                  dd      gd      ||<   6 |t        |      }nd }|||fS c c}w )Nr   r2   r1   )axis)r   aranger;   ri   r   splitr   rc   r   r>   concatenate)rB   r   ro   r   r   rA   num_samplesincomplete_batch_cutofflast_incomplete_batchnum_batchesfinal_batches_cutofffinal_batchesper_worker_indicesr   rL   incomplete_batch_worker_idxs                   r   r   z0NumpyMultiprocessingGenerator.distribute_batches  s   ))CL)IIg&'l #.z1I"J)+'<S;T)U&&S!671<$(!//"j1'l*kK.GH!#'4H3I!J//"k:>XXgw}}Q/?aHRdebjj;ees=)* 	uA$&NN4Fq4I=YZK[KcKcdegiKj3krs$tq!	u !,*-m*<'*.'!#8:UUU fs   :E9N)	r   r   r   r   r   r   staticmethodr   r   r   r$   r   r   r     sA     
D_B E$ E$N V Vr$   r   c
                    t         j                  rddl}
nt        d      t	        | |||||||||	
      }|
j
                  j                  j                  ||      }|rt        t        |       |z        }n t        t        t        |       |z              }|j                  |
j
                  j                  j                  |            S )ao  Create a tf.data.Dataset from the underlying Dataset. This is a multi-process method - the single-process
    equivalent is dataset_to_tf.

    Args:
        dataset (`Dataset`): Dataset to wrap with tf.data.Dataset.
        cols_to_retain (`List[str]`): Dataset column(s) to load in the
            tf.data.Dataset. It is acceptable to include column names that are created by the `collate_fn` and
            that do not exist in the original dataset.
        collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate
            lists of samples into a batch.
        collate_fn_args (`Dict`): A  `dict` of keyword arguments to be passed to the
            `collate_fn`. Can be empty.
        columns_to_np_types (`Dict[str, np.dtype]`): A `dict` mapping column names to numpy dtypes.
        output_signature (`Dict[str, tf.TensorSpec]`): A `dict` mapping column names to
            `tf.TensorSpec` objects.
        shuffle(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for
            validation/evaluation.
        batch_size (`int`, default `None`): Size of batches to load from the dataset. Defaults to `None`, which implies that
            the dataset won't be batched, but the returned dataset can be batched later with `tf_dataset.batch(batch_size)`.
        drop_remainder(`bool`, default `None`): Drop the last incomplete batch when loading. If not provided,
            defaults to the same setting as shuffle.
        num_workers (`int`): Number of workers to use for loading the dataset. Should be >= 1.

    Returns:
        `tf.data.Dataset`
    r   Nr   )
rB   rC   rD   rE   rF   rv   r   r   ro   r   )rv   )r	   r   r   r   r   r   r   from_generatorr   r;   r   applyrz   assert_cardinality)rB   rC   rD   rE   rF   rv   r   r   ro   r   r   data_generatorr   dataset_lengths                 r   multiprocess_dataset_to_tfr    s    L bcc2%'/)%N //Qa/bJS\Z78T#g,";<=BGG00CCNSTTr$   )F)__doc__r   r{   	functoolsr   mathr   uuidr   r4   r   pyarrowr'   multiprocessr   multiprocess.shared_memoryr   r    r	   r   r#   r*   rP   r   r   r   r  r   r$   r   <module>r      s      	       $7 (i ej-`n)b @mV mV`=Us  Ls   A! !A+*A+