
    bi*                     D   d dl mZmZ ddlmZmZmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ  ej,                  e      Z ed
ee      Z	 	 	 	 	 ddee   deee      dee   dee   dee   ded   defdZ	 	 	 ddee   dee   dee   dedef
dZy)    )OptionalTypeVar   )Dataset_concatenate_map_style_datasets_interleave_map_style_datasets)DatasetDictIterableDatasetDict)DatasetInfo)IterableDataset_concatenate_iterable_datasets_interleave_iterable_datasets)
NamedSplit)logging)LiteralDatasetTypeNdatasetsprobabilitiesseedinfosplitstopping_strategyfirst_exhaustedall_exhaustedreturnc                    ddl m} ddlm} | st	        d      t        |       D ]  \  }}	t        |	||f      s|t        |	t        t        f      rA|	st	        d| d      t	        d| dt        |	       d	t        t        |	             d
      t	        d| dt        |	      j                   d      |dk(  rt        |	|      r||fn||f\  }
}t        |	
      rt	        d|
j                   dj                   d| d       |dvrt	        | d      
|u rt        | |||||      S t        | |||||      S )u  
    Interleave several datasets (sources) into a single dataset.
    The new dataset is constructed by alternating between the sources to get the examples.

    You can use this function on a list of [`Dataset`] objects, or on a list of [`IterableDataset`] objects.

        - If `probabilities` is `None` (default) the new dataset is constructed by cycling between each source to get the examples.
        - If `probabilities` is not `None`, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.

    The resulting dataset ends when one of the source datasets runs out of examples except when `oversampling` is `True`,
    in which case, the resulting dataset ends when all datasets have ran out of examples at least one time.

    Note for iterable datasets:

    In a distributed setup or in PyTorch DataLoader workers, the stopping strategy is applied per process.
    Therefore the "first_exhausted" strategy on an sharded iterable dataset can generate less samples in total (up to 1 missing sample per subdataset per worker).

    Args:
        datasets (`List[Dataset]` or `List[IterableDataset]`):
            List of datasets to interleave.
        probabilities (`List[float]`, *optional*, defaults to `None`):
            If specified, the new dataset is constructed by sampling
            examples from one source at a time according to these probabilities.
        seed (`int`, *optional*, defaults to `None`):
            The random seed used to choose a source for each example.
        info ([`DatasetInfo`], *optional*):
            Dataset information, like description, citation, etc.
            <Added version="2.4.0"/>
        split ([`NamedSplit`], *optional*):
            Name of the dataset split.
            <Added version="2.4.0"/>
        stopping_strategy (`str`, defaults to `first_exhausted`):
            Two strategies are proposed right now, `first_exhausted` and `all_exhausted`.
            By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
            If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
            Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
            - with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples.
            - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
    Returns:
        [`Dataset`] or [`IterableDataset`]: Return type depends on the input `datasets`
        parameter. `Dataset` if the input is a list of `Dataset`, `IterableDataset` if the input is a list of
        `IterableDataset`.

    Example:

        For regular datasets (map-style):

        ```python
        >>> from datasets import Dataset, interleave_datasets
        >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
        >>> d2 = Dataset.from_dict({"a": [10, 11, 12]})
        >>> d3 = Dataset.from_dict({"a": [20, 21, 22]})
        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
        >>> dataset["a"]
        [10, 0, 11, 1, 2, 20, 12, 10, 0, 1, 2, 21, 0, 11, 1, 2, 0, 1, 12, 2, 10, 0, 22]
        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
        >>> dataset["a"]
        [10, 0, 11, 1, 2]
        >>> dataset = interleave_datasets([d1, d2, d3])
        >>> dataset["a"]
        [0, 10, 20, 1, 11, 21, 2, 12, 22]
        >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
        >>> dataset["a"]
        [0, 10, 20, 1, 11, 21, 2, 12, 22]
        >>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
        >>> d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
        >>> d3 = Dataset.from_dict({"a": [20, 21, 22, 23, 24]})
        >>> dataset = interleave_datasets([d1, d2, d3])
        >>> dataset["a"]
        [0, 10, 20, 1, 11, 21, 2, 12, 22]
        >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
        >>> dataset["a"]
        [0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24]
        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
        >>> dataset["a"]
        [10, 0, 11, 1, 2]
        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
        >>> dataset["a"]
        [10, 0, 11, 1, 2, 20, 12, 13, ..., 0, 1, 2, 0, 24]
        For datasets in streaming mode (iterable):

        >>> from datasets import interleave_datasets
        >>> d1 = load_dataset('allenai/c4', 'es', split='train', streaming=True)
        >>> d2 = load_dataset('allenai/c4', 'fr', split='train', streaming=True)
        >>> dataset = interleave_datasets([d1, d2])
        >>> iterator = iter(dataset)
        >>> next(iterator)
        {'text': 'Comprar Zapatillas para niña en chancla con goma por...'}
        >>> next(iterator)
        {'text': 'Le sacre de philippe ier, 23 mai 1059 - Compte Rendu...'
        ```
    r   )r   )r   z/Unable to interleave an empty list of datasets.aExpected a list of Dataset objects or a list of IterableDataset objects, but element at position   is an empty dataset dictionary.Dataset at position  has at least one split: N
Please pick one to interleave with the other datasets, for example: dataset[''] is a .r   Unable to interleave a  (at position 0) with a  (at position K). Expected a list of Dataset objects or a list of IterableDataset objects.r   z: is not supported. Please enter a valid stopping_strategy.)r   r   r   )arrow_datasetr   iterable_datasetr   
ValueError	enumerate
isinstancer	   r
   listnextitertype__name__r   r   )r   r   r   r   r   r   r   r   idatasetdataset_type
other_types               K/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/combine.pyinterleave_datasetsr9      s   H '1JKK) 
7'G_#=>'K1D#EF${|}{~ : :  !*1#-FtG}o Vddhimnuivdwcxxz|  stusvv|  ~B  CJ  ~K  ~T  ~T  }U  UV  W  6.8'.J/*Q`biPj %L* G\2),*?*?)@@XYcYlYlXmm{|}{~  J  K ).  DD-..hijjw-mTEUf
 	
 -mTEUf
 	
    dsetsaxisc                 ~   | st        d      t        |       D ]  \  }}t        |t        t        f      s|t        |t
        t        f      rA|st        d| d      t        d| dt        |       dt        t        |             d      t        d| dt        |      j                   d	      |d
k(  r,t        |t              rt        t        fnt        t        f\  }}t        |      rt        d|j                   dj                   d| d       t        u rt        | |||      S t        | |||      S )a  
    Converts a list of [`Dataset`] with the same schema into a single [`Dataset`].

    Args:
        dsets (`List[datasets.Dataset]`):
            List of Datasets to concatenate.
        info (`DatasetInfo`, *optional*):
            Dataset information, like description, citation, etc.
        split (`NamedSplit`, *optional*):
            Name of the dataset split.
        axis (`{0, 1}`, defaults to `0`):
            Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
            (horizontally).

            <Added version="1.6.0"/>

    Example:

    ```py
    >>> ds3 = concatenate_datasets([ds1, ds2])
    ```
    z0Unable to concatenate an empty list of datasets.r   r   r    r!   r"   r#   r$   r%   r   r&   r'   r(   r)   )r   r   r<   )r,   r-   r.   r   r   r	   r
   r/   r0   r1   r2   r3   r   r   )r;   r   r   r<   r4   r5   r6   r7   s           r8   concatenate_datasetsr>      s   : KLL& 
7'G_#=>'K1D#EF${|}{~ : :  !*1#-FtG}o Vddhimnuivdwcxxz|  stusvv|  ~B  CJ  ~K  ~T  ~T  }U  UV  W  6.8'.J/*Q`biPj %L* G\2),*?*?)@@XYcYlYlXmm{|}{~  J  K ). w.u4uSWXX-e$eRVWWr:   )NNNNr   )NNr   )typingr   r   r*   r   r   r   dataset_dictr	   r
   r   r   r+   r   r   r   splitsr   utilsr   utils.py_utilsr   
get_loggerr3   loggerr   r/   floatintr9   r>    r:   r8   <module>rI      s   $ c c :  l l   # 
		H	% mWo>
 ,0"&"&EVI
;I
DK(I
 3-I
 ;
	I

 JI
 ABI
 I
\ #'"&	9X9X
;
9X J9X 	9X
 9Xr:   