
    biz                    n   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZ	d dl
Z
d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZm Z m!Z!m"Z"m#Z#m$Z$ d dl%Z&d dl'Z(d dl)Z*d dl+Z,d dl-m.Z/ d dl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= d dl>m?Z? ddl@mAZA ddlBmCZCmDZDmEZE ddlFmGZG ddlHmIZI ddlJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZS ddlTmUZUmVZVmWZWmXZXmYZYmZZZ ddl[m\Z\m]Z] ddl^m_Z_ ddl`maZambZbmcZcmdZd ddlemfZfmgZgmhZhmiZi ddljmkZl ddlmmnZn ddlompZp ddlqmrZrmsZsmtZtmuZumvZv ddlwmxZxmyZymzZzm{Z{ dd l|m}Z} erd dl~Z~d dlZd dlZd dlZ ene      Ze$eef   Zd! Zd"ed#eeef   fd$Zd"ed%ed&ed'ee   fd(Zdd)eeef   d*e#eI   d+eIfd,Zd-eeee f      d+eeef   fd.Zd)eeef   d+eeee f      fd/Z	 dd0eeeef      d1ed2ed+eeee,j$                  f      fd3Z G d4 d5      Z G d6 d7e      Z G d8 d9e      Z G d: d;e      Z G d< d=e      Z G d> d?e      Z G d@ dAe      Z G dB dCe      Z G dD dEe      Z G dF dGe      ZdHee   fdIZ G dJ dKe      Z G dL dMe      Zd+e,j$                  fdNZ G dO dPe      ZdQe$ee,j$                  f   dRe$eee,jF                  e,jH                  e,jJ                  f   dSefdTZdUe"dQe$ee,j$                  f   dSefdVZdUe"dQe$ee,j$                  f   dSefdWZ G dX dYe      Z G dZ d[e      Z G d\ d]e      Z G d^ d_e      Z G d` dae      Zd"edbeIdceee$eedf   f   d+efddZd)edbeIdceee$eedf   f   d+efdeZe G df dg             Z G dh die      Ze G dj dk             Ze G dl dm             Zdn Zdoe$edpf   d+e$edpf   fdqZ G dr ds      Z G dt dueE      Z	 	 	 ddvee   dwe#e\   dxe#ea   dyed+ef
dzZ	 	 	 	 	 dd{ee   d|e#ee      d}e#e   dwe#e\   dxe#ea   d~erd   d+efdZdededed+efdZd Zy)    N)Counter)IterableIterator)deepcopy)	dataclass)partial)BytesIO)cycleislice)Path)TYPE_CHECKINGAnyBinaryIOCallableOptionalUnion)
CommitInfoCommitOperationAddCommitOperationDeleteDatasetCardDatasetCardDataHfApi)RepoFile)HfHubHTTPErrorRepositoryNotFoundError)Pool)	HTTPError   )config):PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDEDDatasetDatasetInfoMixin)sanitize_patterns)Features)	FeatureTypeListValue_align_features!_check_if_features_can_be_aligned%_fix_for_backward_compatible_features_visitcast_to_python_objectsrequire_decoding)ArrowFormatterPythonFormatterTableFormatterTensorFormatterget_format_type_from_aliasget_formatter)DatasetInfoDatasetInfosDict)	_split_re)
NamedSplitSplit	SplitDict	SplitInfo)cast_table_to_featuresembed_table_storageread_schema_from_file
table_cast)tqdm)
get_logger)MetadataConfigs)Literalasdictglob_pattern_to_regexiflatmap_unorderedstring_to_dict)_merge_gen_kwargs_number_of_shards_in_gen_kwargs_shuffle_gen_kwargs_split_gen_kwargs)PathLikec                     | S N )xs    T/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/iterable_dataset.pyidentity_funcrQ   R   s    H    examplecolumn_mappingc                     t         fd|D              rJt        dt        |       dt        |j                                dt	        |      t	               z
   d      t         fd|j                         D              rXt        dt        |       dt        |j                                dt	               t	        |j                               z
   d      |j                         D ci c]  \  }}| |    c}}S c c}}w )Nc              3   &   K   | ]  }|v 
 y wrM   rN   .0colrS   s     rP   	<genexpr>z%_rename_columns_fn.<locals>.<genexpr>W   s     
8#3g
8   zError when renaming z to z
: columns z are not in the dataset.c              3   &   K   | ]  }|v  
 y wrM   rN   rW   s     rP   rZ   z%_rename_columns_fn.<locals>.<genexpr>[   s     
=c3'>
=r[   z are already in the dataset.)any
ValueErrorlistvaluessetitems)rS   rT   original_column_namenew_column_names   `   rP   _rename_columns_fnre   V   sI   

8
88"4#7"8T.BWBWBY=Z<[[efijxfy|  AH  }I  gI  fJ  Jb  c
 	
 
=^%:%:%<
=="4#7"8T.BWBWBY=Z<[[efijqfrux  zH  zO  zO  zQ  vR  gR  fS  So  p
 	

 6D5I5I5K1 / 	!566  s   -Didxnamecolumnc                 <    || v rt        d| d| d      |||   iS )NzError when adding z	: column z is already in the dataset.)r^   )rS   rf   rg   rh   s       rP   add_column_fnrj   e   s4    w-dV9TFB]^__&+rR   batchtry_featuresreturnc                 2   t         j                  j                  |       }|*	 t        |t        j                  |j
                              }t        j                  |j                        S # t        t         j                  t         j                  f$ r Y Iw xY wrM   )paTablefrom_pydictr>   schematype	TypeErrorArrowInvalidArrowNotImplementedErrorr$   from_arrow_schema)rk   rl   pa_tables      rP   _infer_features_from_batchry   k   sx    xx##E*H	!(BIIl6G6G,HIH %%hoo66 2??B,G,GH 		s   )A+ +(BBexamplesc           
          | D ci c]  }|D ]  }|d   }}}|D cg c]!  }| D cg c]  }|j                  |       c}# }}}t        t        ||            S c c}}w c c}w c c}}w rM   )getdictzip)rz   rS   rY   colsarrayss        rP   _examples_to_batchr   u   sl     &.A'A#CIACADADHISx8Gw{{38IFID&!"" B8Is   A	A(A# A(#A(c              #      K   t        |       dk(  rdnt        | t        t        |                      }t        |      D ]+  }| j	                         D ci c]  \  }}|||    c}} - yc c}}w w)z3Convert a batch (dict of examples) to examples listr   N)lennextiterrangerb   )rk   
n_examplesirY   arrays        rP   _batch_to_examplesr   ~   sd     %jAo3uT$u+5F/G+HJ: >/4{{}=esE!H}==>=s   AA3A-$A3iterable
batch_sizedrop_last_batchc              #     K   ||dk  rBdt         j                  j                  t        | D cg c]  \  }}|	 c}}d            f yt	        |       }|D ]  \  }}t        ||dz
        }||fgt        |      z   }t        |      |k  r|r yt        | \  }	}
dj                  d |	D              }|t         j                  j                  t        |
d            f  yc c}}w w)	a  Convert and group examples in Arrow tables of size `batch_size`.

    Args:
        iterable (`Iterable[Tuple[Key, dict]]`):
            An examples iterable containing tuples (example_key, example) of type (int/str, dict)
        batch_size (`Optional[int]`):
            Size of each sub-table to yield. If None or <= 0, yields the full table.
        drop_last_batch (`bool`, defaults to `False`):
            Drop the last batch if it is smaller than `batch_size`.
    Nr   allT)only_1d_for_numpyr   _c              3   2   K   | ]  }t        |        y wrM   strrX   keys     rP   rZ   z$_convert_to_arrow.<locals>.<genexpr>   s     43s84   )
ro   rp   from_pylistr,   r   r   r_   r   r~   join)r   r   r   r   rS   iteratorr   iterator_batchkey_examples_listkeysrz   new_keys               rP   _convert_to_arrowr      s      Z1_HH  !7S[8\ZQ8\pt!uv
 	
 	H~H  fW*q.9!7^,tN/CC !J.?/0h((4t44rxx++,B8_c,deeef	 9]s   ,C+C%B1C+c            	       X   e Zd ZdZddZdeeeef      fdZ	e
deeg eeeej                  f      f      fd       Ze
defd       Ze
dee   fd       Zd	ej*                  j,                  dd fd
Zddededd fdZddededee   fdZe
defd       ZdefdZdedefdZdefdZy)_BaseExamplesIterablez?Base class for the examples iterable used by an IterableDatasetrm   Nc                     d | _         y rM   _state_dictselfs    rP   __init__z_BaseExamplesIterable.__init__   s
    8<rR   c                 0    t        t        |        d      )zWAn examples iterable should yield tuples (example_key, example) of type (int/str, dict)z doesn't implement __iter__ yetNotImplementedErrorrs   r   s    rP   __iter__z_BaseExamplesIterable.__iter__   s    !T$ZL0O"PQQrR   c                      y rM   rN   r   s    rP   
iter_arrowz _BaseExamplesIterable.iter_arrow       rR   c                      yNFrN   r   s    rP   is_typedz_BaseExamplesIterable.is_typed   s    rR   c                      y rM   rN   r   s    rP   featuresz_BaseExamplesIterable.features   r   rR   	generatorc                 0    t        t        |        d      )z
        Either shuffle the shards/sources of the dataset, or propagate the shuffling to the underlying iterable.
        If the order of the shards must stay fixed (when using .skip or .take for example), then this method returns self.
        z+ doesn't implement shuffle_data_sources yetr   r   r   s     rP   shuffle_data_sourcesz*_BaseExamplesIterable.shuffle_data_sources   s    
 "T$ZL0["\]]rR   
num_shardsindexc                 0    t        t        |        d      )ZEither keep only the requested shard, or propagate the request to the underlying iterable.z) doesn't implement shard_data_sources yetr   r   r   r   
contiguouss       rP   shard_data_sourcesz(_BaseExamplesIterable.shard_data_sources   s    !T$ZL0Y"Z[[rR   c                     |rT| j                   |z  }| j                   |z  }||z  t        ||      z   }||z   ||k  rdndz   }t        t        ||            S t        t        || j                   |            S )Nr   r   )r   minr_   r   )r   r   r   r   divmodstartends           rP   split_shard_indices_by_workerz3_BaseExamplesIterable.split_shard_indices_by_worker   su    //Z/C//J.C%K#eS/1E#+eckq9CeS)**eT__jABBrR   c                 0    t        t        |        d      )Nz! doesn't implement num_shards yetr   r   s    rP   r   z _BaseExamplesIterable.num_shards   s    !T$ZL0Q"RSSrR   c                 0    t        t        |        d      )Nz' doesn't implement _init_state_dict yetr   r   s    rP   _init_state_dictz&_BaseExamplesIterable._init_state_dict   s    !T$ZL0W"XYYrR   
state_dictc                 4    fd | j                   |      S )Nc                     |+t        | t              r|D ]  } | |   ||         | |<    | S |=t        | t              r-t        t	        |             D ]  } | |   ||         | |<    | S |S rM   )
isinstancer}   r_   r   r   )state	new_stater   r   _inner_load_state_dicts       rP   r   zE_BaseExamplesIterable.load_state_dict.<locals>._inner_load_state_dict   s    $E4)@$ TC!7c
IcN!SE#JT&:eT+Bs5z* NA5eAh	!ME!HNrR   r   )r   r   r   s     @rP   load_state_dictz%_BaseExamplesIterable.load_state_dict   s    		 &d&6&6
CCrR   c                 n    | j                   rt        j                  | j                         S t        d      )NzPState dict is not initialized, please call ex_iterable._init_state_dict() first.)r   copyr   RuntimeErrorr   s    rP   r   z _BaseExamplesIterable.state_dict   s,    ==!1!122mnnrR   )rm   NT) __name__
__module____qualname____doc__r   r   tupleKeyr}   r   propertyr   r   ro   rp   r   boolr   r$   r   nprandom	Generatorr   intr   r_   r   r   r   r   r   rN   rR   rP   r   r      sR   I=R(5d#34 R HXb(5bhh;O2P.P%QR   $   (8,  ^bii.A.A ^F] ^\S \ \Ri \C CC C]abe]f C TC T TZ$ ZD$ D4 DoD orR   r   c                        e Zd Zdedeeef   f   def fdZdefdZd Z	de
j                  j                  dd fd	Zdd
ededd fdZedefd       Z xZS )ExamplesIterablegenerate_examples_fn.kwargsc                 >    t         |           || _        || _        y rM   )superr   r   r   )r   r   r   	__class__s      rP   r   zExamplesIterable.__init__   s    $8!rR   rm   c                 X    dd| j                   j                  d| _        | j                  S Nr   )	shard_idxshard_example_idxrs   r   r   r   r   s    rP   r   z!ExamplesIterable._init_state_dict   '    )*DNNLcLcdrR   c              #     K   | j                   r| j                   d   nd}t        t        | j                  | j                        |d       D ]  }| j                   r| j                   d   nd}t         | j
                  di ||d       D ])  }| j                   r| j                   dxx   dz  cc<   | + | j                   sv| j                   dxx   dz  cc<   d| j                   d<    y wNr   r   max_num_jobsr   r   rN   )r   r   rJ   r   r   r   )r   shard_idx_start	gen_kwagsshard_example_idx_startkey_examples        rP   r   zExamplesIterable.__iter__   s     ;?;K;K$**;7QR 1$++DOO \^most 	:IOSO_O_d&6&67J&Kef#%&?d&?&?&L)&LNegkl "##$$%89Q>9!!"   -2-89  !45	:s   B?C+)C+r   c                 D    t        | j                  | j                  |      S rM   )#ShuffledDataSourcesExamplesIterabler   r   r   s     rP   r   z%ExamplesIterable.shuffle_data_sources  s    243L3Ldkk[deerR   r   r   c                     t        | j                  | j                        }| j                  |||      }t	        |D cg c]  }||   	 c}      }t        | j                  |      S c c}w Keep only the requested shard.r   r   )rJ   r   r   r   rG   r   r   r   r   r   r   gen_kwargs_listshard_indicesr   requested_gen_kwargss           rP   r   z#ExamplesIterable.shard_data_sources  sc    +DKKdooV:::uYc:d0m1\/!2D1\] 9 9;OPP 2]   A(c                 ,    t        | j                        S rM   rH   r   r   s    rP   r   zExamplesIterable.num_shards      .t{{;;rR   r   )r   r   r   r   r   r   r}   r   r   r   r   r   r   r   r   r   r   r   __classcell__r   s   @rP   r   r      s    Xc5d;K6K-L VZ 
 $  
:fbii.A.A fFX fQS Q QRd Q <C < <rR   r   c                        e Zd Zdedeeef   f   dedej                  j                  f fdZ
defdZd Zdd	ed
eddfdZ xZS )r   r   .r   r   c                 F    t         |   ||       t        |      | _        y rM   r   r   r   r   )r   r   r   r   r   s       rP   r   z,ShuffledDataSourcesExamplesIterable.__init__  s!     	-v6!),rR   rm   c                 X    dd| j                   j                  d| _        | j                  S r   r   r   s    rP   r   z4ShuffledDataSourcesExamplesIterable._init_state_dict  r   rR   c              #     K   t        | j                        }t        || j                        }| j                  r| j                  d   nd}t        t        || j                        |d      D ]  }| j                  r| j                  d   nd}t         | j                  di ||d      D ])  }| j                  r| j                  dxx   dz  cc<   | + | j                  sv| j                  dxx   dz  cc<   d| j                  d<    yw)*Shuffle the kwargs order to shuffle shardsr   r   r   Nr   r   rN   )	r   r   rI   r   r   r   rJ   r   r   )r   rngkwargs_with_shuffled_shardsr   r   r   r   s          rP   r   z,ShuffledDataSourcesExamplesIterable.__iter__  s    t~~&&9#t{{&K#;?;K;K$**;7QR9XZiko
 
	:I PTO_O_d&6&67J&Kef#%&?d&?&?&L)&LNegkl "##$$%89Q>9!!"   -2-89  !45
	:s   C D#)Dr   r   r   c                     t        | j                        }t        || j                        }t	        | j
                  |      j                  |||      S r   r   )r   r   rI   r   r   r   r   r   r   r   r   r  r  s         rP   r   z6ShuffledDataSourcesExamplesIterable.shard_data_sources,  sO    t~~&&9#t{{&K# 9 9;VWjj* k 
 	
rR   r   )r   r   r   r   r   r   r}   r   r   r   r   r   r   r   r   r  r  s   @rP   r   r     sk    -$,S%T	2B-B$C-MQ-^`^g^g^q^q- $  :"
S 
 
Rd 
rR   r   c                        e Zd Zdedeeej                  f   f   def fdZ	e
d        ZdefdZd Zd	 Zd
ej                   j"                  dd fdZddededd fdZe
defd       Z xZS )ArrowExamplesIterablegenerate_tables_fn.r   c                 >    t         |           || _        || _        y rM   )r   r   r  r   )r   r  r   r   s      rP   r   zArrowExamplesIterable.__init__6  s    "4rR   c                     | j                   S rM   _iter_arrowr   s    rP   r   z ArrowExamplesIterable.iter_arrow;      rR   rm   c                 X    dd| j                   j                  d| _        | j                  S r   r   r   s    rP   r   z&ArrowExamplesIterable._init_state_dict?  r   rR   c              #     K   t               }| j                  r| j                  d   nd}t        t        | j                  | j
                        |d       D ]  }| j                  r| j                  d   nd}d} | j                  di |D ]  \  }}|t        |      z   |k  r|t        |      z  }&|j                  t        j                        D ]V  }|j                  |      }	t        |	      D ]5  }
||k\  r)| j                  r| j                  dxx   dz  cc<   ||
f |dz  }7 X  | j                  s| j                  dxx   dz  cc<   d| j                  d<    y w)Nr   r   r   r   max_chunksizer   rN   )r/   r   r   rJ   r   r   r  r   	to_readerr   'ARROW_READER_BATCH_SIZE_IN_DATASET_ITERformat_batchr   )r   	formatterr   r   r   r   r   rx   pa_subtableformatted_batchrS   s              rP   r   zArrowExamplesIterable.__iter__C  st    #%	;?;K;K$**;7QR 1$++DOO \^most 	:IOSO_O_d&6&67J&Kef# !!8!8!8!E9!E /X$s8}48OO%X6%#+#5#5FDrDr#5#s /K&/&<&<[&IO#5o#F /,0GG#// $ 0 01D E J E"%w,.)Q.)//	/   -2-89  !45#	:s   D6E#9*E#c              #     K   | j                   r| j                   d   nd}t        t        | j                  | j                        |d       D ]  }| j                   r| j                   d   nd}d} | j
                  di |D ]K  \  }}|t        |      z  }||k  r| j                   r | j                   dxx   t        |      z  cc<   ||f M | j                   s| j                   dxx   dz  cc<   d| j                   d<    y wr   )r   r   rJ   r   r   r  r   )r   r   r   r   r   r   rx   s          rP   r  z!ArrowExamplesIterable._iter_arrowY  s    ;?;K;K$**;7QR 1$++DOO \^most 	:IOSO_O_d&6&67J&Kef# !!8!8!8!E9!E $X!S]2!$(??##$$%89S]J98m#$   -2-89  !45	:s   CD)Dr   c                 D    t        | j                  | j                  |      S rM   )(ShuffledDataSourcesArrowExamplesIterabler  r   r   s     rP   r   z*ArrowExamplesIterable.shuffle_data_sourcesi  s    78O8OQUQ\Q\^ghhrR   r   r   c                     t        | j                  | j                        }| j                  |||      }t	        |D cg c]  }||   	 c}      }t        | j                  |      S c c}w r   )rJ   r   r   r   rG   r  r  r   s           rP   r   z(ArrowExamplesIterable.shard_data_sourcesl  sc    +DKKdooV:::uYc:d0m1\/!2D1\]$T%<%<>RSS 2]r   c                 ,    t        | j                        S rM   r  r   s    rP   r   z ArrowExamplesIterable.num_shardss  r  rR   r   )r   r   r   r   r   r   ro   rp   r}   r   r   r   r   r   r  r   r   r   r   r   r   r   r  r  s   @rP   r  r  5  s    8CsBHH}9M4M+N X\ 
     $  :,: ibii.A.A iF] iTS T TRi T <C < <rR   r  c                        e Zd Zdedeeej                  f   f   dede	j                  j                  f fdZdefdZd Zd	 Zdd
ededdfdZ xZS )r%  r  .r   r   c                 F    t         |   ||       t        |      | _        y rM   r  )r   r  r   r   r   s       rP   r   z1ShuffledDataSourcesArrowExamplesIterable.__init__y  s!     	+V4!),rR   rm   c                 X    dd| j                   j                  d| _        | j                  S r   r   r   s    rP   r   z9ShuffledDataSourcesArrowExamplesIterable._init_state_dict  r   rR   c              #     K   t        | j                        }t        || j                        }t	               }| j
                  r| j
                  d   nd}t        t        || j                        |d      D ]  }| j
                  r| j
                  d   nd}d} | j                  di |D ]  \  }}	|t        |	      z   |k  r|t        |	      z  }&|	j                  t        j                        D ]V  }
|j                  |
      }t        |      D ]5  }||k\  r)| j
                  r| j
                  dxx   dz  cc<   ||f |dz  }7 X  | j
                  s| j
                  dxx   dz  cc<   d| j
                  d<    yw)	r  r   r   r   Nr   r  r   rN   )r   r   rI   r   r/   r   r   rJ   r   r  r   r  r   r  r  r   )r   r  r  r   r   r   r   r   r   rx   r!  r"  rS   s                rP   r   z1ShuffledDataSourcesArrowExamplesIterable.__iter__  s    t~~&&9#t{{&K##%	;?;K;K$**;7QR9XZiko
 	:I PTO_O_d&6&67J&Kef# !!8!8!8!E9!E /X$s8}48OO%X6%#+#5#5FDrDr#5#s /K&/&<&<[&IO#5o#F /,0GG#// $ 0 01D E J E"%w,.)Q.)//	/   -2-89  !45'	:s   EF*Fc              #   N  K   t        | j                        }t        || j                        }| j                  r| j                  d   nd}t        t        || j                        |d       D ]  }| j                  r| j                  d   nd}d} | j                  di |D ]K  \  }}|t        |      z  }||k  r| j                  r | j                  dxx   t        |      z  cc<   ||f M | j                  s| j                  dxx   dz  cc<   d| j                  d<    y wr   )
r   r   rI   r   r   r   rJ   r   r  r   )	r   r  r  r   r   r   r   r   rx   s	            rP   r  z4ShuffledDataSourcesArrowExamplesIterable._iter_arrow  s.    t~~&&9#t{{&K#;?;K;K$**;7QR9XZiko
 	:I PTO_O_d&6&67J&Kef# !!8!8!8!E9!E $X!S]2!$(??##$$%89S]J98m#$   -2-89  !45	:s   C9D%<)D%r   r   r  c                     t        | j                        }t        || j                        }t	        | j
                  |      j                  |||      S r  )r   r   rI   r   r  r  r   r  s         rP   r   z;ShuffledDataSourcesArrowExamplesIterable.shard_data_sources  sO    t~~&&9#t{{&K#$T%<%<>YZmm* n 
 	
rR   r   )r   r   r   r   r   r   ro   rp   r}   r   r   r   r   r   r   r  r   r   r  r  s   @rP   r%  r%  x  sv    -$S%RXX*>%>?- - 99&&	- $  :6:(
S 
 
Ri 
rR   r%  c                   
    e Zd Zddedee   def fdZed        Z	ed        Z
ed        Zdefd	Zd
 Zdeeeej&                  f      fdZdej,                  j.                  dd fdZddededd fdZedefd       Z xZS )RebatchedArrowExamplesIterableex_iterabler   r   c                 L    t         |           || _        || _        || _        y rM   )r   r   r0  r   r   )r   r0  r   r   r   s       rP   r   z'RebatchedArrowExamplesIterable.__init__  s%    &$.rR   c                     | j                   S rM   r  r   s    rP   r   z)RebatchedArrowExamplesIterable.iter_arrow  r  rR   c                 .    | j                   j                  S rM   r0  r   r   s    rP   r   z'RebatchedArrowExamplesIterable.is_typed      (((rR   c                 .    | j                   j                  S rM   r0  r   r   s    rP   r   z'RebatchedArrowExamplesIterable.features  r5  rR   rm   c                     | j                   j                         d ddd| j                  j                  d| _        | j                  S )Nr   )examples_iterableprevious_state	batch_idxnum_chunks_since_previous_statecropped_chunk_lengthrs   r0  r   r   r   r   r   s    rP   r   z/RebatchedArrowExamplesIterable._init_state_dict  sE    !%!1!1!B!B!D"/0$%NN++
 rR   c              #   8   K   | j                   E d {    y 7 wrM   )r0  r   s    rP   r   z'RebatchedArrowExamplesIterable.__iter__  s     ####s   c              #   
  K   | j                   r7| j                   d   r(| j                  j                  | j                   d          | j                  j                  r| j                  j                         }nt	        | j                  d      }| j
                  | j
                  dk  ri| j                   r| j                   d   dkD  ryt        j                  |D cg c]  \  }}|	 c}}      }| j                   rd| j                   d<   d|f yg }g }d}| j                   r| j                   d   nd}| j                   r| j                   d	   nd}	| j                   r)| j                  j                         }
|
| j                   d<   |D ]6  \  }}t        |j                  | j
                  
            D ]  \  }}|dkD  r|dz  }|dk(  r|	dk(  r|dz  }"|dk(  r'|	dkD  r"|j                  |	t        |      |	z
        }d}d}	t        |      dk(  r]|t        |      z   | j
                  k  r1|j                  |       |j                  |       |t        |      z  }|t        |      z   | j
                  k(  r|j                  |       |j                  |       dj                  d |D              }| j                   rF| j                   dxx   dz  cc<   | j                   dxx   t        |      z  cc<   d| j                   d	<   |t        j                  j!                  |      f g }g }d}| j                   s
| j                   d<   |dz   | j                   d<   | j
                  |z
  }|j                  | d| d       |j                  |j                  d|             dj                  d |D              }| j                   rF| j                   dxx   dz  cc<   | j                   dxx   t        |      z  cc<   || j                   d	<   |t        j                  j!                  |      f | d| dg}|j                  |t        |      |z
        g}t        |      |z
  }| j                   s
| j                   d<   || j                   d<    | j                   s| j                  j                         }
9 | j"                  s|rdj                  d |D              }| j                   rD
| j                   d<   | j                   dxx   dz  cc<   d| j                   d<   d| j                   d	<   |t        j                  j!                  |      f yyyc c}}w w)z-Iterate over sub-tables of size `batch_size`.r:  r   r   Nr   r;  r   r<  r=  r  r   c              3   2   K   | ]  }t        |        y wrM   r   rX   _keys     rP   rZ   z=RebatchedArrowExamplesIterable._iter_arrow.<locals>.<genexpr>       &ITs4y&Ir   z[:]c              3   2   K   | ]  }t        |        y wrM   r   rC  s     rP   rZ   z=RebatchedArrowExamplesIterable._iter_arrow.<locals>.<genexpr>  rE  r   [z:]c              3   2   K   | ]  }t        |        y wrM   r   rC  s     rP   rZ   z=RebatchedArrowExamplesIterable._iter_arrow.<locals>.<genexpr>-  s     ATs4yAr   )r   r0  r   r   r   r   ro   concat_tablesr   	enumerater  slicer   appendr   rp   from_batchesr   )r   r   r   rx   all_pa_tablekeys_bufferchunks_bufferchunks_buffer_sizenum_chunks_to_skipchunk_length_to_cropr:  r   r<  chunkr   r=  s                   rP   r  z*RebatchedArrowExamplesIterable._iter_arrow  sA     0 01A B,,T-=-=>N-OP&&''224H()9)9aHH??"doo&:D$4$4[$AA$E++,R+!XX,RSL01  -%%TXTdTdT--.OPjkKOK[K[t//0FGab!--88:N1?D-.% 4	?MC:CHDVDVeietetDVDu:v 1n6/%)&!+&'1,1E1J&!+&'1,1E1I!KK(<c%jK_>_`E)*&+,(u:?%E
2T__D&&s+!((/&#e*4&'#e*4G&&s+!((/!hh&I[&IIG''((5:5(()JKsS`OaaKCD(()?@!288#8#8#GGG"$K$&M)*&''=K(()9:NmpqNq(()JK+/??=O+O(&&#b1E0Fa'HI!((Q8L)MN!hh&I[&IIG''((5:5(()JKsS`OaaKCW(()?@!288#8#8#GGG&)U!,@+A#D"EK%*[[1Es5zThGh%i$jM),U6J)J&''=K(()9:Nm(()JKc1nd !%!1!1!<!<!>i4	?j ##hhA[AAG5C  !12  -2-FG  !BC;<  !7828800??? )6#E -Ss'   CU%U
(IU%<D9U%7-U%&B?U%r   c                 v    t        | j                  j                  |      | j                  | j                        S rM   )r/  r0  r   r   r   r   s     rP   r   z3RebatchedArrowExamplesIterable.shuffle_data_sources5  s1    -11)<dootOcOc
 	
rR   r   r   c                 |    t        | j                  j                  |||      | j                  | j                        S Nr   )r/  r0  r   r   r   r   s       rP   r   z1RebatchedArrowExamplesIterable.shard_data_sources:  s:    -//
Ej/YOO  
 	
rR   c                 .    | j                   j                  S rM   r0  r   r   s    rP   r   z)RebatchedArrowExamplesIterable.num_shardsA      ***rR   Fr   )r   r   r   r   r   r   r   r   r   r   r   r   r}   r   r   r   r   r   ro   rp   r  r   r   r   r   r   r   r  r  s   @rP   r/  r/    s    /$9 /xPS} /gk /     ) ) ) )	 $ 	 $T@XeCM&:; T@l
bii.A.A 
Ff 


S 
 
Rr 
 +C + +rR   r/  c                       e Zd Zdedee   f fdZed        Zed        Z	ed        Z
defdZd	 Zdeeeej$                  f      fd
Zdej*                  j,                  dd fdZddededd fdZedefd       Z xZS )SelectColumnsIterabler0  column_namesc                 >    t         |           || _        || _        y rM   )r   r   r0  r_  )r   r0  r_  r   s      rP   r   zSelectColumnsIterable.__init__G  s    &(rR   c                 H    | j                   j                  r| j                  S y rM   )r0  r   r  r   s    rP   r   z SelectColumnsIterable.iter_arrowL  s"    &&### 'rR   c                 .    | j                   j                  S rM   r4  r   s    rP   r   zSelectColumnsIterable.is_typedQ  r5  rR   c                 .    | j                   j                  S rM   r7  r   s    rP   r   zSelectColumnsIterable.featuresU  r5  rR   rm   c                 X    | j                   j                         | _        | j                  S rM   r0  r   r   r   s    rP   r   z&SelectColumnsIterable._init_state_dictY  %    ++<<>rR   c              #      K   | j                   D ]'  \  }}|| j                  D ci c]  }|||   
 c}f ) y c c}w wrM   )r0  r_  )r   rf   rowcs       rP   r   zSelectColumnsIterable.__iter__]  sF     (( 	>HC4+<+<=a3q6	===	>=s   "?:?c              #      K   | j                   j                         D ]3  \  }}t        |      dkD  s||j                  | j                        f 5 y wNr   )r0  r   r   selectr_  )r   rf   rx   s      rP   r  z!SelectColumnsIterable._iter_arrowa  sN     !--88: 	>MC8}q 8??4+<+<===	>s
   /A"Ar   c                 `    t        | j                  j                  |      | j                        S rM   )r^  r0  r   r_  r   s     rP   r   z*SelectColumnsIterable.shuffle_data_sourcesf  s'    $T%5%5%J%J9%UW[WhWhiirR   r   r   c                 f    t        | j                  j                  |||      | j                        S rX  )r^  r0  r   r_  r   s       rP   r   z(SelectColumnsIterable.shard_data_sourcesi  s3    $//
Ej/Y[_[l[l
 	
rR   c                 .    | j                   j                  S rM   rZ  r   s    rP   r   z SelectColumnsIterable.num_shardsn  r[  rR   r   )r   r   r   r   r_   r   r   r   r   r   r   r}   r   r   r   r   r   ro   rp   r  r   r   r   r   r   r   r   r  r  s   @rP   r^  r^  F  s    )$9 )c )
 $ $ ) ) ) ) $  >>XeCM&:; >
jbii.A.A jF] j
S 
 
Ri 

 +C + +rR   r^  c                        e Zd Zdededef fdZed        Zed        Zde	fdZ
d	 Zd
ej                  j                  dd fdZddededd fdZedefd       Z xZS )StepExamplesIterabler0  stepoffsetc                 L    t         |           || _        || _        || _        y rM   )r   r   r0  rr  rs  )r   r0  rr  rs  r   s       rP   r   zStepExamplesIterable.__init__t  s$    &	rR   c                 .    | j                   j                  S rM   r4  r   s    rP   r   zStepExamplesIterable.is_typed{  r5  rR   c                 .    | j                   j                  S rM   r7  r   s    rP   r   zStepExamplesIterable.features  r5  rR   rm   c                 X    | j                   j                         | _        | j                  S rM   re  r   s    rP   r   z%StepExamplesIterable._init_state_dict  rf  rR   c              #      K   t        | j                        }	 t        t        || j                              }t        |      | j                  kD  r|| j                      ny KwrM   )r   r0  r_   r   rr  r   rs  )r   ex_iteratorrk   s      rP   r   zStepExamplesIterable.__iter__  sU     4++,TYY78E5zDKK'DKK(( s   A"A$r   c                 x    t        | j                  j                  |      | j                  | j                        S )Nrr  rs  )rq  r0  r   rr  rs  r   s     rP   r   z)StepExamplesIterable.shuffle_data_sources  s2    #11)<499UYU`U`
 	
rR   r   r   c                 ~    t        | j                  j                  |||      | j                  | j                        S )Nr   r{  )rq  r0  r   rr  rs  r   s       rP   r   z'StepExamplesIterable.shard_data_sources  s8    #//
Ej/Y;;
 	
rR   c                 .    | j                   j                  S rM   rZ  r   s    rP   r   zStepExamplesIterable.num_shards  r[  rR   r   )r   r   r   r   r   r   r   r   r   r}   r   r   r   r   r   r   r   r   r  r  s   @rP   rq  rq  s  s    $9  c  ) ) ) ) $  
bii.A.A 
F\ 


S 
 
Rh 
 +C + +rR   rq  c                        e Zd Z	 ddee   ded   f fdZed        Zed        Z	d Z
defd	Zd
 Zdej                  j                   dd fdZedefd       Z	 ddededd fdZ xZS )#CyclingMultiSourcesExamplesIterableex_iterablesstopping_strategyfirst_exhaustedall_exhaustedc                     t         |           || _        || _        |dk(  rt        j
                  | _        y t        j                  | _        y )Nr  )r   r   r  r  r   r   r]   bool_strategy_func)r   r  r  r   s      rP   r   z,CyclingMultiSourcesExamplesIterable.__init__  sC    
 	(!2 .?/-Q"&&XZX^X^rR   c                 4    | j                   d   j                  S rk  r  r   r   s    rP   r   z,CyclingMultiSourcesExamplesIterable.is_typed        #,,,rR   c                 4    | j                   d   j                  S rk  r  r   r   s    rP   r   z,CyclingMultiSourcesExamplesIterable.features  r  rR   c           	   #      K   | j                   r| j                   d   nd}t        t        t        t	        | j
                                    |dz   d       D ]#  }| j                   r|| j                   d<   | |}% y wNex_iterable_idxr   r   )r   r   r
   r   r   r  )r   r  next_ex_iterable_idxs      rP   _get_indices_iteratorz9CyclingMultiSourcesExamplesIterable._get_indices_iterator  s}     AEAQAQ$**+<=WX$*5s4;L;L7M1N+OQ`cdQdfj$k 	3 6J  !23!!2O		3s   A:A<rm   c                    d| j                   D cg c]  }|j                          c}d gt        | j                         z  dgt        | j                         z  | j                  j                  d| _        | j
                  S c c}w )Nr   F)r  r  previous_statesis_exhaustedrs   )r  r   r   r   r   r   r   r0  s     rP   r   z4CyclingMultiSourcesExamplesIterable._init_state_dict  sw     OSO`O`a[99;a $vD,=,=(>>"Gc$*;*;&<<NN++
  bs   Bc              #   z  K   d gt        | j                        z  }| j                  rdt        t        | j                              D ]C  }| j                  d   |   | j                  |   j	                  | j                  d   |          E | j                  D cg c]  }t        |       }}| j                         }| j                  r"t        j                  | j                  d         n(t        j                  t        | j                        d      }|D ]  }| j                  |      r y ||   t        ||   d      ||<   ||   }| j                  r+t        | j                  d   |         | j                  d   |<   t        ||   d      ||<   ||   du rd||<   | j                  rd| j                  d   |<   d ||<   | j                  r?| j                  |   j                         | j                  d   |<   d | j                  d   |<   t        | j                  |         ||<   |dus|  y c c}w w)Nr  r  Fr  T)r   r  r   r   r   r   r  r   r   fullr  r   r   r   )r   nextsr   r0  	iteratorsindices_iteratorr  results           rP   r   z,CyclingMultiSourcesExamplesIterable.__iter__  sA    T..//3t0012 a##$56q9E%%a(889I9IJ[9\]^9_`a ;?:K:KL;T+&L	L557 ;?:J:JBHHT%%n56PRPWPWX[\`\m\mXnpuPv 	 " 	A&&|4Qx	!e4a1XF9A$BRBRSaBbcdBe9f  !23A6IaL%0E!H Qx5 "&Q##:>D$$^4Q7a##:>:K:KA:N:_:_:aD$$^4Q7=AD$$%67:#D$5$5a$89	!U"3	 Ms   AH;>H;H6,F H;.H;r   c                     | j                   D cg c]  }|j                  |       }}t        || j                        S c c}w )z*Shuffle each underlying examples iterable.)r  r   r  r  r   r   r0  r  s       rP   r   z8CyclingMultiSourcesExamplesIterable.shuffle_data_sources  s?    W[WhWhi88Cii2<AWAWXX js   ?c                 :    t        d | j                  D              S )Nc              3   4   K   | ]  }|j                     y wrM   r   rX   r0  s     rP   rZ   zACyclingMultiSourcesExamplesIterable.num_shards.<locals>.<genexpr>       Ok;))O   r   r  r   s    rP   r   z.CyclingMultiSourcesExamplesIterable.num_shards      OT=N=NOOOrR   r   r   c           
          t        | j                  D cg c]  }|j                  |||       c}| j                        S c c}w )r   r   r  )r  r  r   r  r   r   r   r   r   s        rP   r   z6CyclingMultiSourcesExamplesIterable.shard_data_sources  sD     3cgctctuW_X((Uz(Ru"44
 	
us   A)r  r   )r   r   r   r_   r   rB   r   r   r   r   r  r}   r   r   r   r   r   r   r   r   r   r  r  s   @rP   r  r    s     J[_01_ ##EF_ - - - -3 $  (TYbii.A.A YFk Y
 PC P P 7;

&)
	.
rR   r  c                        e Zd ZdZdee   f fdZed        Zed        Z	ed        Z
defdZd	 Zd
 Zdej                   j"                  dd fdZedefd       Z	 ddededd fdZ xZS )2VerticallyConcatenatedMultiSourcesExamplesIterablea  
    VerticallyConcatenatedMultiSourcesExamplesIterable simply chains the input iterables.
    It doesn't require the examples iterables to always yield the same columns.
    Instead, this is handled by the `IterableDataset` class or `FormattedExamplesIterable`.

    For information, `IterableDataset` merges the features of all the datasets to concatenate into one.
    We use `IterableDataset._resolve_features` to obtain the features of all the datasets to concatenate.

    Then for each example, `IterableDataset` and `FormattedExamplesIterable` automatically fill missing columns with None.
    This is done with `_apply_feature_types_on_example`.
    r  c                 0    t         |           || _        y rM   r   r   r  r   r  r   s     rP   r   z;VerticallyConcatenatedMultiSourcesExamplesIterable.__init__      (rR   c                 4    | j                   d   j                  S rk  r  r   s    rP   r   z;VerticallyConcatenatedMultiSourcesExamplesIterable.is_typed  r  rR   c                 4    | j                   d   j                  S rk  r  r   s    rP   r   z;VerticallyConcatenatedMultiSourcesExamplesIterable.features  r  rR   c                 T    t        d | j                  D              r| j                  S y )Nc              3   8   K   | ]  }|j                   d u  y wrM   )r   r  s     rP   rZ   zPVerticallyConcatenatedMultiSourcesExamplesIterable.iter_arrow.<locals>.<genexpr>#  s     Wk{%%T1W   )r   r  r  r   s    rP   r   z=VerticallyConcatenatedMultiSourcesExamplesIterable.iter_arrow!  s(    WTEVEVWW### XrR   rm   c                     d| j                   D cg c]  }|j                          c}| j                  j                  d| _        | j                  S c c}w )Nr   )r  r  rs   r  r   r   r   r   r  s     rP   r   zCVerticallyConcatenatedMultiSourcesExamplesIterable._init_state_dict&  sM     OSO`O`a[99;aNN++

  bs   Ac              #      K   | j                   r| j                   d   nd}t        | j                  |d       D ]0  }|E d {    | j                   s| j                   dxx   dz  cc<   2 y 7 +wr  )r   r   r  r   ex_iterable_idx_startr0  s      rP   r   z;VerticallyConcatenatedMultiSourcesExamplesIterable.__iter__.  sn     GKGWGW 0 01B C]^!$"3"35JDQ 	9K"""  !23q83	9"s   =A-A+ A-A-c              #      K   | j                   r| j                   d   nd}t        | j                  |d       D ]>  }|j                         E d {    | j                   s(| j                   dxx   dz  cc<   @ y 7 +wr  )r   r   r  r   r  s      rP   r  z>VerticallyConcatenatedMultiSourcesExamplesIterable._iter_arrow5  sw     GKGWGW 0 01B C]^!$"3"35JDQ 	9K"--///  !23q83	9/s   AA;A9A;A;r   c                     t        |      }t        | j                        }|j                  |       |D cg c]  }|j	                  |       }}t        |      S c c}w )zTShuffle the list of examples iterable, as well as each underlying examples iterable.)r   r_   r  shuffler   r  )r   r   r  r  r0  s        rP   r   zGVerticallyConcatenatedMultiSourcesExamplesIterable.shuffle_data_sources<  sY     y!D--.L!Wcd88CddA,OO es   Ac                 :    t        d | j                  D              S )Nc              3   4   K   | ]  }|j                     y wrM   r  r  s     rP   rZ   zPVerticallyConcatenatedMultiSourcesExamplesIterable.num_shards.<locals>.<genexpr>H  r  r  r  r   s    rP   r   z=VerticallyConcatenatedMultiSourcesExamplesIterable.num_shardsF  r  rR   r   r   c           
      v    t        | j                  D cg c]  }|j                  |||       c}      S c c}w r   r   )r  r  r   r  s        rP   r   zEVerticallyConcatenatedMultiSourcesExamplesIterable.shard_data_sourcesJ  s<     BcgctctuW_X((Uz(Ru
 	
u   6r   )r   r   r   r   r_   r   r   r   r   r   r   r}   r   r   r  r   r   r   r   r   r   r   r  r  s   @rP   r  r    s    
)T*?%@ ) - - - - $ $ $  99P,,P	=P PC P P 7;

&)
	=
rR   r  r_  c                     t        |       }t        d |j                         D              s'|D cg c]  }||   dkD  s| }}t        d| d      yc c}w )zBCheck the column names to make sure they don't contain duplicates.c              3   &   K   | ]	  }|d k(    yw)r   NrN   )rX   counts     rP   rZ   z&_check_column_names.<locals>.<genexpr>V  s     8euz8s   r   zAThe examples iterables can't have duplicated columns but columns z are duplicated.N)r   r   r`   r^   )r_  counterrY   duplicated_columnss       rP   _check_column_namesr  S  sd    l#G8w~~'788-4Icq8HcIIOPbOccst
 	
 9Is
   AAc                        e Zd ZdZdee   f fdZed        Zed        Z	de
fdZd Zd	ej                  j                  dd fd
Zedefd       Z	 ddededd fdZ xZS )4HorizontallyConcatenatedMultiSourcesExamplesIterablea5  
    HorizontallyConcatenatedMultiSourcesExamplesIterable merges examples together for the input list of iterables.
    It also checks that there are no duplicate columns (otherwise we don't know which one to keep).
    This check is done once when yielding the first example.

    However it doesn't fill missing columns with None.
    Instead, this is handled by the `IterableDataset` class or `FormattedExamplesIterable`.

    For information, `IterableDataset` merges the features of all the datasets to concatenate into one.
    We use `IterableDataset._resolve_features` to obtain the features of all the datasets to concatenate.

    Then for each example, `IterableDataset` and `FormattedExamplesIterable` automatically fill missing columns with None.
    This is done with `_apply_feature_types_on_example`.
    r  c                 0    t         |           || _        y rM   r  r  s     rP   r   z=HorizontallyConcatenatedMultiSourcesExamplesIterable.__init__m  r  rR   c                 4    | j                   d   j                  S rk  r  r   s    rP   r   z=HorizontallyConcatenatedMultiSourcesExamplesIterable.is_typedr  r  rR   c                 4    | j                   d   j                  S rk  r  r   s    rP   r   z=HorizontallyConcatenatedMultiSourcesExamplesIterable.featuresv  r  rR   rm   c                     | j                   D cg c]  }|j                          c}| j                  j                  d| _        | j                  S c c}w )N)r  rs   r  r  s     rP   r   zEHorizontallyConcatenatedMultiSourcesExamplesIterable._init_state_dictz  sJ    OSO`O`a[99;aNN++
  bs   Ac           	   #   (  K   | j                   D cg c]  }t        |       }}t        j                         D ]  }g }g }t	        |      D ]3  }	 t        |      \  }}|j                  |       |j                  |       5 |r`|dk(  r"t        |D 	cg c]  }|D ]  }	|	  c}	}       i }
|D ]  }|
j                  |        dj                  d |D              }||
f  y  y c c}w # t        $ r |j                  |       Y w xY wc c}	}w w)Nr   r   c              3   2   K   | ]  }t        |        y wrM   r   r   s     rP   rZ   zPHorizontallyConcatenatedMultiSourcesExamplesIterable.__iter__.<locals>.<genexpr>  s     "<3s8"<r   )r  r   	itertoolsr  r_   r   rM  StopIterationremover  updater   )r   r0  ex_iteratorsr   r   rz   ry  r   rS   column_namenew_exampler   s               rP   r   z=HorizontallyConcatenatedMultiSourcesExamplesIterable.__iter__  s    =A=N=NOk[)OO" 	ADH#L1 55#'#4LCKK$OOG,	5 6'H(h`g(hQ\(h(hi ' 0G&&w/0(("<t"<<{**'	 P % 5 ''45 )is@   DC'+D0C,?DD"A
D,D	DD			Dr   c                     | S )z^Doesn't shuffle the wrapped examples iterable since it would break the alignment between them.rN   r   s     rP   r   zIHorizontallyConcatenatedMultiSourcesExamplesIterable.shuffle_data_sources  s	     rR   c                      yNr   rN   r   s    rP   r   z?HorizontallyConcatenatedMultiSourcesExamplesIterable.num_shards  s    rR   r   r   c           
      v    t        | j                  D cg c]  }|j                  |||       c}      S c c}w r  )r  r  r   r  s        rP   r   zGHorizontallyConcatenatedMultiSourcesExamplesIterable.shard_data_sources  s<     DcgctctuW_X((Uz(Ru
 	
ur  r   )r   r   r   r   r_   r   r   r   r   r   r}   r   r   r   r   r   r   r   r   r   r  r  s   @rP   r  r  ]  s    )T*?%@ )
 - - - - $  .,,	? C   7;

&)
	?
rR   r  c            
            e Zd Z	 	 ddee   dej                  j                  deee	      de
d   f fdZed        Zed        Zd	 Zd
efdZdej                  j                  d
d fdZ	 ddeded
d fdZ xZS )+RandomlyCyclingMultiSourcesExamplesIterabler  r   probabilitiesr  r  c                 T    t         |   ||       t        |      | _        || _        y rM   )r   r   r   r   r  )r   r  r   r  r  r   s        rP   r   z4RandomlyCyclingMultiSourcesExamplesIterable.__init__  s)     	'89!),*rR   c                 4    | j                   d   j                  S rk  r  r   s    rP   r   z4RandomlyCyclingMultiSourcesExamplesIterable.is_typed  r  rR   c                 4    | j                   d   j                  S rk  r  r   s    rP   r   z4RandomlyCyclingMultiSourcesExamplesIterable.features  r  rR   c              #     K   t        | j                        }t        | j                        }d}| j                  r| j                  d   nd}| j                  r| j                  d   |j
                  _        | j                  ~	 t        |j                  d||      |d       D ]Z  }|dz   |z  }| j                  r7|| j                  d<   |dk(  r#|j
                  j                  | j                  d<   t        |       \ }	 t        |j                  ||| j                        |d       D ]Z  }|dz   |z  }| j                  r7|| j                  d<   |dk(  r#|j
                  j                  | j                  d<   t        |       \ w)N  bit_generator_index_offsetr   bit_generator_statesizer   )r  p)r   r   r   r  r   bit_generatorr   r  r   integersr   choice)r   r  num_sourcesrandom_batch_sizeindex_offsetr   s         rP   r  zARandomlyCyclingMultiSourcesExamplesIterable._get_indices_iterator  s    t~~&$++, IMIYIYt''(DE_`&*&6&67L&MC#%QBS TVbdhi !A$01$48I#IL''IU(()EF'1,FIFWFWF]F]D,,-BCa&L!  JJ{1BdFXFXJY[gim !A %11$48I#IL''IU(()EF'1,FIFWFWF]F]D,,-BCa&L! s   FFrm   c                 L   | j                   j                  j                  d| j                  D cg c]  }|j	                          c}d gt        | j                        z  dgt        | j                        z  | j                  j                  d| _        | j                  S c c}w )Nr   F)r  r  r  r  r  rs   )	r   r  r   r  r   r   r   r   r   r  s     rP   r   z<RandomlyCyclingMultiSourcesExamplesIterable._init_state_dict  s    #'>>#?#?#E#E*+OSO`O`a[99;a $vD,=,=(>>"Gc$*;*;&<<NN++
  bs   B!c                     | j                   D cg c]  }|j                  |       }}t        ||| j                  | j                        S c c}w )z;Shuffle the data sources of each wrapped examples iterable.r   r  r  )r  r   r  r  r  r  s       rP   r   z@RandomlyCyclingMultiSourcesExamplesIterable.shuffle_data_sources  sQ    W[WhWhi88Cii:,,"44	
 	
 js   Ar   r   c           
          t        | j                  D cg c]  }|j                  |||       c}| j                  | j                  | j
                        S c c}w r  )r  r  r   r   r  r  r  s        rP   r   z>RandomlyCyclingMultiSourcesExamplesIterable.shard_data_sources  sT     ;cgctctuW_X((Uz(RuNN""	
 	
us   A)Nr  r   )r   r   r   r_   r   r   r   r   r   floatrB   r   r   r   r   r  r}   r   r   r   r   r  r  s   @rP   r  r    s    
 04IZ	+01	+ 99&&	+  U,		+
 ##EF	+ - - - -!:	 $ 	 
bii.A.A 
Fs 
 7;	
	
&)	
	6	
rR   r  c                    t        | t        j                        r| S t        | t        j                  t        j
                  f      rt        j                  j                  |       S t        j                  rHdt        j                  v r6dd l}t        | |j                  |j
                  f      r| j                         S | S )Npolarsr   )r   ro   rp   pd	DataFrameSeriesfrom_pandasr   POLARS_AVAILABLEsysmodulesr  to_arrow)outputpls     rP   _table_output_to_arrowr    s    &"((#&2<<34xx##F++8s{{#:fr||RYY78??$$MrR   c                       e Zd Z	 	 	 	 	 	 	 	 	 	 ddedededeee      dedee	   dedeee      d	ee
   d
ed   dee   dee	   f fdZed        Zed        Zed        Zde
fdZd Zd Zddee	   deeeej0                  f      fdZdej6                  j8                  dd fdZd de	de	dd fdZede	fd       Z xZ S )!MappedExamplesIterabler0  functionwith_indicesinput_columnsbatchedr   r   remove_columns	fn_kwargs
formattingFormattingConfigr   /max_num_running_async_map_functions_in_parallelc                    t         |           || _        || _        || _        || _        || _        || _        || _        || _	        |	xs i | _
        |
| _        || _        |xs t        j                  | _        |
r|
j                   rt#        |t$              sSt'        d|
j(                  j+                          dt-        |       j.                   dt-        |      j.                   d      |j
                  |r|ndk7  rQt'        d|
j(                  j+                          dt-        |       j.                   d|r|nd d|j
                  d	      g | _        y )	NzThe z-formatted z" has underlying iterablethat is a z- instead of a RebatchedArrowExamplesIterable.r   z has batch_size=z/ which isdifferent from ex_iterable.batch_size=z from its underlying iterable.)r   r   r0  r   r  r   r   r  r  r  r  r  	_featuresr   /MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLELr  is_tabler   r/  r^   format_type
capitalizers   r   _owned_loops_and_tasks)r   r0  r   r  r  r  r   r   r  r  r  r   r  r   s                rP   r   zMappedExamplesIterable.__init__  sv    	& $.,(*"b$!;uv?u?u 	< *--k+IJ :11<<>?{4PT:K^K^J_ `!!%k!2!;!; <<ik  '''JqI :11<<>?{4PT:K^K^J__o  Fpz  LM  pN N>&1&<&<%>>\^ 
 ce#rR   c                 b    | j                   r#| j                   j                  r| j                  S y y rM   )r  r  r  r   s    rP   r   z!MappedExamplesIterable.iter_arrow:  s(    ??t77###  8?rR   c                     | j                   d uS rM   r   r   s    rP   r   zMappedExamplesIterable.is_typed?  s    }}D((rR   c                     | j                   S rM   r
  r   s    rP   r   zMappedExamplesIterable.featuresC      ~~rR   rm   c                     | j                   j                         d dd| j                  j                  d| _        | j                  S )Nr   )r9  r:  !num_examples_since_previous_stateprevious_state_example_idxrs   r>  r   s    rP   r   z'MappedExamplesIterable._init_state_dictG  sB    !%!1!1!B!B!D"12*+NN++
 rR   c              #      K   | j                   rP| j                   j                  r:t               }| j                  d      D ]  \  }}||j	                  |      f  y | j                         E d {    y 7 w)Nr   r  )r  r  r/   r  
format_row_iter)r   r   r   rx   s       rP   r   zMappedExamplesIterable.__iter__Q  si     ??t77')I!%!1!1!1!B :X9//999: zz|##s   A0A:2A83A:c              #   R   	
K    j                   r j                   d   nd
 j                   rG j                   d   r8 j                  j                   j                   d           j                   d   }nd}t         j                         j                  r>t         j                  j                        }t        |t              r|j                  nd nd 
 fd
fd fd fd fd	 fd
 fd	g t        j                   j                        r3	 t        j                          j"                  j%                  f       nd 	
 fd}	  |       } j&                  r	d |D        }|D ]H  \  }} j                   r& j                   d    j                   dxx   dz  cc<   |dkD  r|dz  }C||f J y # t        $ r t        j                          Y w xY w# t(        t*        f$ r rt,        j/                  dt1               d       D ]  }|j3                  d        	 j5                  t        j6                           # t        j8                  t:        f$ r t,        j/                  d       Y  w xY w w xY ww)Nr  r   r:  r  c               3   (  K   D ]  \  } }j                   j                   dk  rnt        j                   dz
        }| |fgt        |      z   }t        | \  }}dj	                  d |D              } j
                  r5j                   )j                   dkD  rt        |      j                   k  r y t        |      }
r 
|      n|}t        t        |            D cg c]  }	|z   	 }}	t        |      z  	|| |ff  y c c}w w)Nr   r   r   c              3   2   K   | ]  }t        |        y wrM   r   r   s     rP   rZ   zLMappedExamplesIterable._iter.<locals>.iter_batched_inputs.<locals>.<genexpr>w  s     8Cs3x8r   )	r   r   r_   r~   r   r   r   r   r   )r   rS   r   r   r   rz   rk   r   indicescurrent_idxformat_dictr   r   s            rP   iter_batched_inputsz9MappedExamplesIterable._iter.<locals>.iter_batched_inputsk  s     ( ,W .$//Q2F $//A*=> 
 '*7^$4tN7K$K!!$&7!8hhh8488((3!+H7*84.9E*u49#>O:P4QRq;?RRs7|+U|++/,* Ss   C"D%D1!Dc               3   X   K   D ]   \  } }t        |      }dz  dz
  | |ff " y wr  )r}   )r   rS   r   r   s     rP   iter_inputsz1MappedExamplesIterable._iter.<locals>.iter_inputs  sB      ( 6W w-q !AoW~556s   '*c                 <   j                   r| rt        t        |             }| D cg c]"  }t        | |         t        | |         k7  s!|$ }}|r>t	        d| d|D cg c]  }t        | |          c} d| dt        | |          d	      y y y c c}w c c}w )Nz!Column lengths mismatch: columns z have length z while z has length .)r  r   r   r   r^   )processed_inputs	first_colrY   bad_colsr   s       rP   validate_function_outputz>MappedExamplesIterable._iter.<locals>.validate_function_output  s    || 0 &6!78	#3s;KC;P7QUXYijsYtUu7uC  $;H:]  {C  TDsvTWXhilXmTn  TD  SE E!!*<<LY<W8X7YYZ\   !1|
 TDs   "BB Bc                     | \  }}j                   |gnj                   D cg c]  }||   	 c}}d}j                  r||fz  }t        |      }|||j                  fS c c}w )NrN   )r  r  r}   r  )	r   r  r   rS   rY   fn_argsadditional_argsinputsr   s	           rP   prepare_inputsz4MappedExamplesIterable._iter.<locals>.prepare_inputs  st    &LC#'#5#5#=wi\`\n\nCoUXGCLCoG O  G:%']F7OT^^CC Dps   A"c                      |       j                   r(j                   D ]  }||v r||= || d   u s||v s||=  i ||}|S r  r  )r   r.  r'  ri  transformed_inputsr   r*  s        rP   prepare_outputsz5MappedExamplesIterable._iter.<locals>.prepare_outputs  sn    $%56"",, 0AF{"1I';q>9aCS>S,Q/	0
 "@F!?.>!?%%rR   c                 b     | |      \  }}}} 	j                   g ||i |} | ||      S )z8Utility to apply the function on a selection of columns.r   
r   r  r.  r,  r-  r  r'  r/  r3  r   s
          rP   apply_functionz4MappedExamplesIterable._iter.<locals>.apply_function  sG    :HV]:^7FG_i,t}}UgUU9U";8HIIrR   c                 ~   K    | |      \  }}}} 	j                   g ||i | d{   } | ||      S 7 w)zLUtility to apply the function on a selection of columns. Same code but asyncNr5  r6  s
          rP   async_apply_functionz:MappedExamplesIterable._iter.<locals>.async_apply_function  sU     :HV]:^7FG_i%2T]]%[G%[o%[QZ%[[";8HII  \s   )=;=c            	   3     K   j                   r        n        } t        j                  j                        rj                  r:j
                  j                         }|j                  d<   d }j                  d   }g }| D ]  \  }}|j                  |       j                  j                   ||                   t              j                  k\  rj                  t        j                  t        j                              \  }}rjt        |      j                  k\  rRj                  t        j                  t        j                              \  }}rt        |      j                  k\  rRt              dj                  z  k\  rj                  d          rd   j                         r|j!                  d      j!                  d      }	}||	j#                         f j                  r6|	u r2j                  d<   dj                  d<   j                  d<   d\  }}rd   j                         rj                  sމsj
                  j                         }d   }} rA|d   j                  d         f |j!                  d      j!                  d      f r@y y j                  rQj                   rEj
                  j                         j                  d<   dj                  d<   j                  d<   | D ]  \  }}j                  rj                   sj                  d<   | 
||      f j                  sGj                   sTj
                  j                         j                  d<   dj                  d<   j                  d<    y w)	Nr:  r  )return_when
   r   r  NN)r  inspectiscoroutinefunctionr   r   r0  r   rM  create_taskr   r  run_until_completeasynciowaitFIRST_COMPLETEDdonepopr  )inputs_iteratorr:  previous_state_taskr  r  r   r   rF  pendingtaskr7  r9  r   r"  r$  loopr   taskss             rP   iter_outputsz2MappedExamplesIterable._iter.<locals>.iter_outputs  so    7;||13O**4==9##%)%5%5%@%@%BN9GD$$%56*.'151A1AB^1_.=?&5 ANA{NN1%LL!1!12F{TU2V!WX5zT%Y%YY(,(?(?#LLG<S<ST)g $G8l8l(l,0,C,C 'U@W@W X-MD' $G8l8l(l 5zR$*^*^%^^//a9E!HMMO")++a.%))A,4..++8K0KAOD,,-=>TUD,,-PQMgD,,-IJBL?N,?  E!HMMO '',?,GE)-)9)9)D)D)F.3Bi+5@27A8 !!*d&=&=eAh&GGGKKNEIIaL0  ##||=A=M=M=X=X=Z(()9:PQ(()LMIT(()EF&5 	YNA{''#||MXD,,-IJ^K;;;''<<AEAQAQA\A\A^D,,-=>TUD,,-PQMXD,,-IJ	Ys:   E>OCOO"O&O*A%OB'O8OAOc              3   J   K   | ]  \  }}t        |      D ]  }||f 
  y wrM   )r   )rX   r   transformed_batchtransformed_examples       rP   rZ   z/MappedExamplesIterable._iter.<locals>.<genexpr>  s>      ../ABS/T , -..s   !#r   z
Canceling z async tasks.KeyboardInterrupt)msgzTasks canceled.)r   r0  r   r   r  r3   r  r   r1   recursive_tensorizer?  r@  r   rC  get_running_loopr   new_event_loopr  rM  r  	ExceptionrR  loggerdebugr   cancelrB  gatherCancelledErrorr^   )r   num_examples_to_skipr   rN  outputsr   rQ  rK  r7  r9  r   r!  r"  r$  r   rL  r/  r3  rM  r*  s   `       @@@@@@@@@@@@rP   r  zMappedExamplesIterable._iterY  s    HLHXHXd&&'CD^_ 0 01A B,,T-=-=>N-OP#'#3#34W#X #$ (()
 ??%doo&A&ABI;EiQ`;a)77gkKK	,6	6
		D	&	J	J %'&&t}}50//1 ''..e}=D8	Y 8	Yt	"nG||29
 -4 /((##(8(89I(J(V$$%HIQNI'!+(A-(.../Q   0--/0^ ,- 		z#e*]CD! 9DKK$7K894++GNNE,BC   ..
; 4LL!234		sh   D!J'1G ,J'2A)G? J'G<9J';G<<J'?AJ$!I.-J$..JJ$JJ$$J'r  c           
   #   6  K   | j                   rt        | j                   j                        n	t               }| j                  j
                  r| j                  j                         }n:t        | j                  | j                  r| j                  nd| j                        }| j                  rG| j                  d   r8| j                  j                  | j                  d          | j                  d   }nd}| j                  r8|6| j                  j                         | j                  d<   d| j                  d<   | j                  r| j                  d   nd}|D ]  \  }}| j                  r2| j                  &t        |      | j                  k  r| j                  r y | j                  |j                  |      gn| j                  D cg c]  }||   	 c}}	| j                   rR| j                  r5|	j#                  t%        t        |            D 
cg c]  }
||
z   	 c}
       n|	j#                  |        | j&                  |	i | j(                  }t+        |      }t-        |t.        j0                        s2t3        d|j4                   dt7        |       d	|j4                   d
      | j8                  rJ| j8                  D ];  }||j:                  v s|j=                  |j:                  j?                  |            }= |B|t        |      z  }| j                  r | j                  dxx   t        |      z  cc<   ||f tA        |jC                  |            D ]C  \  }
}|dz  }| j                  r| j                  dxx   dz  cc<   |dkD  r|dz  }9| d|
 |f E | j                  sZ| j                  j                         | j                  d<   d| j                  d<   | j                  dxx   t        |      z  cc<    y c c}w c c}
w w)Nr   r   r   r:  r  r   r  z(Provided `function` which is applied to z returns a variable of type z*. Make sure provided `function` returns a z to update the dataset.r  r   )"r  r3   r  r.   r0  r   r   r  r   r   r   r   r   r   r  r  r  rM  r   r   r  r  r   ro   rp   rt   
table_typers   r  r_  remove_columnr   rK  r  )r   r  r   r   r]  r   r   rx   rY   function_argsr   r  output_tablerh   r!  s                  rP   r  z"MappedExamplesIterable._iter_arrow  s    RVRaRaM$//2M2M$Ngugw	&&''224H(  .2ll4?? $ 4 4H
  0 01A B,,T-=-=>N-OP#'#3#34W#X #$  9151A1A1L1L1ND-.DED@AHLHXHXd&&'CD^_% 3	TMCOO/MDOO3(( %%- ''12/3/A/ABhsmB 
   <<!((5XCW)Xa+/)XY!((5"T]]MDT^^DF1&9LlBHH5>y?S?S>TTpF|n$NyOcOcNdd{}  """11 kF!:!::'3'A'A,B[B[BaBabhBi'jk $s8},##$$%ABc(mSB<''&/0F0FUb0F0c&d 4NA{1$K''(()LMQRRM+a/,1,  E1#,334 ##9=9I9I9T9T9VD$$%56LMD$$%HI$$%ABc(mSBg3	T C *Ys-   G
PP<PP
 B5PCP5A$Pr   c                 >   t        | j                  j                  |      | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                        S )&Shuffle the wrapped examples iterable.r   r  r  r  r   r   r  r  r  r   r  )r  r0  r   r   r  r  r  r   r   r  r  r  r   r  r   s     rP   r   z+MappedExamplesIterable.shuffle_data_sourcesd  sz    %11)<]]**,,LL 00..nn]]<@<p<p
 	
rR   r   r   c                 D   t        | j                  j                  |||      | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                        S )r   r   rg  )r  r0  r   r   r  r  r  r   r   r  r  r  r   r  r   s       rP   r   z)MappedExamplesIterable.shard_data_sourcesu  s    %//
Ej/Y]]**,,LL 00..nn]]<@<p<p
 	
rR   c                 .    | j                   j                  S rM   rZ  r   s    rP   r   z!MappedExamplesIterable.num_shards  r[  rR   )
FNFr  FNNNNNrM   r   )!r   r   r   r   r   r   r   r_   r   r   r}   r$   r   r   r   r   r   r   r   r  r   r   r   ro   rp   r  r   r   r   r   r   r   r  r  s   @rP   r  r    s   
 #-1$( %.2$(37'+IM,e*,e ,e 	,e
  S	*,e ,e SM,e ,e !c+,e D>,e /0,e 8$,e :B#,e\ $ $ ) )   $  $AFFT# FT(5QTVXV^V^Q^K_B` FTP
bii.A.A 
F^ 
"
S 
 
Rj 
" +C + +rR   r  inputmaskmask_column_namec                    t        | t        j                        rkt        |t        t        j                  t        j
                  f      s*t        j                  |gt        j                               }| j                  ||      S ||iS )N)rs   )	r   ro   rp   r_   ArrayChunkedArrayr   bool_append_column)rj  rk  rl  s      rP   	_add_maskrr    sb    
 %"$rxx AB88TF4D""#3T:: $''rR   mask_functionc                4     | |g|i |}t        |||      S rM   rr  rs  rj  rl  argsr   rk  s         rP   add_maskrx    s&    000DUD"233rR   c                P   K    | |g|i | d {   }t        |||      S 7 wrM   ru  rv  s         rP   async_add_maskrz    s4      u6t6v66DUD"233 7s   &$&c                        e Zd ZdZ	 	 	 	 	 	 ddedededeee	      dedee
   dee   d	ed
   f fdZ fdZddee
   f fdZdee
   dd fdZdde
de
dd fdZede
fd       Z xZS )FilteredExamplesIterablez
===MASK===r0  r   r  r  r  r   r  r  r  c	                 (   || _         |j                  r/t        i |j                  | j                  t        d      i      }	nd }	t        
|   |t        t        j                  |      rt        nt        || j                        |||||||		       y )Nr   )rl  )	r0  r   r  r  r  r   r  r  r   )rs  r   r$   r   rl  r'   r   r   r   r?  r@  rz  rx  )r   r0  r   r  r  r  r   r  r  r   r   s             rP   r   z!FilteredExamplesIterable.__init__  s     & ^;#7#7 ^9N9NPUV\P] ^_HH#")"="=h"GX!%!6!6
 &'!! 	 	
rR   c              #      K   t         |          D ]2  \  }}t        |      }|j                  | j                        s-||f 4 y wrM   )r   r  r}   rG  rl  )r   r   rS   r   s      rP   r  zFilteredExamplesIterable._iter  sE     !GMO 	#LC7mG{{40017l"	#s
   ;A	Ar  c              #      K   t         |   |      D ]B  \  }}|| j                     }||j                  | j                        j	                  |      f D y w)Nr  )r   r  rl  dropfilter)r   r  r   rx   rk  r   s        rP   r  z$FilteredExamplesIterable._iter_arrow  s^     "W0}0M 	IMCD112Dx}}T%:%:;BB4HHH	Is   AAseedrm   c           
          t        | j                  j                  |      | j                  | j                  | j
                  | j                  | j                  | j                  | j                        S )rf  r   r  r  r  r   r  r  )
r|  r0  r   rs  r  r  r  r   r  r  )r   r  s     rP   r   z-FilteredExamplesIterable.shuffle_data_sources  sZ    '11$7''**,,LLnn	
 		
rR   r   r   c           
          t        | j                  j                  |||      | j                  | j                  | j
                  | j                  | j                  | j                  | j                        S )r   r   r  )
r|  r0  r   rs  r  r  r  r   r  r  r   s       rP   r   z+FilteredExamplesIterable.shard_data_sources  sa    '//
Ej/Y''**,,LLnn	
 		
rR   c                 .    | j                   j                  S rM   rZ  r   s    rP   r   z#FilteredExamplesIterable.num_shards  r[  rR   )FNFr  NNrM   r   )r   r   r   rl  r   r   r   r   r_   r   r   r}   r   r  r  r   r   r   r   r  r  s   @rP   r|  r|    s    # #-1$($(37
*
 
 	

  S	*
 
 SM
 D>
 /0
@#I# I

# 
;U 

S 
 
Rl 
 +C + +rR   r|  c            	       L    e Zd Zdededej                  j                  f fdZe	d        Z
e	d        ZdefdZd	edef fd
Zeddej                  j                  dedee   fd       Zd Zdej                  j                  dd fdZddededd fdZe	defd       Z xZS )BufferShuffledExamplesIterabler0  buffer_sizer   c                 L    t         |           || _        || _        || _        y rM   )r   r   r0  r  r   )r   r0  r  r   r   s       rP   r   z'BufferShuffledExamplesIterable.__init__  s%    &&"rR   c                 .    | j                   j                  S rM   r4  r   s    rP   r   z'BufferShuffledExamplesIterable.is_typed  r5  rR   c                 .    | j                   j                  S rM   r7  r   s    rP   r   z'BufferShuffledExamplesIterable.features  r5  rR   rm   c                     | j                   j                         | _        | j                         | _        | j                  S rM   )r0  r   r   r   _original_state_dictr   s    rP   r   z/BufferShuffledExamplesIterable._init_state_dict  s4    ++<<>$(OO$5!rR   r   c                     | j                   r$|| j                  k7  rt        j                  d       t        |   |      S )NzLoading a state dict of a shuffle buffer of a dataset without the buffer content.The shuffle buffer will be refilled before starting to yield new examples.)r   r  rX  warningr   r   )r   r   r   s     rP   r   z.BufferShuffledExamplesIterable.load_state_dict  s>    T666a w&z22rR   r  c              #   X   K   	 d | j                  d||      D        E d {    $7 w)Nc              3   2   K   | ]  }t        |        y wrM   )r   )rX   r   s     rP   rZ   zFBufferShuffledExamplesIterable._iter_random_indices.<locals>.<genexpr>  s     ]1A]r   r   r  )r  )r  r  r  s      rP   _iter_random_indicesz3BufferShuffledExamplesIterable._iter_random_indices  s/     ]QJ[(\]]] ]s    *(*c              #   @  K   | j                   }t        | j                        }| j                  ||      }g }| j                  D ]9  }t        |      |k(  rt        |      }||    |||<   )|j                  |       ; |j                  |       |E d {    y 7 wrM   )	r  r   r   r  r0  r   r   rM  r  )r   r  r  r  
mem_bufferrO   r   s          rP   r   z'BufferShuffledExamplesIterable.__iter__  s     &&t~~&44S+F
!! 	%A:+-)* m# !
1!!!$	% 	Js   BBBBc                 d    t        | j                  j                  |      | j                  |      S )zFShuffle the wrapped examples iterable as well as the shuffling buffer.r  r   )r  r0  r   r  r   s     rP   r   z3BufferShuffledExamplesIterable.shuffle_data_sources%  s.    -11)<$JZJZfo
 	
rR   r   r   c                 ~    t        | j                  j                  |||      | j                  | j                        S )r   r   r  )r  r0  r   r  r   r   s       rP   r   z1BufferShuffledExamplesIterable.shard_data_sources+  s:    -//
Ej/Y((nn
 	
rR   c                 .    | j                   j                  S rM   rZ  r   s    rP   r   z)BufferShuffledExamplesIterable.num_shards3  r[  rR   )r  r   )r   r   r   r   r   r   r   r   r   r   r   r   r}   r   r   staticmethodr   r  r   r   r   r   r  r  s   @rP   r  r    s
   #$9 # #XZXaXaXkXk # ) ) ) ) $  
3$ 34 3 ^"))"5"5 ^C ^dlmpdq ^ ^"
bii.A.A 
Ff 

S 
 
Rr 
 +C + +rR   r  c            	            e Zd Z	 	 ddedededef fdZed        Zed        Z	de
fd	Zd
 Zed        Zdej                   j"                  dd fdZddededd fdZedefd       Z xZS )SkipExamplesIterabler0  n"block_sources_order_when_shufflingsplit_when_shardingc                 Z    t         |           || _        || _        || _        || _        y rM   r   r   r0  r  r  r  r   r0  r  r  r  r   s        rP   r   zSkipExamplesIterable.__init__9  /     	&2T/#6 rR   c                 .    | j                   j                  S rM   r4  r   s    rP   r   zSkipExamplesIterable.is_typedG  r5  rR   c                 .    | j                   j                  S rM   r7  r   s    rP   r   zSkipExamplesIterable.featuresK  r5  rR   rm   c                     d| j                   j                         | j                  j                  d| _        | j                  S )NF)skippedr9  rs   r>  r   s    rP   r   z%SkipExamplesIterable._init_state_dictO  s<    !%!1!1!B!B!DNN++

 rR   c              #      K   | j                   r| j                   d   rdn| j                  }| j                   rd| j                   d<   t        | j                  |d       E d {    y 7 w)Nr  r   T)r   r  r   r0  )r   r  s     rP   r   zSkipExamplesIterable.__iter__W  s\     %)%5%5$:J:J9:U[_[a[a*.DY'$**,A4HHHs   AA)!A'"A)c                 `    | |z  }| |z  }|g|z  }t        |      D ]  }||xx   dz  cc<    |S r  r   numr  quotient	remainderr  r   s         rP   split_numberz!SkipExamplesIterable.split_number]  E    !8!G	ay! 	A1INI	rR   r   c                     | j                   r| S t        | j                  j                  |      | j                  | j                   | j
                        S )zeMay not shuffle the wrapped examples iterable since it would skip examples from other shards instead.r  r  r  )r  r  r0  r   r  r  r   s     rP   r   z)SkipExamplesIterable.shuffle_data_sourcesf  J    22K'  55i@&&373Z3Z$($<$<	 rR   r   r   c                     | j                   r\t        | j                  j                  |||      | j	                  | j
                  |      |   | j                  | j                         S | S r   r   r  )r  r  r0  r   r  r  r  r   s       rP   r   z'SkipExamplesIterable.shard_data_sourcesr  sf    ##'  33JR\3]##DFFJ7>373Z3Z$($<$<	  KrR   c                 .    | j                   j                  S rM   rZ  r   s    rP   r   zSkipExamplesIterable.num_shards~  r[  rR   TTr   r   r   r   r   r   r   r   r   r   r   r}   r   r   r  r  r   r   r   r   r   r   r  r  s   @rP   r  r  8  s    
 48$(7*7 7 -1	7
 "7 ) ) ) ) $  I  
bii.A.A 
F\ 

S 
 
Rh 
 +C + +rR   r  c                        e Zd ZdZdedee   f fdZdefdZ	d Z
dej                  j                  dd fd	Zdd
ededd fdZedefd       Z xZS )RepeatExamplesIterablezP
    Iterable that repeats the underlying iterable a given number of times.
    r0  	num_timesc                 >    t         |           || _        || _        y rM   )r   r   r0  r  )r   r0  r  r   s      rP   r   zRepeatExamplesIterable.__init__  s    
 	&"rR   rm   c                     d| j                   j                         | j                  j                  d| _        | j                  S )Nr   )repeat_indexr9  rs   r>  r   s    rP   r   z'RepeatExamplesIterable._init_state_dict  s<    !%!1!1!B!B!DNN++

 rR   c              #   N  K   | j                   r| j                   d   nd}	 | j                  |t        | j                  d      k\  ry | j                  E d {    |dz  }| j                   r6|| j                   d<   | j                  j	                         | j                   d<   7 Lw)Nr  r   r   r9  )r   r  maxr0  r   )r   r  s     rP   r   zRepeatExamplesIterable.__iter__  s     ;?;K;Kt''7QR~~)lc$..RS>T.T''''AL3?  08<8H8H8Y8Y8[  !45  (s   AB%B#AB%r   c                 b    t        | j                  j                  |      | j                        S )z-Shuffle the underlying iterable, then repeat.r  )r  r0  r   r  r   s     rP   r   z+RepeatExamplesIterable.shuffle_data_sources  s'    %d&6&6&K&KI&VbfbpbpqqrR   r   r   c                 h    t        | j                  j                  |||      | j                        S )zShard, then repeat shards.r   r  )r  r0  r   r  r   s       rP   r   z)RepeatExamplesIterable.shard_data_sources  s1    %//
Ej/Ynn
 	
rR   c                 .    | j                   j                  S rM   rZ  r   s    rP   r   z!RepeatExamplesIterable.num_shards  r[  rR   r   )r   r   r   r   r   r   r   r   r}   r   r   r   r   r   r   r   r   r   r  r  s   @rP   r  r    s    #*# C=# $  	\rbii.A.A rF^ r
S 
 
Rj 
 +C + +rR   r  c            	            e Zd Z	 	 ddedededef fdZed        Zed        Z	de
fd	Zd
 Zed        Zdej                   j"                  dd fdZddededd fdZedefd       Z xZS )TakeExamplesIterabler0  r  r  r  c                 Z    t         |           || _        || _        || _        || _        y rM   r  r  s        rP   r   zTakeExamplesIterable.__init__  r  rR   c                 .    | j                   j                  S rM   r4  r   s    rP   r   zTakeExamplesIterable.is_typed  r5  rR   c                 .    | j                   j                  S rM   r7  r   s    rP   r   zTakeExamplesIterable.features  r5  rR   rm   c                     d| j                   j                         | j                  j                  d| _        | j                  S )Nr   )	num_takenr9  rs   r>  r   s    rP   r   z%TakeExamplesIterable._init_state_dict  s<    !%!1!1!B!B!DNN++

 rR   c              #      K   | j                   r| j                   d   nd}t        | j                  | j                  |z
        D ])  }| j                   r| j                   dxx   dz  cc<   | + y w)Nr  r   r   )r   r   r0  r  )r   ex_iterable_num_takenr   s      rP   r   zTakeExamplesIterable.__iter__  sm     AEAQAQ 0 0 =WX!$"2"2DFF=R4RS 	K  -2-	s   A.A0c                 `    | |z  }| |z  }|g|z  }t        |      D ]  }||xx   dz  cc<    |S r  r  r  s         rP   r  z!TakeExamplesIterable.split_number  r  rR   r   c                     | j                   r| S t        | j                  j                  |      | j                  | j                   | j
                        S )zeMay not shuffle the wrapped examples iterable since it would take examples from other shards instead.r  )r  r  r0  r   r  r  r   s     rP   r   z)TakeExamplesIterable.shuffle_data_sources  r  rR   r   r   c                 d   | j                   r\t        | j                  j                  |||      | j	                  | j
                  |      |   | j                  | j                         S t        | j                  j                  |||      | j
                  | j                  | j                         S r  )r  r  r0  r   r  r  r  r   s       rP   r   z'TakeExamplesIterable.shard_data_sources  s    ##'  33JR\3]##DFFJ7>373Z3Z$($<$<	  (  33JR\3]&&373Z3Z$($<$<	 rR   c                 .    | j                   j                  S rM   rZ  r   s    rP   r   zTakeExamplesIterable.num_shards   r[  rR   r  r   r  r  s   @rP   r  r    s    
 48$(7*7 7 -1	7
 "7 ) ) ) ) $    
bii.A.A 
F\ 
S  Rh " +C + +rR   r  r   token_per_repo_idc                     t        |       } |D ]  }|| vsd | |<    |j                  |       }|j                  ||      }|S Nr  )r}   encode_exampledecode_example)rS   r   r  r  encoded_exampledecoded_examples         rP   _apply_feature_types_on_exampler    s\     7mG (g%#'GK ( --g6O--oQb-cOrR   c                     t        |       } t        | t        t        |                      }|D ]  }|| vsd g|z  | |<    |j	                  |       }|j                  ||      }|S r  )r}   r   r   r   encode_batchdecode_batch)rk   r   r  r   r  encoded_batchdecoded_batchs          rP   _apply_feature_types_on_batchr    s{     KEU4U,-.J 5e#"&*!4E+5 ))%0M))-K\)]MrR   c                   J    e Zd ZU ee   ed<   edefd       Zedefd       Z	y)r  r  rm   c                 H    t        t        | j                        t              S rM   )r   r3   r  r0   r   s    rP   r  zFormattingConfig.is_table(  s    -(8(89>JJrR   c                 H    t        t        | j                        t              S rM   )r   r3   r  r1   r   s    rP   	is_tensorzFormattingConfig.is_tensor,  s    -(8(89?KKrR   N)
r   r   r   r   r   __annotations__r   r   r  r  rN   rR   rP   r  r  $  sF    #K$ K K L4 L LrR   r  c                   (    e Zd Zdedee   dee   deee	ee
df   f   f fdZed        Zed        Zed	        Zd
efdZd Zd
eeeej,                  f      fdZdej2                  j4                  d
d fdZddeded
d fdZed
efd       Z xZS )FormattedExamplesIterabler0  r  r   r  Nc                 Z    t         |           || _        || _        || _        || _        y rM   )r   r   r0  r
  r  r  )r   r0  r  r   r  r   s        rP   r   z"FormattedExamplesIterable.__init__2  s.     	&!$!2rR   c                     | j                   j                  r/| j                  r| j                  j                  r| j                  S y y rM   )r0  r   r  r  r  r   s    rP   r   z$FormattedExamplesIterable.iter_arrow?  s9    &&4??C[C[### D\&rR   c                 N    | j                   j                  xs | j                  d uS rM   )r0  r   r
  r   s    rP   r   z"FormattedExamplesIterable.is_typedD  s"    ((FDNN$,FFrR   c                     | j                   S rM   r  r   s    rP   r   z"FormattedExamplesIterable.featuresH  r  rR   rm   c                 X    | j                   j                         | _        | j                  S rM   re  r   s    rP   r   z*FormattedExamplesIterable._init_state_dictL  rf  rR   c              #     K   | j                   r| j                   j                  r:t        | j                  j                  s| j
                  nd | j                        }nNt        | j                   j                  | j                  j                  s| j
                  nd | j                        }| j                  j                  r@| j                         D ],  \  }}|j                  |      }t        |      D ]  }||f 
 . y t        |t              r|j                  nd }| j                  D ]Y  \  }}| j                   r8| j                  j                  s"t#        || j                   | j                        }|r ||      }||f [ y w)N)r   r  r  )r  r  r/   r0  r   r
  r  r3   r  r   r  r  r   r   r1   rT  r   r  )r   r   r   rx   rk   rS   r!  s          rP   r   z"FormattedExamplesIterable.__iter__P  sZ    $//":":'/3/?/?/H/Hd"&"8"8I
 &++/3/?/?/H/Hd"&"8"8I
 &&!%!1!1!3 'X!..x81%8 'Gw,&'' i9 -- 
 !% 0 0 #W==)9)9)B)B=$BXBXG )'2G7l"#s   FF
c              #   &  K   | j                   s"| j                  j                         E d {    | j                  j                         D ]  \  }}t        |j                        }| j                   j
                  }| j                   D ]V  }||vst        j                  j                  t        j                         t        |      d g      }|j                  ||      }X |j                  |k7  rt        || j                         }||f  y 7 wrM   )r   r0  r  ra   r_  arrow_schemaro   	NullArrayfrom_buffersnullr   rq  rr   r;   )r   r   rx   columnsrr   r  rY   s          rP   r  z%FormattedExamplesIterable._iter_arrowr  s     }}''33555!--99; 
	 MC(//0G]]//F#}} Hg-,,33BGGIs8}tfUC'55k3GHH &(1(DMMJx-
	  6s   *DDA"DB Dr   c                     t        | j                  j                  |      | j                  | j                  | j
                        S )rf  r   r  r  )r  r0  r   r   r  r  r   s     rP   r   z.FormattedExamplesIterable.shuffle_data_sources  s:    (11)<]]"44	
 	
rR   r   r   c                     t        | j                  j                  |||      | j                  | j                  | j
                        S )r   r   r  )r  r0  r   r   r  r  r   s       rP   r   z,FormattedExamplesIterable.shard_data_sources  sA    (//
Ej/Y]]"44	
 	
rR   c                 .    | j                   j                  S rM   rZ  r   s    rP   r   z$FormattedExamplesIterable.num_shards  r[  rR   r   ) r   r   r   r   r   r  r$   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   ro   rp   r  r   r   r   r   r   r   r   r  r  s   @rP   r  r  1  s   3*3 -.3 8$	3
  U3d?%; ;<3 $ $ G G   $   #D XeCM&:;  
bii.A.A 
Fa 

S 
 
Rm 
 +C + +rR   r  c                   T    e Zd ZU ej                  j
                  ed<   dZee	   ed<   y)ShufflingConfigr   N_original_seed)
r   r   r   r   r   r   r  r  r   r   rN   rR   rP   r  r    s     yy"""$(NHSM(rR   r  c                   "    e Zd ZU eed<   eed<   y)DistributedConfigrank
world_sizeN)r   r   r   r   r  rN   rR   rP   r  r    s    
IOrR   r  c                     t         j                  rfddl}|j                  j                  j
                  | j                  vr5| xj                  |j                  j                  j
                  fz  c_        yyy)zNAdd torch.utils.data.IterableDataset as a parent class if 'torch' is availabler   N)r   TORCH_AVAILABLEtorch.utils.datautilsdataIterableDataset	__bases__)clstorchs     rP   ._maybe_add_torch_iterable_dataset_parent_classr    sV    ;;++3==@MMekk..>>@@M A rR   valueztorch.Tensorc                     t         j                  rJdd l}t        | |j                        r| j                         S  |j                  |       j                         S | S rk  )r   r   r  r   Tensorshare_memory_tensor)r	  r  s     rP   *_maybe_share_with_torch_persistent_workersr    sJ    eU\\*&&((5<<&4466rR   c                   H    e Zd ZdZded   defdZdee   fdZ	dedd fdZ
y	)
IterableColumnaQ  
    An iterable for a specific column of an [`IterableDataset`].

    Example:

    Iterate on the texts of the "text" column of a dataset:

    ```python
    for text in dataset["text"]:
        ...
    ```

    It also works with nested columns:

    ```python
    for source in dataset["metadata"]["source"]:
        ...
    ```
    source)r  r  r  c                      || _         || _        y rM   r  r  )r   r  r  s      rP   r   zIterableColumn.__init__  s    &rR   rm   c              #   P   K   | j                   D ]  }|| j                       y wrM   r  )r   rS   s     rP   r   zIterableColumn.__iter__  s*     {{ 	,G$**++	,s   $&c                     t        | |      S rM   r  r   r  s     rP   __getitem__zIterableColumn.__getitem__      dK00rR   N)r   r   r   r   r   r   r   r   r   r   r  rN   rR   rP   r  r    sE    ('u%HI 'X[ ',(3- ,1s 1/? 1rR   r  c                      e Zd ZdZ	 	 	 	 	 	 dzdedee   dee   dee   dee	   dee
   d	eeeeeedf   f      fd
Zedee   fd       Zedeee      fd       ZdefdZdeddfdZd Zd Zd Zd{dZedefd       Zd Zedefd       Zedefd       Zd Zd Z 	 d|dededefdZ!d Z"d}dedefdZ#d ede$fd!Z%e&dde'jP                  fd"e)d#ee*   d$ee   dedd f
d%       Z+e&	 	 d~d&d'dee   d#ee*   dd fd(       Z,e&d)edd fd*       Z-	 dd+ee   dd fd,Z.	 	 	 	 	 	 	 	 	 dd-ee)   d.ed/eeeee   f      d0edee   ded1eeeee   f      d#ee*   d2ee   dd fd3Z/	 	 	 	 	 	 dd-ee)   d/eeeee   f      d0edee   d2ee   dd fd4Z0	 dd"ee1jd                  jf                     d5edd fd6Z4d7efd8Z5d9edd fd:Z6d;ee   dd fd<Z7d9edd fd=Z8	 dd>ed?ed@edd fdAZ9dBedCeee1jt                  f   dd fdDZ;dEedFedd fdGZ<dHeeef   dd fdIZ=dJeeee   f   dd fdKZ>dJeeee   f   dd fdLZ?dCedMe@dd fdNZAd#e*dd fdOZBddPedQedd fdRZCdSedTedd fdUZDdV ZEd}dededd fdWZFddee   d0edeeeGe   f   fdXZHdefdYZI	 ddee   d0edeeJj                  eGeJj                     f   fdZZL	 	 	 	 ddee   d0ed[ee   d\eded]eGd]   f   f
d^ZM	 	 d~d_eeNeOf   dee   d`ee   defdaZP	 	 d~d_eeNeOf   dee   d`ee   defdbZQ	 ddBedceedddedff   dee   defdgZR	 	 d~d_eeNeOf   dee   d`ee   defdhZSdiedjedkedlededmee   dnee   doee   d>edpedeTeUeeV   eef      fdqZWdkedlededmee   dnee   doee   d>ee   dpedree   deUeeV   eeef   fdsZX	 	 	 	 	 	 	 	 	 	 	 	 	 ddkedteduee   dee   dlee   dvee   dwee   dxee   dmee   dnee   doee   d>ee   dpedree   deYfdyZZy)r  z A Dataset backed by an iterable.Nr0  infosplitr  	shufflingdistributedr  c                    |r(|j                   dkD  r|r|j                  t        d      ||j                         n	t	               }t        j                  | ||       t        j                  |      | _        || _        || _	        || _
        |xs i | _        t        d      | _        d | _        | j                          t!        | j"                         y )Nr   zThe dataset doesn't have a fixed random seed across nodes to shuffle and split the list of dataset shards by node. Please pass e.g. `seed=42` in `.shuffle()` to make all the nodes use the same seed. )r  r  r   )r  r  r   r   r4   r"   r   _ex_iterable_formatting
_shuffling_distributed_token_per_repo_idr  _epoch_starting_state_dict"_prepare_ex_iterable_for_iterationr  r   )r   r0  r  r  r  r  r  r  s           rP   r   zIterableDataset.__init__  s     ;11A5)	H`H`Hhg 
 #.tyy{KM!!$T? IIk2%#'EVE\Z\2\]^2_48!//16t~~FrR   rm   c                 H    | j                   dS t        | j                         S )af  Number of columns in the dataset.
        This can be None if the dataset has unknown features (e.g. after a map() operation).

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
        >>> ds.num_columns
        2
        ```
        N)r   r   r   s    rP   num_columnszIterableDataset.num_columns  s!     }},tD#dmm2DDrR   c                 H    | j                   dS t        | j                         S )a  Names of the columns in the dataset.
        This can be None if the dataset has unknown features (e.g. after a map() operation).

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation", streaming=True)
        >>> ds.column_names
        ['text', 'label']
        ```
        N)r   r_   r   s    rP   r_  zIterableDataset.column_names
  s!     }},tE$t}}2EErR   c                 @    t        j                  | j                        S )a  Get the current state_dict of the dataset.
        It corresponds to the state at the latest example it yielded.

        Resuming returns exactly where the checkpoint was saved except in two cases:

        1. examples from shuffle buffers are lost when resuming and the buffers are refilled with new data
        2. combinations of `.with_format(arrow)` and batched `.map()` may skip one batch.

        Returns:
            `dict`

        Example:

        ```py
        >>> from datasets import Dataset, concatenate_datasets
        >>> ds = Dataset.from_dict({"a": range(6)}).to_iterable_dataset(num_shards=3)
        >>> for idx, example in enumerate(ds):
        ...     print(example)
        ...     if idx == 2:
        ...         state_dict = ds.state_dict()
        ...         print("checkpoint")
        ...         break
        >>> ds.load_state_dict(state_dict)
        >>> print(f"restart from checkpoint")
        >>> for example in ds:
        ...     print(example)
        ```

        which returns:
        ```
        {'a': 0}
        {'a': 1}
        {'a': 2}
        checkpoint
        restart from checkpoint
        {'a': 3}
        {'a': 4}
        {'a': 5}
        ```

        ```py
        >>> from torchdata.stateful_dataloader import StatefulDataLoader
        >>> ds = load_dataset("deepmind/code_contests", streaming=True, split="train")
        >>> dataloader = StatefulDataLoader(ds, batch_size=32, num_workers=4)
        >>> # checkpoint
        >>> state_dict = dataloader.state_dict()  # uses ds.state_dict() under the hood
        >>> # resume from checkpoint
        >>> dataloader.load_state_dict(state_dict)  # uses ds.load_state_dict() under the hood
        ```
        )r   r   r   r   s    rP   r   zIterableDataset.state_dict  s    f }}T--..rR   r   c                     || _         y)a  Load the state_dict of the dataset.
        The iteration will restart at the next example from when the state was saved.

        Resuming returns exactly where the checkpoint was saved except in two cases:

        1. examples from shuffle buffers are lost when resuming and the buffers are refilled with new data
        2. combinations of `.with_format(arrow)` and batched `.map()` may skip one batch.

        Example:

        ```py
        >>> from datasets import Dataset, concatenate_datasets
        >>> ds = Dataset.from_dict({"a": range(6)}).to_iterable_dataset(num_shards=3)
        >>> for idx, example in enumerate(ds):
        ...     print(example)
        ...     if idx == 2:
        ...         state_dict = ds.state_dict()
        ...         print("checkpoint")
        ...         break
        >>> ds.load_state_dict(state_dict)
        >>> print(f"restart from checkpoint")
        >>> for example in ds:
        ...     print(example)
        ```

        which returns:
        ```
        {'a': 0}
        {'a': 1}
        {'a': 2}
        checkpoint
        restart from checkpoint
        {'a': 3}
        {'a': 4}
        {'a': 5}
        ```

        ```py
        >>> from torchdata.stateful_dataloader import StatefulDataLoader
        >>> ds = load_dataset("deepmind/code_contests", streaming=True, split="train")
        >>> dataloader = StatefulDataLoader(ds, batch_size=32, num_workers=4)
        >>> # checkpoint
        >>> state_dict = dataloader.state_dict()  # uses ds.state_dict() under the hood
        >>> # resume from checkpoint
        >>> dataloader.load_state_dict(state_dict)  # uses ds.load_state_dict() under the hood
        ```
        N)r&  )r   r   s     rP   r   zIterableDataset.load_state_dictO  s    ` %/!rR   c                     d| j                   j                  -t        | j                   j                  j                               nd d| j                   dS )Nz IterableDataset({
    features: Unknownz,
    num_shards: z
}))_infor   r_   r   r   r   s    rP   __repr__zIterableDataset.__repr__  st    3X\XbXbXkXkXwD9L9L9Q9Q9S4T  ~G  4H  H[  \`  \k  \k  [l  lq  r  	rrR   c                     | j                   S rM   )__dict__r   s    rP   __getstate__zIterableDataset.__getstate__  s    }}rR   c                 p    || _         t        | j                        | _        t        | j                         y rM   )r2  r  r%  r  r   )r   ds     rP   __setstate__zIterableDataset.__setstate__  s%    @M6t~~FrR   c                 J    t        t        | j                  |                  S )NrA  )r   r   )r   r  s     rP   _headzIterableDataset._head  s    Da0122rR   c                 ,    t        | j                        S rM   )r   r%  r   s    rP   epochzIterableDataset.epoch  s    4;;rR   c                 `   | j                   r%| j                  dk(  r| j                   j                  S | j                   rgt        | j                   j                        j	                  dd      | j                  z
  }|dk  rd|z   n|}t
        j                  j                  |      S t        d      )Nr   l            zThis dataset is not shuffled)	r"  r:  r   r   r  r   r   default_rngr^   )r   effective_seeds     rP   _effective_generatorz$IterableDataset._effective_generator  s    ??tzzQ??,,,__%doo&?&?@II!WUX\XbXbbN;IA;Mg7SaN99((88;<<rR   c                     | j                   r]| j                  j                  | j                   j                  z  dk(  r-| j                  j                  | j                   j                  z  S | j                  j                  S rk  )r#  r   r   r  r   s    rP   r   zIterableDataset.num_shards  se    !2!2!=!=@Q@Q@\@\!\`a!a$$//43D3D3O3OOO  +++rR   c                     | j                   S rM   r  r   s    rP   n_shardszIterableDataset.n_shards  s    rR   c           
   #     K   | j                         }t        j                  j                          dd l}|j
                  j                  j                         }| j                         r|j                  |j                  k  rzt        j                  d|j                   d|j                   d|j                  |j                  z
   d       t        j                  d|j                   d|j                   d       | j                  rd	| j                  j                   d
nd}|j!                  |j                  |j"                  d      }|rt        j%                  | d|j"                   dt'        |       d|j                   d       |j)                  |j                  |j"                  d      }|j+                         | j,                  d| _        | j0                  r:| j,                  | j0                  d   k(  r|j3                  | j0                  d          | j4                  r|j6                  s| j4                  j8                  rst;        | j4                  j<                  | j>                        }|j6                  r|j7                         }ntA        |d      }|D ]  \  }}	|jC                  |	        y |D ]	  \  }}
|
  t        j%                  | d|j"                   dt'        |       d|j                   d       y t        j%                  | d|j"                   d|j                   d|j                   d       y w)Nr   zToo many dataloader workers:  (max is dataset.num_shards=). Stopping z dataloader workers.zTo parallelize data loading, we give each process some shards (or data sources) to process. Therefore it's unnecessary to have a number of workers greater than dataset.num_shards=J. To enable more parallelism, please split the dataset in more files than r&  znode#  Fr   r   r   zdataloader worker#z, ': Starting to iterate over /z shards.r9  r:  r:  r9  r  r   rA  z, ': Finished iterating over z9, ': Stopping... Number of dataset shards < num_workers (<z).)"r'  fsspecasyn
reset_lockr  r  r  get_worker_info_is_main_processr   num_workersrX  r  r  r#  r  r   idrY  r   r   r   r:  r   r&  r   r!  r   r  r3   r  r   r   r  )r   r0  r  worker_info_log_prefixshards_indicesr   r   r   rx   rS   s              rP   _iter_pytorchzIterableDataset._iter_pytorch  s    ==? 	 kk&&668  "{'='=@W@W'WNN/0G0G/HHdepe{e{d| }'33k6L6LLMMac KKjju  kA  kA  jB B[[f[q[qZrrsu <@;L;Ld//445Q7RT$BB"..knnQV C 
 LL-1+..1AA_`cdr`s_ttu  wB  wM  wM  vN  NV  W &88&22+..UZ 9 K &1%A%A%C D ((TZZ4;T;TU\;]-]++D,E,EFY,Z[[%;%;t?O?O?X?X)$*:*:*F*FQUQ^Q^_	))*557H0KH%- 9MC#..x889$/ "LC!M" LL-1+..1AA^_bcq_r^sst  vA  vL  vL  uM  MU  V LL-1+..1AAz  |G  |R  |R  {S  ST  U`  Ul  Ul  Tm  mo  ps   M	Mc                     | j                   r| j                   j                  dkD  rydt        j                  v r:dd l}|j
                  j                  j                         }||j                  dkD  ryy)Nr   Fr  T)	r#  r  r  r  r  r  r  rO  rR  )r   r  rS  s      rP   rP  z IterableDataset._is_main_process  s_    !2!2!7!7!!;ckk!#++**::<K&;>>A+=rR   r   r   c           	         | j                   }| j                  r"|j                  s;| j                  j                  s%| j                  r'|j                  | j                  k7  rt        |||      }| j                  r |j                  | j                               }n|}| j                  r| j                  j                  }| j                  j                  }|j                  |z  dk(  r\| j                         r7|j                  |z  }|dkD  rdnd}t        j                  d| d| d| d	       |j!                  ||d
      }n_| j                         rAt        j                  d| d       t        j                  d| d|j                   d|        t#        |||      }| j                  s%| j                  rF|j                  | j                  k7  r-t%        || j                  | j                  | j&                        }|j)                         | j*                  d| _        | j.                  r:| j*                  | j.                  d   k(  r|j1                  | j.                  d          |S )Nr`  r   r   srG  z
Assigning z shardz (or data sourcez) of the dataset to each node.FrH  zAssigning 1 out of zS examples of the dataset to each node. The others are skipped during the iteration.zIt is more optimized to distribute the dataset shards (or data sources) across nodes. You can do that by using a dataset with number of shards that is a factor of world_size=z. The current dataset has z which is not a factor of r{  r  r   r  rJ  r:  r9  )r   r!  r   r  r   r/  r"  r   r>  r#  r  r  r   rP  rX  r  r   rq  r  r$  r   r:  r   r&  r   )r   r   r   r0  r  r  num_shards_per_nodeplurals           rP   r'  z2IterableDataset._prepare_ex_iterable_for_iteration  sK    ''''4+;+;+D+D+"6"6$--"G8
OK ??%::4;T;T;VWK%K$$))D**55J%%
2a7((**5*@*@J*N'$7!$;SFKK$%8$9xGWX^W__}~ *<<
Z^kp<q((*KK-j\  :M  N KKss}r~ 33>3I3I2JJdeodpr
 3;ZX\]+2F2F$--2W3++"&"9"9	K "-!=!=!?ZZ
 $$t7P7PQX7Y)Y''(A(ABU(VWrR   c              #     K   dt         j                  v rmdd l}|j                  j                  j                         }t        | |j                  j                  j                        r|| j                         E d {    y | j                         }| j                  r|j                  s| j                  j                  rst        | j                  j                  | j                        }|j                  r|j                         }nt!        |d      }|D ]  \  }}|j#                  |        y |D ]	  \  }}|  y 7 ŭw)Nr  r   r  r   rA  )r  r  r  r  r  rO  r   r  rV  r'  r!  r   r  r3   r  r   r   r  )	r   r  rS  r0  r   r   r   rx   rS   s	            rP   r   zIterableDataset.__iter__(	  s    ckk!#++**::<K$ 0 0 @ @AkF]--///==?!7!74;K;K;T;T%d&6&6&B&BT]][I%%&113,[QG!) 5X**8445' 	LCM	 0s   A:E<E=CEc              #     K   | j                   rJt        | j                   j                  | j                        }t	        |t
              r|j                  nd}nd}| j                  ||      }| j                   rk|j                  s| j                   j                  rI|j                  r|j                         }nt        |||      }|D ]  \  }}j                  |        yt        |      }|D ]V  \  }}	|	gt        ||dz
        D 	cg c]  \  }}	|		 c}	}z   }
|rt        |
      |k  r yt        |
      }|r ||      n| X yc c}	}w w)a  Iterate through the batches of size `batch_size`.

        Args:
            batch_size (:obj:`int`): size of each batch to yield.
            drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be
                dropped
        r  Nr`  r   )r!  r3   r  r   r   r1   rT  r'  r   r  r   r  r   r   r   r   )r   r   r   r   r!  r0  r   r   rx   rS   rz   rk   s               rP   r   zIterableDataset.iterA	  s?     %d&6&6&B&BT]][I;EiQ`;a)77gkKK==et=u!7!74;K;K;T;T%%&113,[Zapq!) 7X,,X667$$ 	?LCyxQ[^_Q_@`#aWG#aaH3x=:#=&x0E(3+e$>	?#as   D
EE
:Er  c                     t        | |      S rM   r  r  s     rP   r  zIterableDataset.__getitem__d	  r  rR   r   r   
gen_kwargsc                 D    ddl m}  || ||d|      j                         S )a  Create an Iterable Dataset from a generator.

        Args:
            generator (`Callable`):
                A generator function that `yields` examples.
            features (`Features`, *optional*):
                Dataset features.
            gen_kwargs(`dict`, *optional*):
                Keyword arguments to be passed to the `generator` callable.
                You can define a sharded iterable dataset by passing the list of shards in `gen_kwargs`.
                This can be used to improve shuffling and when iterating over the dataset with multiple workers.
            split ([`NamedSplit`], defaults to `Split.TRAIN`):
                Split name to be assigned to the dataset.

                <Added version="2.21.0"/>
        Returns:
            `IterableDataset`

        Example:

        ```py
        >>> def gen():
        ...     yield {"text": "Good", "label": 0}
        ...     yield {"text": "Bad", "label": 1}
        ...
        >>> ds = IterableDataset.from_generator(gen)
        ```

        ```py
        >>> def gen(shards):
        ...     for shard in shards:
        ...         with open(shard) as f:
        ...             for line in f:
        ...                 yield {"line": line}
        ...
        >>> shards = [f"data{i}.txt" for i in range(32)]
        >>> ds = IterableDataset.from_generator(gen, gen_kwargs={"shards": shards})
        >>> ds = ds.shuffle(seed=42, buffer_size=10_000)  # shuffles the shards order + uses a shuffle buffer
        >>> from torch.utils.data import DataLoader
        >>> dataloader = DataLoader(ds.with_format("torch"), num_workers=4)  # give each worker a subset of 32/4=8 shards
        ```
        r   )GeneratorDatasetInputStreamT)r   r   r`  	streamingr  )io.generatorrb  read)r   r   r`  r  rb  s        rP   from_generatorzIterableDataset.from_generatorg	  s)    b 	>*(zUYaf

$&	rR   dfzpyspark.sql.DataFramec                     ddl m} t        j                  dk(  rt	        d       || f||dd|j                         S )a  Create an IterableDataset from Spark DataFrame. The dataset is streamed to the driver in batches.

        Args:
            df (`pyspark.sql.DataFrame`):
                The DataFrame containing the desired data.
            split (`NamedSplit`, *optional*):
                Split name to be assigned to the dataset.
            features (`Features`, *optional*):
                Dataset features.

        Returns:
            [`IterableDataset`]

        Example:

        ```py
        >>> df = spark.createDataFrame(
        >>>     data=[[1, "Elia"], [2, "Teo"], [3, "Fang"]],
        >>>     columns=["id", "name"],
        >>> )
        >>> ds = IterableDataset.from_spark(df)
        ```
        r   )SparkDatasetReaderwin32z@IterableDataset.from_spark is not currently supported on WindowsT)r  r   rc  )io.sparkri  r  platformOSErrorre  )rg  r  r   r   ri  s        rP   
from_sparkzIterableDataset.from_spark	  sP    < 	1<<7"\]]!
	

 
 $&	rR   filenamec                     t        |       }t        j                  |      }t        t        j
                  d| i      }t        |t        |            S )zInstantiate a IterableDataset from Arrow table at filename.

        Args:
            filename (`str`):
                File name of the dataset.

        Returns:
            [`IterableDataset`]
        ro  )r   r  )r0  r  )r=   r$   rw   r  r!    _generate_tables_from_cache_filer  r4   )ro  pa_table_schemainferred_featuresr0  s       rP   	from_filezIterableDataset.from_file	  sK     09$66G+G,T,T^hjr]st;[Rc=deerR   rs   c           
      0   t        |      }t        | j                  | j                  j	                         | j
                  t        |      t	        j                  | j                        t	        j                  | j                        | j                        S )at	  
        Return a dataset with the specified format.

        Args:

            type (`str`, *optional*):
                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`.
                `None` means it returns python objects (default).

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> from transformers import AutoTokenizer
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation", streaming=True)
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)
        >>> ds = ds.with_format("torch")
        >>> next(iter(ds))
        {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',
         'label': tensor(1),
         'input_ids': tensor([  101, 18027, 16310, 16001,  1103,  9321,   178, 11604,  7235,  6617,
                1742,  2165,  2820,  1206,  6588, 22572, 12937,  1811,  2153,  1105,
                1147, 12890, 19587,  6463,  1105, 15026,  1482,   119,   102,     0,
                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
                    0,     0,     0,     0]),
         'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
         'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}
        ```
        r  r0  r  r  r  r  r  r  )r2   r  r   r/  r   _splitr  r   r"  r#  r$  )r   rs   s     rP   with_formatzIterableDataset.with_format	  so    X *$/ ))"++'D9mmDOO4d&7&78"55
 	
rR   r   r  r  r  r  r  c
                    t        |t              r|g}t        |t              r|g}|t        }|	i }	|t        |      }| j                  }
|
j
                  r;| j                  j                  #| j                  j                  |
j                  k(  rdn| j                  j                  }| j                  r_| j                  j                  rIt        |
t        j                  | j                        || j                        }
t        |
|r|nd|      }
n| j                  r2| j                  j                  rt        | j                  |r|nd|      }
| j                  s|r6t        |
t        j                  | j                        || j                        }
t!        |
||||||||	| j                  |      }
| j"                  j                         }||_        t%        |
|| j&                  | j                  t        j                  | j(                        t        j                  | j*                        | j                        S )a  
        Apply a function to all the examples in the iterable dataset (individually or in batches) and update them.
        If your function returns a column that already exists, then it overwrites it.
        The function is applied on-the-fly on the examples when iterating over the dataset.

        You can specify whether the function should be batched or not with the `batched` parameter:

        - If batched is `False`, then the function takes 1 example in and should return 1 example.
          An example is a dictionary, e.g. `{"text": "Hello there !"}`.
        - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples.
          A batch is a dictionary, e.g. a batch of 1 example is {"text": ["Hello there !"]}.
        - If batched is `True` and `batch_size` is `n` > 1, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples.
          Note that the last batch may have less than `n` examples.
          A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`.

        If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simulatenous calls.
        It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.

        Args:
            function (`Callable`, *optional*, defaults to `None`):
                Function applied on-the-fly on the examples when you iterate on the dataset.
                It must have one of the following signatures:

                - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False`
                - `function(example: Dict[str, Any], idx: int) -> Dict[str, Any]` if `batched=False` and `with_indices=True`
                - `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False`
                - `function(batch: Dict[str, List], indices: List[int]) -> Dict[str, List]` if `batched=True` and `with_indices=True`

                For advanced usage, the function can also return a `pyarrow.Table`.
                If the function is asynchronous, then `map` will run your function in parallel.
                Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged.
                If no function is provided, default to identity function: `lambda x: x`.
            with_indices (`bool`, defaults to `False`):
                Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx[, rank]): ...`.
            input_columns (`Optional[Union[str, List[str]]]`, defaults to `None`):
                The columns to be passed into `function`
                as positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.
            batched (`bool`, defaults to `False`):
                Provide batch of examples to `function`.
            batch_size (`int`, *optional*, defaults to `1000`):
                Number of examples per batch provided to `function` if `batched=True`.
                `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to `function`.
            drop_last_batch (`bool`, defaults to `False`):
                Whether a last batch smaller than the batch_size should be
                dropped instead of being processed by the function.
            remove_columns (`[List[str]]`, *optional*, defaults to `None`):
                Remove a selection of columns while doing the mapping.
                Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding
                columns with names in `remove_columns`, these columns will be kept.
            features (`[Features]`, *optional*, defaults to `None`):
                Feature types of the resulting dataset.
            fn_kwargs (`Dict`, *optional*, default `None`):
                Keyword arguments to be passed to `function`.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True)
        >>> def add_prefix(example):
        ...     example["text"] = "Review: " + example["text"]
        ...     return example
        >>> ds = ds.map(add_prefix)
        >>> list(ds.take(3))
        [{'label': 1,
         'text': 'Review: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},
         {'label': 1,
         'text': 'Review: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'},
         {'label': 1, 'text': 'Review: effective but too-tepid biopic'}]
        ```
        NrZ  r   r`  )
r   r  r  r  r   r   r  r  r  r   rw  )r   r   rQ   r*   r   r   r/  r   r!  r  r  r   r   r$  r/  r   r  r  r  rx  r"  r#  )r   r   r  r  r  r   r   r  r   r  r0  input_featuresr  s                rP   mapzIterableDataset.map
  s   f mS)*OMnc*,-N$HI<XFH'' $$$***=*=*EI\I\`k`t`tIt $$ 	  0 0 9 93==)9)9:'"&"9"9	K 9g
1VeK D$5$5$@$@<%%*Q`o >7#}}T-=-=>+&*&=&=	 -%'!+)''
 yy~~ #++''mmDOO4d&7&78"55
 	
rR   c           
      0   t        |t              r|g}| j                  }| j                  j                  s| j
                  rEt        || j
                  |j                  rdn| j                  j                  | j                        }t        |||||||| j
                        }t        || j                  | j                  | j
                  t        j                  | j                        t        j                  | j                        | j                        S )a
  Apply a filter function to all the elements so that the dataset only includes examples according to the filter function.
        The filtering is done on-the-fly when iterating over the dataset.

        If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simulatenous calls (configurable).
        It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.

        Args:
            function (`Callable`):
                Callable with one of the following signatures:

                - `function(example: Dict[str, Any]) -> bool` if `with_indices=False, batched=False`
                - `function(example: Dict[str, Any], indices: int) -> bool` if `with_indices=True, batched=False`
                - `function(example: Dict[str, List]) -> List[bool]` if `with_indices=False, batched=True`
                - `function(example: Dict[str, List], indices: List[int]) -> List[bool]` if `with_indices=True, batched=True`

                If the function is asynchronous, then `filter` will run your function in parallel.
                If no function is provided, defaults to an always True function: `lambda x: True`.
            with_indices (`bool`, defaults to `False`):
                Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`.
            input_columns (`str` or `List[str]`, *optional*):
                The columns to be passed into `function` as
                positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.
            batched (`bool`, defaults to `False`):
                Provide batch of examples to `function`.
            batch_size (`int`, *optional*, default `1000`):
                Number of examples per batch provided to `function` if `batched=True`.
            fn_kwargs (`Dict`, *optional*, default `None`):
                Keyword arguments to be passed to `function`.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True)
        >>> ds = ds.filter(lambda x: x["label"] == 0)
        >>> list(ds.take(3))
        [{'label': 0, 'movie_review': 'simplistic , silly and tedious .'},
         {'label': 0,
         'movie_review': "it's so laddish and juvenile , only teenage boys could possibly find it funny ."},
         {'label': 0,
         'movie_review': 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .'}]
        ```
        NrZ  r  rw  )r   r   r   r/  r   r!  r  r   r$  r|  r  rx  r   r   r"  r#  )r   r   r  r  r  r   r  r0  s           rP   r  zIterableDataset.filter
  s    h mS)*OM ''::$"2"23++!,!5!54::;N;N"&"9"9	K /%'!''	
 #++''mmDOO4d&7&78"55
 	
rR   r  c           
      l   | t         j                  j                  |      }nt        |      }t	        ||      }t        t        | j                  ||      | j                  j                         | j                  | j                  |t        j                  | j                        | j                        S )a^  
        Randomly shuffles the elements of this dataset.

        This dataset fills a buffer with `buffer_size` elements, then randomly samples elements from this buffer,
        replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or
        equal to the full size of the dataset is required.

        For instance, if your dataset contains 10,000 elements but `buffer_size` is set to 1000, then `shuffle` will
        initially select a random element from only the first 1000 elements in the buffer. Once an element is
        selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element,
        maintaining the 1000 element buffer.

        If the dataset is made of several shards, it also does shuffle the order of the shards.
        However if the order has been fixed by using [`~datasets.IterableDataset.skip`] or [`~datasets.IterableDataset.take`]
        then the order of the shards is kept unchanged.

        Args:
            seed (`int`, *optional*, defaults to `None`):
                Random seed that will be used to shuffle the dataset.
                It is used to sample from the shuffle buffer and also to shuffle the data shards.
            generator (`numpy.random.Generator`, *optional*):
                Numpy random Generator to use to compute the permutation of the dataset rows.
                If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy).
            buffer_size (`int`, defaults to `1000`):
                Size of the buffer.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True)
        >>> list(ds.take(3))
        [{'label': 1,
         'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},
         {'label': 1,
         'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'},
         {'label': 1, 'text': 'effective but too-tepid biopic'}]
        >>> shuffled_ds = ds.shuffle(seed=42)
        >>> list(shuffled_ds.take(3))
        [{'label': 1,
         'text': "a sports movie with action that's exciting on the field and a story you care about off it ."},
         {'label': 1,
         'text': 'at its best , the good girl is a refreshingly adult take on adultery . . .'},
         {'label': 1,
         'text': "sam jones became a very lucky filmmaker the day wilco got dropped from their record label , proving that one man's ruin may be another's fortune ."}]
        ```
        )r   r  r  rw  )r   r   r<  r   r  r  r  r   r/  r   rx  r!  r#  r$  )r   r  r   r  r  s        rP   r  zIterableDataset.shuffle  s    d 		--d3I +I#iM	6!!{i "++''d&7&78"55

 
	
rR   r:  c                 H    | xj                   || j                   z
  z  c_         y rM   )r%  )r   r:  s     rP   	set_epochzIterableDataset.set_epochD  s    ut{{**rR   r  c           
      h   t        | j                  || j                  du | j                  du       }t	        || j
                  j                         | j                  | j                  t        j                  | j                        t        j                  | j                        | j                        S )a  
        Create a new [`IterableDataset`] that skips the first `n` elements.

        Args:
            n (`int`):
                Number of elements to skip.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True)
        >>> list(ds.take(3))
        [{'label': 1,
         'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},
         {'label': 1,
         'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'},
         {'label': 1, 'text': 'effective but too-tepid biopic'}]
        >>> ds = ds.skip(1)
        >>> list(ds.take(3))
        [{'label': 1,
         'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'},
         {'label': 1, 'text': 'effective but too-tepid biopic'},
         {'label': 1,
         'text': 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .'}]
        ```
        Nr  r  rw  )r  r   r"  r#  r  r/  r   rx  r!  r   r$  r   r  r0  s      rP   skipzIterableDataset.skipG  s    8 +/3$/F $ 1 1T 9	
 #"++''mmDOO4d&7&78"55
 	
rR   r  c           
         t        t        | j                  |      | j                  | j                  | j
                  t        j                  | j                        t        j                  | j                        | j                        S )a~  
        Create a new [`IterableDataset`] that repeats the underlying dataset `num_times` times.

        N.B. The effect of calling shuffle after repeat depends significantly on buffer size.
        With buffer_size 1, duplicate data is never seen in the same iteration, even after shuffling:
        ds.repeat(n).shuffle(seed=42, buffer_size=1) is equivalent to ds.shuffle(seed=42, buffer_size=1).repeat(n),
        and only shuffles shard orders within each iteration.
        With buffer size >= (num samples in the dataset * num_times), we get full shuffling of the repeated data, i.e. we can observe duplicates in
        the same iteration.

        Args:
            num_times (`int`) or (`None`):
                Number of times to repeat the dataset. If `None`, the dataset will be repeated indefinitely.

        Example:
        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train")
        >>> ds = ds.take(2).repeat(2)
        >>> list(ds)
        [{'label': 1,
         'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},
         {'label': 1,
         'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'},
         {'label': 1, 'text': 'effective but too-tepid biopic'},
         {'label': 1,
         'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},
         {'label': 1,
         'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'},
         {'label': 1, 'text': 'effective but too-tepid biopic'}]
        ```
        r  rw  )r  r  r   r/  rx  r!  r   r   r"  r#  r$  )r   r  s     rP   repeatzIterableDataset.repeats  sd    B .t/@/@IV++''mmDOO4d&7&78"55
 	
rR   c           
      h   t        | j                  || j                  du | j                  du       }t	        || j
                  j                         | j                  | j                  t        j                  | j                        t        j                  | j                        | j                        S )a  
        Create a new [`IterableDataset`] with only the first `n` elements.

        Args:
            n (`int`):
                Number of elements to take.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True)
        >>> small_ds = ds.take(2)
        >>> list(small_ds)
        [{'label': 1,
         'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},
         {'label': 1,
         'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .'}]
        ```
        Nr  rw  )r  r   r"  r#  r  r/  r   rx  r!  r   r$  r  s      rP   takezIterableDataset.take  s    * +/3$/F $ 1 1T 9	
 #"++''mmDOO4d&7&78"55
 	
rR   r   r   r   c           
      B   | j                   j                  |||      }t        || j                  j	                         | j
                  | j                  t	        j                  | j                        t	        j                  | j                        | j                        S )a(  Return the `index`-nth shard from dataset split into `num_shards` pieces.

        This shards deterministically. `dataset.shard(n, i)` splits the dataset into contiguous chunks,
        so it can be easily concatenated back together after processing. If `dataset.num_shards % n == l`, then the
        first `l` datasets each have `(dataset.num_shards // n) + 1` shards, and the remaining datasets have `(dataset.num_shards // n)` shards.
        `datasets.concatenate_datasets([dset.shard(n, i) for i in range(n)])` returns a dataset with the same order as the original.
        In particular, `dataset.shard(dataset.num_shards, i)` returns a dataset with 1 shard.

        Note: n should be less or equal to the number of shards in the dataset `dataset.num_shards`.

        On the other hand, `dataset.shard(n, i, contiguous=False)` contains all the shards of the dataset whose index mod `n = i`.

        Be sure to shard before using any randomizing operator (such as `shuffle`).
        It is best if the shard operator is used early in the dataset pipeline.

        Args:
            num_shards (`int`):
                How many shards to split the dataset into.
            index (`int`):
                Which shard to select and return.
            contiguous: (`bool`, defaults to `True`):
                Whether to select contiguous blocks of indices for shards.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("amazon_polarity", split="train", streaming=True)
        >>> ds
        Dataset({
            features: ['label', 'title', 'content'],
            num_shards: 4
        })
        >>> ds.shard(num_shards=2, index=0)
        Dataset({
            features: ['label', 'title', 'content'],
            num_shards: 2
        })
        ```
        rH  rw  )r   r   r  r/  r   rx  r!  r   r"  r#  r$  )r   r   r   r   r0  s        rP   shardzIterableDataset.shard  s}    \ ''::jX]jt:u#"++''mmDOO4d&7&78"55
 	
rR   rg   rh   c                 H    | j                  t        t        ||      d      S )zAdd column to Dataset.

        Args:
            name (str): Column name.
            column (list or np.array): Column data to be added.

        Returns:
            `IterableDataset`
        )rg   rh   T)r  )r|  r   rj   )r   rg   rh   s      rP   
add_columnzIterableDataset.add_column  s!     xxDHW[x\\rR   rc   rd   c                 (    | j                  ||i      S )a  
        Rename a column in the dataset, and move the features associated to the original column under the new column
        name.

        Args:
            original_column_name (`str`):
                Name of the column to rename.
            new_column_name (`str`):
                New name for the column.

        Returns:
            `IterableDataset`: A copy of the dataset with a renamed column.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True)
        >>> next(iter(ds))
        {'label': 1,
         'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}
        >>> ds = ds.rename_column("text", "movie_review")
        >>> next(iter(ds))
        {'label': 1,
         'movie_review': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}
        ```
        )rename_columns)r   rc   rd   s      rP   rename_columnzIterableDataset.rename_column  s    8 ""$8/#JKKrR   rT   c           	         | j                   j                  r$| j                   j                  j                         nd}| j                  t	        t
        |      t        |            }|St        |j                         D ci c]  \  }}||j                         v r||   n||! c}}      |j                   _        |S c c}}w )aa  
        Rename several columns in the dataset, and move the features associated to the original columns under
        the new column names.

        Args:
            column_mapping (`Dict[str, str]`): A mapping of columns to rename to their new names

        Returns:
            `IterableDataset`: A copy of the dataset with renamed columns
        N)rT   r1  )
r/  r   r   r|  r   re   r_   r$   rb   r   )r   rT   original_featuresds_iterablerY   features         rP   r  zIterableDataset.rename_columns&  s     ;?**:M:MDJJ//446SWhh&~FW[\jWk  
 ()1 ):(?(?(A$W ,/.2E2E2G+GN3'SRYY*K& s   $B?
r_  c                 L   | j                   j                  r$| j                   j                  j                         nd}| j                  |      }|S|j                         |j                   _        |j	                         D ]!  \  }}||v s|j                   j                  |= # |S )a^  
        Remove one or several column(s) in the dataset and the features associated to them.
        The removal is done on-the-fly on the examples when iterating over the dataset.


        Args:
            column_names (`Union[str, List[str]]`):
                Name of the column(s) to remove.

        Returns:
            `IterableDataset`: A copy of the dataset object without the columns to remove.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True)
        >>> next(iter(ds))
        {'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
        >>> ds = ds.remove_columns("label")
        >>> next(iter(ds))
        {'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}
        ```
        Nr1  )r/  r   r   r|  rb   )r   r_  r  r  rY   r   s         rP   r  zIterableDataset.remove_columns?  s    2 ;?**:M:MDJJ//446SWhhlh;():)?)?)AK&+113 8Q,&#))22378 rR   c           	         t        |t              r|g}| j                  rt        j                  | j                        }| j                  j
                  t        |      t        | j                  j
                  j                               z
  }|rFt        dt        |       dt        | j                  j
                  j                                d      t        |D ci c]  }||j
                  |    c}      |_        t        | j                  |      }t        || j                  | j                  | j                   | j"                  | j$                        S c c}w )aV  Select one or several column(s) in the dataset and the features
        associated to them. The selection is done on-the-fly on the examples
        when iterating over the dataset.


        Args:
            column_names (`Union[str, List[str]]`):
                Name of the column(s) to select.

        Returns:
            `IterableDataset`: A copy of the dataset object with selected columns.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True)
        >>> next(iter(ds))
        {'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
        >>> ds = ds.select_columns("text")
        >>> next(iter(ds))
        {'text': 'the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}
        ```
        zColumn name z- not in the dataset. Columns in the dataset: r&  rw  )r   r   r/  r   r   r   ra   r   r^   r_   r$   r^  r   r  rx  r!  r"  r#  r$  )r   r_  r  missing_columnsri  r0  s         rP   select_columnszIterableDataset.select_columnsb  s'   2 lC((>L::==,Dzz""."%l"3c$**:M:M:R:R:T6U"U"$&tO'<&= ><

 3 3 8 8 :;<A? 
 !)|)T!!T]]1-=*=)T U+D,=,=|L#++''oo))"55
 	
 *Us   Er  c           
      R   t        |      }| j                  j                         }||j                  |<   t	        | j
                  || j                  | j                  t        j                  | j                        t        j                  | j                        | j                        S )a=  Cast column to feature for decoding.

        Args:
            column (`str`):
                Column name.
            feature (`Feature`):
                Target feature.

        Returns:
            `IterableDataset`

        Example:

        ```py
        >>> from datasets import load_dataset, Audio
        >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train", streaming=True)
        >>> ds.features
        {'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None),
         'english_transcription': Value('string'),
         'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan',  'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']),
         'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR',  'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']),
         'path': Value('string'),
         'transcription': Value('string')}
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000))
        >>> ds.features
        {'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
         'english_transcription': Value('string'),
         'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan',  'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']),
         'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR',  'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']),
         'path': Value('string'),
         'transcription': Value('string')}
        ```
        rw  r*   r/  r   r   r  r   rx  r!  r   r"  r#  r$  )r   rh   r  r  s       rP   cast_columnzIterableDataset.cast_column  s    D 8@zz  'f))++''mmDOO4d&7&78"55
 	
rR   c           
      B   t        |      }| j                  j                         }||_        t	        | j
                  || j                  | j                  t        j                  | j                        t        j                  | j                        | j                        S )a  
        Cast the dataset to a new set of features.

        Args:
            features ([`Features`]):
                New features to cast the dataset to.
                The name of the fields in the features must match the current column names.
                The type of the data must also be convertible from one type to the other.
                For non-trivial conversion, e.g. `string` <-> `ClassLabel` you should use [`~Dataset.map`] to update the Dataset.

        Returns:
            `IterableDataset`: A copy of the dataset with casted features.

        Example:

        ```py
        >>> from datasets import load_dataset, ClassLabel, Value
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train", streaming=True)
        >>> ds.features
        {'label': ClassLabel(names=['neg', 'pos']),
         'text': Value('string')}
        >>> new_features = ds.features.copy()
        >>> new_features["label"] = ClassLabel(names=["bad", "good"])
        >>> new_features["text"] = Value("large_string")
        >>> ds = ds.cast(new_features)
        >>> ds.features
        {'label': ClassLabel(names=['bad', 'good']),
         'text': Value('large_string')}
        ```
        rw  r  r   r   r  s      rP   castzIterableDataset.cast  sz    D 9Bzz  ))++''mmDOO4d&7&78"55
 	
rR   enablenum_threadsc                    | j                   st        d      | }dt        fd}|r|dkD  r| j                   j                         }| j                   j                         }t	        |t        |d             t	        |t        |d             |j                  |      }t        j                  j                  |      }t        t        ||j                        }|j                  ||      }t        |j                  t              sJ d|z  |j                  _        |S |j                   j                         }	t	        |	t        ||             |j                  |	      }|S )	a
  
        Enable or disable the dataset features decoding for audio, image, video.

        When enabled (default), media types are decoded:

        * audio -> dict of "array" and "sampling_rate" and "path"
        * image -> PIL.Image
        * video -> torchvision.io.VideoReader

        You can enable multithreading using `num_threads`. This is especially useful to speed up remote
        data streaming. However it can be slower than `num_threads=0` for local data on fast disks.

        Disabling decoding is useful if you want to iterate on the paths or bytes of the media files
        without actually decoding their content. To disable decoding you can use `.decode(False)`, which
        is equivalent to calling `.cast()` or `.cast_column()` with all the Audio, Image and Video types
        set to `decode=False`.

        Args:
            enable (`bool`, defaults to `True`):
                Enable or disable features decoding.
            num_threads (`int`, defaults to `0`):
                Enable multithreading for features decoding.

        Returns:
            `IterableDataset`: A copy of the dataset with casted features.

        Examples:

        Disable decoding:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("sshh12/planet-textures", split="train", streaming=True)
        >>> next(iter(ds))
        {'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=2048x1024>,
        'text': 'A distant celestial object with an icy crust, displaying a light blue shade, covered with round pits and rugged terrains.'}
        >>> ds = ds.decode(False)
        >>> ds.features
        {'image': Image(mode=None, decode=False, id=None),
        'text': Value('string')}
        >>> next(iter(ds))
        {
          'image': {
            'path': 'hf://datasets/sshh12/planet-textures@69dc4cef7a5c4b2cfe387727ec8ea73d4bff7302/train/textures/0000.png',
            'bytes': None
          },
          'text': 'A distant celestial object with an icy crust, displaying a light blue shade, covered with round pits and rugged terrains.'
        }
        ```

        Speed up streaming with multithreading:

        ```py
        >>> import os
        >>> from datasets import load_dataset
        >>> from tqdm import tqdm
        >>> ds = load_dataset("sshh12/planet-textures", split="train", streaming=True)
        >>> num_threads = min(32, (os.cpu_count() or 1) + 4)
        >>> ds = ds.decode(num_threads=num_threads)
        >>> for _ in tqdm(ds):  # 20 times faster !
        ...     ...
        ```
        zFeatures decoding is only available for datasets with known features, but features are Unknown. Please set the datasets features with `ds = ds.cast(features)`.decodec                 ,    t        |d      r| |_        y y )Nr  )hasattrr  )r  r  s     rP   set_decodingz,IterableDataset.decode.<locals>.set_decoding:  s    w)!' *rR   r   FTr     )r   r^   r   r   r+   r   r  multiprocessingpool
ThreadPool_apply_asyncr  r|  r   r   r  r  )
r   r  r  dsr  disabled_decoding_featuresenabled_decoding_featuresr  funcr   s
             rP   r  zIterableDataset.decode  s9   @ }}R  	( 	( kAo)-););)=&(,(:(:(<%-w|U/KL,glD.IJ34B"''22;?D</H/W/WXD'@ABboo/EFFFNOR]oBOOK
 	 {{'')H8W\6:;"B	rR   rr  rs  c           
      6   t        | j                  ||      }t        || j                  j	                         | j
                  | j                  t	        j                  | j                        t	        j                  | j                        | j                        S )Nr{  rw  )rq  r   r  r/  r   rx  r!  r   r"  r#  r$  )r   rr  rs  r0  s       rP   _stepzIterableDataset._stepP  sp    *4+<+<4PVW#"++''mmDOO4d&7&78"55
 	
rR   c           
         | j                   | S | j                  j                  r| j                  j                   }n(t        | j	                  d       j                               }| j                  j                         }||_         t        | j                  || j                  | j                  t        j                  | j                        t        j                  | j                        | j                        S )Nrw  )r   r   r   ry   ry  r8  r  r   r  rx  r!  r   r"  r#  r$  r  s      rP   _resolve_featuresz!IterableDataset._resolve_features\  s    ==$K''((11H1$2B2B42H2N2N2PQHyy~~ ))++''mmDOO4d&7&78"55
 	
rR   c           
          d }| j                   rAt        | j                   j                         D ci c]  \  }}|t        |       c}}      }nd}| j	                  |d|||      S c c}}w )a  
        Group samples from the dataset into batches.

        Args:
            batch_size (`int`): The number of samples in each batch.
            drop_last_batch (`bool`, defaults to `False`): Whether to drop the last incomplete batch.

        Example:
        ```py
        >>> ds = load_dataset("some_dataset", streaming=True)
        >>> batched_ds = ds.batch(batch_size=32)
        ```
        c                 X    | j                         D ci c]	  \  }}||g c}}S c c}}w rM   )rb   )	unbatchedkvs      rP   batch_fnz'IterableDataset.batch.<locals>.batch_fn~  s'    '0'89tq!AsF999s   &NT)r  r   r   r   )r   r$   rb   r&   r|  )r   r   r   r  rY   r  r   s          rP   rk   zIterableDataset.batcho  sl    	: ==H[H[H] ^Wd7m!3 ^_HHxxdz?em  
 	
 !_s   A)
c              #   F  K   |rD| j                  d      j                  |      D ]  }t        |d      j                          ! yt	        j
                  t        | j                  d      j                  d                  }t        |d      j                         S w)a  Returns the dataset as a Python dict. Can also return a generator for large datasets.

        Args:
            batch_size (`int`, *optional*): The size (number of rows) of the batches if `batched` is `True`.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.

        Returns:
            `dict` or `Iterator[dict]`

        Example:

        ```py
        >>> ds.to_dict()
        ```
        arrowrA  unsetfingerprintr  N)ry  r   r!   to_dictro   rJ  r_   r   r   r  tables       rP   r  zIterableDataset.to_dict  s       ))'277:7N De9AACCD $$T$*:*:7*C*H*HTX*H*Y%Z[E5g6>>@@   BB!c                     t        j                  t        | j                  d      j	                  d                  }t        |d      j                         S )zReturns the dataset as a Python list.

        Returns:
            `list`

        Example:

        ```py
        >>> ds.to_list()
        ```
        r  r  rA  r  r  )ro   rJ  r_   ry  r   r!   to_list)r   r  s     rP   r  zIterableDataset.to_list  sI       d&6&6w&?&D&DPT&D&U!VWu'2::<<rR   c              #   F  K   |rD| j                  d      j                  |      D ]  }t        |d      j                          ! yt	        j
                  t        | j                  d      j                  d                  }t        |d      j                         S w)a  Returns the dataset as a `pandas.DataFrame`. Can also return a generator for large datasets.

        Args:
            batch_size (`int`, *optional*):
                The size (number of rows) of the batches if `batched` is `True`.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
            batched (`bool`):
                Set to `True` to return a generator that yields the dataset as batches
                of `batch_size` rows. Defaults to `False` (returns the whole datasets once).

        Returns:
            `pandas.DataFrame` or `Iterator[pandas.DataFrame]`

        Example:

        ```py
        >>> ds.to_pandas()
        ```
        r  rA  r  r  r  N)ry  r   r!   	to_pandasro   rJ  r_   r  s       rP   r  zIterableDataset.to_pandas  s     , ))'277:7N Fe9CCEEF $$T$*:*:7*C*H*HTX*H*Y%Z[E5g6@@BBr  schema_overridesrechunkzpl.DataFramec              #   R  K   |rG| j                  d      j                  |      D ]"  }t        |d      j                  ||       $ yt	        j
                  t        | j                  d      j                  d                  }t        |d      j                  ||      S w)a  Returns the dataset as a `polars.DataFrame`. Can also return a generator for large datasets.

        Args:
            batch_size (`int`, *optional*):
                The size (number of rows) of the batches if `batched` is `True`.
                Defaults to `genomicsml.datasets.config.DEFAULT_MAX_BATCH_SIZE`.
            batched (`bool`):
                Set to `True` to return a generator that yields the dataset as batches
                of `batch_size` rows. Defaults to `False` (returns the whole datasets once).
            schema_overrides (`dict`, *optional*):
                Support type specification or override of one or more columns; note that
                any dtypes inferred from the schema param will be overridden.
            rechunk (`bool`):
                Make sure that all data is in contiguous memory. Defaults to `True`.
        Returns:
            `polars.DataFrame` or `Iterator[polars.DataFrame]`

        Example:

        ```py
        >>> ds.to_polars()
        ```
        r  rA  r  r  )r  r  r  N)ry  r   r!   	to_polarsro   rJ  r_   )r   r   r  r  r  r  s         rP   r  zIterableDataset.to_polars  s     < ))'277:7N xe9CCUeovCwwx $$T$*:*:7*C*H*HTX*H*Y%Z[E5g6@@Rbls@tts   B%B'path_or_bufstorage_optionsc                     t        j                  t        | j                  d      j	                  d                  } t        |d      j                  |f||d|S )aL  Exports the dataset to csv.

        This iterates on the dataset and loads it completely in memory before writing it.

        Args:
            path_or_buf (`PathLike` or `FileOrBuffer`):
                Either a path to a file (e.g. `file.csv`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.csv`),
                or a BinaryIO, where the dataset will be saved to in the specified format.
            batch_size (`int`, *optional*):
                Size of the batch to load in memory and write at once.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.
            **to_csv_kwargs (additional keyword arguments):
                Parameters to pass to pandas's [`pandas.DataFrame.to_csv`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html).
                The parameter `index` defaults to `False` if not specified.
                If you would like to write the index, pass `index=True` and also set a name for the index column by
                passing `index_label`.

        Returns:
            `int`: The number of characters or bytes written.

        Example:

        ```py
        >>> ds.to_csv("path/to/dataset/directory")
        ```
        r  r  rA  r  r  r   r  )ro   rJ  r_   ry  r   r!   to_csv)r   r  r   r  to_csv_kwargsr  s         rP   r  zIterableDataset.to_csv  sh    F   d&6&6w&?&D&DPT&D&U!VW9wu'299
!+
 	
 	
rR   c                     t        j                  t        | j                  d      j	                  d                  } t        |d      j                  |f||d|S )a*  Export the dataset to JSON Lines or JSON.

        This iterates on the dataset and loads it completely in memory before writing it.

        The default output format is [JSON Lines](https://jsonlines.org/).
        To export to [JSON](https://www.json.org), pass `lines=False` argument and the desired `orient`.

        Args:
            path_or_buf (`PathLike` or `FileOrBuffer`):
                Either a path to a file (e.g. `file.json`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.json`),
                or a BinaryIO, where the dataset will be saved to in the specified format.
            batch_size (`int`, *optional*):
                Size of the batch to load in memory and write at once.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.
            **to_json_kwargs (additional keyword arguments):
                Parameters to pass to pandas's [`pandas.DataFrame.to_json`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html).
                Default arguments are `lines=True` and `orient="records".
                The parameter `index` defaults to `False` if `orient` is `"split"` or `"table"`.
                If you would like to write the index, pass `index=True`.

        Returns:
            `int`: The number of characters or bytes written.

        Example:

        ```py
        >>> ds.to_json("path/to/dataset/directory/filename.jsonl")
        ```

        ```py
        >>> num_shards = dataset.num_shards
        >>> for index in range(num_shards):
        ...     shard = dataset.shard(index, num_shards)
        ...     shard.to_json(f"path/of/my/dataset/data-{index:05d}.jsonl")
        ```

        r  r  rA  r  r  r  )ro   rJ  r_   ry  r   r!   to_json)r   r  r   r  to_json_kwargsr  s         rP   r  zIterableDataset.to_json  sh    \   d&6&6w&?&D&DPT&D&U!VW:wu'2::
!+
 	
 	
rR   conzsqlalchemy.engine.Connectionzsqlalchemy.engine.Enginezsqlite3.Connectionc                     t        j                  t        | j                  d      j	                  d                  } t        |d      j                  ||fd|i|S )a  Exports the dataset to a SQL database.

        Args:
            name (`str`):
                Name of SQL table.
            con (`str` or `sqlite3.Connection` or `sqlalchemy.engine.Connection` or `sqlalchemy.engine.Connection`):
                A [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) or a SQLite3/SQLAlchemy connection object used to write to a database.
            batch_size (`int`, *optional*):
                Size of the batch to load in memory and write at once.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
            **sql_writer_kwargs (additional keyword arguments):
                Parameters to pass to pandas's [`pandas.DataFrame.to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html).
                The parameter `index` defaults to `False` if not specified.
                If you would like to write the index, pass `index=True` and also set a name for the index column by
                passing `index_label`.


        Returns:
            `int`: The number of records written.

        Example:

        ```py
        >>> # con provided as a connection URI string
        >>> ds.to_sql("data", "sqlite:///my_own_db.sql")
        >>> # con provided as a sqlite3 connection object
        >>> import sqlite3
        >>> con = sqlite3.connect("my_own_db.sql")
        >>> with con:
        ...     ds.to_sql("data", con)
        ```
        r  r  rA  r  r  r   )ro   rJ  r_   ry  r   r!   to_sql)r   rg   r  r   sql_writer_kwargsr  s         rP   r  zIterableDataset.to_sqlR  s\    N   d&6&6w&?&D&DPT&D&U!VW9wu'299$pPZp^opprR   c                    ddl m}  || j                        xs t        j                  }t        j                  t        | j                  d      j                  |                  } t        |d      j                  |fd|i|S )a  Exports the dataset to parquet

        Args:
            path_or_buf (`PathLike` or `FileOrBuffer`):
                Either a path to a file (e.g. `file.parquet`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.parquet`),
                or a BinaryIO, where the dataset will be saved to in the specified format.
            batch_size (`int`, *optional*):
                Size of the batch to load in memory and write at once.
                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.19.0"/>
            **parquet_writer_kwargs (additional keyword arguments):
                Parameters to pass to PyArrow's `pyarrow.parquet.ParquetWriter`.

        Returns:
            `int`: The number of characters or bytes written.

        Example:

        ```py
        >>> ds.to_parquet("path/to/dataset/directory")
        ```

        ```py
        >>> num_shards = dataset.num_shards
        >>> for index in range(num_shards):
        ...     shard = dataset.shard(index, num_shards)
        ...     shard.to_parquet(f"path/of/my/dataset/data-{index:05d}.parquet")
        ```

        r   )get_arrow_writer_batch_size_from_featuresr  rA  r  r  r  )arrow_writerr  r   r   DEFAULT_MAX_BATCH_SIZEro   rJ  r_   ry  r   r!   
to_parquet)r   r  r   r  parquet_writer_kwargsr  r  s          rP   r  zIterableDataset.to_parquet|  s    P 	L>t}}MnQWQnQn
  d&6&6w&?&D&DPZ&D&[!\]=wu'2==
)8
<Q
 	
rR   job_idnum_jobsrepo_iddata_dirtokenrevision	create_prembed_external_filesc           	   #   b   K   |	|z  }|	|z  }||z  t        ||      z   |z   ||k  rdndz    fdt        z
        D        }t        t        j                  |      }d}d}d}g }|D ]0  \  }}|
rTddlm} |j                  d      }|j                  t        t         j                        d ||j                        	      }| d
| d|dd|	dd}t               }|j                  |       t        j                   |      |j"                  z  }|t%        fdt        j&                        D              z  }|j)                         }|t+        |      z  }~t-        ||      }|j/                  ||gd||       |j1                  |       |ddf 3 |d|||ff yw)ar  Pushes the dataset shards as Parquet files to the hub.

        Returns:
            additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards
            uploaded_size (`int`): number of uploaded bytes to the repository
            dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset after uncompression
        r   r   c              3   V   K   | ]   }|z   j                  z
  |d       f " yw)TrH  N)r  )rX   r   r   r   r   s     rP   rZ   zEIterableDataset._push_parquet_shards_to_hub_single.<locals>.<genexpr>  s2      
Z[UQY

cEkt
TU
s   &)endpointr  r  r  r  T)r  r   rI  -05d-of-z.parquetc              3   T   K   | ]  }j                  |      j                   ! y wrM   )	row_grouptotal_byte_size)rX   r   parquet_metadatas     rP   rZ   zEIterableDataset._push_parquet_shards_to_hub_single.<locals>.<genexpr>  s(      "BC **1-=="s   %(path_in_repopath_or_fileobjdataset)r  	additions	repo_typer  r  FN)r   r   r   r   HF_ENDPOINTr  r  ry  r|  r   r<   r$  r   r	   r  pqread_metadatanum_rowssumnum_row_groupsgetvaluer   r   preupload_lfs_filesrM  )r   r  r  r  r  r  r  r  r  r   r  r   r   index_shardsapiuploaded_sizedataset_nbytesnum_examplesr  r   r  r  shard_path_in_repobufferparquet_contentshard_additionr   r  r   s   `                         @@@rP   "_push_parquet_shards_to_hub_singlez2IterableDataset._push_parquet_shards_to_hub_single  s    , H$8#fs63//ck&3,QA6
_dehkpep_q
 V//u=.0	( 	#LE5#S))'2		/4CZCZ[ HX " 
 %-:QugQuSkjQTEUU]!^YFV$!//7,555Lc "GLM]MlMlGm"  N %oo/OS11M/=OapqN##)*#!# $  ^,%""=	#@ dYEEEs   F)F/num_procc
                 p   |rE| j                   j                  j                         D 
cg c]  \  }
}t        |d      s|
 c}}
ng }|xr t	        |      }|| j
                  }g }dx}}|	xs d}t        |      D cg c]"  }| j                  ||d      ||||||||||d$ }}d}||	|	dk\  rd	|	 d
ndz  }t        d||      }|	|	dk  rt        j                         n
t        |	      5 }|t        j                  di |d   nt        |t        j                  |      }|D ]2  \  }}}|s|j                  |       ||d   z  }||d   z  }||d   z  }4 | |j!                          |j#                          ddd       t%        d |D              }||||fS c c}}
w c c}w # 1 sw Y   ,xY w)a  Pushes the dataset shards as Parquet files to the hub.

        Returns:
            additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards
            uploaded_size (`int`): number of uploaded bytes to the repository
            dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset after uncompression
            num_examples (`int`): number of examples of the uploaded dataset
        T)ignore_decode_attributeNr   r   rH  )r   r  r  r  r  r  r  r  r  r   r  zUploading the dataset shardsz (num_proc=)rG  z shards)unittotaldesc)kwargs_iterabler  c              3   H   K   | ]  }|j                   j                    y wrM   )upload_infor  )rX   additions     rP   rZ   z>IterableDataset._push_parquet_shards_to_hub.<locals>.<genexpr>F  s     P(H0055Ps    "rN   )r/  r   rb   r-   r   r   r   r  hf_tqdm
contextlibnullcontextr   r  r  rE   r  closer   r  )r   r  r  r  r  r  r  r   r  r  r  r  decodable_columnsr  r  r  r  r  r  r  pbarr  update_streamrF  contentr   s                             rP   _push_parquet_shards_to_hubz+IterableDataset._push_parquet_shards_to_hub  s   4 $  ::..446l41a:J1fj:kQl 	
  4O=N8OJ.0	())=q  /
  

hfQU
V $"$$&((<
 
  .X-AhRSm+hZq)Y[[

 *2)9X\Z##%tT\~ 	ae <  BBX_UVEWX'#FF$3  *7 /%gKK(+I"gaj0N GAJ.L/ 

		'	* PiPP-EEw m
.	 	s   F! F! 'F'1BF,,F5config_nameset_defaultcommit_messagecommit_descriptionprivatec                    	
 !" dt         j                        v rt        d      || j                  kD  rrt        j                  d| d j                   d| j                  z
   d       t        j                  d j                   d	 j                   d
        j                  }dk(  rt        d      # j                  t         j                        ndt        j                  t              st        dt         d d      t        t        j                  	      	 j                  d      j                   
'
j)                  d      sj+                  
	dd       s	dk7  rnd j-                  	
||||	      \  " !dt.        t         t0        t2           t         t4        t            f   f !
 	"fd}||nd}t7              t        j8                  kD  rOt        j                  dt        j8                   d       t;        j<                  t7              t        j8                  z        }t?        d|      D ]  }|t        j8                  z  |d z   t        j8                  z   }tA        tC        jD                  t?        d!      tC        jF                  d"            d #      D ]E  \  }}|d tI        jH                         z   z  }	 jK                  ||d$|d%d&|d%d'z   |d
|(      } n t        j                  d.|d z    d/||z
  d z
  rd0||z
  d z
   d1nd2z   d
z           g }n}tA        tC        jD                  t?        d!      tC        jF                  d"            d #      D ]  \  }}|d tI        jH                         z   z  } |       \  }}}}g }|r9|j]                  t_        t        j`                  |jc                  d3      4             |j]                  t_        t        jd                  t        |      jc                         4             	 jK                  ||z   |z   ||d
||5      } |S  S # t"        $ r% j%                  d|d      }|j&                  Y >w xY w# tL        $ r}|jN                  rtQ        |jN                  tR              r~|jN                  jT                  jV                  d)k(  r[tY        jZ                  |       t        j                  d* d+ d,| d-|jN                  jT                  jV                   d'	       Y d}~p d}~ww xY w# tL        $ r}|jN                  rtQ        |jN                  tR              r}|jN                  jT                  jV                  d6v r[tY        jZ                  |       t        j                  d7 d+ d,| d-|jN                  jT                  jV                   d'	       Y d}~c d}~ww xY w)8aG  Pushes the dataset to the hub as a Parquet dataset.
        The dataset is pushed using HTTP requests and does not need to have neither git or git-lfs installed.

        The resulting Parquet files are self-contained by default. If your dataset contains [`Image`], [`Audio`] or [`Video`]
        data, the Parquet files will store the bytes of your images or audio files.
        You can disable this by setting `embed_external_files` to `False`.

        Args:
            repo_id (`str`):
                The ID of the repository to push to in the following format: `<user>/<dataset_name>` or
                `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace
                of the logged-in user.
            config_name (`str`, defaults to "default"):
                The configuration name (or subset) of a dataset. Defaults to "default".
            set_default (`bool`, *optional*):
                Whether to set this configuration as the default one. Otherwise, the default configuration is the one
                named "default".
            split (`str`, *optional*):
                The name of the split that will be given to that dataset. Defaults to `self.split`.
            data_dir (`str`, *optional*):
                Directory name that will contain the uploaded data files. Defaults to the `config_name` if different
                from "default", else "data".
            commit_message (`str`, *optional*):
                Message to commit while pushing. Will default to `"Upload dataset"`.
            commit_description (`str`, *optional*):
                Description of the commit that will be created.
                Additionally, description of the PR if a PR is created (`create_pr` is True).
            private (`bool`, *optional*):
                Whether to make the repo private. If `None` (default), the repo will be public unless the
                organization's default is private. This value is ignored if the repo already exists.
            token (`str`, *optional*):
                An optional authentication token for the Hugging Face Hub. If no token is passed, will default
                to the token saved locally when logging in with `huggingface-cli login`. Will raise an error
                if no token is passed and the user is not logged-in.
            revision (`str`, *optional*):
                Branch to push the uploaded files to. Defaults to the `"main"` branch.
            create_pr (`bool`, *optional*, defaults to `False`):
                Whether to create a PR with the uploaded files or directly commit.
            num_shards (`int`, *optional*):
                Number of shards to write. Equals to this dataset's `.num_shards` by default.
            embed_external_files (`bool`, defaults to `True`):
                Whether to embed file bytes in the shards.
                In particular, this will do the following before the push for the fields of type:

                - [`Audio`] and [`Image`]: remove local path information and embed file content in the Parquet files.
            num_proc (`int`, *optional*, defaults to `None`):
                Number of processes when preparing and uploading the dataset.
                This is helpful if the dataset is made of many samples and transformations.
                Multiprocessing is disabled by default.

        Return:
            huggingface_hub.CommitInfo

        Example:

        ```python
        >>> dataset.push_to_hub("<organization>/<dataset_id>")
        >>> dataset_dict.push_to_hub("<organization>/<dataset_id>", private=True)
        >>> dataset.push_to_hub("<organization>/<dataset_id>", num_shards=1024)
        ```

        If your dataset has multiple splits (e.g. train/validation/test):

        ```python
        >>> train_dataset.push_to_hub("<organization>/<dataset_id>", split="train")
        >>> val_dataset.push_to_hub("<organization>/<dataset_id>", split="validation")
        >>> # later
        >>> dataset = load_dataset("<organization>/<dataset_id>")
        >>> train_dataset = dataset["train"]
        >>> val_dataset = dataset["validation"]
        ```

        If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages):

        ```python
        >>> english_dataset.push_to_hub("<organization>/<dataset_id>", "en")
        >>> french_dataset.push_to_hub("<organization>/<dataset_id>", "fr")
        >>> # later
        >>> english_dataset = load_dataset("<organization>/<dataset_id>", "en")
        >>> french_dataset = load_dataset("<organization>/<dataset_id>", "fr")
        ```
        zVideo(a]  push_to_hub is not implemented for video datasets, instead you should upload the video files using e.g. the huggingface_hub library and optionally upload a metadata.csv or metadata.jsonl file containing other information like video captions, features or labels. More information at https://huggingface.co/docs/datasets/main/en/video_load#videofolderNzToo many num_proc: rC  rD  z processes.zTo parallelize data loading, we give each process some shards (or data sources) to process. Therefore it's unnecessary to have a number of processes greater than dataset.num_shards=rE  r&  r  zN`config_name` cannot be 'data'. Please, choose another name for configuration.trainzSplit name should match 'z' but got 'z'.r  r  )r  T)r  r   exist_okzrefs/pr/)branchr  r  r#  default)	r  r  r  r  r  r   r  r  r  rm   c            	      ~   %j                  *d+      j                  } d\  }}g }d}g }$D cg c]  }|j                   }}%j                  *| d/d      D ]2  }t	        |t
              s|j                  t        j                  k(  rd}5|j                  t        j                  k(  rd}U|j                  j                  ' d. d      rC|j                  |vr5|j                  t        |j                  	             ||j                  z  }t        j                  |j                  t        j                   d
d            st#        t              }	t%        |j                  |	      }
|
J |
d   }||vs"|j                  |       5 d*v r*j'                  d      nd *f\  }},j(                  j+                         }d |_        0|_        (|_        0(z   |_        &|_        t7        .t9        .()|      i      |_        |r%j=                  *t        j                  d|       }t?        j@                  tC        |            }|jD                  }tG        jH                  |      }tK        jH                  |      }|r
&|v r|&   }nd }n|rd }tM               }tG               }%j=                  *t        j                  d|       }tO        |d      5 }tQ        j@                  |      }|r|jS                  &d       nd }|rtU        jV                  |      nd }d d d        nd }tM               }tG               }d }tX        j)                  d       |j:                  rlt[        |j:                        .gk7  rR,j\                  j^                  |j^                  k7  r/ta        d,j\                  j^                   d|j^                         .|j:                  v rZ|xj.                  |z  c_        |xj0                  |j:                  jS                  .t9                     jb                  xs dz  c_        d |_        |j.                  xs d0z   |_        |j0                  xs d(z   |_        |j.                  |j0                  z   |_        |j:                  je                  .d        t9        .(tg        ,      |      |j:                  .<   |}|s7|r5d|D cg c]  }|d| dd c}i}tG        d|i      ji                  |       &|v rb|&   }d|v rtk        |d         }ni }' d. dg|.<   d|jm                         D cg c]  \  }}|tg        |      dk(  r|d   n|d c}}i}nd.' d. ddgi}&|i}-rJ&dk7  rE|r>|jo                         } | dk(  rta        d      | r||    je                  d      }!||    || <   d|d<   |rt%j=                  *t        j                  d|       }tO        |d      5 }tQ        j@                  |      }d d d        tq        |      &<   tQ        jr                  |d      }"nd }"tK        &|i      ji                  |       tG        |      ji                  |       |t?        d| d      n|}#| ||#|"fS c c}w # 1 sw Y   dxY wc c}w c c}}w # 1 sw Y   xY w)Nr  )r  r  )FFr   T)r  r  r  r  	recursiverI  r  )r  z{split}*r  )	num_bytesr  dataset_nameutf-8)encodingz0Updating downloaded metadata with the new split.zVFeatures of the new split don't match the features of the existing splits on the hub: z != 
data_fileszdata/z-*)r  pathr%  r   zzThere exists a configuration named 'default'. To set a different configuration as default, rename the 'default' one first.   )indentz---
z
---
):	repo_infoshar  list_repo_treer   r   	rfilenamer   REPOCARD_FILENAMEDATASETDICT_INFOS_FILENAME
startswithrM  r   r  fnmatchr    replacerD   rF   r  r  r   download_checksumsdownload_sizedataset_sizesize_in_bytesr  r9   r:   splitshf_hub_downloadr   loadr   r  rA   from_dataset_card_datar5   r   openjsonr|   r4   	from_dictrX  r_   r/  r   r^   r)  rG  r   to_dataset_card_datar#   rb   get_default_config_namerC   dumps)1parent_commitrepo_with_dataset_cardrepo_with_dataset_infos	deletionsdeleted_sizerepo_splitsr  repo_files_to_add	repo_filepatternsplit_pattern_fields
repo_splitorganizationr*  info_to_dumpdataset_card_pathdataset_carddataset_card_datametadata_configsdataset_infosr1  dataset_infos_pathfdataset_infor   default_metadata_configs_to_dumpmetadata_configdata_files_to_dumprx  _patternmetadata_config_to_dumpconfigs_to_dumpcurrent_default_config_namer   new_dataset_infosnew_dataset_cardr  r  r  r  r  r  r  r  r   r  r  r  r   s1                                       rP   get_deletions_and_dataset_cardzCIterableDataset.push_to_hub.<locals>.get_deletions_and_dataset_card  s`   MM'YQYMZ^^M ?K;"$;57IL%'KGP Q8!6!6 Q Q //-9TYei 0  7	 ")X6&&&*B*BB-1*((F,M,MM.2+''22hZqq3IJ!++3DD$$%:	H[H[%\] INN2L__''NVVW`bef 44noG+9):M:Mw+W(/;;;!5g!>J!4#**:6174 @Cg~s);TXZaSb&L,99>>+L.2L+)6L&(6L%)6)GL&'2L$"+9lam#L &$'$7$7V55Ub %8 %!  +//5F0GH$0$5$5!#2#I#IJ[#\ 2B2Y2YZk2l [M%A -k :I $I(#$3$5!#2#4 %(%8%8V>>)^k &9 &" ,w? ^1*.))A,MKX=#4#4[$#G^bLGS 5 5l CY]I^ ^
  $$3$5!#2#4  	$NO##Y-=-=(>5'(Izz**i.@.@@(tuyuu  vI  vI  uJ  JN  OX  Oa  Oa  Nb  c  	 0 00!//<?/!..)2B2B2F2Fuik2Z2d2d2ihii.37I0/8/F/F/K!}.\I+.7.D.D.I^-[I*.7.E.E	H^H^.^I+$$((5.7c$i^j/I$$U+ $-L# \g"hSXUeE7"<M#N"h40  ,L MNccduv.."2;"??2):?<;X)Y&)+&08z5'-D,E"5) 
 1C0H0H0J#
 -FH &,36x=A3EHQK8#+' ,8EV^U__`af`ggiSj:k9l*m'*,CDO{i7#2B2Z2Z2\/2i?(>  3,-HIMMiXGWXsGt(CD59'	2&%(%8%8V>>)^k &9 &" ,w? 71*.))A,M7-3L-Ak*$(JJ}Q$G!$(!k<89NNO`aO,AABSTCOCWe$5#6g>?]i  !)-=?PPPk !R|^ ^D #i#87 7s*   ZAZZ(	"Z-Z3Z%3Z<zUpload datasetz)Number of files to upload is larger than z+. Splitting the push into multiple commits.r   r   r<     )r   z (part r  r  r  )
operationsr  r  r  r  r    z!Retrying intermediate commit for z, z (z/n with status_code zCommit #z
 completedz (still z to go)rG  r+  r  )rh  r  r  r  r  r  rH  )i  ri  zRetrying commit for )3r   r   r   r   rX  r  r  r^   r  rematchr6   r   r   r  r1  rR  r   create_repor  r7  create_branchr  r   r_   r   r   r   UPLOADS_MAX_NUMBER_PER_COMMITmathceilr   rK  r  chainr  r   create_commitr   __context__r   r   responsestatus_codetimesleeprM  r   r6  encoder5  )#r   r  r  r  r  r  r  r  r   r  r  r  r   r  r  repo_urlrf  num_commitsr   rh  retry
sleep_timecommit_infoerrlast_commit_additionsrH  rK  rV  rY  dataset_card_additionsr  r  r  r  r   s#   ``````   ``                   @@@@@rP   push_to_hubzIterableDataset.push_to_hubI  sa   H s4==))%Y  Ht$>NN%hZ/KDOOK\ ]$t67{D KKllpl{l{k| }[[_[j[jZkkln
 H& mnn ='+zz'=C

O7Exx	5)8;ugUWXYYV//u=		'mmGymADDG (;(;J(Ggheycgh&1Y&>{FHAEAaAa!!5 Bb B
>	=.,^	Qc4@U;VX[]efi]j6j0k ^	Q ^	Q ^	Q@ ,:+EK[y>F@@@KK;F<`<`;a  bM  N ))C	NV5Y5Y$YZK1k* $&<<<AImIm?m
 *39??59iN^N^_aNb3ckl)m %E:!fmmo"55J"&)&7&7#'1+9gaWDQ\]`Paab<c+c/A&/%-&/ '8 '. 78 q1ugZ0BMPQ/TUBU+/A"5!6g>[]_A$J %'!$-!!*9??59iFVFVWYFZ+[cd!e )	E:!fmmo--JDbDdAM9lM%'"&--&%+%F%F(5(<(<W(E #))"0H0HZ]^jZkZrZrZtu!//47MMPYY#1'9'%'"/ 0 	2 W)	V o ' 	'#	 ' H &&G	'h * "OO *3??I F # 8 8 D D K !JJz2"KK"CG9B{m[]^c]ddxy|  zI  zI  zR  zR  z^  z^  y_  _`  !a %!"d " OO"3??I>00<<
J JJz*KK.wir+bOcdgdsdsd|d|  eI  eI  dJ  JK  L sP   4Q" 1$R=U	"*RR	UBU UU		W;BW65W66W;)NNNNNN)   )r   Fr\  r=  rM   )	NFNFr  FNNN)NFNFr  N)NNr  r   )Tr   r   )NFNT)r%  NNNNNNNNFNTN)[r   r   r   r   r   r   r4   r7   r  r  r  r}   r   r   r   r   r   r   r)  r_   r_  r   r   r0  r3  r6  r8  r:  r>  r   rA  rV  rP  r'  r   r   r  r  r  r8   TRAINr   r$   rf  rn  rt  ry  r|  r  r   r   r   r  r  r  r  r  r  r   r  r  r  r  r  r%   r  r  r  r  r  rk   r   r  r  r  r  r  r  rK   r   r  r  r  r  r   r   r   r  r  r   r  rN   rR   rP   r  r    sC
   *
 '+&*15/337IMG*G {#G 
#	G
 -.G O,G /0G $DeCtO.D)D$EFG: EXc] E E FhtCy1 F F3/D 3/j0/$ 0/4 0/drG3  s    	= ,C , ,
 #  9v	 <A66486	6p2!?s !?T !?F1s 1~ 1  (,%)!KK	448$4 TN4 	4
 
4 4l  '+'+(#(
#( 8$(
 
( (T fC f$5 f f" #8
sm8
 
8
x (,"9=$( %:>'+$(V
8$V
 V
  c49n 56	V

 V
 SMV
 V
 !sDI~!67V
 8$V
 D>V
 
V
t (,9=$($(T
8$T
  c49n 56	T

 T
 SMT
 D>T
 
T
n ^bA
$,RYY-@-@$AA
WZA
	A
F+s +*
c *
/ *
X)
 )
2C )
V#
c #
/ #
R  	7
7
 7
 	7

 
7
r
]s 
]E$.,A 
]FW 
]L# L LPa L<T#s(^ @Q 2!5d3i+@ !EV !F1
5d3i+@ 1
EV 1
f-
# -
 -
@Q -
^-
-
 
-
^[T [s [CT [z

# 

s 

/@ 


&
 
d 
GX 
4A(3- A ARWX\^fgk^lXlRm A.= =  AFC"3-C9=C	r||Xbll33	4C> %)+/#uSM#u #u #4.	#u
 #u 
~x77	8#uP %)*.	)
8X-.)
 SM)
 "$	)
 
)
\ %)*.	4
8X-.4
 SM4
 "$	4
 
4
t %)	(q(q 368RThhi(q SM	(q 
(qZ %)*.	.
8X-..
 SM.
 "$	.
 
.
`EFEF EF 	EF
 EF EF }EF 3-EF D>EF EF #EF 
%/0#s:;	<EFNTFTF TF 	TF
 }TF 3-TF D>TF SMTF #TF 3-TF 
t&'c36	7TFr %&*#"&(,,0"&#"&$)$(%)"&!`` ` d^	`
 }` 3-` !` %SM` $` }` 3-` D>` SM` #`  3-!`" 
#`rR   r  dsetsr  r  axisc           
         | D cg c]  }|j                          } }|dk(  r$t        | D cg c]  }|j                   c}       n,t        | D cg c]  }|j                  D ]  }|  c}}       t	        d | D              rd}nt        d | D              rd}t        j                  d       ne| D ch c]  }|j                  j                   }}t        |      dk(  r|j                         }	t        |	      }nd}t        j                  d       t        t        | D cg c]  }|j                   c}      D 
ci c]  }
|
j                         D ]  \  }}||
  c}}}
      }
| D cg c]!  }t!        j"                  |j$                        # }}|dk(  rt'        |      }nt)        |      }|.t+        j,                  | D cg c]  }|j                   c}      }n|j!                         }|
|_        | D ci c]'  }|j.                  j                         D ]  \  }}||
 ) }}}}t1        |||||      S c c}w c c}w c c}}w c c}w c c}w c c}}}
w c c}w c c}w c c}}}w )	a  
    Converts a list of `IterableDataset` with the same schema into a single `IterableDataset`.
    Missing data are filled with None values.

    <Added version="2.4.0"/>

    Args:
        dsets (`List[datasets.IterableDataset]`): List of Datasets to concatenate.
        info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.
        split (`NamedSplit`, optional): Name of the dataset split.
        axis (``{0, 1}``, default ``0``, meaning over rows):
            Axis to concatenate over, where ``0`` means over rows (vertically) and ``1`` means over columns
            (horizontally).

            *New in version 1.6.0*

    Example:

    ```py
    >>> ds3 = _concatenate_iterable_datasets([ds1, ds2])
    ```
    r   c              3   8   K   | ]  }|j                   d u   y wrM   r!  rX   dsets     rP   rZ   z1_concatenate_iterable_datasets.<locals>.<genexpr>  s     
64t#
6r  Nc              3   8   K   | ]  }|j                   d u   y wrM   r  r  s     rP   rZ   z1_concatenate_iterable_datasets.<locals>.<genexpr>  s     8$T%8r  zoSome of the datasets have disparate format or format not set. Resetting the format of the concatenated dataset.r   rv  )r0  r  r  r  r  )r  r)   r   r  r   r]   rX  r  r!  r  r   rG  r  r$   r(   rb   r   r   r   r  r  r4   
from_merger$  r  )r  r  r  r  r5  r  col_namer  format_type_setr  r   r  r  r  r0  r  r  r  r  s                      rP   _concatenate_iterable_datasetsr    sG   8 -22qQ  "2E2 qy)U*KT4==*KL%V$VHXVXVW 
6
66
	8%8	8
}	
 EJJD4++77JJ1$)--/K)kBJJKK B -.Ot}}.OPnn(]e]k]k]mnUYUVXYAnnH <AAaDMM!..1ALAqyHVJ<X |%%u&=!qvv&=>yy{DM7<vvGSZSmSmSsSsSuv%%vvv+ a 3 +LV K /Pn B '>
 ws4   III
6I%I"I2&I$I),I.datasetsr  r  r  r  c           
      j   | D cg c]  }|j                          } }t        | D cg c]  }|j                   c}       t        t	        | D cg c]  }|j                   c}      D 	
ci c]  }|j                         D ]  \  }	}
|	|

  c}
}	}      }| D cg c]!  }t        j                  |j                        # }}|t        ||      }n.t        j                  j                  |      }t        ||||      }|.t        j                  | D cg c]  }|j                    c}      }n|j                         }||_        | D ci c]'  }|j"                  j                         D ]  \  }}||
 ) }}}}t%        ||||      S c c}w c c}w c c}w c c}
}	}w c c}w c c}w c c}}}w )a  
    Interleave several iterable datasets (sources) into a single iterable dataset.
    The new iterable dataset alternates between the sources to yield examples.
    If `probabilities = None` (default) the iterable dataset will cycles through the sources in order for each next example in the iteration.
    If `probabilities` is not `None, the iterable dataset will sample a random source according to the provided probabilities for each next examples in the iteration.

    <Added version="2.4.0"/>

    Args:
        datasets (`List[IterableDataset]`): list of datasets to interleave
        probabilities (`List[float]`, optional, default None): If specified, the new iterable dataset samples
            examples from one source at a time according to these probabilities.
        seed (`int`, optional, default None): The random seed used to choose a source for each example.
        stopping_strategy (`str`, defaults to `first_exhausted`):
            Two strategies are proposed right now.
            By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
            If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
            Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
            - with no probabilities, the resulting dataset will have max_length_datasets*nb_dataset samples.
            - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.

    Output:
        `datasets.IterableDataset`
    r  r  )r0  r  r  r  )r  r)   r   r$   r(   rb   r   r   r   r  r   r   r<  r  r4   r  r  r$  r  )r  r  r  r  r  r  r5  r  r   r  r  r  r0  r   r  r  r  r  s                     rP   _interleave_iterable_datasetsr  A  s   @ 088!##%8H8 &&Jt}}&JK -.Rt}}.RSqq(`h`n`n`pqX\XY[\AqqH <DDaDMM!..1DLD 9,ZklII))$/	AI]^o

 |%%x&@!qvv&@Ayy{DM '/ "WE_E_EeEeEg3A7E  {U^oppC 9 'K
 /Sq E 'A
s(   FFF-"F&F$F),F.r  r  r  c           	      |   | j                   r5|| j                   j                  z  |z   }|| j                   j                  z  }t        ||      }t	        | j
                  | j                  j                         | j                  | j                  t        j                  | j                        || j                        S )a  
    Split an iterable dataset for the node at rank `rank` in a pool of nodes of size `world_size`.

    If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`),
    then the shards are evenly assigned across the nodes, which is the most optimized.
    Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.

    Args:
        dataset ([`IterableDataset`]):
            The iterable dataset to split by node.
        rank (`int`):
            Rank of the current node.
        world_size (`int`):
            Total number of nodes.

    Returns:
        [`IterableDataset`]: The iterable dataset to be used on the node at rank `rank`.
    )r  r  rw  )r#  r  r  r  r  r   r/  r   rx  r!  r   r"  r$  )r  r  r  r  s       rP   _split_by_node_iterable_datasetr    s    & G00555<'"6"6"A"AA
#*EK((]]!nn&&-- 2 23!44 rR   c                    K   | j                  ||f      }	 |j                         r|j                         S t        j                  d       d {    >7 wrk  )apply_asyncreadyr|   rC  rw  )r  r  rO   futures       rP   r  r    sN     dQD)F
<<>::<--"""	  #s   AAAArM   r\  )NNr   )NNNNr  )rC  r  r   r8  r?  r  rC  ro  multiprocessing.poolr  r   rj  r  rv  collectionsr   collections.abcr   r   r   dataclassesr   	functoolsr   ior	   r
   r   pathlibr   typingr   r   r   r   r   r   fsspec.asynrL  numpyr   pandasr  pyarrowro   pyarrow.parquetparquetr  huggingface_hubr   r   r   r   r   r   huggingface_hub.hf_apir   huggingface_hub.utilsr   r   multiprocessr   requestsr   rG  r   arrow_datasetr    r!   r"   r-  r#   r   r$   features.featuresr%   r&   r'   r(   r)   r*   r+   r,   r-   r  r.   r/   r0   r1   r2   r3   r  r4   r5   namingr6   r>  r7   r8   r9   r:   r  r;   r<   r=   r>   r  r?   r  utils.loggingr@   utils.metadatarA   utils.py_utilsrB   rC   rD   rE   rF   utils.shardingrG   rH   rI   rJ   utils.typingrK   sqlite3r  r  
sqlalchemyr  r   rX  r   r   r   rQ   r}   re   r_   rj   ry   r   r   r   r   rp   r   r   r   r   r  r%  r/  r^  rq  r  r  r  r  r  r  r  rn  ro  BooleanScalarrr  rx  rz  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rN   rR   rP   <module>r     s             	 
   .  !   #  J J      v v + I    p p ) 
 
 
  0  ; ; a a " % + f f v v " 	H	CHo d38n 4 c  d4j 7d39o 7XhEW 7ck 7#d38n!5 #$sDy/ #>d39o >(4S>2J > "fuS$Y'(ff f eCM"#	f@Co CoL"<, "<J"
*: "
J@<1 @<FC
/D C
LE+%: E+P*+1 *+Z++0 ++\d
*? d
NH
9N H
V
d3i 
K
;P K
\P
2U P
f
bhh 
}+2 }+@
(rxx 
(
dBHHboor7G7GG
H
( 
(4H 4U4>-B 4]` 4
44$)$.$94TW4J+5 J+ZD+%: D+NH+0 H+V.+2 .+bN+0 N+b%:>sE#tUY/DZ?Z:[	#8<S%TSWBX=X8Y	  	L 	L 	Ld+ 5 d+N ) ) )
   
A	eC<O6P 	UZ[^`n[nUo 	1 1BO$& O$hH #'"&	R R
;
R JR 	R
 Rn ,0"&"&EVAq?#AqDK(Aq 3-Aq ;
	Aq
 JAq ABAq AqH_ C UX ]l D#rR   