
    biV                    L   d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlZdd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB ddlCmDZDmEZE ddlFmGZG ddlHmIZImJZJ ddlKmLZLmMZMmNZN ddlOmPZP ddlQmRZRmSZS ddlTmUZUmVZVmWZWmXZX ddlYmZZZ dd l[m\Z\ dd!l#m]Z] dd"l#m^Z_ dd#l`maZa dd$lbmcZc dd%ldmeZemfZfmgZgmhZh dd&limjZjmkZkmlZlmmZmmnZnmoZompZpmqZq dd'lrmsZsmtZt dd(lumvZv erdd)lwmxZx  e]j                  ez      Z{ G d* d+e|      Z}e G d, d-             Z~ G d. d/      Z G d0 d1e      Z G d2 d3e      Zy)4zDatasetBuilder base class.    N)IterableMapping)	dataclass)partial)Path)TYPE_CHECKINGOptionalUnion)patch)	url_to_fs)Pool)
thread_map   )configutils)Dataset)ArrowReaderReadInstruction)ArrowWriterParquetWriterSchemaInferenceError)DataFilesDictDataFilesPatternsDictsanitize_patterns)DatasetDictIterableDatasetDict)DownloadConfig)DownloadManagerDownloadMode)StreamingDownloadManagerxjoin)DatasetGenerationCastErrorDatasetGenerationErrorFileFormatErrorManualDownloadError)Features)is_remote_filesystemrename)Hasher)DatasetInfoPostProcessedInfo)ArrowExamplesIterableExamplesIterableIterableDataset)DuplicatedKeysError)"INVALID_WINDOWS_CHARACTERS_IN_PATHcamelcase_to_snakecase)Split	SplitDictSplitGenerator	SplitInfo)$extend_dataset_builder_for_streaming)	CastError)logging)tqdm)FileLock)is_remote_url)VerificationModeget_size_checksum_dictverify_checksumsverify_splits)classpropertyconvert_file_size_to_inthas_sufficient_disk_spaceiflatmap_unordered
map_nestedmemoizesize_strtemporary_assignment)_number_of_shards_in_gen_kwargs_split_gen_kwargs)tracked_list)DatasetModulec                       e Zd Zy)InvalidConfigNameN)__name__
__module____qualname__     K/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/builder.pyrM   rM   \   s    rR   rM   c                       e Zd ZU dZdZeed<    ej                  d      Z	e
eej                  ef      ed<   dZe
e   ed<   dZe
eeef      ed<   dZe
e   ed	<   d
 Zd Z	 ddede
e   defdZdededdfdZy)BuilderConfiga  Base class for `DatasetBuilder` data configuration.

    `DatasetBuilder` subclasses with data configuration options should subclass
    `BuilderConfig` and add their own properties.

    Attributes:
        name (`str`, defaults to `default`):
            The name of the configuration.
        version (`Version` or `str`, defaults to `0.0.0`):
            The version of the configuration.
        data_dir (`str`, *optional*):
            Path to the directory containing the source data.
        data_files (`str` or `Sequence` or `Mapping`, *optional*):
            Path(s) to source data file(s).
        description (`str`, *optional*):
            A human description of the configuration.
    defaultname0.0.0versionNdata_dir
data_filesdescriptionc                     t         D ]0  }|| j                  v st        dt          d| j                   d       | j                  9t	        | j                  t
        t        f      st        d| j                         y y )Nz Bad characters from black list 'z' found in 'z\'. They could create issues when creating a directory for this config on Windows filesystem.z/Expected a DataFilesDict in data_files but got )r0   rW   rM   r[   
isinstancer   r   
ValueError)selfinvalid_chars     rS   __post_init__zBuilderConfig.__post_init__z   s    > 	Ltyy('67Y6ZZfgkgpgpfq rp q 	 ??&z$//M[pKq/rNtN_`aa 0s&rR   c                      t         j                  j                               t        j                  j                               k7  ryt         fd j                  j                         D              S )NFc              3   X   K   | ]!  }|t        |      f|t        |      fk(   # y wN)getattr).0kor`   s     rS   	<genexpr>z'BuilderConfig.__eq__.<locals>.<genexpr>   s-     ]1AwtQ'(Q1,>>]s   '*)set__dict__keysall)r`   ri   s   ``rS   __eq__zBuilderConfig.__eq__   sP     t}}!!#$AJJOO,=(>>]HZHZH\]]]rR   config_kwargscustom_featuresreturnc                    d}|j                         }|j                  dd       |j                  dd       d|v rA|d   |j                  dd       n)|d   }t        j                  j	                  |      }||d<   |rt        |      D ci c]  }|||   
 }}t        d |j                         D              rJdj                  d |j                         D              }t        |      dkD  r+t        j                  |      }nt        j                  |      }|>t               }|r|j                  |       |j                  |       |j                         }|rU| j                  d	z   |z   }t        |      t         j"                  kD  r%| j                  d	z   t        j                  |      z   }|S | j                  S c c}w )
a0  
        The config id is used to build the cache directory.
        By default it is equal to the config name.
        However the name of a config is not sufficient to have a unique identifier for the dataset being generated
        since it doesn't take into account:
        - the config kwargs that can be used to overwrite attributes
        - the custom features used to write the dataset
        - the data_files for json/text/csv/pandas datasets

        Therefore the config id is just the config name with an optional suffix based on these.
        NrW   rY   rZ   c              3   \   K   | ]$  }t        |t        t        t        t        f       & y wre   )r^   strboolintfloat)rg   vs     rS   rj   z1BuilderConfig.create_config_id.<locals>.<genexpr>   s      ka:a#tS%!89ks   *,,c              3      K   | ]>  \  }}t        |      d z   t        j                  j                  t        |            z    @ yw)=N)ru   urllibparse
quote_plus)rg   rh   ry   s      rS   rj   z1BuilderConfig.create_config_id.<locals>.<genexpr>   s9      "GKq!CFSL6<<#:#:3q6#BB"s   AA    -)copypopospathnormpathsortedrn   valuesjoinitemslenr)   hashupdate	hexdigestrW   r   %MAX_DATASET_CONFIG_ID_READABLE_LENGTH)	r`   rp   rq   suffixconfig_kwargs_to_add_to_suffixrZ   rh   m	config_ids	            rS   create_config_idzBuilderConfig.create_config_id   s   " !%)6););)=&&**648&**9d;
 77-j9A.22:tD :*E77++H5=E.z:) ?EEc>d.9:1!44.* . kCaChChCjkk "OmOsOsOu"  v;##[[)GHF%CD&A HH_%[[]F		C&0I9~ L LL IIOfkk&.AA	993.s   F<	base_pathdownload_configc                     t        | j                  t              rF| j                  rt	        || j                        n|}| j                  j                  ||      | _        y y re   )r^   r[   r   rZ   r!   resolve)r`   r   r   s      rS   _resolve_data_filesz!BuilderConfig._resolve_data_files   sF    doo'<=;?==i7iI"oo55iQDO >rR   re   )rN   rO   rP   __doc__rW   ru   __annotations__r   VersionrY   r	   r
   rZ   r[   r   r   r\   rb   ro   dictr&   r   r   r   rQ   rR   rS   rU   rU   `   s    $ D#3@5==3IGXeEMM3./0I"Hhsm"HLJ}.CCDEL!%K#%	b^ /3>> "(+> 
	>@RS R> RVZ RrR   rU   c                      e Zd ZdZdZeZg ZdZdZ		 	 	 	 	 	 	 	 	 	 	 	 	 dGde
e   de
e   de
e   de
e   de
e   de
e   d	e
e   d
e
eeef      de
e   de
eeeeef      de
e   de
e   de
e   fdZd Zd Zede
e   fd       Zde
e   fdZddde
e   fdZ	 dHdeeef   fdZee e       deeef   fd                     Z ed        Z!dIdZ"dJdefdZ#d Z$e%jL                  defd       Z'ed         Z(d!ed"efd#Z)	 	 	 	 	 	 	 	 	 	 dKd$e
e   d%e
e*   d&e
ee+ef      d'e
ee,ef      d(e
e-   de
e   d)ed*e
eeef      d+e
e   de
e   fd,Z.d- Z/d. Z0d/ Z1defd0Z2d1 Z3d2 Z4	 	 	 	 dLd4e
eee5ee   ee5   f      d'e
ee,ef      dee6e7f   fd5Z8	 dMd4eee9e5f   d6ed'e,d7efd8Z:e5jv                  d3fd4ee9e5f   d7ede6fd9Z<d4ee9e5f   defd:Z=	 	 dHd4e
e   de
e   deeee>f   e>f   fd;Z?de>fd<Z@d=e6d>eAeef   de
e6   fd?ZBd4edeeef   fd@ZCd4edAed(e-de
e   fdBZDe%jL                  d(ee-eEf   fdC       ZFe%jL                  	 	 	 dNdDeGd)ed*e
eeef      d+e
e   fdE       ZHdDeGdeIfdFZJy)ODatasetBuildera'  Abstract base class for all datasets.

    `DatasetBuilder` has 3 key methods:

        - [`DatasetBuilder.info`]: Documents the dataset, including feature
          names, types, shapes, version, splits, citation, etc.
        - [`DatasetBuilder.download_and_prepare`]: Downloads the source data
          and writes it to disk.
        - [`DatasetBuilder.as_dataset`]: Generates a [`Dataset`].

    Some `DatasetBuilder`s expose multiple variants of the
    dataset by defining a [`BuilderConfig`] subclass and accepting a
    config object (or name) on construction. Configurable datasets expose a
    pre-defined set of configurations in [`DatasetBuilder.builder_configs`].

    Args:
        cache_dir (`str`, *optional*):
            Directory to cache data. Defaults to `"~/.cache/huggingface/datasets"`.
        dataset_name (`str`, *optional*):
            Name of the dataset, if different from the builder name. Useful for packaged builders
            like csv, imagefolder, audiofolder, etc. to reflect the difference between datasets
            that use the same packaged builder.
        config_name (`str`, *optional*):
            Name of the dataset configuration.
            It affects the data generated on disk. Different configurations will have their own subdirectories and
            versions.
            If not provided, the default configuration is used (if it exists).

            <Added version="2.3.0">

            Parameter `name` was renamed to `config_name`.

            </Added>
        hash (`str`, *optional*):
            Hash specific to the dataset builder code. Used to update the caching directory when the
            dataset builder code is updated (to avoid reusing old data).
            The typical caching directory (defined in `self._relative_data_dir`) is `name/version/hash/`.
        base_path (`str`, *optional*):
            Base path for relative paths that are used to download files.
            This can be a remote URL.
        features ([`Features`], *optional*):
            Features types to use with this dataset.
            It can be used to change the [`Features`] types of a dataset, for example.
        token (`str` or `bool`, *optional*):
            String or boolean to use as Bearer token for remote files on the
            Datasets Hub. If `True`, will get token from `"~/.huggingface"`.
        repo_id (`str`, *optional*):
            ID of the dataset repository.
            Used to distinguish builders with the same name but not coming from the same namespace, for example "rajpurkar/squad"
            and "lhoestq/squad" repo IDs. In the latter, the builder name would be "lhoestq___squad".
        data_files (`str` or `Sequence` or `Mapping`, *optional*):
            Path(s) to source data file(s).
            For builders like "csv" or "json" that need the user to specify data files. They can be either
            local or remote files. For convenience, you can use a `DataFilesDict`.
        data_dir (`str`, *optional*):
            Path to directory containing source data file(s).
            Use only if `data_files` is not passed, in which case it is equivalent to passing
            `os.path.join(data_dir, "**")` as `data_files`.
            For builders that require manual download, it must be the path to the local directory containing the
            manually downloaded data.
        storage_options (`dict`, *optional*):
            Key/value pairs to be passed on to the dataset file-system backend, if any.
        writer_batch_size (`int`, *optional*):
            Batch size used by the ArrowWriter.
            It defines the number of samples that are kept in memory before writing them
            and also the length of the arrow chunks.
            None means that the ArrowWriter will use its default value.
        **config_kwargs (additional keyword arguments): Keyword arguments to be passed to the corresponding builder
            configuration class, set on the class attribute [`DatasetBuilder.BUILDER_CONFIG_CLASS`]. The builder
            configuration class is [`BuilderConfig`] or a subclass of it.
    N	cache_dirdataset_nameconfig_namer   r   infofeaturestokenrepo_idr[   rZ   storage_optionswriter_batch_sizec                 H
   t        | j                  j                  d      d         | _        || _        || _        || _        |	| _        |xs i | _        |rt        |      n| j                  | _	        |xs | j                  | _        |
Ft        |
t              s6t        j                  t        |
      |t!        || j                              }
dt#        j$                  | j&                  j(                        j*                  v r|||d<   |
|
|d<   |||d<   || _         | j.                  d||d|\  | _        | _        || j5                         }| j                  |_        | j                  |_	        | j0                  j                  |_        | j0                  j:                  |_        || _        ||| j<                  _        tA        |xs t0        jB                        | _"        tG        | jD                        r| jD                  n(tH        jJ                  jM                  | jD                        | _"        |r.tO        jP                  | jD                  t0        jR                        ntA        t0        jT                        | _+        tG        | jV                        r| jV                  n(tH        jJ                  jM                  | jV                        | _+        d | _,        | j[                         | _.        tG        | jD                        stI        j^                  | jD                  d	
       tH        jJ                  jQ                  | jD                  ta        | j\                        jc                         je                  dd      dz         }tg        |      5  tH        jJ                  ji                  | j\                        r	tk        tI        jl                  | j\                              dkD  rtH        jJ                  ji                  tH        jJ                  jQ                  | j\                  t0        jn                              rtp        js                  d       tu        jv                  | j\                        | _        nOtp        jy                  d| j\                   d| j                   d       tI        jz                  | j\                         d d d        | j\                  | _>        t        j                  d      | _A        d | _B        d| _C        d | _D        t        |        y # 1 sw Y   UxY w)N.r   r   r   r   r   r[   rZ   )r   rq   Texist_ok/_z.lockr   z<Overwrite dataset info from restored data version if exists.zOld caching folder z for dataset z- exists but no data were found. Removing it. fileFrQ   )Fr1   rO   splitrW   r   r   r   r   r   r   DEFAULT_WRITER_BATCH_SIZE_writer_batch_sizer^   r   from_patternsr   r   inspect	signatureBUILDER_CONFIG_CLASS__init__
parametersrp   _create_builder_configr   r   _infobuilder_namer   rY   r   r   ru   HF_DATASETS_CACHE_cache_dir_rootr;   r   r   
expanduser	posixpathr   DOWNLOADED_DATASETS_DIRDOWNLOADED_DATASETS_PATH_cache_downloaded_dir_legacy_relative_data_dir_build_cache_dir
_cache_dirmakedirsr   as_posixreplacer:   existsr   listdirDATASET_INFO_FILENAMEloggerdebugr*   from_directorywarningrmdir_output_dirfsspec
filesystem_fs
dl_manager_record_infos_file_formatr6   )r`   r   r   r   r   r   r   r   r   r   r[   rZ   r   r   rp   	lock_paths                   rS   r   zDatasetBuilder.__init__-  s   $ 00E0Ec0J20NO	#'	"
.4"DP2<@VZV_V_"3"Ut7U7U!*Z*O&44!*-# .UDL`L` aJ **4+D+D+M+MNYYY^f^r(0M*%!*4M,'(0M*%*&Ad&A&A '
#$'
 '
#T^ <::<D II --;;++{{**	!)DII  #9#H0H0HI$1$2F2F$GD  RWWM_M_`d`t`tMu 	
  NN4//1O1OPV445 	" T778 &&##D$>$>? 	" *.&//1T112KK,,t<$$d4??&;&D&D&F&N&NsTW&X[b&bI )$ 
277>>$//22::doo67!;77>>"'',,tHdHd*ef"LL)gh(3(B(B4??(SDI1$//1B-PTPaPaOb  cP  Q 1
2  ??.4.?.?.G  # ! 	-T25
2 
2s   D4TT!c                     | j                   S re   )rl   r`   s    rS   __getstate__zDatasetBuilder.__getstate__  s    }}rR   c                 (    || _         t        |        y re   )rl   r6   )r`   ds     rS   __setstate__zDatasetBuilder.__setstate__  s    ,T2rR   rr   c                      y re   rQ   r   s    rS   manual_download_instructionsz+DatasetBuilder.manual_download_instructions      rR   c                 2   | j                   j                  d      r{t        | j                        sd| j                  j
                  dk(  rIddlm} | j                  r<| j                  j                  d      dkD  r| j                  j                  d      d   nd}| j                  | j                  j                  dd      n| j                  }|| j                  t        | j                  j
                        d z   }|j                  | j
                  d	      d   }t!        j"                  || j                  n| d
| j                   |d|      }t!        j"                  | j                  |      }t$        j&                  j)                  |      r|S yyyy)z]Check for the old cache directory template {cache_dir}/{namespace}___{builder_name} from 2.13	datasets.rV   r   )_PACKAGED_DATASETS_MODULESr   r   Nz--missing___rX   )rO   
startswithr;   r   r   rW   packaged_modulesr   r   countr   r   r   r   r   getr   r   r   r   isdir)r`   r   	namespacer   r   r   legacy_relative_data_dirlegacy_cache_dirs           rS   _check_legacy_cachez"DatasetBuilder._check_legacy_cache  sZ    OO&&{3!$"6"67  I-D6:llt||GYGYZ]G^abGb**3/2hlI=A\\=U$,,..sD9[_[l[lK#dnnS9I9I5J5L&MMI-11$))YGJD'0~~%.%6!!ykTM^M^L_<`	($  )~~d.B.BD\]ww}}-.// / . 8 4rR   dataset_modulerK   c                 p   | j                   j                  d      rt        | j                        st	        | j
                        ddhz
  sddlm} ddlm	} dt        dt        d	t        fd
}| j                  r<| j                  j                  d      dkD  r| j                  j                  d      d   nd}t        j                   |dd      5  | j"                  j$                  dz   t'        j(                  d| j"                  j*                  i      z   }ddd       |j-                  | j$                  d      }|j.                  j0                  r`| j"                  j$                  |j.                  j0                  v r4 |||j.                  j0                  | j"                  j$                           }t3        j4                  || j6                  n| d| j6                   d|      }t3        j4                  | j                  |      }	t8        j:                  j=                  |	      r|S yyyy# 1 sw Y   xY w)zxCheck for the old cache directory template {cache_dir}/{namespace}___{dataset_name}/{config_name}-xxx from 2.14 and 2.15r   r[   rZ   r   )&_PACKAGED_DATASETS_MODULES_2_15_HASHES)Picklerr   config_parametersrr   c                     h d}t        |j                               D ci c]  \  }}||vr|| }}}t               }|j                  |        |j                  |       |j	                         S c c}}w )z
                Used to update hash of packaged modules which is used for creating unique cache directories to reflect
                different config parameters which are passed in metadata from readme.
                >   rY   r   r\   )r   r   r)   r   r   )r   r   params_to_excludeparamvalueparams_to_add_to_hashr   s          rS   "update_hash_with_config_parameterszODatasetBuilder._check_legacy_cache2.<locals>.update_hash_with_config_parameters  s}    
 %N! )//@/F/F/H(I)$u$55 5L)% )
 H./{{}$)s   A1r   r   N_legacy_no_dict_keys_sortingTr   r   r   rX   )rO   r   r;   r   rk   rp   r   r   utils._dillr   ru   r   r   r   r   r   objectr   rW   r)   r   r[   r   builder_configs_parametersmetadata_configsr   r   r   r   r   r   )
r`   r   r   r   r   r   r   r   r   r   s
             rS   _check_legacy_cache2z#DatasetBuilder._check_legacy_cache2  s    OO&&{3!$"6"67++,j/IIP,% %QU %Z] %  7;llt||GYGYZ]G^abGb**3/2hlIg'EtL i KK,,s2V[[,PTP[P[PfPfAg5hh	i9==diiSD99JJKK$$(Q(Q(b(bb9.CCTTUYU`U`UeUef (1~~%.%6!!ykTM^M^L_<`	($  )~~d.B.BD\]ww}}-.// /O J 8 40i is   AH++H5c           	      6   d}|4| j                   r'| j                  U| j                  j                  | j                        }t        j                  d| j                   d|j                          nt        | j                         dkD  rp|sd| j                  xs | j                   d| j                   d   j                   d}t        d	t        | j                  j                                d
| dz         | j                   d   }t        j                  d| j                   d|j                          t        |t              r[| j                  j                  |      }|>| j                   r2t        d| dt        | j                  j                                      |sc|||d<   n| j                  r|s| j                  |d<   d|vr't        | d      r| j                   r| j                   |d<    | j"                  di |}n_|rt%        j&                  |      n|}|j)                         D ]3  \  }}|	t        ||      st        d| d| d      t+        |||       5 |j                  st        d|j                         |j-                  | j.                  t1        | j2                  | j4                               |j7                  ||      }|| j                  vxr |dk7  }	|	rt        j                  d|        ||fS |j                  | j                  v rK|| j                  |j                     k7  r/t        dt        | j                  j                                      |j8                  st        d|j                   d      ||fS )a  Create and validate BuilderConfig object as well as a unique config id for this config.
        Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
        config_kwargs override the defaults kwargs in config
        Nz$No config specified, defaulting to: r   r   zload_dataset('z', 'r   z')zEConfig name is missing.
Please pick one among the available configs: z
Example of usage:
	``z6No config specified, defaulting to the single config: zBuilderConfig 'z' not found. Available: rW   rY   VERSIONzBuilderConfig z doesn't have a 'z' key.z$BuilderConfig must have a name, got r   r   )rq   rV   z Using custom data configuration zvCannot name a custom BuilderConfig the same as an available BuilderConfig. Change the name. Available BuilderConfigs: z must have a versionrQ   )BUILDER_CONFIGSDEFAULT_CONFIG_NAMEbuilder_configsr   r   r   r   rW   r   r   r_   listrm   r^   ru   hasattrr  r   r   deepcopyr   setattrr   r   r   r   r   r   rY   )
r`   r   rq   rp   builder_configexample_of_usagekeyr   r   	is_customs
             rS   r   z%DatasetBuilder._create_builder_config  s     4#7#7''3!%!5!5!9!9$:R:R!SB4CTCTBUUVWeWjWjVklmt++,q0(,T\\-NT=N=N,OtTXThThijTkTpTpSqqst ) )NNRSWSgSgSlSlSnNoMpr 89I8J!LM  &*%9%9!%<NKKPQUQbQbPccdesexexdyz
 k3'!1155kBN%$*>*> %k]2J4PTPdPdPiPiPkKlJmn 
 &(3f%))-(,(@(@f%-'$	2Jt||+/<<i(6T66GGN ?LT]]>:Q_N+113 8
U$">37(>.9IIZ[^Z__e)fggNC7	8 ""CNDWDWCXYZZ 	**nn*TMaMab 	+ 	
 #33+ 4 
	 d&:&::V	Y@V	KK:9+FG y(( ##t';';;"d&:&:>;N;N&OO QQUVZVjVjVoVoVqQrPsu  ")) >.2E2E1FFZ![\\y((rR   c                    | j                   D ci c]  }|j                  | }}t        |      t        | j                         k7  r1| j                   D cg c]  }|j                   }}t        d|       |S c c}w c c}w )z@Dictionary of pre-defined configurations for this builder class.z5Names in BUILDER_CONFIGS must not be duplicated. Got )r  rW   r   r_   )clsr   configsnamess       rS   r	  zDatasetBuilder.builder_configsR  s}    
 695H5HI66;;&IIw<3s2233/2/B/BCVV[[CECTUZT[\]]	 JCs   A9A>c                     | j                   S re   )r   r   s    rS   r   zDatasetBuilder.cache_dir]  s    rR   c                     | j                  |      xs | j                         xs d | _        | j                         | _        | j                  | _        y re   )r  r   r   r   r   r   )r`   r   s     rS   !_use_legacy_cache_dir_if_possiblez0DatasetBuilder._use_legacy_cache_dir_if_possiblea  sL     %%n5[9Q9Q9S[W[ 	& //1??rR   c                 J   | j                   |r|r| j                   S | j                  r<| j                  j                  d      dkD  r| j                  j                  d      d   nd}|| j                  n| d| j                   }t        j                  || j                        }|r3t        j                  |t        | j                  j                              }|rF| j                  r:t        | j                  t              r t        j                  || j                        }|S )a  Relative path of this dataset in cache_dir:
        Will be:
            self.dataset_name/self.config.version/self.hash/
        or if a repo_id with a namespace has been specified:
            self.namespace___self.dataset_name/self.config.version/self.hash/
        If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
        Nr   r   r   )r   r   r   r   r   r   r   r   ru   r   rY   r   r^   )r`   with_version	with_hashr   builder_data_dirs        rS   _relative_data_dirz!DatasetBuilder._relative_data_diri  s     ))5,911126,,4<<CUCUVYCZ]^C^DLL&&s+A.dh	090A4,,)TWX\XiXiWjGk$>>*:DNNK(~~.>DKKDWDW@XYz$))S'A(~~.>		JrR   c           
         t        j                  | j                  | j                  d            t        j                  | j                  | j                  d            }fd}t	              s |       }|r|d   d   }|| j
                  j                  k7  r^dt        |       d| j                   d| j                   d	t        | j
                  j                         d
	}t        j                  |       |S )z2Return the data directory for the current version.F)r  Tc                     t         j                  j                        sg S g } t        j                        D ])  }	 | j	                  t        j                  |      |f       + | j                  d       | S # t        $ r Y Kw xY w)z"Returns previous versions on disk.T)reverse)	r   r   r   r   appendr   r   r_   sort)version_dirnamesdir_namer  s     rS   _other_versions_on_diskz@DatasetBuilder._build_cache_dir.<locals>._other_versions_on_disk  s    77>>"23	!JJ'78 $++U]]8-Dh,OP
 !!$!/## " s   &A::	BBr   zFound a different version z of dataset z in cache_dir z". Using currently defined version r   )r   r   r   r  r;   r   rY   ru   r   r   r   )r`   version_data_dirr%  version_dirsother_versionwarn_msgr  s         @rS   r   zDatasetBuilder._build_cache_dir}  s    $>>$*>*>@W@Wej@W@kl$>>$*>*>@W@Wei@W@jk	$ -.24L ,Q 2 DKK$7$774S5G4HUYUfUfTg h%%)%9%9$::\t{{2234A7 
 NN8,rR   c                     t         )a	  Construct the DatasetInfo object. See `DatasetInfo` for details.

        Warning: This function is only called once and the result is cached for all
        following .info() calls.

        Returns:
            info: (DatasetInfo) The dataset information
        NotImplementedErrorr   s    rS   r   zDatasetBuilder._info  s
     "!rR   c                     t         j                  j                  t        j                  t        j
                  |                   S )z8Return the path of the module of this class or subclass.)r   r   dirnamer   getfile	getmodule)r  s    rS   get_imported_module_dirz&DatasetBuilder.get_imported_module_dir  s+     wwww/@/@/EFGGrR   srcdstc                 2    t        | j                  ||       y re   )r(   r   )r`   r2  r3  s      rS   _renamezDatasetBuilder._rename  s    txxc"rR   
output_dirr   download_modeverification_moder   file_formatmax_shard_sizenum_procc                 0    ||n j                   }t        |fi |
xs i \  }}| _        t         j                        s|n j                  j	                  |       _        t        |xs t        j                        }t        |xs t        j                        }||n j                  }||dvrt        d| d      | _         j                  j                   j
                        dk(  r3t        d j
                   d j
                   j                  z    d      ||Rt!         j"                  |t        j$                  k(  |t        j$                  k(  d	|	 j&                   j(                  
      }t+         j                  | j,                  j.                  | j0                  xs |t        j2                  k(        }t         j                         | _        r?t7         j
                        j8                  j;                  dd        j
                  dz   }rt=              nt?        j@                         5   j                  jC                  tE        jF                   j
                  t,        jH                              }|rs|t        j                  k(  r`tJ        jM                  d j                   d j
                   d        jO                          _&         jQ                  |       	 ddd       ytJ        jM                  d j                   d j
                   d       rtS         jL                  jT                  xs dt7         j
                        j8                        stW        dtY         jL                  jT                  xs d       dtY         jL                  jZ                  xs d       dtY         jL                  j\                  xs d       dtY         jL                  j^                  xs d       d	      t>        j`                   fd       } jL                  jT                  rtJ        jM                  d j                   d j,                  jb                   dtY         jL                  jZ                         dtY         jL                  j\                         dtY         jL                  j^                         dtY         jL                  jT                         d j
                   d       npr% j                  j                   j
                        n j
                  }tJ        jM                  d j                   d j,                  jb                   d| d        je                  |        | j
                        5 }tg         d |      5  d!|i}|||d"<   |	|	|d#<     jh                  d)||d$|| tk        d%  jL                  jl                  jo                         D               jL                  _.        |jq                          jL                  _9         jL                  jZ                  < jL                  j\                   jL                  jZ                  z    jL                  _*         ju                          ddd       ddd        jQ                  |       tJ        jM                  d& j                   d' j
                   d(       ddd       y# 1 sw Y   [xY w# 1 sw Y   _xY w# 1 sw Y   yxY w)*a[  Downloads and prepares dataset for reading.

        Args:
            output_dir (`str`, *optional*):
                Output directory for the dataset.
                Default to this builder's `cache_dir`, which is inside `~/.cache/huggingface/datasets` by default.

                <Added version="2.5.0"/>
            download_config (`DownloadConfig`, *optional*):
                Specific download configuration parameters.
            download_mode ([`DownloadMode`] or `str`, *optional*):
                Select the download/generate mode, default to `REUSE_DATASET_IF_EXISTS`.
            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
                Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).

                <Added version="2.9.1"/>
            dl_manager (`DownloadManager`, *optional*):
                Specific `DownloadManger` to use.
            base_path (`str`, *optional*):
                Base path for relative paths that are used to download files. This can be a remote url.
                If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead.
            file_format (`str`, *optional*):
                Format of the data files in which the dataset will be written.
                Supported formats: "arrow", "parquet". Default to "arrow" format.
                If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files.

                <Added version="2.5.0"/>
            max_shard_size (`Union[str, int]`, *optional*):
                Maximum number of bytes written per shard, default is "500MB".
                The size is based on uncompressed data size, so in practice your shard files may be smaller than
                `max_shard_size` thanks to Parquet compression for example.

                <Added version="2.5.0"/>
            num_proc (`int`, *optional*, defaults to `None`):
                Number of processes when downloading and generating the dataset locally.
                Multiprocessing is disabled by default.

                <Added version="2.7.0"/>
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the caching file-system backend, if any.

                <Added version="2.5.0"/>
            **download_and_prepare_kwargs (additional keyword arguments): Keyword arguments.

        Example:

        Download and prepare the dataset as Arrow files that can be loaded as a Dataset using `builder.as_dataset()`:

        ```py
        >>> from datasets import load_dataset_builder
        >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
        >>> builder.download_and_prepare()
        ```

        Download and prepare the dataset as sharded Parquet files locally:

        ```py
        >>> from datasets import load_dataset_builder
        >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
        >>> builder.download_and_prepare("./output_dir", file_format="parquet")
        ```

        Download and prepare the dataset as sharded Parquet files in a cloud storage:

        ```py
        >>> from datasets import load_dataset_builder
        >>> storage_options = {"key": aws_access_key_id, "secret": aws_secret_access_key}
        >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes")
        >>> builder.download_and_prepare("s3://my-bucket/my_rotten_tomatoes", storage_options=storage_options, file_format="parquet")
        ```
        N)arrowparquetzUnsupported file_format: z. Expected 'arrow' or 'parquet' z7Unable to download and prepare the dataset at the root z'. Please specify a subdirectory, e.g. ''F)r   force_downloadforce_extractuse_etagr;  r   r   )r   r   rZ   r   record_checksumsT)parentsr   z_builder.lockzFound cached dataset z ()zGenerating dataset r   )	directoryzNot enough disk space. Needed: z (download: z, generated: z, post-processed: c              3   
  K   s"j                   j                  | d       |  y| dz   }t        j                  |d       	 | t        j                  j	                  |       rt        j                  |        t        j                  ||        t        j                  j                  |      rt        j                  |       yy# t        j                  j                  |      rt        j                  |       w w xY ww)z4Create temporary dir for dirname and rename on exit.Tr   z.incompleteN)	r   r   r   r   r   shutilrmtreemover   )r.  tmp_diris_localr`   s     rS   incomplete_dirz;DatasetBuilder.download_and_prepare.<locals>.incomplete_dirQ  s       HH%%g%=!M%5GKK$73%77==1"MM'2GW577>>'2"MM'2 3277>>'2"MM'2 3s   ADAC	 6D	7D  Dz"Downloading and preparing dataset r   z	, total: z) to z... to r   r9  r:  r;  )r   r8  c              3   4   K   | ]  }|j                     y wre   )	num_bytes)rg   r   s     rS   rj   z6DatasetBuilder.download_and_prepare.<locals>.<genexpr>  s     0hU0hs   Dataset z downloaded and prepared to z(. Subsequent calls will reuse this data.rQ   );r   r   r   r'   unstrip_protocolr   r   REUSE_DATASET_IF_EXISTSr<   BASIC_CHECKSr   r_   r   _strip_protocolRuntimeErrorr   r   r   FORCE_REDOWNLOADr   r   r   r   rZ   r   
ALL_CHECKSr   r   parentmkdirr:   
contextlibnullcontextr   r   r   r   r   r   
_load_info"download_post_processing_resourcesrB   size_in_bytesOSErrorrF   download_sizedataset_sizepost_processing_sizecontextmanagerrW   _check_manual_downloadrG   _download_and_preparesumsplitsr   get_recorded_sizes_checksumsdownload_checksums
_save_info)r`   r6  r   r7  r8  r   r   r9  r:  r;  r   download_and_prepare_kwargsfsr   data_existsrN  _desttmp_output_dirprepare_split_kwargsrM  s   `                  @rS   download_and_preparez#DatasetBuilder.download_and_prepare  s   j $.#9Zt
":I/2GRIJ-A$((-K:QUQYQYQjQjkuQv$]%Zl6Z6Z[,->-_BRB_B_`!*!6IDNN	"{:N'N8Edeff'88##D$4$45; I$JZJZI[ \88<8H8H4K\K\8\7]]^` 
 &"0"88#0L4Q4Q#Q"/<3P3P"P"%**$($8$8# )!.. /--#"&"4"4"h8IM]MhMh8hJ ,DHH55$ !!"))//t/L((?:I %-Xi *2H2H2J U	((//)..9I9I6KgKg*hiK}0T0TT3D4E4E3FbIYIYHZZ[\] !OO-	77
CU	 U	 KK-d.?.?-@4CSCSBTTUVW0II++0qDAQAQ<R<Y<Y "9(499CZCZC_^_:`9aamnvw{  xA  xA  xO  xO  xT  ST  oU  nV  Vc  dl  mq  mv  mv  mC  mC  mH  GH  dI  cJ  J\  ]e  fj  fo  fo  fD  fD  fI  HI  ]J  \K  KL  M  &&3 '3* yy&&89J9J8K1T[[M]M]L^ _""*499+B+B"C!DMRZ[_[d[d[q[qRrQs t''/		0N0N'O&P Q&tyy'>'>?@dFVFVEWWZ\ GO001A1ABTXTdTd@ARAR@SSTUYU`U`UeUeTffjkpjqqtuv''
3   0 01 &^ *$~N &,9;+G(%1AO,-=>+;C,Z8.D.. #-*; / 6	 .10hdiiN^N^NeNeNg0h-hDII*3=3Z3Z3\DII0yy..:26))2H2H499KbKb2b		/OO%%&&0 33J?KK4,,--I$JZJZI[ \9 :eU	 U	v& && &qU	 U	sF   B4\J\\ ,C.[4\ "A	\4[=9\  \		\\c                    | j                   x|j                  kt        t        j                  d| j
                   d| j                  j                   d| j                    d| j                  xs | j
                   d	            y y )Nz                     The dataset z with config zp requires manual data.
                    Please follow the manual download instructions:
                     za
                    Manual data can be loaded with:
                     datasets.load_dataset("z$", data_dir="<path/to/manual/data>"))	r   
manual_dirr%   textwrapdedentr   r   rW   r   r`   r   s     rS   rf  z%DatasetBuilder._check_manual_download  s    ,,8Z=R=R=Z%!!%!2!2 3=AQAQ@R S778 9--1\\-NT=N=N,OOsw	 	 >[8rR   c                 F   t        | j                        }| j                  |      } | j                  |fi |}|t        j
                  k(  r;|j                  r/t        | j                  j                  |j                         d       |D ]  }t        |j                  j                        j                         dk(  rt        d      t         j                  d|j                  j                   d       |j#                  |j                         	  | j$                  |fi | |j1                           |t        j2                  k(  s|t        j
                  k(  r t5        | j                  j6                  |       || j                  _        |j8                  | j                  _        y
# t&        $ r1}t'        d| j(                  xs dz   d	z   t        |      z         d
d
}~wt*        $ r5}t+        |j,                  |j.                  d| j                         d
d
}~ww xY w)a  Downloads and prepares dataset for reading.

        This is the internal implementation to overwrite called when user calls
        `download_and_prepare`. It should download all required data and generate
        the pre-processed datasets files.

        Args:
            dl_manager ([`DownloadManager`]):
                `DownloadManager` used to download and cache data.
            verification_mode ([`VerificationMode`]):
                if `ALL_CHECKS`, perform all the verifications including checksums.
                if `BASIC_CHECKS`, do not perform checksums, only perform split tests.
                if `NO_CHECKS`, do not perform any verification.
            prepare_split_kwargs: Additional options, such as `file_format`, `max_shard_size`
        )r   zdataset source filesrn   z{`all` is a special split keyword corresponding to the union of all splits, so cannot be used as key in ._split_generator().Generating  splitzCannot find data file. r?  z
Original error:
Nz;To avoid duplicate keys, please fix the dataset splits for )fix_msg)r3   r   _make_split_generators_kwargs_split_generatorsr<   rY  rD  r>   r   rk  rj  ru   
split_inforW   lowerr_   r   add_prepare_splitra  r   r/   r  duplicate_key_indicesmanage_extracted_filesrU  r?   ri  downloaded_sizerb  )	r`   r   r8  rr  
split_dictsplit_generators_kwargssplit_generatorssplit_generatores	            rS   rg  z$DatasetBuilder._download_and_prepare  s   " D,=,=>
"&"D"DEY"Z1411*X@WX  0 ; ;;
@[@[		,,j.U.U.WYo
  0 	0O?--22399;uD +  KK+o&@&@&E&E%FfMNNN?556###OL7KL --/9	0<  0 = ==ARVfVqVqAq$))**J7 &		","<"<		+  -88>B@+, !f
  ' )EE++YZ^ZcZcYde 	s$   F**	H 3,GH +0HH c                 P   | j                   j                  xs g D ]  }| j                  |      j                         D ]  \  }}t	        | j
                        rt        d| j
                         t        j                  |v rt        d|       t        j                  j                  | j                  |      }t        j                  j                  |      r| j                  |||      }|st        j                  d| d|        t!        j"                  ||         y )Nz/Post processing is not supported on filesystem +Resources shouldn't be in a sub-directory: z$Downloaded post-processing resource z as )r   ri  _post_processing_resourcesr   r'   r   r,  r   sepr_   r   r   r   r   #_download_post_processing_resourcesr   rI  rK  )r`   r   r   resource_nameresource_file_nameresource_pathdownloaded_resource_paths          rS   r_  z1DatasetBuilder.download_post_processing_resources  s   YY%%+ 	ME595T5TUZ5[5a5a5c M11/9-0_`d`h`h_i.jkk66//$'RSeRf%ghh "T-=-=?Q Rww~~m4/3/W/W}j0, 0&J=/Y]^p]q$rs$<mLM	MrR   c                 l    t        j                  | j                  | j                  j                        S )Nr   )r*   r   r   r   r   r   s    rS   r^  zDatasetBuilder._load_info  s%    ))$*:*:DHHLdLdeerR   c                 *   t        | j                        st        | j                  dz         nt	        j
                         }|5  | j                  j                  | j                  | j                  j                         d d d        y # 1 sw Y   y xY w)Nz
_info.lockr  )	r'   r   r:   r   r\  r]  r   write_to_directoryr   )r`   	file_locks     rS   rl  zDatasetBuilder._save_info  s{     (1 T%%45'') 	
  	eII(()9)9488KcKc(d	e 	e 	es   <B		Bc                     ~i S )zFGet kwargs for `self._split_generators()` from `prepare_split_kwargs`.rQ   )r`   rr  s     rS   r}  z,DatasetBuilder._make_split_generators_kwargs  s
     	rR   Fr   c                    | j                   | j                   dk7  rt        d      t        | j                        r,t	        dt        | j                        j                   d      t        j                  j                  | j                        s&t        d| j                   d| j                   d      t        j                  d|xs% d	j                  | j                   j"                         d
| j                          |$| j                   j"                  D ci c]  }|| }}t%        |xs t$        j&                        }t)        t+        | j,                  |||      |dd      }t/        |t0              rt3        |      }|S c c}w )a  Return a Dataset for the specified split.

        Args:
            split (`datasets.Split`):
                Which subset of the data to return.
            run_post_process (`bool`, defaults to `True`):
                Whether to run post-processing dataset transforms and/or add
                indexes.
            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
                Verification mode determining the checks to run on the
                downloaded/processed dataset information (checksums/size/splits/...).

                <Added version="2.9.1"/>
            in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.

        Returns:
            datasets.Dataset

        Example:

        ```py
        >>> from datasets import load_dataset_builder
        >>> builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
        >>> builder.download_and_prepare()
        >>> ds = builder.as_dataset(split='train')
        >>> ds
        Dataset({
            features: ['text', 'label'],
            num_rows: 8530
        })
        ```
        r=  zELoading a dataset not written in the "arrow" format is not supported.zLoading a dataset cached in a z is not supported.rR  z: could not find data in z. Please make sure to call builder.download_and_prepare(), or use datasets.load_dataset() before trying to access the Dataset object.zConstructing Dataset for split z, z, from )run_post_processr8  	in_memoryT)	map_tupledisable_tqdm)r   r$   r'   r   r,  typerN   r   r   r   r   FileNotFoundErrorr   r   r   r   r   ri  r<   rU  rD   r   _build_single_datasetr^   r   r   )r`   r   r  r8  r  sdatasetss          rS   
as_datasetzDatasetBuilder.as_dataset  sv   P (T->->'-I!"ijj)%(FtDHH~G^G^F__q&rssww~~d../#4,,--FtGWGWFX YV V  	6u7[		$))JZJZ@[6\\cdhdtdtcuvw =#'99#3#34aQT4E4,->-_BRB_B_` **!1"3#	 

 h%"8,H% 5s   
F	r  r  c           	      $	   t        |t              sNt        |      }|dk(  r3dj                  | j                  j
                  j                               }t        |      }| j                  ||      }|r| j                  |      j                         D ]"  }t        j                  |v st        d|        | j                  |      j                         D ci c]0  \  }}|t        j                  j                  | j                   |      2 }}}| j#                  ||      }	|	b|	}i }
d}|j                         D ]  \  }}t%        |      }||
|<    |t&        j(                  k(  rw|ru| j                  j*                   | j                  j*                  j,                  d}n/| j                  j*                  j,                  j/                  |      }t1        ||
d       | j                  j*                  t3               | j                  _        | j                  j*                  j,                  i | j                  j*                  _        |
| j                  j*                  j,                  t        |      <   t5        d | j                  j*                  j,                  j                         D              | j                  _        | j                  j8                  i| j                  j:                  S| j                  j8                  | j                  j:                  z   | j                  j6                  z   | j                  _        | j?                          | j                  j*                  |j@                  _        | j                  j6                  |j@                  _        | j                  j<                  |j@                  _        | j                  j*                  jB                  | j                  j*                  jB                  jD                  |jB                  jD                  k7  r9t        d	| j                  j*                  jB                   d
|jB                         | j                  j*                  jB                  |j                  _!        |S c c}}w )zas_dataset for a single split.rn   +)r   r  r  NFzpost processing resourcesc              3   P   K   | ]  }|j                         D ]	  }|d        yw)rQ  N)r   )rg   split_checksums_dictschecksums_dicts      rS   rj   z7DatasetBuilder._build_single_dataset.<locals>.<genexpr>|  s;      5-*?*F*F*H5 ' #;/5/5s   $&z:Post-processed features info don't match the dataset:
Got
z
but expected something like
)#r^   r   ru   r   r   ri  rm   r2   _as_datasetr  r   r   r  r_   r   r   r   _post_processr=   r<   rY  post_processedresources_checksumsr   r>   r+   rh  rd  rc  rb  r`  rl  r   r   r  )r`   r   r  r8  r  dsr  r  resources_pathsr  recorded_checksumsrD  r  size_checksumexpected_checksumss                  rS   r  z$DatasetBuilder._build_single_datasetN  s    %1JE~!1!1!6!6!89%LE   
 &*&E&Ee&L&S&S&U i"66//$'RSeRf%ghhi
 :>9X9XY^9_9e9e9g5M#5 rww||D,<,<>PQQO  "//ODN)#%'"#( 4C4I4I4K F0M=$:=$IM8E&}5F %(8(C(CCHXyy//7499;S;S;g;g;o-1*-1YY-E-E-Y-Y-]-]^c-d*$%79KMhi99++3/@/BDII,99++??GCEDII,,@K]		((<<SZH14 5151I1I1]1]1d1d1f5 2		.
 99))5$)):Q:Q:]		..1H1HH499KiKii II+ !*.))*B*B'04		0N0N-)-)@)@&99++44@yy//88==AQAQQ(Z[_[d[d[s[s[|[|Z}  ~]  ^`  ^i  ^i  ]j  k  ,099+C+C+L+L(	Ys   5Rc                 n   | j                   j                  | j                        }| j                  }| j	                         r| j
                  }t        || j                        j                  ||| j                  j                  j                         |      }| j                  |      }t        dd|i|S )a  Constructs a `Dataset`.

        This is the internal implementation to overwrite called when user calls
        `as_dataset`. It should read the pre-processed datasets files and generate
        the `Dataset` object.

        Args:
            split (`datasets.Split`):
                which subset of the data to read.
            in_memory (`bool`, defaults to `False`):
                Whether to copy the data in-memory.

        Returns:
            `Dataset`
        )rW   instructionssplit_infosr  fingerprintrQ   )r   rV  r   r   r   rW   r   r   readri  r   _get_dataset_fingerprintr   )r`   r   r  r   r   dataset_kwargsr  s          rS   r  zDatasetBuilder._as_dataset  s      HH,,T-=-=>	((##%99L$Y		:??		((//1	 @ 
 33E:A;A.AArR   c                     t               }|j                  t        | j                               j	                                |j                  t        |             |j                         }|S )zThe dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs.)r)   r   r   r  r   ru   r   )r`   r   hasherr  s       rS   r  z'DatasetBuilder._get_dataset_fingerprint  sP    d42245>>@Ac%j!&&(rR   c                 r   t        | j                        r,t        dt        | j                        j                   d      t        |xs | j                  t        | j                  | j                        | j                  | j                  j                        }| j                  |       | j                  |      D ci c]  }|j                  | }}||}n$||v r||   }nt!        d| dt#        |             t%        | j&                  |d      }t)        |t*              rt-        |      }|S c c}w )	Nz(Loading a streaming dataset cached in a z is not supported yet.r   )r   r   r   rZ   zBad split: z. Available splits: T)r  )r'   r   r,  r  rN   r    r   r   r   r   r   r   rZ   rf  r~  rW   r_   r
  rD   _as_streaming_dataset_singler^   r   r   )r`   r   r   r   sgsplits_generatorssplits_generatorr  s           rS   as_streaming_datasetz#DatasetBuilder.as_streaming_dataset  s-   
  )%:4>;R;R:SSij  .14>>*TMaMab**[[))	

 	##J/373I3I*3UVRRWWb[VV=0''07{5'1EdK\F]E^_`` --

 h%*84H# Ws    D4c                     | j                  |      }| j                  r| j                  | j                  ini }t        || j                  |j
                  |      S )N)r   r   token_per_repo_id) _get_examples_iterable_for_splitr   r   r.   r   rW   )r`   r  ex_iterabler  s       rS   r  z+DatasetBuilder._as_streaming_dataset_single  sR     ;;<LM:>,,T\\4::6Bdii/?/D/DXi
 	
rR   datasetr  c                      y)z%Run dataset transforms or add indexesNrQ   )r`   r  r  s      rS   r  zDatasetBuilder._post_process  r   rR   c                     i S )z+Mapping resource_name -> resource_file_namerQ   )r`   r   s     rS   r  z)DatasetBuilder._post_processing_resources  s    	rR   r  c                      y)zPDownload the resource using the download manager and return the downloaded path.NrQ   )r`   r   r  r   s       rS   r  z2DatasetBuilder._download_post_processing_resources  s     rR   c                     t               )a  Specify feature dictionary generators and dataset splits.

        This function returns a list of `SplitGenerator`s defining how to generate
        data and what splits to use.

        Example:

            return [
                    datasets.SplitGenerator(
                            name=datasets.Split.TRAIN,
                            gen_kwargs={'file': 'train_data.zip'},
                    ),
                    datasets.SplitGenerator(
                            name=datasets.Split.TEST,
                            gen_kwargs={'file': 'test_data.zip'},
                    ),
            ]

        The above code will first call `_generate_examples(file='train_data.zip')`
        to write the train data, then `_generate_examples(file='test_data.zip')` to
        write the test data.

        Datasets are typically split into different subsets to be used at various
        stages of training and evaluation.

        Note that for datasets without a `VALIDATION` split, you can use a
        fraction of the `TRAIN` data for evaluation as you iterate on your model
        so as not to overfit to the `TEST` data.

        For downloads and extractions, use the given `download_manager`.
        Note that the `DownloadManager` caches downloads, so it is fine to have each
        generator attempt to download the source data.

        A good practice is to download all data in this function, and then
        distribute the relevant parts to each split with the `gen_kwargs` argument

        Args:
            dl_manager (`Union[DownloadManager, StreamingDownloadManager]`):
                Download manager to download the data

        Returns:
            `list<SplitGenerator>`.
        r+  rx  s     rS   r~  z DatasetBuilder._split_generators  s    Z "##rR   r  c                     t               )a  Generate the examples and record them on disk.

        Args:
            split_generator (`SplitGenerator`):
                Split generator to process
            file_format (`str`, *optional*):
                format of the data files in which the dataset will be written.
                Supported formats: "arrow", "parquet". Default to "arrow" format.
            max_shard_size (`Union[str, int]`, *optional*):
                Maximum number of bytes written per shard, default is "500MB".
                The size is based on uncompressed data size, so in practice your shard files may be smaller than
                `max_shard_size` thanks to Parquet compression for example.
            num_proc (`int`, *optional*, defaults to `None`):
                Number of processes when downloading and generating the dataset locally.
                Multiprocessing is disabled by default.

                <Added version="2.7.0"/>
            **kwargs: Additional kwargs forwarded from _download_and_prepare
        r+  )r`   r  r9  r:  r;  kwargss         rS   r  zDatasetBuilder._prepare_split$      8 "##rR   c                     t               )zGenerate the examples on the fly.

        Args:
            split_generator (`SplitGenerator`):
                Split generator to process
        r+  r`   r  s     rS   r  z/DatasetBuilder._get_examples_iterable_for_splitB  s     "##rR   )NNNNNNNNNNNNN)NN)r   rK   )TT)
NNNNNNr=  NNN)NTNF)Fr=  NN)KrN   rO   rP   r   r  rU   r   r  r  r   r	   ru   r*   r&   r
   rv   r
  r   r   rw   r   r   r   propertyr   r   r  tupler   r@   classmethodrE   r	  r   r  r  r   abcabstractmethodr   r1  r5  r   r   r<   r   rs  rf  rg  r_  r^  rl  r}  r2   r   r   r  r   r  TRAINr  r  r.   r  r  r   r  r  r  r    r~  r4   r  r-   r  rQ   rR   rS   r   r      s   FR G ) O  !% $(&*%)"#'&*'+,0!%FJ"&*.+/q3C=q3 smq3 c]	q3
 smq3 C=q3 {#q3 8$q3 dCi()q3 #q3 U3dM#ABCq3 3-q3 "$q3 $C=q3f3 hsm  0Xc] 0.-0? -0xPS} -0` 15X)	}c!	"X)t YS-%7 8      + s  (  D 		"{ 	" 	" H H#3 #S #
 %)48<@DH04#'"48"&*._SM_ ".1_  lC&7 89	_
 $E*:C*?$@A_ _-_ C=_ _ !sCx1_ 3-_ "$_B?=BM fK fe FJDHIc5$s)T%[@ABI $E*:C*?$@A	I 
w#	$I`  CS/501C C ,	C
 CJ BG`e B'=!> BY] Bjq B:eOU4J.K PS   $#'"}" C=" 
tC()?:	;	"H	
 
	
W wsCx?P U]^eUf  S#X ),:I	# 	,$E/C[2[,\ ,$ ,$\ 	 #48"&$'$ $ !sCx1	$
 3-$ $:$ $Sc $rR   r   c                        e Zd ZdZej
                  d        Z	 	 	 ddedede	e
   de	ee
ef      fdZded	ed
ede
dedede
deee
eee
ef   f      fdZ fdZdedefdZ xZS )GeneratorBasedBuilderaw  Base class for datasets with data generation based on dict generators.

    `GeneratorBasedBuilder` is a convenience class that abstracts away much
    of the data writing and reading of `DatasetBuilder`. It expects subclasses to
    implement generators of feature dictionaries across the dataset splits
    (`_split_generators`). See the method docstrings for details.
    c                     t               )ag  Default function generating examples for each `SplitGenerator`.

        This function preprocess the examples from the raw data to the preprocessed
        dataset files.
        This function is called once for each `SplitGenerator` defined in
        `_split_generators`. The examples yielded here will be written on
        disk.

        Args:
            **kwargs (additional keyword arguments):
                Arguments forwarded from the SplitGenerator.gen_kwargs

        Yields:
            key: `str` or `int`, a unique deterministic example identification key.
                * Unique: An error will be raised if two examples are yield with the
                    same key.
                * Deterministic: When generating the dataset twice, the same example
                    should have the same key.
                Good keys can be the image id, or line number if examples are extracted
                from a text file.
                The key will be hashed and sorted to shuffle examples deterministically,
                such as generating the dataset multiple times keep examples in the
                same order.
            example: `dict<str feature_name, feature_value>`, a feature dictionary
                ready to be encoded and written to disk. The example will be
                encoded with `self.info.features.encode_example({...})`.
        r+  r`   r  s     rS   _generate_examplesz(GeneratorBasedBuilder._generate_examplesU  s    : "##rR   r  check_duplicate_keysr;  r:  c                 f   !"# t        |xs t        j                        } j                  j                  $ j                  j                  |j
                     }n|j                  }d} j                   d|j
                   | d| }t        j                   j                  |      !|r{|dkD  rvt        |j                        }	|	dk  r)t        j                  d| d|j
                   d       d}n3|	|k  r.t        j                  d| d|	 d	|j
                   d
|	 d	       |	}t        d|j                   d|j
                   d      }
!||||d}||dk(  rid }|j                  }d}|
5    j"                  d$||d|D ]  \  }}}|r|}|
j%                  |        	 d d d        |J d       d |D        \  }}}"}nt'        t)        |j                  |            D cg c]  \  }}||d| }}}t+        |      }d g|z  }d g|z  }d g|z  }d g|z  "d g|z  }t-        |      5 }|
5  t/        | j"                  |      D ]1  \  }}}|r|\  ||<   ||<   ||<   "|<   ||<   !|
j%                  |       3 	 d d d        d d d        d |vsJ d| d       t1        "      #t1        |      }t1        |      }|d   }||j                  _        ||j                  _        t        j5                  d# d       #dkD  rdt6        t8           f! "#fd}t'        "      D cg c]  \  }}t;        |      D ]  }||f  }}}}t=        ||dd       |D  cg c]  }|D ]  } |   c} }|j                  _        nKd\  }} jA                  !jC                  d |d!      jC                  d"|d!      !jC                  |d#              j                  jD                  | j                  _"        y y # 1 sw Y   WxY wc c}}w # 1 sw Y   xY w# 1 sw Y   xY wc c}}}w c c} }w )%N-JJJJJ-SSSSS-of-NNNNNr   r   r   Setting num_proc from  back to 1 for the @ split to disable multiprocessing as it only contains one shard.rO  	 for the  split as it only contains  shards.	 examplesrz  r{  unittotaldesc)fpathr9  r:  r  r  r   
gen_kwargsjob_id-Failed to retrieve results from prepare_splitc              3   "   K   | ]  }|g 	 y wre   rQ   rg   items     rS   rj   z7GeneratorBasedBuilder._prepare_split.<locals>.<genexpr>        hh   max_num_jobskwargs_iterable;Failed to retrieve results from prepare_split: result list G still contains None - at least one worker failed to return its results	Renaming shard_and_jobc                     | \  }}t        d |       |z   }j                  j                  d|d      j                  d|d      j                  d|d      j                  dd             y NSSSSS05dJJJJJzJJJJJ-SSSSSNNNNNrh  r5  r   )r  shard_idr  global_shard_idr  r`   shards_per_jobtotal_shardss       rS   _rename_shardz;GeneratorBasedBuilder._prepare_split.<locals>._rename_shard  s}    #0 &"%nWf&=">"IMM'hs^=EEgRXY\Q]_MM-OC3HJRRSZ_klo^prrR   T@   disablemax_workersr   r   r  r  r   r?  rQ   )#rA   r   MAX_SHARD_SIZEr   ri  rW   r  r   r   r   r   rH   r  r   r   hf_tqdmnum_examples_prepare_split_singler   	enumeraterI   r   r   rC   rh  rQ  r   r  rw   ranger   shard_lengthsr5  r   r   )$r`   r  r  r9  r;  r:  r  SUFFIXfnamenum_input_shardspbar_prepare_split_argsresultr  r  donecontentexamples_per_jobbytes_per_jobfeatures_per_jobshard_lengths_per_jobkwargs_per_jobnum_jobspooltotal_num_examplestotal_num_bytesr   r  
num_shardsr  shards_and_jobsr  shard_lengthr  r  r  s$   `                                @@@rS   r  z$GeneratorBasedBuilder._prepare_splitt  s"    2.2YFDYDYZ99'))/*>*>?J(33J($$%Q';';&<VHAk]St//71>?Y?YZ1$,XJ6I*//IZ  [[  \ !H,,XJd;K:LIV`VeVeUf  gB  CS  BT  T\  ] ,))z/v6
 &,$$8
 x1}F(33JF --GT-G-G .)&.<O. -)FD' !(G,-- %V'VV%h#)hdm-=~Od +4%o&@&@xX+&FJ  *VS?RSN  >*H $v0!FX-M $v0"Vh.N%)FX$5!h 14 11Cd88.2 1-g   !( 0 8 -f 5 0 8 .v 6 5f = !KK0111$ // MN^M_  `g  h/ >* !12m,#A&2D""//>"", 	yh78!U3Z   +4N*C &FJ %j 1  6""O 
 }otQST 3H8!.\i8LX88O&&4
  $HfLLg(39AA'fUX\[fb)
 99%!)DII &s- -1 11 1V8sC   .5O9(P0P3APP+P&!P-9PP	PP#r  r  r9  r  r  rr   c           
   #   (  K    | j                   di |}|dk(  rt        nt        }	|dk(  }
g }d\  }}d}d}	  |	| j                  j                  |j                  d|d      j                  d|d      | j                  |j                  || j                  j                  |
      }	 t        j                         }|D ]Z  \  }}||j                  |kD  r|j                         \  }}|j                          |j                  |       ||z  }||z  }|dz  } |	|j                  |j                  d|d      j                  d|d      | j                  |j                  || j                  j                  |
      }| j                  j                  %| j                  j                  j!                  |      n|}|j#                  ||       |dz  }t        j                         |t$        j&                  z   kD  s>t        j                         }|d	|f d}] 	 |d	|f |dz   }|j                         \  }}|j                          |j                  |       ||z  }||z  }	 |d|||j                  ||ff y # |d	|f |dz   }|j                         \  }}|j                          |j                  |       ||z  }||z  }w xY w# t(        $ r9}t+        |t,              r|j.                  |j.                  }t1        d
      |d }~ww xY ww)Nr>  r  r   r  r  r   )r   r   r   	hash_saltcheck_duplicatesr   embed_local_filesr   F.An error occurred while generating the datasetTrQ   )r  r   r   r   r   r   r   rW   r   r   time
_num_bytesfinalizecloser!  	_featuresencode_examplewriter   PBAR_REFRESH_TIME_INTERVAL	Exceptionr^   r   __context__r#   )r`   r  r  r9  r:  r  r  r  	generatorwriter_classr+  r  r#  r$  r  num_examples_progress_updatewriter_timer  recordr  rQ  exampler%  r  s                            rS   r  z+GeneratorBasedBuilder._prepare_split_single  s     ,D++9j9	(3y(@}k'94.2+O'($0	b!++]]7xn>FFwSYZ]R^`"&"9"9$//!5 $ 8 8"3F!-		#, 9KC%1f6G6G.6X28//2C/i%,,\:*l:*'94 A!-%+%5%5!&w8C.!J!R!RSZ_efi^j!l.2.E.E&0oo-A,0HH,D,D.?" LP99K]K]Kidii00??GouGLL#.0A50yy{UV-N-N%NN $		$e-III784/92 e%AAA%\
*0//*;'i$$\2"l2"9, d/&BRBRT^`mnnn e%AAA%\
*0//*;'i$$\2"l2"9, 	b!12q}}7PMM()YZ`aa		bsJ   4LA/K 'EI> ; I> A
K &L>AK

K 	L4L

LLc                 x    t        |   ||fd|t        j                  k(  xs |t        j                  k(  i| y )Nr  )superrg  r<   rU  rY  )r`   r   r8  prepare_splits_kwargs	__class__s       rS   rg  z+GeneratorBasedBuilder._download_and_prepareH  sK    %	
 "36F6S6S!S "@ $4$?$??		

 $	
rR   c                 B    t        | j                  |j                        S re   )r-   r  r  r  s     rS   r  z6GeneratorBasedBuilder._get_examples_iterable_for_splitQ  s     7 79S9STTrR   r  )rN   rO   rP   r   r  r  r  r4   rv   r	   rw   r
   ru   r  r   r5   r   r  r  rg  r-   r  __classcell__)rA  s   @rS   r  r  L  s    	$ $D "&48L*'L* #L*
 3-L* !sCx1L*\DoDo Do 	Do
 Do Do #Do Do 
%T5e#445	6DoL
U USc UrR   r  c                       e Zd ZdZej
                  d        Z	 	 	 ddedede	e
   de	eee
f      fdZd	ed
edede
de
deee
eee
ef   f      fdZdedefdZy)ArrowBasedBuilderzaBase class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet).c                     t               )a  Default function generating examples for each `SplitGenerator`.

        This function preprocess the examples from the raw data to the preprocessed
        dataset files.
        This function is called once for each `SplitGenerator` defined in
        `_split_generators`. The examples yielded here will be written on
        disk.

        Args:
            **kwargs (additional keyword arguments):
                Arguments forwarded from the SplitGenerator.gen_kwargs

        Yields:
            key: `str` or `int`, a unique deterministic example identification key.
                * Unique: An error will be raised if two examples are yield with the
                    same key.
                * Deterministic: When generating the dataset twice, the same example
                    should have the same key.
                Good keys can be the image id, or line number if examples are extracted
                from a text file.
                The key will be hashed and sorted to shuffle examples deterministically,
                such as generating the dataset multiple times keep examples in the
                same order.
            example: `pyarrow.Table`, a feature table
                ready to be encoded and written to disk.
        r+  r  s     rS   _generate_tablesz"ArrowBasedBuilder._generate_tablesX  r  rR   Nr  r9  r;  r:  c                 V    !" t        |xs t        j                        }	  j                  j                  |j
                     }d} j                   d|j
                   | d| }t        j                   j                  |       |r{|dkD  rvt        |j                        }|dk  r)t        j                  d| d|j
                   d       d}n3||k  r.t        j                  d| d| d	|j
                   d
| d	       |}t!        d|j"                  d|j
                   d      }	 ||d}
||dk(  rid }|j                  }d}|	5    j$                  d$||d|
D ]  \  }}}|r|}|	j'                  |        	 d d d        |J d       d |D        \  }}}!}nt)        t+        |j                  |            D cg c]  \  }}||d|
 }}}t-        |      }d g|z  }d g|z  }d g|z  }d g|z  !d g|z  }t/        |      5 }|	5  t1        | j$                  |      D ]1  \  }}}|r|\  ||<   ||<   ||<   !|<   ||<   !|	j'                  |       3 	 d d d        d d d        d |vsJ d| d       t3        !      "t3        |      }t3        |      }|d   }||j                  _        ||j                  _        t        j7                  d" d       "dkD  rdt8        t:           f  !"fd}t)        !      D cg c]  \  }}t=        |      D ]  }||f  }}}}t?        ||dd       |D cg c]  }|D ]  }|  c}}|j                  _         nKd\  }} jC                   jE                  d |d!      jE                  d"|d!       jE                  |d#              j                  jF                  | j                  _#        y y # t        $ r |j                  }Y w xY w# 1 sw Y   sxY wc c}}w # 1 sw Y   xY w# 1 sw Y   xY wc c}}}w c c}}w )%Nr  r   r   r   r  r  r  rO  r  r  r  r  rz  r{  r  )r  r9  r:  r   r  r  c              3   "   K   | ]  }|g 	 y wre   rQ   r  s     rS   rj   z3ArrowBasedBuilder._prepare_split.<locals>.<genexpr>  r  r  r  r  r  r  r  shard_id_and_jobc                     | \  }}t        d |       |z   }j                  j                  d|d      j                  d|d      j                  d|d      j                  dd             y r  r  )rJ  r  r  r  r  r`   r  r  s       rS   r  z7ArrowBasedBuilder._prepare_split.<locals>._rename_shard  s}    #3 &"%nWf&=">"IMM'hs^=EEgRXY\Q]_MM-OC3HJRRSZ_klo^prrR   Tr  r	  r  r  r  r   r?  rQ   )$rA   r   r  r   ri  rW   r5  r  r   r   r   r   rH   r  r   r   r  r  r  r   r  rI   r   r   rC   rh  rQ  r   r  rw   r  r   r  r5  r   r   )#r`   r  r9  r;  r:  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r   r  r%  r  shard_ids_and_jobsr  r'  r  r  r  s#   `                               @@@rS   r  z ArrowBasedBuilder._prepare_splitv  s'    2.2YFDYDYZ	4))/*>*>?J )$$%Q';';&<VHAk]St//71>?Y?YZ1$,XJ6I*//IZ  [[  \ !H,,XJd;K:LIV`VeVeUf  gB  CS  BT  T\  ] ,))z/v6
 &,
 x1}F(33JF --GT-G-G .)&.<O. -)FD' !(G,-- %V'VV%h#)hdm-=~Od +4%o&@&@xX+&FJ  *VS?RSN  >*H $v0!FX-M $v0"Vh.N%)FX$5!h 14 11Cd88.2 1-g   !( 0 8 -f 5 0 8 .v 6 5f = !KK0111$ // MN^M_  `g  h/ >* !12m,#A&2D""//>"", 	yh78!c
   +4N*C" "&FJ %j 1"  6"""" "
 }&8$TVW 3H8!.\i8LX88O&&4
  $HfLLg(39AA'fUX\[fb)
 99%!)DII &{  	4(33J	4H- -1 11 1V"8sT   #O 
5O1O>PAPPP=P%O.-O.1O;P		PPr  r  r  rr   c           	   #     K   |j                         D ci c]#  \  }}|t        |t              rt        |      n|% }}} | j                  di |}|dk(  rt
        nt        }	|dk(  }
g }d\  }}d}d}	  |	| j                  j                  |j                  d|d      j                  d|d      | j                  | j                  j                  |
      }	 t        j                         }|D ]  \  }}||j                  |kD  r|j                         \  }}|j!                          |j#                  |       ||z  }||z  }|dz  } |	|j$                  |j                  d|d      j                  d|d      | j                  | j                  j                  |
      }	 |j'                  |       |t3        |      z  }t        j                         |t4        j6                  z   kD  st        j                         }|d
|f d} 	 |d
|f |dz   }|j                         \  }}|j!                          |j#                  |       ||z  }||z  }	 |d|||j$                  ||ff y c c}}w # t(        $ r<}t+        j,                  || j                  j.                  || j0                  	      d }~ww xY w# |d
|f |dz   }|j                         \  }}|j!                          |j#                  |       ||z  }||z  }w xY w# t8        $ rJ}t        |t:              r|j<                  |j<                  }t        |t>              r t?        d      |d }~ww xY ww)Nr>  r  r   r  r  r   )r   r   r   r   r+  r   )
cast_errorr   r  r   Fr,  TrQ   ) r   r^   r
  rJ   rG  r   r   r   r   r   r   r   r   r-  r.  r/  r0  r!  r1  write_tabler7   r"   from_cast_errorr   r   r   r   r4  r5  r   r6  r#   )r`   r  r  r9  r:  r  rh   ry   r7  r8  r+  r  r#  r$  r  r9  r:  r;  r   tabler  rQ  rN  r%  r  s                            rS   r  z'ArrowBasedBuilder._prepare_split_single  so     T^ScScSef41aaJq$,?aQFf
f)D))7J7	(3y(@}k'94.2+O'($5	b!++]]7xn>FFwSYZ]R^`"&"9"9 $ 8 8"3F&-		 ) 9HAu%1f6G6G.6X28//2C/i%,,\:*l:*'94 A!-%+%5%5!&w8C.!J!R!RSZ_efi^j!l.2.E.E,0HH,D,D.?"**51 1CJ>0yy{UV-N-N%NN $		$e-III78499< e%AAA%\
*0//*;'i$$\2"l2"9, d/&BRBRT^`mnnnA gH % 8HH'1)-)?)?'1"&**	  e%AAA%\
*0//*;'i$$\2"l2"9, 	b!12q}}7PMM!34()YZ`aa	bsz   M"(I/6M"5A#L C
J= $I556J= , J= A
L M"5	J:>7J55J::J= =AL		L 	MAMMM"c                 D    t        | j                  |j                        S )N)r  )r,   rG  r  r  s     rS   r  z2ArrowBasedBuilder._get_examples_iterable_for_splitF  s    $T%:%:?C]C]^^rR   r  )rN   rO   rP   r   r  r  rG  r4   ru   r	   rw   r
   r  r   r   r  rv   r  r-   r  rQ   rR   rS   rE  rE  U  s    k$ $@ #"&48I*'I* I* 3-	I*
 !sCx1I*VCoCo'*Co9<CoNQCo[^Co	%T5e#445	6CoJ_ _Sc _rR   rE  )r   r  r\  r   r   r   r   rI  rv  r-  r}   collections.abcr   r   dataclassesr   	functoolsr   pathlibr   typingr   r	   r
   unittest.mockr   r   fsspec.corer   multiprocessr   tqdm.contrib.concurrentr   r?  r   r   arrow_datasetr   arrow_readerr   r   arrow_writerr   r   r   r[   r   r   r   dataset_dictr   r   download.download_configr   download.download_managerr   r   #download.streaming_download_managerr    r!   
exceptionsr"   r#   r$   r%   r   r&   filesystemsr'   r(   r  r)   r   r*   r+   iterable_datasetr,   r-   r.   keyhashr/   namingr0   r1   ri  r2   r3   r4   r5   	streamingr6   rQ  r7   r8   r9   r  utils._filelockr:   utils.file_utilsr;   utils.info_utilsr<   r=   r>   r?   utils.py_utilsr@   rA   rB   rC   rD   rE   rF   rG   utils.shardingrH   rI   utils.trackrJ   loadrK   
get_loggerrN   r   r_   rM   rU   r   r  rE  rQ   rR   rS   <module>rq     s=    ! 
    	      - !   1 1   !  .  " K J O O : 4 D P p p    0 V V ( N ? ? ;   " % + g g	 	 	 O % # 
		H	%	
 	 nR nR nRbw$ w$t#FUN FURr_ r_rR   