
    bi%{                     (   d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ  d
dl!m"Z"m#Z#m$Z$m%Z%m&Z& d
dl'm(Z( ee)e*e*f   e)e*   e)d   f   Z+ e*ejX                        Z- ej\                  e/      Z0 G d de*      Z1 G d de2      Z3dZ4ejX                  ddgejj                  g dejl                  g diZ7dZ8ejr                   ejt                  d      k  r	dd gZ;g d!Z<n0ejr                   ejt                  d"      k  r	d#d gZ;g d$Z<nd%d&gZ;g d'Z<ejX                  ejj                  ejl                  gZ=e=D  ci c]0  } | e7|    D cg c]  }e;D ]  }|j}                  |e8(        c}}2 c}}} Z?e=D  ci c]0  } | e7|    D cg c]  }e<D ]  }|j}                  |e8(        c}}2 c}}} Z@ejX                  d)giZAe4gZBe@e?eAgZCd*ZDg d+ZEd,e*d-eFfd.ZGd/eeHeIe*f   d-eHe*eeIe*   d0f   f   fd1ZJd2e*d,e*d-eFfd3ZKd2e*d,e*d-eFfd4ZLd5e
e*geIe*   f   d-eHe*eIe*   f   fd6ZM	 	 dHd,e*d7e*d8eeIe*      d9ee   d-eIe*   f
d:ZNdId7e*d9ee   d-eHe*eIe*   f   fd;ZO	 dId<e*d9ee   d-e+fd=ZP	 	 dHd>eIe*   d9ee   d?eeQ   d-eIe+   fd@ZR G dA d0eIe*         ZS G dB dCeHe*eSf         ZT G dD dEeIe*         ZU G dF dGeHe*eUf         ZVyc c}}w c c}}} w c c}}w c c}}} w )J    N)partial)	has_magic)PathPurePath)CallableOptionalUnion)	url_to_fs)HfFileSystem)version)
thread_map   )config)DownloadConfig)	_split_re)Split)logging)tqdm)!_prepare_path_and_storage_optionsis_local_pathis_relative_path	xbasenamexjoin)string_to_dict c                       e Zd Zy)UrlN__name__
__module____qualname__r       N/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/data_files.pyr   r   !       r"   r   c                       e Zd Zy)EmptyDatasetErrorNr   r   r"   r#   r&   r&   %   r$   r"   r&   zFdata/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*traintraining)
validationvaliddevval)testtestingeval
evaluationz-._ 0-9z2023.9.0z**[{sep}/]{keyword}[{sep}]*z{keyword}[{sep}]*)z{keyword}/**z{keyword}[{sep}]*/**z**[{sep}/]{keyword}/**z**[{sep}/]{keyword}[{sep}]*/**z	2023.12.0z**/*[{sep}/]{keyword}[{sep}]*)z{keyword}/**/*z{keyword}[{sep}]*/**/*z**/*[{sep}/]{keyword}/**/*z"**/*[{sep}/]{keyword}[{sep}]*/**/*z**/{keyword}[{sep}]*z**/*[{sep}]{keyword}[{sep}]*)z**/{keyword}/**z**/{keyword}[{sep}]*/**z**/*[{sep}]{keyword}/**z**/*[{sep}]{keyword}[{sep}]*/**)keywordsepz**z*[])z	README.mdzconfig.jsonzdataset_info.jsonzdataset_infos.jsonzdummy_data.zipzdataset_dict.jsonpatternreturnc                 4     t         fdt        D              S )Nc              3   &   K   | ]  }|v  
 y wNr   ).0wildcard_characterr3   s     r#   	<genexpr>z%contains_wildcards.<locals>.<genexpr>s   s     [1C!W,[s   )anyWILDCARD_CHARACTERS)r3   s   `r#   contains_wildcardsr=   r   s    [GZ[[[r"   patternsDataFilesListc           	         t        | t              r@| j                         D ci c]$  \  }}t        |      t        |t              r|n|g& c}}S t        | t              r	t
        | giS t        | t              rt        d | D              r| D ]W  }t        |t              r8t        |      dk(  r*d|v r&t        |j                  d      t        t        f      rKt        d|        | D cg c]  }|d   	 }}t        t        |            t        |      k7  rt        d|       | D ci c]-  }t        |d         t        |d   t              r|d   n|d   g/ c}S t
        | iS t        t	        |             S c c}}w c c}w c c}w )a/  
    Take the data_files patterns from the user, and format them into a dictionary.
    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
    The default split is "train".

    Returns:
        patterns: dictionary of split_name -> list of patterns
    c              3   <   K   | ]  }t        |t                y wr7   )
isinstancedict)r8   r3   s     r#   r:   z$sanitize_patterns.<locals>.<genexpr>   s     AWz'4(As      splitpathz]Expected each split to have a 'path' key which can be a string or a list of strings, but got z*Some splits are duplicated in data_files: )rB   rC   itemsstrlistSANITIZED_DEFAULT_SPLITr;   lenget
ValueErrorsetsanitize_patterns)r>   keyvaluer3   splitss        r#   rO   rO   v   s    (D!ZbZhZhZjkJCQVC:eT#:%Gkk	Hc	"'(44	Hd	#AAA# 	w-G)7*"7;;v#6dD$wx  xA  B 	 7??7gg&?F?3v;3v;. #MfX!VWW  ( GG$%*WV_VZ:[wvbijpbqarr 
 ,X66 h003 l @s   )E=,F,2Fmatched_rel_pathc                 <   t        |       j                  j                  D cg c]  }|j                  d      s| }}t        |      j                  j                  D cg c]  }|j                  d      s| }}t	        |      t	        |      k7  S c c}w c c}w )u  
    When a path matches a pattern, we additionally check if it's inside a special directory
    we ignore by default (if it starts with a double underscore).

    Users can still explicitly request a filepath inside such a directory if "__pycache__" is
    mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── __pycache__
            └── b.txt

    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
    False
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
    False
    __)r   parentparts
startswithrK   )rS   r3   partdata_dirs_to_ignore_in_pathdata_dirs_to_ignore_in_patterns        r#   "_is_inside_unrequested_special_dirr\      s    8 5==M4N4U4U4[4["uD_c_n_nos_t4"u"u7?7H7O7O7U7U%otY]YhYhimYnd%o"%o*+s3Q/RRR #v%os   BB B7Bc                 T   t        |       j                  D cg c]&  }|j                  d      st        |      dhk(  r%|( }}t        |      j                  D cg c]&  }|j                  d      st        |      dhk(  r%|( }}t	        |      t	        |      k7  S c c}w c c}w )u9  
    When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

    Users can still explicitly request a filepath that is hidden or is inside a hidden directory
    if the hidden part is mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── a.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
    False
    .)r   rW   rX   rN   rK   )rS   r3   rY   hidden_directories_in_pathhidden_directories_in_patterns        r#   ?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dirra      s    l ""2399"T__S=QZ]^bZchkglZl" " "'*00%DOOC4HQTUYQZ_b^cQc%! % )*c2O.PPP"%s!   B B B B%3B%B%pattern_resolverc                    t         D ]
  }|j                  dd      }	  | |      }t        |      dkD  s.t	               }|D ]8  }t        t        |      t        |            }|J |j                  |d          : t        d |D              rt        dt         d| d      t        D cg c]  }||v st        |       c}t        |t        D ch c]  }t        |       c}z
        z   }|D ci c]  }||j                  |	      g c}c S  t        D ]e  }	g }
|	j!                         D ]6  \  }}|D ],  }	  | |      }t        |      dkD  s|
j#                  |        6 8 |
sQ|
D ci c]  }||	|   
 c}c S  t        d
 d|        # t        $ r Y w xY wc c}w c c}w c c}w # t        $ r Y w xY wc c}w )a+  
    Get the default pattern from a directory or repository by testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    z{split}*r   rE   c              3   R   K   | ]  }t        j                  t        |        ! y wr7   )rematchr   )r8   rE   s     r#   r:   z+_get_data_files_patterns.<locals>.<genexpr>  s     Ferxx	511Fs   %'zSplit name should match 'z'' but got 'z'.)rE   zCouldn't resolve pattern z with resolver )ALL_SPLIT_PATTERNSreplaceFileNotFoundErrorrK   rN   r   r   addr;   rM   r   DEFAULT_SPLITSrH   sortedformatALL_DEFAULT_PATTERNSrG   append)rb   split_patternr3   
data_filesrR   pp_partsrE   sorted_splitspatterns_dictnon_empty_splitsr>   s               r#   _get_data_files_patternsrx      s    , [''	37	)'2J z?Q"uF -(1y7OP***

77+,-
 FvFF #<YK|TZS[[]!^__5CWEuPVSZWZ`.A#e*AA[ M MZZ5EM00u0=>>ZZ%[( . O,224 	OE8# !1'!:J z?Q&$++E2	 =MNEE=//NNO 7yP`Oab
cc; ! 		 XAZ )  OsA   F,	F$6F$F)4F.<F30G	F! F!3	F?	>F?		base_pathallowed_extensionsdownload_configc           	      ,   t        |       rt        ||       } nAt        |       r4t        j                  j                  |       d   t        j                  z   }nd}t        | |      \  } }t        | fi |\  }}t        t              t        |       hz
  }t        |j                  t              r|j                  n|j                  d   }|dk7  r|dz   nd}	i }
|dk(  r+t        j                   t#        j$                  d      k\  rd|
d	<    |j&                  | fd
di|
j)                         D cg c]  \  }}|d   dk(  sM|j+                  d      ryt        j                  j-                  t        j                  j/                  |            r=t        |      |vr0t1        ||      s$t3        ||      s|j5                  |	      r|n|	|z    }}}|D cg c]3  }t7        fdt        |      j9                  d      dd D              r|5 }}t;        |      t;        |      k  r>t=        t        |      t        |      z
        }t>        jA                  d|  d|        n|}|s$d|  d}|dt=               z  }tC        |      |S c c}}w c c}w )a  
    Resolve the paths and URLs of the data files from the pattern passed by the user.

    You can use patterns to resolve multiple local files. Here are a few examples:
    - *.csv to match all the CSV files at the first level
    - **.csv to match all the CSV files at any level
    - data/* to match all the files inside "data"
    - data/** to match all the files inside "data" and its subdirectories

    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
    other than a forward slash /.

    More generally:
    - '*' matches any character except a forward-slash (to match just the file or directory name)
    - '**' matches any character including a forward-slash /

    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
    The same applies to special directories that start with a double underscore like "__pycache__".
    You can still include one if the pattern explicitly mentions it:
    - to include a hidden file: "*/.hidden.txt" or "*/.*"
    - to include a hidden directory: ".hidden/*" or ".*/*"
    - to include a special directory: "__special__/*" or "__*/*"

    Example::

        >>> from datasets.data_files import resolve_pattern
        >>> base_path = "."
        >>> resolve_pattern("docs/**/*.py", base_path)
        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']

    Args:
        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
            The paths can be absolute or relative to base_path.
            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
        base_path (str): Base path to use when resolving relative paths.
        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
        download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
    Returns:
        List[str]: List of paths or URLs to the local or remote files that match the patterns.
    r    r{   filez://hfz0.20.0Fexpand_infodetailTtypeislinkNc              3   ,   K   | ]  }d |z   v   yw)r^   Nr   )r8   suffixrz   s     r#   r:   z"resolve_pattern.<locals>.<genexpr>r  s     g&3<#55gs   r^   r   z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find ''z with any supported extension )"r   r   r   osrF   
splitdriver2   r   r
   rN   FILES_TO_IGNOREr   rB   protocolrH   r   HF_HUB_VERSIONr   parseglobrG   rL   isfilerealpathr\   ra   rX   r;   rE   rK   rI   loggerinforj   )r3   ry   rz   r{   storage_optionsfs
fs_patternfiles_to_ignorer   protocol_prefixglob_kwargsfilepathr   matched_pathsoutinvalid_matched_files	error_msgs     `              r#   resolve_patternr   '  s   `  	7+	w	GG&&w/2RVV;		@ZijG_w:/:NB
/*i.@-AAO(c:r{{AH*2f*<h&"OK4F11W]]85LL%*M" &bgggJdJkJPPRHdLF"txx'9bggnnRWWM]M]^fMg>hx 728ZHOPXZde ''8oPX>XXM  % *
gIhDWD]D]^aDbcdceDfgg 
 

 s8c-(($(]);c#h)F$G!KK27);hi~h  A &wiq1	)9$?Q:R9STTI	**J5
s   /BJ8Jc                 x    t        t        | |      }	 t        |      S # t        $ r t	        d|  d      dw xY w)uA
  
    Get the default pattern from a directory testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    Some examples of supported patterns:

    Input:

        my_dataset_repository/
        ├── README.md
        └── dataset.csv

    Output:

        {'train': ['**']}

    Input:

        my_dataset_repository/
        ├── README.md
        ├── train.csv
        └── test.csv

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train.csv
            └── test.csv

        my_dataset_repository/
        ├── README.md
        ├── train_0.csv
        ├── train_1.csv
        ├── train_2.csv
        ├── train_3.csv
        ├── test_0.csv
        └── test_1.csv

    Output:

        {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
         'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train/
            │   ├── shard_0.csv
            │   ├── shard_1.csv
            │   ├── shard_2.csv
            │   └── shard_3.csv
            └── test/
                ├── shard_0.csv
                └── shard_1.csv

    Output:

        {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
         'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train-00000-of-00003.csv
            ├── train-00001-of-00003.csv
            ├── train-00002-of-00003.csv
            ├── test-00000-of-00001.csv
            ├── random-00000-of-00003.csv
            ├── random-00001-of-00003.csv
            └── random-00002-of-00003.csv

    Output:

        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    )ry   r{   zThe directory at z doesn't contain any data filesN)r   r   rx   rj   r&   )ry   r{   resolvers      r#   get_data_patternsr     sN    h )_]Hj'11 j"3I;>] ^_eiijs   
 9	data_filec                 L   t        | |      \  } }t        | fi |^}}t        |t              r)|j	                  |       }|j
                  |j                  fS | j                  t        j                        rt        t        j                  |j                        }d| t        t        j                        dz   d  j                  ddd      z   } |j	                  |       }|j
                  |j                  fS |j                  |       }dD ]  }||v st        ||         fc S  y)	Nr~   )endpointtokenzhf://r   z	/resolve/@)ETagetagmtimer   )r   r
   rB   r   resolve_pathrepo_idrevisionrX   r   HF_ENDPOINTr   rK   ri   r   rH   )	r   r{   r   r   _resolved_pathhffsr   rP   s	            r#   _get_single_origin_metadatar     s    "C9^m!nIy4O4FB"l#	2$$m&<&<<<			f00	1V%7%7?T?TUiF,>,>(?!(C(EFNN{\_abcc	)))4$$m&<&<<<779D( %$;S	N$$% r"   rr   max_workersc           
          ||nt         j                  }t        t        t        |      | |t
        dt        |       dk  xs d       S )Nr~   zResolving data files   )r   
tqdm_classdescdisable)r   &HF_DATASETS_MULTITHREADING_MAX_WORKERSr   r   r   hf_tqdmrK   )rr   r{   r   s      r#   _get_origin_metadatar     sK    
 "-!8+f>k>kK+_M#J2%- r"   c                       e Zd ZdZdee   dee   ddf fdZddZe		 	 	 ddee   d	e
j                  j                  d
ee   deee      dee   dd fd       Ze		 	 	 ddee   d
ee   deee      dee   dd f
d       Ze		 	 	 ddee   d
ee   deee      dee   dd f
d       Zddddeee      deee      dd fdZ xZS )r?   a  
    List of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns:
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover, DataFilesList has an additional attribute ``origin_metadata``.
    It can store:
    - the last modified time of local files
    - ETag of remote files
    - commit sha of a dataset repository

    Thanks to this additional attribute, it is possible to hash the list
    and get a different hash if and only if at least one file changed.
    This is useful for caching Dataset objects that are obtained from a list of data files.
    rr   origin_metadatar4   Nc                 2    t         |   |       || _        y r7   )super__init__r   )selfrr   r   	__class__s      r#   r   zDataFilesList.__init__  s    $.r"   c                 P    t        g | || j                  |j                  z         S r7   )r?   r   r   others     r#   __add__zDataFilesList.__add__  s(    _t_e_d.B.BUEZEZ.Z[[r"   r>   dataset_infory   rz   r{   c                     d|j                    d|j                   d|xs d j                  d      }| j                  ||||      S )Nzhf://datasets/r   /r}   ry   rz   r{   )idsharstripfrom_patterns)clsr>   r   ry   rz   r{   s         r#   from_hf_repozDataFilesList.from_hf_repo  s]     %\__$5Q|7G7G6H)/WYIZ[bbcfg	  	>Pbq ! 
 	
r"   c                     ||n%t               j                         j                         }| j                  ||||      S Nr   )r   resolveas_posixr   )r   r>   ry   rz   r{   s        r#   from_local_or_remotez"DataFilesList.from_local_or_remote,  sE     "+!6IDFNN<L<U<U<W	  	>Pbq ! 
 	
r"   c           	         ||n%t               j                         j                         }g }|D ]!  }	 |j                  t	        ||||             # t        ||      } | ||      S # t
        $ r t        |      s Y Qw xY wNr   r~   )r   r   r   extendr   rj   r   r   )r   r>   ry   rz   r{   rr   r3   r   s           r#   r   zDataFilesList.from_patterns9  s     "+!6IDFNN<L<U<U<W	
 	G!!#"++=(7		 /z?[://	 %  ) *s   A))B B
extensions
file_namesr   r   c          	         g }|r@dj                  d |D              }|j                  t        j                  d| d             |r@dj                  d |D              }|j                  t        j                  d| d             |r;t	        | D cg c]  t        fd|D              s c}| j                  	      S t	        t        |       | j                  	      S c c}w )
N|c              3   F   K   | ]  }t        j                  |        y wr7   rf   escape)r8   exts     r#   r:   z'DataFilesList.filter.<locals>.<genexpr>X  s     "Hc299S>"H   !z.*(z	)(\..+)?$c              3   F   K   | ]  }t        j                  |        y wr7   r   )r8   fns     r#   r:   z'DataFilesList.filter.<locals>.<genexpr>[  s     !EB"))B-!Er   z.*[\/]?(z)$c              3   @   K   | ]  }|j                          y wr7   )rg   )r8   r3   r   s     r#   r:   z'DataFilesList.filter.<locals>.<genexpr>_  s     7iU\i8P7is   )r   )joinrp   rf   compiler?   r;   r   rI   )r   r   r   r>   ext_pattern
fn_patternr   s         `r#   filterzDataFilesList.filterS  s     (("HZ"HHKOOBJJ[M'DEF!E*!EEJOOBJJ(:,b'ABC ,0jyC7i`h7i4ij $ 4 4 
 !dT=Q=QRR	 ks   C%.C%)r   r?   r4   r?   NNN)r   r    r!   __doc__rI   rH   SingleOriginMetadatar   r   classmethodhuggingface_hubhf_apiDatasetInfor   r   r   r   r   r   __classcell__r   s   @r#   r?   r?     s   "/49 /tDX?Y /^b /\ 
 $(2648
s)
 &,,88
 C=	

 %T#Y/
 ".1
 

 
  $(2648

s)

 C=

 %T#Y/	


 ".1

 


 

  $(26480s)0 C=0 %T#Y/	0
 ".10 
0 04 48[_S%d3i0SEMdSViEXS	Sr"   c                      e Zd ZdZe	 	 	 ddeeeee   e	f   f   de
e   de
ee      de
e   dd f
d       Ze	 	 	 ddeeeee   e	f   f   d	ej                  j                  de
e   de
ee      de
e   dd fd
       Ze	 	 	 ddeeeee   e	f   f   de
e   de
ee      de
e   dd f
d       Zdddde
ee      de
ee      dd fdZy)DataFilesDicta  
    Dict of split_name -> list of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover, each list is a DataFilesList. It is possible to hash the dictionary
    and get a different hash if and only if at least one file changed.
    For more info, see [`DataFilesList`].

    This is useful for caching Dataset objects that are obtained from a list of data files.

    Changing the order of the keys of this dictionary also doesn't change its hash.
    Nr>   ry   rz   r{   r4   c                      |        }|j                         D ]3  \  }}t        |t              r|nt        j                  ||||      ||<   5 |S r   )rG   rB   r?   r   r   r>   ry   rz   r{   r   rP   patterns_for_keys           r#   r   z"DataFilesDict.from_local_or_remotev  sh     e%-^^%5 
	!C! .> !"77$''9$3	 8  H
	 
r"   r   c                      |        }|j                         D ]4  \  }}t        |t              r|nt        j                  |||||      ||<   6 |S )N)r   ry   rz   r{   )rG   rB   r?   r   )	r   r>   r   ry   rz   r{   r   rP   r   s	            r#   r   zDataFilesDict.from_hf_repo  sk     e%-^^%5 	!C! .> !"//$!-''9$3 0  H	 
r"   c                      |        }|j                         D ]3  \  }}t        |t              r|nt        j                  ||||      ||<   5 |S r   )rG   rB   r?   r   r   s           r#   r   zDataFilesDict.from_patterns  sh     e%-^^%5 
	!C! .> !"00$''9$3	 1  H
	 
r"   r   r   r   c                     t        |              }| j                         D ]  \  }}|j                  ||      ||<    |S )Nr   )r   rG   r   )r   r   r   r   rP   data_files_lists         r#   r   zDataFilesDict.filter  sL     d4jl$(JJL 	\ C&--PZ-[CH	\
r"   r   )r   r    r!   r   r   rC   rH   r	   rI   r?   r   r   r   r   r   r   r   r   r   r   r"   r#   r   r   f  s     $(2648sE$s)]":;;< C= %T#Y/	
 ".1 
 * 
 $(2648sE$s)]":;;< &,,88 C=	
 %T#Y/ ".1 
 .  $(2648sE$s)]":;;< C= %T#Y/	
 ".1 
 , 48[_%d3i0EMdSViEX	r"   r   c                        e Zd ZdZdee   deeee         f fdZd Ze		 ddee   deee      dd fd       Z
	 dded	ee   dd
fdZdee   dd fdZ xZS )DataFilesPatternsListz
    List of data files patterns (absolute local paths or URLs).
    For each pattern there should also be a list of allowed extensions
    to keep, or a None ot keep all the files for the pattern.
    r>   rz   c                 2    t         |   |       || _        y r7   )r   r   rz   )r   r>   rz   r   s      r#   r   zDataFilesPatternsList.__init__  s    
 	""4r"   c                 P    t        g | || j                  |j                  z         S r7   )r?   rz   r   s     r#   r   zDataFilesPatternsList.__add__  s(    _t_e_d.E.EH`H`.`aar"   r4   c                 .     | ||gt        |      z        S r7   )rK   )r   r>   rz   s      r#   r   z#DataFilesPatternsList.from_patterns  s     801CMABBr"   ry   r{   r?   c           	      <   ||n%t               j                         j                         }g }t        | | j                        D ]$  \  }}	 |j                  t        ||||             & t        ||      }t        ||      S # t        $ r t        |      s Y Ww xY wr   )r   r   r   ziprz   r   r   rj   r   r   r?   )r   ry   r{   rr   r3   rz   r   s          r#   r   zDataFilesPatternsList.resolve  s    
 "+!6IDFNN<L<U<U<W	
+.tT5L5L+M 	'G'!!#"++=(7		 /z?[Z99	 %  ) *s   
BBBr   c                 Z    t        | | j                  D cg c]  }||z   	 c}      S c c}w r7   )r   rz   )r   r   rz   s      r#   filter_extensionsz'DataFilesPatternsList.filter_extensions  s/    $TMdMde7I%
2e
 	
es   (
r7   )r   r    r!   r   rI   rH   r   r   r   r   r   r   r   r  r   r   s   @r#   r   r     s    5s)5 !$s)!455b LPCCyC6>tCy6IC	 C C 59:: ".1: 
	:.
DI 
:Q 
r"   r   c                       e Zd ZdZe	 ddeeee   f   deee      dd fd       Z		 ddedee
   dd	fd
Zdee   dd fdZy)DataFilesPatternsDictz[
    Dict of split_name -> list of data files patterns (absolute local paths or URLs).
    Nr>   rz   r4   c                      |        }|j                         D ]1  \  }}t        |t              r|nt        j                  ||      ||<   3 |S )N)rz   )rG   rB   r   r   )r   r>   rz   r   rP   r   s         r#   r   z#DataFilesPatternsDict.from_patterns  sc     e%-^^%5 	!C! .0EF !*88$'9 9  H	 
r"   ry   r{   r   c                 t    t               }| j                         D ]  \  }}|j                  ||      ||<    |S r7   )r   rG   r   )r   ry   r{   r   rP   data_files_patterns_lists         r#   r   zDataFilesPatternsDict.resolve  sC    
 o-1ZZ\ 	T)C)/77	?SCH	T
r"   r   c                 ~     t        |              }| j                         D ]  \  }}|j                  |      ||<    |S r7   )r   rG   r  )r   r   r   rP   r	  s        r#   r  z'DataFilesPatternsDict.filter_extensions  sE    d4jl-1ZZ\ 	N)C)/AA*MCH	N
r"   r7   )r   r    r!   r   r   rC   rH   rI   r   r   r   r   r  r   r"   r#   r  r    s     W[CcN+AI$s)AT	  $ 59 ".1 
	DI :Q r"   r  )NNr7   )Wr   rf   	functoolsr   r   r   pathlibr   r   typingr   r   r	   r   fsspec.corer
   r   	packagingr   tqdm.contrib.concurrentr   r}   r   downloadr   namingr   rR   r   utilsr   r   r   utils.file_utilsr   r   r   r   r   utils.py_utilsr   tuplerH   r   TRAINrJ   
get_loggerr   r   r   rj   r&   SPLIT_PATTERN_SHARDED
VALIDATIONTESTSPLIT_KEYWORDSNON_WORDS_CHARSFSSPEC_VERSIONr   "KEYWORDS_IN_FILENAME_BASE_PATTERNS"KEYWORDS_IN_DIR_NAME_BASE_PATTERNSrl   rn   "DEFAULT_PATTERNS_SPLIT_IN_FILENAME"DEFAULT_PATTERNS_SPLIT_IN_DIR_NAMEDEFAULT_PATTERNS_ALLrh   ro   r<   r   boolr=   rC   rI   rO   r\   ra   rx   r   r   r   intr   r?   r   r   r  )rE   r1   r3   s   000r#   <module>r&     s   	 	   " , ,  ! (  .  $    " r r * U38_eCj%)CD  ekk*  
		H	%	# 		) 	 a  
KK':&	;	JJ9
 	=7==44*GI\)]&*& ]W]];77*IK^)_&*& +AB`)a&*& ++u//<  & & 	 
%e,9  	wO<< & "  & & 	 
%e,9  	wO<< & " 
KK$  ,, && 
  \ \ \#1dD#o 6 #14U4PS9VeKeEf@f;g #1LS Ss St SB;QVY ;Qdg ;Qlp ;Q|)dxtCy8H/I )ddSVX\]`XaSaNb )d^ /304	YYY !c+Y n-	Y
 
#YYxXj Xjx7O Xj[_`ceijmen`n[o Xjz 15n- 0 15!%S	n- # 

	"^SDI ^SBZDm+, Zz2
DI 2
j#D&;!;< #Y&&s0   9L !K:(L 9L!L(L:L L