
    bi+b                        d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlZddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZ ddlmZ ddlm Z! e
rddl"m#Z# ddl$m%Z%m&Z&  ejN                  e(      Z)dZ* ejV                  dedd  dejX                        Z- ejV                  d      Z. G d de/      Z0 G d de/      Z1 ed       G d d             Z2	 	 d4de3de4d   dee3d f   d!ee3   d"ee3   d#e2fd$Z5 G d% d&      Z6 G d' d(e6      Z7 G d) d*e6      Z8 ed       G d+ d,             Z9 ed       G d- d.             Z:d/ Z;d0 Z<d1 Z=d2 Z> G d3 d       Z?y)5zArrow ArrowReader.    N)	dataclass)partial)TYPE_CHECKINGOptionalUnion)
thread_map   )DownloadConfig)	_split_refilenames_for_dataset_split)InMemoryTableMemoryMappedTableTableconcat_tables)logging)tqdm)DatasetInfo)Split	SplitInfoz=https://storage.googleapis.com/huggingface-nlp/cache/datasetsz
^
 (?P<split>z)
 (\[
    ((?P<from>-?[\d_]+)
     (?P<from_pct>%)?)?
    :
    ((?P<to>-?[\d_]+)
     (?P<to_pct>%)?)?
 \])?(\((?P<rounding>[^\)]*)\))?
$
z\s*\+\s*c                       e Zd ZdZy)DatasetNotOnHfGcsErrorz?When you can't get the dataset from the Hf google cloud storageN__name__
__module____qualname____doc__     P/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/arrow_reader.pyr   r   A   s    Ir   r   c                       e Zd ZdZy)MissingFilesOnHfGcsErrorz9When some files are missing on the Hf oogle cloud storageNr   r   r   r    r"   r"   G   s    Cr   r"   T)frozenc                   ,    e Zd ZU dZeed<   ee   ed<   y)FileInstructionsa}  The file instructions associated with a split ReadInstruction.

    Attributes:
        num_examples: `int`, The total number of examples
        file_instructions: List[dict(filename, skip, take)], the files information.
            The filenames contains the relative path, not absolute.
            skip/take indicates which example read in the file: `ds.slice(skip, take)`
    num_examplesfile_instructionsN)r   r   r   r   int__annotations__listdictr   r   r    r%   r%   M   s     Dz!r   r%   namesplit_infosr   instructionReadInstructionfiletype_suffixprefix_pathreturnc                 L   t        | t              s!t        dt        |       j                         | st        d      |D ci c]  }|j                  |j                   }}|D ci c]  }|j                  |j                   }}|D ci c]4  }|j                  t        || |j                  |||j                           6 }}t        |t              st        j                  |      }|j                  |      }	g }
d}|	D ]  }||j                     }||j                     }||j                     }|j                  dn|j                  }|j                  |n|j                  }|-|D ]'  }||z
  }|dk(  r||z  }|
j!                  |||d       ) d}d}t#        ||      D ]^  \  }}||z  }||k  rJ||kD  rE||kD  r||z
  nd}||k  r||z
  |z
  nd}|dk(  r6|
j!                  |||d       ||dk(  r||z
  n|z  }||z  }`  t%        ||
      S c c}w c c}w c c}w )a  Returns instructions of the split dict.

    Args:
        name (`str`): Name of the dataset.
        split_infos (`list` of `[SplitInfo]`): Dataset splits information.
        instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.
        filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.
        prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.

    Returns:
        [`FileInstructions`]
    zExpected str 'name', but got: zExpected non-empty str 'name')pathdataset_namesplitr0   shard_lengthsr   )filenameskiptaker   )r&   r'   )
isinstancestr	TypeErrortyper   
ValueErrorr,   r&   r7   r   r/   	from_specto_absolute	splitnamefrom_toappendzipr%   )r,   r-   r.   r0   r1   infoname2lenname2shard_lengthsname2filenamesabsolute_instructionsr'   r&   	abs_instrsplit_length	filenamesr7   rC   rD   r8   r:   index_start	index_endshard_lengthr9   s                           r    make_file_instructionsrR   \   s   & dC 8d9L9L8MNOO8999DE		4,,,EHEDOPD$))T%7%77PP  	  			.))+,TYY7
 	
	N 	 k?3%//<'33H= L* ,		 3 34"9#6#67	*9+>+>?__,)//&\\1\y|| % ^Ez19$!((hW[)\]^ KI*-i*G 	,&,\)	9$k)927+2E5;.1D689n2+d2"Dqy %,,(DZ^-_` 42:L4$74OL|+	,!,4 !+ ] FP	s   H'H9H!c                   t    e Zd ZdZdeded   fdZddefdZddefdZ	d	 Z
	 dd
Z	 	 ddee   ded   fdZy)
BaseReaderz@
    Build a Dataset object out of Instruction instance(s).
    r4   rG   r   c                 .    || _         || _        d| _        y)zInitializes ArrowReader.

        Args:
            path (str): path where tfrecords are stored.
            info (DatasetInfo): info about the dataset.
        N)_path_info_filetype_suffix)selfr4   rG   s      r    __init__zBaseReader.__init__   s     
.2
/3r   r2   c                     t         )=Returns a Dataset instance from given (filename, skip, take).)NotImplementedError)rY   filename_skip_take	in_memorys      r    _get_table_from_filenamez#BaseReader._get_table_from_filename   s    !!r   c           	         t        |      dk(  st        d |D              st        d      t        j                  |      }|D ]2  }t
        j                  j                  | j                  |d         |d<   4 t        t        | j                  |      |t        dt        |      dk  xs d	      }|D cg c]  }t        |      dkD  s| }}|s-| j                  | j                  j                  t        d
      |xsI t        j                   g t#        j$                  | j                  j                  j&                              g}t        |      dk7  rt)        |      }|S |d   }|S c c}w )a  Returns Dataset for given file instructions.

        Args:
            files: List[dict(filename, skip, take)], the files information.
                The filenames contain the absolute path, not relative.
                skip/take indicates which example read in the file: `ds.slice(skip, take)`
            in_memory (bool, default False): Whether to copy the data in-memory.
        r   c              3   <   K   | ]  }t        |t                y wN)r;   r+   ).0fs     r    	<genexpr>z)BaseReader._read_files.<locals>.<genexpr>   s     %IajD&9%Is   z&please provide valid file informationsr8   r_   zLoading dataset shards   N)
tqdm_classdescdisablezqTried to read an empty table. Please specify at least info.features to create an empty table with the right type.)schemar	   )lenallr?   copydeepcopyosr4   joinrV   r   r   r`   hf_tqdmrW   featuresr   from_batchesparl   r>   r   )rY   filesr_   re   	pa_tablestpa_tables          r    _read_fileszBaseReader._read_files   sT    u:?#%I5%I"IEFFe$ 	DAGGLLQz]CAjM	D D11YG)J"$,
	 !*81SVaZQ8	8djj0DJJ4G4G4O D  m-"<"<R		RVR\R\ReReRjRjHk"l!m	/29~/B=+ IRRS 9s   5E-	E-c                 f    t        |||| j                  | j                        }|j                  }|S )z?Return list of dict {'filename': str, 'skip': int, 'take': int})r0   r1   )rR   rX   rV   r'   )rY   r,   r.   r-   r'   rw   s         r    get_file_instructionsz BaseReader.get_file_instructions   s7    2+{D<Q<Q_c_i_i
 "33r   c                 v    | j                  |||      }|sd| d}t        |      | j                  |||      S )a  Returns Dataset instance(s).

        Args:
            name (str): name of the dataset.
            instructions (ReadInstruction): instructions to read.
                Instruction can be string and will then be passed to the Instruction
                constructor as it.
            split_infos (list of SplitInfo proto): the available splits for dataset.
            in_memory (bool, default False): Whether to copy the data in-memory.

        Returns:
             kwargs to build a single Dataset instance.
        zInstruction "z" corresponds to no data!)rw   original_instructionsr_   )r}   r?   
read_files)rY   r,   instructionsr-   r_   rw   msgs          r    readzBaseReader.read   sJ    * **4{K!,/HICS/!U,Zcddr   Nrw   r   )Nr/   r   c                     | j                  ||      }|ddlm}  |t        |            }nd}|| j                  |d}|S )aJ  Returns single Dataset instance for the set of file instructions.

        Args:
            files: List[dict(filename, skip, take)], the files information.
                The filenames contains the relative path, not absolute.
                skip/take indicates which example read in the file: `ds.skip().take()`
            original_instructions: store the original instructions used to build the dataset split in the dataset.
            in_memory (bool, default False): Whether to copy the data in-memory.

        Returns:
            kwargs to build a Dataset instance.
        rg   Nr	   )r   )arrow_tablerG   r6   )r{   splitsr   r<   rW   )rY   rw   r   r_   rz   r   r6   dataset_kwargss           r    r   zBaseReader.read_files   sN    & ##EY#? ,%#345EE)14::PUVr   F)NF)r   r   r   r   r<   r   rZ   r   r`   r{   r}   r   r*   r+   r   r   r   r   r    rT   rT      ss    	4S 	4(? 	4"u "U @ e< JN	Dz  %%EFr   rT   c                   X     e Zd ZdZdeded   f fdZd	defdZe	d	defd       Z
 xZS )
ArrowReaderz
    Build a Dataset object out of Instruction instance(s).
    This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.
    r4   rG   r   c                 4    t         |   ||       d| _        y)zInitializes ArrowReader.

        Args:
            path (str): path where Arrow files are stored.
            info (DatasetInfo): info about the dataset.
        arrowNsuperrZ   rX   rY   r4   rG   	__class__s      r    rZ   zArrowReader.__init__#  s     	t$ 'r   r2   c                     |d   d|v r|d   ndd|v r|d   nd}}}t         j                  ||      }|dk(  rt        |      |z
  }|'|%|dk(  r|t        |      k(  s|j                  ||      }|S )r\   r8   r9   Nr:   rg   r   r   )r   
read_tablerm   slice)rY   r^   r_   r8   r9   r:   tables          r    r`   z$ArrowReader._get_table_from_filename-  s     z**04F*Fv&D*04F*Fv&D $
 &&x9&E2:u:$D 0$!)PSTYPZHZKKd+Er   c                 @    |rt         nt        }|j                  |       S )z
        Read table from file.

        Args:
            filename (str): File name of the table.
            in_memory (bool, default=False): Whether to copy the data in-memory.

        Returns:
            pyarrow.Table
        )r   r   	from_file)r8   r_   	table_clss      r    r   zArrowReader.read_table<  s     &/M4E	""8,,r   r   )r   r   r   r   r<   r   rZ   r   r`   staticmethodr   __classcell__r   s   @r    r   r     sG    
(S ((? (u  - - -r   r   c                   8     e Zd ZdZdeded   f fdZd Z xZS )ParquetReaderzv
    Build a Dataset object out of Instruction instance(s).
    This Reader uses memory mapping on parquet files.
    r4   rG   r   c                 4    t         |   ||       d| _        y)zInitializes ParquetReader.

        Args:
            path (str): path where tfrecords are stored.
            info (DatasetInfo): info about the dataset.
        parquetNr   r   s      r    rZ   zParquetReader.__init__R  s     	t$ )r   c                     |d   d|v r|d   ndd|v r|d   nd}}}t        j                  |d      }|'|%|dk(  r|t        |      k(  s|j                  ||      }|S )r\   r8   r9   Nr:   T)
memory_mapr   )pqr   rm   r   )rY   r^   kwargsr8   r9   r:   rz   s          r    r`   z&ParquetReader._get_table_from_filename\  s     z**04F*Fv&D*04F*Fv&D $ ==d; 0$!)PST\P]H]~~dD1Hr   )	r   r   r   r   r<   r   rZ   r`   r   r   s   @r    r   r   L  s%    
*S *(? *r   r   c                   0    e Zd ZU dZeed<   eed<   eed<   y)_AbsoluteInstructionz?A machine friendly slice: defined absolute positive boundaries.rB   rC   rD   N)r   r   r   r   r<   r)   r(   r   r   r    r   r   k  s    INJGr   r   c                   r    e Zd ZU dZeed<   dZee   ed<   dZ	ee   ed<   dZ
ee   ed<   dZee   ed<   d Zy)	_RelativeInstructionzHRepresents a single parsed slicing instruction, can use % and negatives.rB   NrC   rD   unitroundingc                 H   | j                   | j                   dvrt        d      | j                  | j                  dvrt        d      | j                   dk7  r| j                  t        d      | j                   dk(  r/| j                  #t	        | j                        dkD  rt        d      | j                   dk(  r/| j
                  #t	        | j
                        dkD  rt        d      | j                  | j                   dk(  rd	n| j                  | j                  d
<   y )N)%abszunit must be either % or abs)closestpct1_dropremainderz5rounding must be either closest or pct1_dropremainderr   zAIt is forbidden to specify rounding if not using percent slicing.d   z2Percent slice boundaries must be > -100 and < 100.r   r   )r   r?   r   rC   r   rD   __dict__rY   s    r    __post_init__z"_RelativeInstruction.__post_init__~  s    99 TYYl%B;<<==$>_)_TUU99 9`aa99

 63tzz?S;PQRR99 3DGGs8JQRR151F499X[K[Iaeananj!r   )r   r   r   r   r<   r)   rC   r   r(   rD   r   r   r   r   r   r    r   r   t  sH    RNE8C=BD(3-"Hhsm"or   r   c           
         t         j                  |       }|st        d|        |j                  d      s|j                  d      rdnd}t	        |j                  d      |j                  d      |j                  d      rt        |j                  d            nd	|j                  d
      r t        |j                  d
            |      S d	|      S )z)Returns ReadInstruction for given string.z!Unrecognized instruction format: from_pctto_pctr   r   r6   r   fromNrD   )
split_namer   rC   rD   r   )_SUB_SPEC_REmatchr?   groupr/   r(   )specresr   s      r    _str_to_read_instructionr     s    


T
"C<TFCDD))J'399X+>3ED99W%:&(+		&(9c#))F#$t#&99T?3syy  9= r   c                 \    |dk  rd}t        |      | t        j                  |dz        z  S )Nr   zUsing "pct1_dropremainder" rounding on a split with less than 100 elements is forbidden: it always results in an empty dataset.      Y@)r?   mathtrunc)boundaryr&   r   s      r    _pct_to_abs_pct1r     s:    cL 	 odjj!5666r   c                 6    t        t        | |z  dz              S )Nr   )r(   round)r   r&   s     r    _pct_to_abs_closestr     s    uX,u4566r   c                    | j                   dk(  rt        nt        }| j                  }||vrt	        d| dt        |       d      ||   }| j                  }| j                  }| j                  dk(  r|dn |||      }||n |||      }n|dn|}||n|}|dk  rt        ||z   d      }|dk  rt        ||z   d      }t        ||      }t        ||      }t        |||      S )zReturns _AbsoluteInstruction instance for given RelativeInstruction.

    Args:
        rel_instr: RelativeInstruction instance.
        name2len: dict {split_name: num_examples}.
    r   zUnknown split "z". Should be one of .r   r   )r   r   r   rB   r?   r*   rC   rD   r   maxminr   )	rel_instrrH   
pct_to_absr6   r&   rC   rD   s          r    _rel_to_abs_instrr     s    )2(:(:i(G$M]JEH?5'1Ed8nEUUVWXXE?LOOE	B~~]
5,(GZ\ZL-I]Z\RqyL5(!,	Av"A&|$E	R	Bub11r   c                   \    e Zd ZdZd Zed        ZddZed        Zd Z	d Z
d	 Zd
 Zd Zy)r/   a  Reading instruction for a dataset.

    Examples::

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%]')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
          'test', from_=0, to=33, unit='%'))

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
          'test[:33%]+train[1:-1]'))
      ds = datasets.load_dataset('mnist', split=(
          datasets.ReadInstruction('test', to=33, unit='%') +
          datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
          'test[:33%](pct1_dropremainder)'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
          'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder"))

      # 10-fold validation:
      tests = datasets.load_dataset(
          'mnist',
          [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')
          for k in range(0, 100, 10)])
      trains = datasets.load_dataset(
          'mnist',
          [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')
          for k in range(0, 100, 10)])

    c                     || _         y rc   _relative_instructions)rY   relative_instructionss     r    _initzReadInstruction._init  s
    &;#r   c                 J    | j                  |       }|j                  |       |S )zCReturns ReadInstruction obj initialized with relative_instructions.)__new__r   )clsr   results      r    ,_read_instruction_from_relative_instructionsz<ReadInstruction._read_instruction_from_relative_instructions  s$     S!*+r   Nc           	      B    | j                  t        |||||      g       y)a  Initialize ReadInstruction.

        Args:
            split_name (str): name of the split to read. Eg: 'train'.
            rounding (str, optional): The rounding behaviour to use when percent slicing is
                used. Ignored when slicing with absolute indices.
                Possible values:
                 - 'closest' (default): The specified percentages are rounded to the
                     closest value. Use this if you want specified percents to be as
                     much exact as possible.
                 - 'pct1_dropremainder': the specified percentages are treated as
                     multiple of 1%. Use this option if you want consistency. Eg:
                         len(5%) == 5 * len(1%).
                     Using this option, one might not be able to use the full set of
                     examples, if the number of those is not a multiple of 100.
            from_ (int):
            to (int): alternative way of specifying slicing boundaries. If any of
                {from_, to, unit} argument is used, slicing cannot be specified as
                string.
            unit (str): optional, one of:
                '%': to set the slicing unit as percents of the split size.
                'abs': to set the slicing unit as absolute numbers.
        N)r   r   )rY   r   r   rC   rD   r   s         r    rZ   zReadInstruction.__init__  s!    6 	

(UBhOPQr   c                     t        |      }t        j                  |      }|st        d|       t	        |d         }t        d |dd D        |      S )aM  Creates a `ReadInstruction` instance out of a string spec.

        Args:
            spec (`str`):
                Split(s) + optional slice(s) to read + optional rounding
                if percents are used as the slicing unit. A slice can be specified,
                using absolute numbers (`int`) or percentages (`int`).

        Examples:

            ```
            test: test split.
            test + validation: test split + validation split.
            test[10:]: test split, minus its first 10 records.
            test[:10%]: first 10% records of test split.
            test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.
            test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.
            ```

        Returns:
            ReadInstruction instance.
        z&No instructions could be built out of r   c              3   2   K   | ]  }t        |        y wrc   )r   )rd   subs     r    rf   z,ReadInstruction.from_spec.<locals>.<genexpr>5  s     Fc,S1Fs   r	   N)r<   _ADDITION_SEP_REr6   r?   r   sum)r   r   subsr.   s       r    r@   zReadInstruction.from_spec  sZ    0 4y%%d+EdVLMM.tAw7FT!"XFTTr   c                    g }| j                   D ]  }|j                  }|j                  |j                  |j                  }|j                  }|j                  }|j
                  }|dk(  r|nd}|t        |      |z   nd}|t        |      |z   nd}d| d| d}|dk(  r||dk7  rd| dnd}	|||	z   z  }|j                  |        d	j                  |      S )
Nr    [:]r   ()+)	r   rB   rC   rD   r   r   r<   rE   rr   )
rY   rel_instr_specsr   rel_instr_specrC   rD   r   r   	slice_strrounding_strs
             r    to_speczReadInstruction.to_spec7  s    44 	3I&00N*ill.F!\\ ~~$--#s{t-2->E
T)B')~SWt^2wat1-	'+s{x7KPX\ePeazOkm  )l"::"">2	3  xx((r   c                 D   t        |t              sd}t        |      | j                  }|j                  }|d   j                  dk7  rF|d   j                  dk7  r4| j                  d   j
                  |d   j
                  k7  rt        d      | j                  ||z         S )zEReturns a new ReadInstruction obj, result of appending other to self.zAReadInstruction can only be added to another ReadInstruction obj.r   r   zPIt is forbidden to sum ReadInstruction instances with different rounding values.)r;   r/   r=   r   r   r   r?   r   )rY   otherr   self_ris	other_riss        r    __add__zReadInstruction.__add__K  s    %1UCC. ..00	QK%!!!U*++A.779Q<;P;PPopp@@IAUVVr   c                 "    | j                         S rc   )r   r   s    r    __str__zReadInstruction.__str__Z  s    ||~r   c                 "    d| j                    dS )NzReadInstruction(r   r   r   s    r    __repr__zReadInstruction.__repr__]  s    !$"="=!>a@@r   c                 T    | j                   D cg c]  }t        ||       c}S c c}w )aZ  Translate instruction into a list of absolute instructions.

        Those absolute instructions are then to be added together.

        Args:
            name2len (`dict`):
                Associating split names to number of examples.

        Returns:
            list of _AbsoluteInstruction instances (corresponds to the + in spec).
        )r   r   )rY   rH   r   s      r    rA   zReadInstruction.to_absolute`  s(     IMHcHcd9!)X6ddds   %)NNNN)r   r   r   r   r   classmethodr   rZ   r@   r   r   r   r   rA   r   r   r    r/   r/     sY    $L<  R: U U<)(WAer   )NN)@r   ro   r   rq   redataclassesr   	functoolsr   typingr   r   r   pyarrowrv   pyarrow.parquetr   r   tqdm.contrib.concurrentr   download.download_configr
   namingr   r   r   r   r   r   r   utilsr   r   rs   rG   r   r   r   r   
get_loggerr   loggerHF_GCP_BASE_URLcompileXr   r   ConnectionErrorr   r"   r%   r<   r*   rR   rT   r   r   r   r   r   r   r   r   r/   r   r   r    <module>r     s       	 	 !  1 1   . 4 : I I  " !( 
		H	%Qrzz
aO 	 DD  2::k* 	_ 		 	 $" " "$ &*!%H
Hk"H s--.H c]	H
 #H HVs sl,-* ,-^J > $   $o o o0772:de der   