
    bi                        d dl Z d dlmZ d dlmZmZ d dlZd dlm	Z
 d dlmZ d dlZd dlmZ ej"                  j$                  j'                  e      Ze G d dej,                               Z G d dej0                        Zy)	    N)	dataclass)OptionalUnion)
table_castc                        e Zd ZU dZdZee   ed<   dZee	e
      ed<   dZeej                     ed<   dZeeej"                  e	e   e	e	e      f      ed<    fdZ xZS )ParquetConfigzBuilderConfig for Parquet.N
batch_sizecolumnsfeaturesfiltersc                 "    t         |           y N)super__post_init__)self	__class__s    d/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/packaged_modules/parquet/parquet.pyr   zParquetConfig.__post_init__   s        )__name__
__module____qualname____doc__r	   r   int__annotations__r
   liststrr   datasetsFeaturesr   r   ds
Expressiontupler   __classcell__)r   s   @r   r   r      sr    $ $J$#'GXd3i ',0Hhx(()0NRGXeBMM4;T%[8IIJKR   r   r   c                   Z    e Zd ZeZd Zd Zdej                  dej                  fdZ	d Z
y)Parquetc                    | j                   j                  | j                   j                  xt        | j                   j                        t        | j                   j                        k7  r9t	        d| j                   j                   d| j                   j                         t        j                  | j                   j                        S )NzIThe columns and features argument must contain the same columns, but got z and )r   )configr
   r   set
ValueErrorr   DatasetInfo)r   s    r   _infozParquet._info    s    KK+$$0DKK''(C0D0D,EE[;;&&'uT[[-A-A,BC  ##T[[-A-ABBr   c                    | j                   j                  s"t        d| j                   j                         d|j                  _        |j                  | j                   j                        }g }|j                         D ]  \  }}t        |t              r|g}|D cg c]  }|j                  |       }}| j                  j                  {t        j                  j                  |      D ]Y  }t        |d      5 }t         j"                  j%                  t'        j(                  |            | j                  _        ddd        n |j+                  t!        j,                  |d|i              | j                   j.                  t1        | j                   j.                        t1        | j                  j                        k7  rst!        j"                  | j                  j                  j                         D 	ci c]!  \  }}	|| j                   j.                  v s||	# c}	}      | j                  _        |S c c}w # 1 sw Y   xY wc c}	}w )z-We handle string, list and dicts in datafilesz=At least one data file must be specified, but got data_files=TNrbfiles)name
gen_kwargs)r&   
data_filesr(   download_configextract_on_the_flydownload_and_extractitems
isinstancer   
iter_filesinfor   	itertoolschainfrom_iterableopenr   r   from_arrow_schemapqread_schemaappendSplitGeneratorr
   r'   )
r   
dl_managerr0   splits
split_namer-   filefcolfeats
             r   _split_generatorszParquet._split_generators,   s   {{%%\]a]h]h]s]s\tuvv8<
""544T[[5K5KL
!+!1!1!3 	aJ%%=BCTZ**40CECyy!!)%OO99%@ DdD) dQ-5->->-P-PQSQ_Q_`aQb-c		*d MM(11zwX]N^_`	a ;;*s4;;3F3F/G3tyyOaOaKb/b!)!2!2,0II,>,>,D,D,FeysD#QUQ\Q\QdQdJdde"DII  Dd d fs   H=<AI> I
I
Ipa_tablereturnc                     | j                   j                  *t        || j                   j                  j                        }|S r   )r7   r   r   arrow_schema)r   rI   s     r   _cast_tablezParquet._cast_tableE   s5    99) "(DII,>,>,K,KLHr   c              #     K   | j                   j                  | j                   j                  t        d | j                  j                  j
                  D              t        | j                   j                        k7  r:t        d| j                   j                   d| j                  j                   d      t        | j                   j                  t              r)t        j                  | j                   j                        n| j                   j                  }t        t        j                  j                  |            D ]  \  }}t!        |d      5 }t#        j$                         j'                  |      }|j(                  r| j                   j*                  xs |j(                  d   j,                  }	 t        |j/                  || j                   j                  |dd            D ]?  \  }}	t0        j2                  j5                  |	g      }
| d| | j7                  |
      f A 	 d d d         y # t        $ r-}t8        j;                  d	| d
t=        |       d|         d }~ww xY w# 1 sw Y   8xY ww)Nc              3   4   K   | ]  }|j                     y wr   )r.   ).0fields     r   	<genexpr>z+Parquet._generate_tables.<locals>.<genexpr>N   s     NUejjNs   z)Tried to load parquet data with columns 'z' with mismatching features ''r,   r   )r	   r
   filterbatch_readaheadfragment_readahead_zFailed to read file 'z' with error z: )r&   r   r
   sortedr7   rL   r(   r5   r   r   r=   filters_to_expression	enumerater8   r9   r:   r;   r   ParquetFileFormatmake_fragment
row_groupsr	   num_rows
to_batchespaTablefrom_batchesrM   loggererrortype)r   r-   filter_exprfile_idxrD   rE   parquet_fragmentr	   	batch_idxrecord_batchrI   es               r   _generate_tableszParquet._generate_tablesL   sH    ;;+0C0C0ONdii.@.@.M.MNNRXY]YdYdYlYlRmm ?@S@S?TTqrvr{r{  sE  sE  rF  FG  H 
 $++--t4 $$T[[%8%89$$ 	
 (	(E(Ee(LM 	NHddD! Q#%#7#7#9#G#G#J #..!%!7!7!b;K;V;VWX;Y;b;bJ7@,77+5(,(;(;'20134 8 8 X3I| (*xx'<'<l^'LH &.Ja	{";T=M=Mh=W"WWX 	( & '<TF-PTUVPWyXZ[\Z]%^_' sD   EI?A!I28A5H9-I2.I?9	I/(I**I//I22I<	7I?N)r   r   r   r   BUILDER_CONFIG_CLASSr*   rH   r`   ra   rM   rl    r   r   r$   r$      s3    (
C2BHH  !r   r$   )r8   dataclassesr   typingr   r   pyarrowr`   pyarrow.datasetdatasetr   pyarrow.parquetparquetr=   r   datasets.tabler   utilslogging
get_loggerr   rc   BuilderConfigr   ArrowBasedBuilderr$   rn   r   r   <module>r|      sn     ! "     % 
			*	*8	4 	 H** 	  	 Ph(( Pr   