
    bi                         d dl Z d dlZd dlmZmZmZ d dlZd dlmZ	 ddl
mZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ  G d de      Z  G d d      Z!y)    N)BinaryIOOptionalUnion   )DatasetFeatures
NamedSplitconfig)$get_writer_batch_size_from_data_size#get_writer_batch_size_from_features)query_table)_PACKAGED_DATASETS_MODULES)Parquet)tqdm)NestedDataStructureLikePathLike   )AbstractDatasetReaderc                   h     e Zd Z	 	 	 	 	 	 d
dee   dee   dee   dede	de	dee
   f fdZd	 Z xZS )ParquetDatasetReaderpath_or_pathssplitfeatures	cache_dirkeep_in_memory	streamingnum_procc           
          t        
|   |f||||||d| t        |t              r|n| j                  |i}t
        d   d   }	t        d||||	d|| _        y )N)r   r   r   r   r   r   parquetr   )r   
data_filesr   hash )super__init__
isinstancedictr   r   r   builder)selfr   r   r   r   r   r   r   kwargsr!   	__class__s             N/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/io/parquet.pyr$   zParquetDatasetReader.__init__   s     			
)		
 		
 *4M4)Htzz[hNi))4Q7 
$	

 
    c                 6   | j                   r(| j                  j                  | j                        }|S d }d }d }d }| j                  j	                  ||||| j
                         | j                  j                  | j                  || j                        }|S )N)r   )download_configdownload_modeverification_mode	base_pathr   )r   r0   	in_memory)r   r'   as_streaming_datasetr   download_and_preparer   
as_datasetr   )r(   datasetr.   r/   r0   r1   s         r+   readzParquetDatasetReader.read2   s    >>ll77djj7IG$  #O M $ILL-- /+"3# .  ll--jj4EQUQdQd . G r,   )NNNFFN)__name__
__module____qualname__r   r   r   r	   r   strboolintr$   r7   __classcell__)r*   s   @r+   r   r      sx     '+'+$"&
.x8
 
#
 8$	

 
 
 
 3-
>r,   r   c                   v    e Zd Z	 	 	 	 ddedeeef   dee   dee	   dee
e	f   de
fdZd	efd
Zdeded	efdZy)ParquetDatasetWriterNr6   path_or_buf
batch_sizestorage_optionsuse_content_defined_chunkingwrite_page_indexc                    || _         || _        |xs: t        |j                        xs# t	        t        |      |j                               | _        |xs i | _        || _	        |du rt        j                  }|| _        || _        y )NT)r6   rA   r   r   r   len_estimate_nbytesrB   rC   parquet_writer_kwargsr
   DEFAULT_CDC_OPTIONSrD   rE   )r(   r6   rA   rB   rC   rD   rE   rI   s           r+   r$   zParquetDatasetWriter.__init__K   s     & ^273C3CD^3CL'BZBZB\] 	
  /4"%:"'4/+1+E+E(,H) 0r,   returnc                    t        | j                  t        t        t        j
                  f      rct        j                  | j                  dfi | j                  xs i 5 } | j                  d|| j                  d| j                  }d d d        |S  | j                  d| j                  | j                  d| j                  }|S # 1 sw Y   S xY w)Nwb)file_objrB   r"   )r%   rA   r;   bytesosr   fsspecopenrC   _writerB   rI   )r(   bufferwrittens      r+   writezParquetDatasetWriter.writec   s    d&&eR[[(ABT--tT8L8L8RPRT X^%$++ ## 00  "dkk ))?? ,,G
  s   *CCrN   c           	         d}|j                  dd      }| j                  j                  j                  }t	        j
                  |f|| j                  | j                  d|}t        t        dt        | j                        |      dd      D ]e  }t        | j                  j                  t        |||z         | j                  j                        }	|j                  |	       ||	j                   z  }g | j                  d	ur0|j#                  d
t%        j&                  | j                        i       |j)                          |S )zWrites the pyarrow table as Parquet to a binary file handle.

        Caller is responsible for opening and closing the handle.
        r   rA   N)schemarD   rE   baz"Creating parquet from Arrow format)unitdesc)tablekeyindicesFcontent_defined_chunking)popr6   r   arrow_schemapqParquetWriterrD   rE   hf_tqdmrangerG   r   _dataslice_indiceswrite_tablenbytesadd_key_value_metadatajsondumpsclose)
r(   rN   rB   rI   rU   _rX   writeroffsetbatchs
             r+   rS   zParquetDatasetWriter._writes   s-   
 !%%mT:&&33!!
)-)J)J!22	

 $
 !S&
35
 	$F
  ll((&&:"56--E
 u%u||#G	$ ,,E9))+EtzzRVRsRsGt*uvr,   )NNTT)r8   r9   r:   r   r   r   r   r   r=   r&   r<   r$   rV   rS   r"   r,   r+   r@   r@   J   s    
 %)*.:>!%11 8X-.1 SM	1
 "$1 ',D$J&71 10s  #x #S #VY #r,   r@   )"rl   rP   typingr   r   r   rQ   pyarrow.parquetr   rb    r   r   r	   r
   arrow_writerr   r   
formattingr   packaged_modulesr    packaged_modules.parquet.parquetr   utilsr   rd   utils.typingr   r   abcr   r   r@   r"   r,   r+   <module>r}      sL     	 , ,   4 4 d $ 9 6 # < &50 5pL Lr,   