
    biI                        d Z ddlZddlZddlmZ ddlmZmZmZ ddl	Z	ddl
ZddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& dd
l'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8  e4jr                  e:      Z;e<Z=dee   dee>   fdZ?dee   dee>   fdZ@de>de>de>fdZA G d deB      ZC G d d      ZD G d deD      ZE G d d      ZF G d d eF      ZGy)!z$To write records into Parquet files.    N)Iterable)AnyOptionalUnion)	url_to_fs   )config)AudioFeaturesImagePdfValueVideo)
FeatureTypeList_ArrayXDExtensionType_visitcast_to_python_objectsgenerate_from_arrow_typeget_nested_type%list_of_np_array_to_pyarrow_listarraynumpy_to_pyarrow_listarrayto_pyarrow_listarray)is_remote_filesystem)DatasetInfo)DuplicatedKeysError	KeyHasher)
array_castcast_array_to_featureembed_table_storage
table_cast)logging)asdictconvert_file_size_to_intfirst_non_null_non_empty_valuefeaturesreturnc                     | syt         j                  dt        ddffd}t        | |       t         j                  u rdS S )aN  
    Get the writer_batch_size that defines the maximum record batch size in the arrow files based on configuration values.
    The default value is 100 for image/audio datasets and 10 for videos.
    This allows to avoid overflows in arrow buffers.

    Args:
        features (`datasets.Features` or `None`):
            Dataset Features from `datasets`.
    Returns:
        writer_batch_size (`Optional[int]`):
            Writer batch size to pass to a dataset builder.
            If `None`, then it will use the `datasets` default, i.e. `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
    Nfeaturer'   c                     t        | t              r+t        j                  t	        t        j                        y t        | t
              r+t        j                  t	        t        j                        y t        | t              r+t        j                  t	        t        j                        y t        | t              r<| j                  dk(  r,t        j                  t	        t        j                        y y y y Nbinary)
isinstancer   r	   *ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETSminr
   *ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETSr   *ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETSr   dtype+ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETSr)   
batch_sizes    P/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/arrow_writer.pyset_batch_sizezAget_arrow_writer_batch_size_from_features.<locals>.set_batch_sizeI   s    gu%&*[*[*gZ)Z)Z[J'F,],],iZ)Z)Z[J'F,],],iZ)Z)Z[Jw&)BBNZ)[)[\J O * '    npinfr   r   r&   r7   r5   s     @r6   )get_arrow_writer_batch_size_from_featuresr=   6   sL     J] ] ] 8^$'47Z7r8   c                     | syt         j                  dt        ddffd}t        | |       t         j                  u rdS S )a  
    Get the writer_batch_size that defines the maximum row group size in the parquet files based on configuration values.
    By default these are not set, but it can be helpful to hard set those values in some cases.
    This allows to optimize random access to parquet file, since accessing 1 row requires
    to read its entire row group.

    Args:
        features (`datasets.Features` or `None`):
            Dataset Features from `datasets`.
    Returns:
        writer_batch_size (`Optional[int]`):
            Writer batch size to pass to a parquet writer.
            If `None`, then it will use the `datasets` default, i.e. aiming for row groups of 100MB.
    Nr)   r'   c                     t        | t              r+t        j                  t	        t        j                        y t        | t
              r+t        j                  t	        t        j                        y t        | t              r+t        j                  t	        t        j                        y t        | t              r<| j                  dk(  r,t        j                  t	        t        j                        y y y y r+   )r-   r   r	   )PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETSr/   r
   )PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETSr   )PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETSr   r2   *PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETSr4   s    r6   r7   z;get_writer_batch_size_from_features.<locals>.set_batch_sizeq   s    gu%&*Z*Z*fZ)Y)YZJ'F,\,\,hZ)Y)YZJ'F,\,\,hZ)Y)YZJw&)AAMZ)Z)Z[J N * 'r8   r9   r<   s     @r6   #get_writer_batch_size_from_featuresrD   ]   sL     J\ \ \ 8^$'47Z7r8   num_rows	num_bytesc                 T    t        d| t        t        j                        z  |z        S )ab  
    Get the writer_batch_size that defines the maximum row group size in the parquet files.
    The default in `datasets` is aiming for row groups of maximum 100MB uncompressed.
    This allows to optimize random access to parquet file, since accessing 1 row requires
    to read its entire row group.

    This can be improved to get optimized size for querying/iterating
    but at least it matches the dataset viewer expectations on HF.

    Args:
        num_rows (`int`):
            Number of rows in the dataset.
        num_bytes (`int`):
            Number of bytes in the dataset.
            For dataset with external files to embed (image, audio, videos), this can also be an
            estimate from `dataset._estimate_nbytes()`.
    Returns:
        writer_batch_size (`Optional[int]`):
            Writer batch size to pass to a parquet writer.
    
   )maxr$   r	   MAX_ROW_GROUP_SIZE)rE   rF   s     r6   $get_writer_batch_size_from_data_sizerK      s(    * r86v7P7PQQU^^__r8   c                       e Zd Zy)SchemaInferenceErrorN)__name__
__module____qualname__ r8   r6   rM   rM      s    r8   rM   c            
           e Zd ZdZ	 	 	 ddedee   dee   dee   fdZdefd	Ze	dede
eee   f   fd
       Zddeej                     fdZy)TypedSequencea  
    This data container generalizes the typing when instantiating pyarrow arrays, tables or batches.

    More specifically it adds several features:
    - Support extension types like ``datasets.features.Array2DExtensionType``:
        By default pyarrow arrays don't return extension arrays. One has to call
        ``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type))``
        in order to get an extension array.
    - Support for ``try_type`` parameter that can be used instead of ``type``:
        When an array is transformed, we like to keep the same type as before if possible.
        For example when calling :func:`datasets.Dataset.map`, we don't want to change the type
        of each column by default.
    - Better error message when a pyarrow array overflows.

    Example::

        from datasets.features import Array2D, Array2DExtensionType, Value
        from datasets.arrow_writer import TypedSequence
        import pyarrow as pa

        arr = pa.array(TypedSequence([1, 2, 3], type=Value("int32")))
        assert arr.type == pa.int32()

        arr = pa.array(TypedSequence([1, 2, 3], try_type=Value("int32")))
        assert arr.type == pa.int32()

        arr = pa.array(TypedSequence(["foo", "bar"], try_type=Value("int32")))
        assert arr.type == pa.string()

        arr = pa.array(TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64")))
        assert arr.type == Array2DExtensionType((1, 3), "int64")

        table = pa.Table.from_pydict({
            "image": TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64"))
        })
        assert table["image"].type == Array2DExtensionType((1, 3), "int64")

    Ndatatypetry_typeoptimized_int_typec                     ||t        d      || _        || _        || _        || _        | j                  d u| _        |d uxr
 |d u xr |d u | _        d | _        y )Nz)You cannot specify both type and try_type)
ValueErrorrT   rU   rV   rW   trying_typetrying_int_optimization_inferred_type)selfrT   rU   rV   rW   s        r6   __init__zTypedSequence.__init__   st      4HII		 "4==4'9'E'k$RV,'k[cgk[k$"r8   r'   c                     | j                   -t        t        j                  |       j                        | _         | j                   S )a  Return the inferred feature type.
        This is done by converting the sequence to an Arrow array, and getting the corresponding
        feature type.

        Since building the Arrow array can be expensive, the value of the inferred type is cached
        as soon as pa.array is called on the typed sequence.

        Returns:
            FeatureType: inferred feature type of the sequence.
        )r\   r   paarrayrU   r]   s    r6   get_inferred_typezTypedSequence.get_inferred_type   s7     &":288D>;N;N"OD"""r8   c           
      r   t         j                  rdt        j                  v rddl}t        |       \  }}t        ||j                  j                        r5| D cg c]  }|t               j                  |      nd! c}t               fS t        |t              rqt        |d   |j                  j                        rN| D cg c]-  }|'|D cg c]  }t               j                  |       c}nd/ c}}t        t                     fS t         j                  rdt        j                  v rddl}t        |       \  }}t        ||j                  j                        r5| D cg c]  }|t               j                  |      nd! c}t               fS t        |t              rqt        |d   |j                  j                        rN| D cg c]-  }|'|D cg c]  }t               j                  |       c}nd/ c}}t        t                     fS | dfS c c}w c c}w c c}}w c c}w c c}w c c}}w )a  Implement type inference for custom objects like PIL.Image.Image -> Image type.

        This function is only used for custom python objects that can't be directly passed to build
        an Arrow array. In such cases is infers the feature type to use, and it encodes the data so
        that they can be passed to an Arrow array.

        Args:
            data (Iterable): array of data to infer the type, e.g. a list of PIL images.

        Returns:
            Tuple[Iterable, Optional[FeatureType]]: a tuple with:
                - the (possibly encoded) array, if the inferred feature type requires encoding
                - the inferred feature type if the array is made of supported custom objects like
                    PIL images, else None.
        PILr   N
pdfplumber)r	   PIL_AVAILABLEsysmodules	PIL.Imager%   r-   r   encode_examplelistr   PDFPLUMBER_AVAILABLErf   pdfPDFr   )rT   re   non_null_idxnon_null_valuevaluexrf   s          r6   _infer_custom_type_and_encodez+TypedSequence._infer_custom_type_and_encode   s   " ES[[$8+I$+O(L..#))//:bfgY^9J..u5PTTginippp.$/J~a?PRUR[R[RaRa4bkobg%BS>1UW++A.>Y]]=! ! &&<3;;+F+I$+O(L..*..*<*<=`deW\u7H,,U3dRegjglll.$/J~a?PR\R`R`RdRd4eim`e@Qe<SU))!,<W[[;  Tz h ? f =s<   $HH# H-H##$H)H3 H.7H3H#.H3c           	         |t        d      ~| j                  }| j                  %| j                  | j	                  |      \  }| _        | j
                  %| j                  r| j                  n| j                  }n| j
                  }|t        |      nd}| j                  t        | j                        nd}d}	 t        |t              r,t        ||      }t        j                  j                  ||      S t        |t        j                         rt#        |      }nft        |t$              r4|r2t        t'        |      d   t        j                         rt)        |      }n"d}t        j*                  t-        |d            }| j.                  r]t        j0                  j3                  |j                        r|j5                  |      }|S t        j0                  j7                  |j                        rt        j0                  j3                  |j                  j8                        r!t;        |t        j<                  |            }|S t        j0                  j7                  |j                  j8                        rot        j0                  j3                  |j                  j8                  j8                        r2t;        |t        j<                  t        j<                  |                  }|S |%t?        ||| j                   | j                         }|S # t@        t        jB                  jD                  t        jB                  jF                  f$ r}| j                  s%t        |t        jB                  jF                        r | j                  r	 t        |t        j                         rt#        |      cY d}~S t        |t$              r$|r"tI        d |D              rt)        |      cY d}~S d}t        j*                  t-        |d            cY d}~S # t        jB                  jD                  $ r}d	tK        |      v rtM        d
tO        |       d| d      d| j.                  r_dtK        |      v rRt        jP                  |jS                               jT                  }	tV        jY                  d|	 d       cY d}~cY d}~S |rKdtK        |      v r>t        j*                  t-        |dd            }|t?        ||dd      }|cY d}~cY d}~S  d}~ww xY wd	tK        |      v rtM        d
tO        |       d| d      d| j.                  rZdtK        |      v rMt        jP                  |jS                               jT                  }	tV        jY                  d|	 d       cY d}~S |rFdtK        |      v r9t        j*                  t-        |dd            }|t?        ||dd      }|cY d}~S  d}~ww xY w)z=This function is called when calling pa.array(typed_sequence)NzMTypedSequence is supposed to be used with pa.array(typed_sequence, type=None)Fr   T)only_1d_for_numpy)allow_primitive_to_strallow_decimal_to_strc              3   P   K   | ]  }t        |t        j                           y wN)r-   r:   ndarray).0rr   s     r6   	<genexpr>z0TypedSequence.__arrow_array__.<locals>.<genexpr>T  s     @qchESUS]S]A^@qs   $&overflowz There was an overflow with type zE. Try to reduce writer_batch_size to have batches smaller than 2GB.
()znot in rangezFailed to cast a sequence to z. Falling back to int64.zCould not convert)rv   optimize_list_casting)-rY   rT   rU   rV   rt   r\   rZ   r   rW   r-   r   r   r`   ExtensionArrayfrom_storager:   r{   r   rl   r%   r   ra   r   r[   typesis_int64castis_list
value_typer   list_r   	TypeErrorlibArrowInvalidArrowNotImplementedErroranystrOverflowErrortype_r2   to_pandas_dtypenameloggerinfo)
r]   rU   rT   pa_typeoptimized_int_pa_typetrying_cast_to_python_objectsstorageouteoptimized_int_pa_type_strs
             r6   __arrow_array__zTypedSequence.__arrow_array__  s,    lmmyy99!6(,(J(J4(P%D$%&$($4$44==$))D&&D+/+;/$'8<8O8O8[OD334ae 	 ).%V	'#89.tW=((55gwGG $

+06D$'DZ@^_c@def@gikisis5t;DA04-hh5ddST++88$$SXX.((#89C J XX%%chh/xx(()<)<=(bhh7L.MN J ))#((*=*=>288CTCTUXU]U]UhUhUsUsCt(bhhrxx@U7V.WX J ! ,$:J:J6Jeieueuau JFFFF++
 6	 ##
1bff6U6U(V!$

39$??#D$/DS@qlp@q=qDTJJ8<5!xx(>tW[(\]]vv** !SV++>uT{m  LR  ST  RU  UV  W#$ 55.CPQF:R46HH=R=b=b=d4e4j4j1;<U;VVno  #

6;NRUVWRX;X hh244glm  +"7 #T$]a#C  #

+, s1v%#6uT{m  DJ  KL  JM  MN  O --.CF2J,.HH5J5Z5Z5\,],b,b);<U;VVnop
.3F#a&3Phh5ddjopq#/TRVmqrC
m6	s   >;L :CL A<L B#L 0(L =W>>W9$P9W>?.P-W>3!PW>T7B	T TW9W>ATTW9W>TTBW9*W>0AW92W>8W99W>)NNNrz   )rN   rO   rP   __doc__r   r   r   r^   rc   staticmethodtuplert   r`   DataTyper   rQ   r8   r6   rS   rS      s    %T '+*.48## {## ;'	#
 %[1#*#; # $H $xR]I^?^9_ $ $LiHR[[$9 ir8   rS   c            
       R     e Zd Z	 	 	 	 ddee   dee   dee   dee   f fdZ xZS )OptimizedTypedSequencerU   rV   colrW   c                     t        d      t        d      t        d      t        d      d}|||j                  |d       }t        |   ||||       y )Nint8int32)attention_maskspecial_tokens_mask	input_idstoken_type_ids)rU   rV   rW   )r   getsuperr^   )r]   rT   rU   rV   r   rW   optimized_int_type_by_col	__class__s          r6   r^   zOptimizedTypedSequence.__init__  sc     $Fm#(=w#	%
! <H,!:!>!>sD!ID8Pbcr8   )NNNN)rN   rO   rP   r   r   r   r^   __classcell__r   s   @r6   r   r     s\     '+*.!48d {#d ;'	d
 c]d %[1d dr8   r   c                   B   e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d+deej                     dee   dee   deej                     dee   dee
   d	ee   d
ee   dedededededee   fdZd Zd Zd Zd Zdej                  fdZdej                  fdZed        Zed,dedee   deeef   fd       Zd Zd Z	 	 d-deeef   d eeee
ef      dee
   fd!Zd" Zd,d#ej@                  dee
   fd$Z!	 	 d.d%eee"f   dee
   d&ee   fd'Z#d,d(ej@                  dee
   fd)Z$d/d*Z%y)0ArrowWriterz,Shuffles and writes Examples to Arrow files.Nschemar&   pathstreamfingerprintwriter_batch_size	hash_saltcheck_duplicatesdisable_nullableupdate_featureswith_metadataunitembed_local_filesstorage_optionsc                 V   ||t        d      ||| _        d | _        n<|,|| _        t        j                  | j                        | _        nd | _        d | _        |t        |      | _        nt        d      | _        || _        |	| _        |zt        |fi |xs i \  }}|| _
        t        | j                        s|n| j                  j                  |      | _        | j                  j                  |d      | _        d| _        nd | _
        d | _        || _        d| _        || _        |	| _        |xs' t'        | j                        xs t(        j*                  | _        |
| _        || _        || _        || _        d| _        d| _        g | _        g | _        d | _        g | _         y )Nz1At least one of path and stream must be provided. wbTFr   )!rY   	_features_schemar   from_arrow_schemar   _hasher_check_duplicates_disable_nullabler   _fsr   unstrip_protocol_pathopenr   _closable_streamr   r   r=   r	   DEFAULT_MAX_BATCH_SIZEr   r   r   r   r   _num_examples
_num_bytescurrent_examplescurrent_rows	pa_writerhkey_record)r]   r   r&   r   r   r   r   r   r   r   r   r   r   r   r   fss                   r6   r^   zArrowWriter.__init__  s   " <FNPQQ%DNDL&,DL%77EDN!DNDL $Y/DL$R=DL!1!1> A/*?RAHB24DH%9$((%CIbIbcgIhDJ((--d3DK$(D!DHDJ DK$)D!& 0 -8H-,, 	
  /*	!2BD,.?Cr8   c                 r    | j                   t        | j                        z   t        | j                        z   S )z/Return the number of writed and staged examples)r   lenr   r   rb   s    r6   __len__zArrowWriter.__len__  s.    !!C(=(=$>>TEVEVAWWWr8   c                     | S rz   rQ   rb   s    r6   	__enter__zArrowWriter.__enter__  s    r8   c                 $    | j                          y rz   )close)r]   exc_typeexc_valexc_tbs       r6   __exit__zArrowWriter.__exit__  s    

r8   c                     | j                   r	 | j                   j                          | j                  r2| j                  j
                  s| j                  j                          y y y # t        $ r Y Jw xY wrz   )r   r   	Exceptionr   r   closedrb   s    r6   r   zArrowWriter.close  sa    >>$$&   ););KK *<   s   A' '	A32A3inferred_schemac                    | j                   }| j                  }t        j                  |      }| j                  || j                  r~| j                  j
                  D ci c]  }|j                  | }}|j
                  D ].  }|j                  }||v s|||   k(  s| j                  |   ||<   0 |}|}n|}|j                  }| j                  rt        j                   d |D              }| j                  r9|j                  | j                  t        |      | j                              }||fS |j                  i       }||fS c c}w )Nc              3   t   K   | ]0  }t        j                  |j                  |j                  d        2 ywF)nullableNr`   fieldr   rU   r|   r   s     r6   r}   z,ArrowWriter._build_schema.<locals>.<genexpr>  s)     dTYrxx

EJJOOd   68)r&   )r   r   r   r   r   rU   r   arrow_schemar   r`   r   _build_metadatar   r   )	r]   r   r   r&   inferred_featuresr   fieldsinferred_fieldr   s	            r6   _build_schemazArrowWriter._build_schema  sD   >>$66G>>%##9=9L9LM%**e+MM&7&<&< KN)..Dv~)VD\96:nnT6J-d3	K
 -$3(H 1 > >F  YYd]cddF))$*>*>{T\?]_c_o_o*pqF x ))"-Fx' Ns   Ec                     | j                  |      \  | _        | _        t        j                  | j
                  | j                        | _        y rz   )r   r   r   r`   RecordBatchStreamWriterr   r   r]   r   s     r6   _build_writerzArrowWriter._build_writer  s7    '+'9'9/'J$dn33DKKNr8   c                     | j                   | j                   n6| j                  )t        j                  | j                  j                        nd }| j
                  r|t        j                  d |D              }||S g S )Nc              3   t   K   | ]0  }t        j                  |j                  |j                  d        2 ywr   r   r   s     r6   r}   z%ArrowWriter.schema.<locals>.<genexpr>  s)     fUZUZZ% P Pfr   )r   r   r`   r   rU   r   )r]   r   s     r6   r   zArrowWriter.schema  ss     ||' LL48NN4N"))DNN//0TX 	
 !!g&9iif^effG!-w525r8   r   r'   c                     dg}t        |       }i }|D ci c]  }|||   
 c}|d<   |||d<   dt        j                  |      iS c c}w )Nr&   r   r   huggingface)r#   jsondumps)r   r   	info_keysinfo_as_dictmetadatakeys         r6   r   zArrowWriter._build_metadata  s`    L	d|>GHsCc!22H"&1H]#tzz(344 Is   Ac           	         | j                   sy| j                  rt        | j                  j                        }| j                   d   d   j	                         }| j                  j                  D cg c]	  }||v s| }}|D cg c]	  }||vs| }}||z   }nt        | j                   d   d         }i }|D ]  t        fd| j                   D              rw| j                   D cg c]
  }|d       }	}|	D 
cg c]2  }
t        |
t        j                        r|
j                  n|
gD ]  }| 4 }	}
}t        j                  |	      |<   | j                   D cg c]S  }t        |d      t        j                  t        j                  f      r|d      j                         d   n|d      U c}|<    | j                  |       g | _         yc c}w c c}w c c}w c c}}
w c c}w )ziWrite stored examples from the write-pool of examples. It makes a table out of the examples and write it.Nr   c              3   ~   K   | ]4  }t        |d       t        j                  t        j                  f       6 yw)r   N)r-   r`   ArrayChunkedArray)r|   rowr   s     r6   r}   z5ArrowWriter.write_examples_on_file.<locals>.<genexpr>8  s-     iC:c!fSkBHHboo+FGis   :=)batch_examples)r   r   setnameskeysrl   allr-   r`   r  chunksconcat_arraysr  	to_pylistwrite_batch)r]   schema_colsexamples_colsr   common_cols
extra_colscolsr  r  arraysra   chunks      `        r6   write_examples_on_filez"ArrowWriter.write_examples_on_file&  s   $$;;dkk//0K 11!4Q7<<>M*.++*;*;T3sm?S3TKT)6Q##[:P#QJQ+D--a034D 	C iSWShShii151F1FG##a&+GG "(2<UBOO2T%,,[`Za   
 ')&6&6v&>s#  $44' 3=SVC[288UWUdUdJe2fCF3K))+A.lopqlrsvlww's#	" 	7 "1 UQ H's+   2	G<G	G"G"+G'7G,"AG2c                     | j                   syt        j                  | j                         }| j                  |       g | _         y)zwWrite stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table.N)r   r`   concat_tableswrite_table)r]   tables     r6   write_rows_on_filezArrowWriter.write_rows_on_fileH  s:        !2!23r8   exampler   c                    | j                   rV| j                  j                  |      }| j                  j	                  ||f       | j
                  j	                  ||f       n| j                  j	                  |df       || j                  }|Mt        | j                        |k\  r4| j                   r| j                          g | _        | j                          yyy)zAdd a given (Example,Key) pair to the write-pool of examples which is written to file.

        Args:
            example: the Example to add.
            key: Optional, a unique identifier(str, int or bytes) associated with each example
        r   N)
r   r   hashr   appendr   r   r   check_duplicate_keysr  )r]   r  r   r   r  s        r6   writezArrowWriter.writeP  s     !!<<$$S)D!!(('49##T3K0 !!(('27$ $ 6 6(S1F1F-GK\-\%%))+#% '') .](r8   c           
         t               }| j                  D ]m  \  }}||v rSt        | j                        D cg c]%  \  }\  }}||k(  rt        | j                  |z         ' }}}}t        ||      |j                  |       o yc c}}}w )z+Raises error if duplicates found in a batchN)r  r   	enumerater   r   r   add)r]   
tmp_recordr  r   indexduplicate_hash_duplicate_key_indicess           r6   r   z ArrowWriter.check_duplicate_keysq  s    U
)) 
	%ID#z! 7@@P@P6Q) )22%- **U23)% ) *#/DEEt$
	%)s   *B
r  c                     t        |      dk7  rt        dt        |       d      | j                  j                  |       || j                  }|*t        | j                        |k\  r| j                          yyy)zAdd a given single-row Table to the write-pool of rows which is written to file.

        Args:
            row: the row to add.
        r   z>Only single-row pyarrow tables are allowed but got table with z rows.N)r   rY   r   r  r   r  )r]   r  r   s      r6   	write_rowzArrowWriter.write_row  s     s8q=]^abe^f]ggmnoo  %$ $ 6 6(S1B1B-CGX-X##% .Y(r8   r  try_original_typec                    |r/t        t        t        |j                                           dk(  ry| j                  | j
                  rdn| j                  }| j                  | j
                  r| j                  nd}g }t               }| j                  rqt        | j                  j                        }|j                         }	| j                  j                  D 
cg c]	  }
|
|	v s|
 }}
|	D 
cg c]	  }
|
|vs|
 }}
||z   }nt        |      }|D ]  }
||
   }|r||
   nd}t        |t        j                  t        j                   f      r:|t#        ||      n|}|j%                  |       t'        |j(                        ||
<   u||
|v r|r||
   nd}t+        ||||
      }|j%                  t        j,                  |             |j/                         ||
<    | j                  |j0                  n| j                  }t        j2                  j5                  ||      }| j7                  ||       yc c}
w c c}
w )ag  Write a batch of Example to file.
        Ignores the batch if it appears to be empty,
        preventing a potential schema update of unknown types.

        Args:
            batch_examples: the batch of examples to add.
            try_original_type: use `try_type` when instantiating OptimizedTypedSequence if `True`, otherwise `try_type = None`.
        r   N)rU   rV   r   )r   )r   nextitervaluesr   r   r   r   r   r  r  r	  rl   r-   r`   r  r  r   r  r   rU   r   ra   rc   r   Tablefrom_arraysr  )r]   r  r   r,  r&   try_featuresr  r   r  
batch_colsr   r  r  r  
col_valuescol_typera   col_try_typetyped_sequencer   pa_tables                        r6   r  zArrowWriter.write_batch  s    c$tN,A,A,C'D"EF!K>>1d6J6J4PTP^P^)-)?DDXDXt~~^b$J;;dkk//0K',,.J*.++*;*;Q3sj?P3QKQ)3N#s+7M#NJN+D'D 	LC',J(0x}dH*rxx&ABGOG[-j(Cake$)A*//)R!#& $/C<4GL] !% 
 "8
\hnq!rbhh~67)7)I)I)K!#&	L  48>>3I"//t{{88''v'>#45/ RNs   	H8'H81	H=;H=r9  c                    || j                   }| j                  | j                  |j                         |j	                         }t        || j                        }| j                  rt        |      }| xj                  |j                  z  c_	        | xj                  |j                  z  c_        | j                  j                  ||       y)zUWrite a Table to file.

        Args:
            example: the Table to add.
        N)r   )r   r   r   r   combine_chunksr!   r   r   r    r   nbytesr   rE   r  )r]   r9  r   s      r6   r  zArrowWriter.write_table  s     $ $ 6 6>>!x?**,h5!!*84H8??*h///""8->?r8   c                    | j                          | j                  r| j                          g | _        | j	                          | j
                  '| j                  r| j                  | j                         | j
                  >| j
                  j                          d | _        |rB| j                  j                          n'|r| j                  j                          t        d      t        j                  d| j                   d| j                   d| j                   d| j                   r| j                   nd d	       | j                  | j                  fS )Nz@Please pass `features` or at least one example when writing datazDone writing  z in z bytes r   .)r  r   r   r   r  r   r   r   r   r   rM   r   debugr   r   r   r   )r]   close_streams     r6   finalizezArrowWriter.finalize  s   !!!%%'!D##%>>!dkkt{{+>>%NN  "!DN!!#!!#&'ijjD../q4?PPWfjfpfpX\XbXbvxWyyz{	
 !!4??22r8   )NNNNNNNFFFTexamplesFNrz   )NNNT)T)&rN   rO   rP   r   r   r`   Schemar   r   
NativeFileintbooldictr^   r   r   r   r   r   r   propertyr   r   r   r   r  r  r   r   bytesr!  r   r1  r+  rl   r  r  rB  rQ   r8   r6   r   r     sR   6 '+'+"*.%)+/#'+0!& %""'*.C#C 8$C sm	C
 'C c]C $C=C C=C #4.C C C C C  C "$CJX  RYY  6ORYY O 6 6 5k 5 5QUVY[^V^Q_ 5 5 #D 15+/	*c3h* eCeO,-* $C=	*B%&RXX &(3- &" ,0,0	/6S$Y/6 $C=/6 $D>	/6b@BHH @# @$3r8   r   c                   F     e Zd Zddd fd
Zdej
                  fdZ xZS )ParquetWriterTuse_content_defined_chunkingwrite_page_indexc                h    t        |   |i | |du rt        j                  }|| _        || _        y rD  )r   r^   r	   DEFAULT_CDC_OPTIONSrO  rP  )r]   rO  rP  argskwargsr   s        r6   r^   zParquetWriter.__init__  s9    $)&)'4/+1+E+E(,H) 0r8   r   c                 ^   | j                  |      \  | _        | _        t        j                  | j
                  | j                  | j                  | j                        | _        | j                  dur;| j                  j                  dt        j                  | j                        i       y y )NrN  Fcontent_defined_chunking)r   r   r   pqrM  r   rO  rP  r   add_key_value_metadatar   r   r   s     r6   r   zParquetWriter._build_writer  s    '+'9'9/'J$dn))KKLL)-)J)J!22	
 ,,E9NN11+TZZ8Y8Y-Z[ :r8   )rN   rO   rP   r^   r`   rE  r   r   r   s   @r6   rM  rM    s    ;?RV 1RYY r8   rM  )Hr   r   rh   collections.abcr   typingr   r   r   fsspecnumpyr:   pyarrowr`   pyarrow.parquetparquetrW  fsspec.corer   r   r	   r&   r
   r   r   r   r   r   features.featuresr   r   r   r   r   r   r   r   r   r   filesystemsr   r   r   keyhashr   r   r  r   r   r    r!   utilsr"   utils.py_utilsr#   r$   r%   
get_loggerrN   r   rU   r   rG  r=   rD   rK   rY   rM   rS   r   r   rM  rQ   r8   r6   <module>rg     s   +  
 $ ' '     !  ? ?   .  3 U U  \ \ 
		H	%$88J $8xX[} $8N%8(82D %8RU %8P`3 `3 `3 `0	: 	\ \~d] d,R3 R3j
K r8   