
    bih              
          d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlZddlmZ ddlmZmZmZmZ ddlmZ dd	lmZ  ed
      Z  ed      Z! ed      Z" ed      Z#de$de%fdZ&de	fdZ'dedee(e)e$e*ef   dedej:                  fdZ+dedee(e)e$e*ef   dej:                  fdZ,dejZ                  de%fdZ. G d dee!e"e#f         Z/de0e*e1e    f   de0e*e f   fdZ2 G d de/ej:                  ejZ                  ej:                  f         Z3 G d de/e0e1e0f         Z4 G d  d!e/e0ejj                  e0f         Z6 G d" d#e/ejn                  ejp                  ejn                  f         Z9 G d$ d%      Z: G d& d'      Z; G d( d)e      Z< G d* d+e<      Z= G d, d-e<      Z> G d. d/ee!e"e#f         Z? G d0 d1e?e!e"e#f         Z@ G d2 d3e?e!e"e#f         ZA G d4 d5eAej:                  ejZ                  ej:                  f         ZB G d6 d7e?ee1ef         ZC G d8 d9eAejn                  ejp                  ejn                  f         ZD G d: d;e?e0e"e0f         ZEde*d<e1e*   ddfd=ZFdee(e)e$ef   d>e(ddfd?ZGdee(e)e$e*ef   de*fd@ZH	 dEdedee(e)e$e*ef   dee   dej:                  fdAZI	 	 dFdedee(e)e$e*ef   dBe?dCee1   fdDZJy)G    N)IterableMappingMutableMapping)partial)AnyCallableGenericOptionalTypeVarUnion   )Features)_ArrayXDExtensionType_is_zero_copy_onlydecode_nested_examplepandas_types_mapper)Table)no_op_if_value_is_nullT	RowFormatColumnFormatBatchFormatkeyreturnc                 V    | j                   dk(  xr | j                  | j                  k\  S )N   )stepstopstartr   s    Y/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/formatting/formatting.py_is_range_contiguousr"   (   s"    88q=2SXX22    c                 8    t        d|  dt        |        d      )NzWrong key type: 'z' of type 'z6'. Expected one of int, slice, range, str or Iterable.)	TypeErrortyper    s    r!   _raise_bad_key_typer'   ,   s&    

C5DI;6lm r#   tableindicesc           
         t        |t              rK|j                  ||j                  z  d      j	                  d      d   j                         }t        | |      S t        |t              r!t        |j                  |j                         }t        |t              rt        |      rw|j                  dk\  rht        | |j                  |j                  |j                  |j                  z
        j	                  d      D cg c]  }|j                          c}      S 	 t        |t              r;| j                  |g      } t        | |j	                  d      j                               S t        |t               rJt        | |D cg c]4  }|j                  |d      j	                  d      d   j                         6 c}      S t#        |       yc c}w c c}w )aE  
    Query a pyarrow Table to extract the subtable that correspond to the given key.
    The :obj:`indices` parameter corresponds to the indices mapping in case we cant to take into
    account a shuffling or an indices selection for example.
    The indices table must contain one column named "indices" of type uint64.
    r   r   N)
isinstanceint
fast_slicenum_rowscolumnas_py_query_tablesliceranger)   r"   r   r   strselect	to_pylistr   r'   )r(   r   r)   is       r!   !_query_table_with_indices_mappingr8   2   s|    #s  w'7'7!7;BB1EaHNNPE3''#uS[[!1!123#u$a7+=+=ciiTWT]T]I]+^+e+efg+hia	i  #scU#E7>>!#4#>#>#@AA#x E[^#_VWG$6$6q!$<$C$CA$Fq$I$O$O$Q#_`` j $`s   G
9G
c                 *   t        |t              r| j                  || j                  z  d      S t        |t              r!t        |j                  | j                         }t        |t
              rNt        |      rB|j                  dk\  r3| j                  |j                  |j                  |j                  z
        S 	 t        |t              r9| j                  j                  | j                  D cg c]
  }||k7  s	| c}      S t        |t              rlt        j                   |t        j"                        }t%        |      dk(  r| j                  j	                  dd      S | j'                  || j                  z        S t)        |       yc c}w )zY
    Query a pyarrow Table to extract the subtable that correspond to the given key.
    r   r   N)r+   r,   r-   r.   r2   r3   r)   r"   r   r   r4   r(   dropcolumn_namesr   npfromiterint64lenfast_gatherr'   )r(   r   r/   s      r!   r1   r1   P   s-    #senn 4a88#uS[[01#u$a##CIIsxx#))/CDD#s{{e6H6H ZFFVYM Z[[#x kk#rxx(s8q=;;$$Q**  u~~!566 ![s   2
F=Fpa_arrayc                      | j                   dkD  S Nr   )
null_count)rA   s    r!   _is_array_with_nullsrE   i   s    ""r#   c                   |    e Zd ZdZdej
                  defdZdej
                  defdZ	dej
                  de
fdZy)BaseArrowExtractorz
    Arrow extractor are used to extract data from pyarrow tables.
    It makes it possible to extract rows, columns and batches.
    These three extractions types have to be implemented.
    pa_tabler   c                     t         NNotImplementedErrorselfrH   s     r!   extract_rowzBaseArrowExtractor.extract_rowt       !!r#   c                     t         rJ   rK   rM   s     r!   extract_columnz!BaseArrowExtractor.extract_columnw   rP   r#   c                     t         rJ   rK   rM   s     r!   extract_batchz BaseArrowExtractor.extract_batchz   rP   r#   N)__name__
__module____qualname____doc__par   r   rO   r   rR   r   rT    r#   r!   rG   rG   m   sL    "BHH " ""rxx "L ""bhh "; "r#   rG   py_dictc                 \    | j                         D ci c]  \  }}||d    c}}S c c}}w )z:Return the first element of a batch (dict) as a row (dict)r   )items)r[   r   arrays      r!   _unnestr_   ~   s(    ,3MMO<jc5CqM<<<s   (c                       e Zd Zdej                  dej                  fdZdej                  dej                  fdZdej                  dej                  fdZy)SimpleArrowExtractorrH   r   c                     |S rJ   rZ   rM   s     r!   rO   z SimpleArrowExtractor.extract_row       r#   c                 $    |j                  d      S rC   )r/   rM   s     r!   rR   z#SimpleArrowExtractor.extract_column   s    q!!r#   c                     |S rJ   rZ   rM   s     r!   rT   z"SimpleArrowExtractor.extract_batch   rc   r#   N)	rU   rV   rW   rY   r   rO   ArrayrR   rT   rZ   r#   r!   ra   ra      sS    BHH  "rxx "BHH "bhh 288 r#   ra   c                   x    e Zd Zdej                  defdZdej                  defdZdej                  defdZ	y)PythonArrowExtractorrH   r   c                 4    t        |j                               S rJ   )r_   	to_pydictrM   s     r!   rO   z PythonArrowExtractor.extract_row   s    x))+,,r#   c                 @    |j                  d      j                         S rC   )r/   r6   rM   s     r!   rR   z#PythonArrowExtractor.extract_column   s    q!++--r#   c                 "    |j                         S rJ   )rj   rM   s     r!   rT   z"PythonArrowExtractor.extract_batch   s    !!##r#   N)
rU   rV   rW   rY   r   dictrO   listrR   rT   rZ   r#   r!   rh   rh      sG    -BHH - -.rxx .D .$bhh $4 $r#   rh   c                       e Zd Zd Zdej
                  defdZdej
                  dej                  fdZ
dej
                  defdZdej                  dej                  fdZy	)
NumpyArrowExtractorc                     || _         y rJ   )np_array_kwargs)rN   rr   s     r!   __init__zNumpyArrowExtractor.__init__   s
    .r#   rH   r   c                 6    t        | j                  |            S rJ   )r_   rT   rM   s     r!   rO   zNumpyArrowExtractor.extract_row   s    t))(344r#   c                 D    | j                  ||j                  d            S rC   )_arrow_array_to_numpyr;   rM   s     r!   rR   z"NumpyArrowExtractor.extract_column   s#    ))(83H3H3K*LMMr#   c                 f    |j                   D ci c]  }|| j                  ||          c}S c c}w rJ   )r;   rv   )rN   rH   cols      r!   rT   z!NumpyArrowExtractor.extract_batch   s0    JRJ_J_`3T//>>```s   .rA   c                    t        |t        j                        rt        |j                  t              rUt        |j                  j                  d      }|j                  D cg c]  }|j                  |      D ]  }|  c}}nt        |j                        xr t        d |j                  D              }|j                  D cg c]  }|j                  |      D ]  }|  c}}nt        |j                  t              r4t        |j                  j                  d      }|j                  |      nCt        |j                        xr t        |       }|j                  |      j                         t              dkD  r{t        fdD              rgt        j                  j!                  t        j"                        dk\  rt        j$                  t&              S t        j(                  d	t&        
      S t        j                  j!                  t        j"                        dk\  rt        j$                        S t        j(                  d	      S c c}}w c c}}w )NT)unnest)zero_copy_onlyc              3   4   K   | ]  }t        |         y wrJ   )rE   ).0chunks     r!   	<genexpr>z<NumpyArrowExtractor._arrow_array_to_numpy.<locals>.<genexpr>   s       K8=,U33Ks   r   c              3   
  K   | ]z  }t        |t        j                        xr1 |j                  t        k(  xs |j
                  d    j
                  k7  xs' t        |t              xr t        j                  |       | yw)r   N)r+   r<   ndarraydtypeobjectshapefloatisnan)r}   xr^   s     r!   r   z<NumpyArrowExtractor._arrow_array_to_numpy.<locals>.<genexpr>   sp        Arzz*_60A0^QWWPUVWPXP^P^E^ :q%(8RXXa[:s   B Bz2.0.0b1)r   F)copyr   r   )r+   rY   ChunkedArrayr&   r   r   storage_dtypechunksto_numpyallrE   tolistr?   anyr<   libNumpyVersion__version__asarrayr   r^   )rN   rA   r{   r~   rowr^   s        @r!   rv   z)NumpyArrowExtractor._arrow_array_to_numpy   s   h0(--)>?!3HMM4O4OX\!]%-__!^l@m9<C "4HMM!B "s KAIK H &.__!^l@m9<C (--)>?!3HMM4O4OX\!]&//~/N!3HMM!B!iK_`hKiGi&//~/NUUWu:>   
 66&&r~~6)C::e6::xxE@@66r~~.);::e$$88E..?s   & I( I.N)rU   rV   rW   rs   rY   r   rm   rO   r<   r   rR   rT   rf   rv   rZ   r#   r!   rp   rp      sq    /5BHH 5 5Nrxx NBJJ Nabhh a4 a$/bhh $/2:: $/r#   rp   c                       e Zd Zdej                  dej                  fdZdej                  dej                  fdZ	dej                  dej                  fdZ
y)PandasArrowExtractorrH   r   c                 N    |j                  d      j                  t              S )Nr   )lengthtypes_mapper)r2   	to_pandasr   rM   s     r!   rO   z PandasArrowExtractor.extract_row   s"    ~~Q~'11?R1SSr#   c                 n    |j                  dg      j                  t              |j                  d      S )Nr   r   )r5   r   r   r;   rM   s     r!   rR   z#PandasArrowExtractor.extract_column   s4    s#--;N-OPXPePefgPhiir#   c                 .    |j                  t              S )Nr   )r   r   rM   s     r!   rT   z"PandasArrowExtractor.extract_batch   s    !!/B!CCr#   N)rU   rV   rW   rY   r   pd	DataFramerO   SeriesrR   rT   rZ   r#   r!   r   r      s\    TBHH T Tjrxx jBII jDbhh D2<< Dr#   r   c                   v    e Zd Z	 ddee   deeeeeedf   f      fdZ	dedefdZ
ded	edefd
ZdedefdZy)PythonFeaturesDecoderNfeaturestoken_per_repo_idc                      || _         || _        y rJ   r   r   rN   r   r   s      r!   rs   zPythonFeaturesDecoder.__init__   s     !!2r#   r   r   c                 l    | j                   r'| j                   j                  || j                        S |S N)r   )r   decode_exampler   )rN   r   s     r!   
decode_rowz PythonFeaturesDecoder.decode_row   s0    ^b^k^kt}}++C4CYCY+Ztqttr#   r/   column_namec                 n    | j                   r(| j                   j                  ||| j                        S |S r   )r   decode_columnr   )rN   r/   r   s      r!   r   z#PythonFeaturesDecoder.decode_column   s;     }} MM''tOeOe'f	
 	
r#   batchc                 l    | j                   r'| j                   j                  || j                        S |S r   )r   decode_batchr   rN   r   s     r!   r   z"PythonFeaturesDecoder.decode_batch   s0    ^b^k^kt}}))%4CYCY)Zvqvvr#   rJ   )rU   rV   rW   r
   r   rm   r4   r   boolrs   r   rn   r   r   rZ   r#   r!   r   r      s    mq3 *3?GSRWX[]acgXgRhMhHi?j3ud ut u
D 
s 
t 
w$ w4 wr#   r   c                       e Zd Zdee   fdZdej                  dej                  fdZdej                  de
dej                  fdZd	ej                  dej                  fd
Zy)PandasFeaturesDecoderr   c                     || _         y rJ   r   )rN   r   s     r!   rs   zPandasFeaturesDecoder.__init__   s	     r#   r   r   c                 D   | j                   r^| j                   j                         D ci c]8  \  }}| j                   j                  |   r|t        t	        t
        |            : c}}ni }|r+|j                  |      |t        |j                               <   |S c c}}w rJ   )	r   r]   _column_requires_decodingr   r   r   	transformrn   keys)rN   r   r   featuredecodes        r!   r   z PandasFeaturesDecoder.decode_row   s     }} -1MM,?,?,A(K==::;G 3G<QSZ4[\\  	 '*}}V'<CV[[]#$
s   =Br/   r   c                     | j                   rM|| j                   v r?| j                   j                  |   r&t        t        t        | j                   |               nd }|r|j                  |      }|S rJ   )r   r   r   r   r   r   )rN   r/   r   r   s       r!   r   z#PandasFeaturesDecoder.decode_column   sf     }}!=$--BiBijuBv #7+@$--P[B\#]^ 	
 %%f-Fr#   r   c                 $    | j                  |      S rJ   )r   r   s     r!   r   z"PandasFeaturesDecoder.decode_batch	  s    u%%r#   N)rU   rV   rW   r
   r   rs   r   r   r   r   r4   r   r   rZ   r#   r!   r   r      sk    !(!3 !bll r|| BII C BII &",, &2<< &r#   r   c                       e Zd ZdZdej
                  ddfdZd Zd Zd Z	dd
Z
d Zd Zd Zd Zd Zd Zd Zd Zedd       Zd Zd Zy	)LazyDictzeA dictionary backed by Arrow data. The values are formatted on-the-fly when accessing the dictionary.rH   	formatter	Formatterc                     || _         || _        t        j                  |j                        | _        t        | j
                  j                               | _        y rJ   )	rH   r   rm   fromkeysr;   datasetr   keys_to_format)rN   rH   r   s      r!   rs   zLazyDict.__init__  s>     "MM("7"78	!$)).."23r#   c                 ,    t        | j                        S rJ   )r?   r   rN   s    r!   __len__zLazyDict.__len__  s    499~r#   c                     | j                   |   }|| j                  v r;| j                  |      }|| j                   |<   | j                  j                  |       |S rJ   )r   r   formatremoverN   r   values      r!   __getitem__zLazyDict.__getitem__  sQ    		#$%%%KK$E"DIIcN&&s+r#   c                 t    || j                   v r| j                   j                  |       || j                  |<   y rJ   r   r   r   r   s      r!   __setitem__zLazyDict.__setitem__"  s1    $%%%&&s+		#r#   Nc                 p    || j                   v r| j                   j                  |       | j                  |= y rJ   r   rN   r   s     r!   __delitem__zLazyDict.__delitem__'  s/    $%%%&&s+IIcNr#   c                 ,    t        | j                        S rJ   )iterr   r   s    r!   __iter__zLazyDict.__iter__,  s    DIIr#   c                     || j                   v S rJ   )r   r   s     r!   __contains__zLazyDict.__contains__/  s    diir#   c                 L    | j                          t        | j                        S rJ   )_format_allreprr   r   s    r!   __repr__zLazyDict.__repr__2  s    DIIr#   c                    t        |t              r}| j                         }|j                         }|j                          |xj                  |j
                  j                         z  c_        |j
                  |j
                  z  |_        |S t        |t              rI| j                         }|xj                  |j                         z  c_        |j
                  |z  |_        |S t        S rJ   	r+   r   r   r   r   r   r   rm   NotImplementedrN   otherinsts      r!   __or__zLazyDict.__or__6  s    eX&99;DJJLE5::??#44		EJJ.DIKeT"99;D5::</		E)DIKr#   c                    t        |t              r}| j                         }|j                         }|j                          |xj                  |j
                  j                         z  c_        |j
                  |j
                  z  |_        |S t        |t              rI| j                         }|xj                  |j                         z  c_        ||j
                  z  |_        |S t        S rJ   r   r   s      r!   __ror__zLazyDict.__ror__E  s    eX&99;DJJLE5::??#44

TYY.DIKeT"99;D5::</		)DIKr#   c                 r   t        |t              rn|j                         }|j                          | xj                  |j
                  j                         z  c_        | xj
                  |j
                  z  c_        | S | xj                  |j                         z  c_        | xj
                  |z  c_        | S rJ   )r+   r   r   r   r   r   r   )rN   r   s     r!   __ior__zLazyDict.__ior__T  s    eX&JJLE5::??#44II#I  5::</IIIr#   c                 B   | j                   j                  | j                         }|j                  j                  | j                         | j                  d   j	                         |j                  d<   | j                  d   j	                         |j                  d<   |S )Nr   r   )	__class____new____dict__updater   )rN   r   s     r!   __copy__zLazyDict.__copy___  sw    ~~%%dnn5T]]+ $f 5 : : <f*.--8H*I*N*N*P&'r#   c                 ,    dd l }|j                  |       S rC   r   )rN   r   s     r!   r   zLazyDict.copyh  s    yyr#   c                     t         rJ   rK   )clsiterabler   s      r!   r   zLazyDict.fromkeysm  s    !!r#   c                     t         rJ   rK   r   s     r!   r   zLazyDict.formatq  rP   r#   c                     | j                   D ]   }| j                  |      | j                  |<   " | j                   j                          y rJ   )r   r   r   clearr   s     r!   r   zLazyDict._format_allt  s?    && 	.C![[-DIIcN	.!!#r#   )r   NrJ   )rU   rV   rW   rX   rY   r   rs   r   r   r   r   r   r   r   r   r   r   r   r   classmethodr   r   r   rZ   r#   r!   r   r     sv    o4 4k 4

 	
 " ""$r#   r   c                       e Zd Zd Zy)LazyRowc                 r    | j                   j                  | j                  j                  |g            d   S rC   r   format_columnrH   r5   r   s     r!   r   zLazyRow.format{  s-    ~~++DMM,@,@#,GHKKr#   NrU   rV   rW   r   rZ   r#   r!   r   r   z  s    Lr#   r   c                       e Zd Zd Zy)	LazyBatchc                 l    | j                   j                  | j                  j                  |g            S rJ   r   r   s     r!   r   zLazyBatch.format  s(    ~~++DMM,@,@#,GHHr#   Nr   rZ   r#   r!   r  r    s    Ir#   r  c                       e Zd ZdZeZeZeZ	e
Z	 	 ddee   deeeeeedf   f      fdZdej(                  dedeeeef   fd	Zdej(                  defd
Zdej(                  defdZdej(                  defdZy)r   z
    A formatter is an object that extracts and formats data from pyarrow tables.
    It defines the formatting for rows, columns and batches.
    Nr   r   c                     || _         || _        t        | j                   | j                        | _        t	        | j                         | _        y rJ   )r   r   r   python_features_decoderr   pandas_features_decoderr   s      r!   rs   zFormatter.__init__  s=    
 !!2'<T]]DLbLb'c$'<T]]'K$r#   rH   
query_typer   c                     |dk(  r| j                  |      S |dk(  r| j                  |      S |dk(  r| j                  |      S y Nr   r/   r   )
format_rowr   format_batch)rN   rH   r  s      r!   __call__zFormatter.__call__  sP    ??8,,8#%%h//7"$$X.. #r#   c                     t         rJ   rK   rM   s     r!   r  zFormatter.format_row  rP   r#   c                     t         rJ   rK   rM   s     r!   r   zFormatter.format_column  rP   r#   c                     t         rJ   rK   rM   s     r!   r  zFormatter.format_batch  rP   r#   NN)rU   rV   rW   rX   ra   simple_arrow_extractorrh   python_arrow_extractorrp   numpy_arrow_extractorr   pandas_arrow_extractorr
   r   rm   r4   r   r   rs   rY   r   r   r   r   r  r  r   r  rZ   r#   r!   r   r     s    
 21/1 (,IML8$L $DeCtO.D)D$EFL/ /s /uYP\^iEi?j /"288 "	 ""bhh "< ""RXX "+ "r#   r   c                       e Zd ZdefdZy)TensorFormatterdata_structc                     t         rJ   rK   )rN   r  s     r!   recursive_tensorizez#TensorFormatter.recursive_tensorize  rP   r#   N)rU   rV   rW   rm   r  rZ   r#   r!   r  r    s    "t "r#   r  c                   "    e Zd ZU eed<   eed<   y)TableFormatter
table_typecolumn_typeN)rU   rV   rW   r4   __annotations__rZ   r#   r!   r  r    s    Or#   r  c                       e Zd ZdZdZdej                  dej                  fdZdej                  dej                  fdZ	dej                  dej                  fdZ
y)	ArrowFormatterzarrow tablezarrow arrayrH   r   c                 @    | j                         j                  |      S rJ   )r  rO   rM   s     r!   r  zArrowFormatter.format_row  s    **,88BBr#   c                 @    | j                         j                  |      S rJ   )r  rR   rM   s     r!   r   zArrowFormatter.format_column  s    **,;;HEEr#   c                 @    | j                         j                  |      S rJ   )r  rT   rM   s     r!   r  zArrowFormatter.format_batch  s    **,::8DDr#   N)rU   rV   rW   r  r  rY   r   r  rf   r   r  rZ   r#   r!   r!  r!    sf    JKC288 C CFbhh F288 FERXX E"(( Er#   r!  c                        e Zd Zd fd	Zdej
                  defdZdej
                  defdZ	dej
                  defdZ
 xZS )PythonFormatterc                 4    t         |   ||       || _        y rJ   )superrs   lazy)rN   r   r)  r   r   s       r!   rs   zPythonFormatter.__init__  s    #45	r#   rH   r   c                     | j                   rt        ||       S | j                         j                  |      }| j                  j                  |      }|S rJ   )r)  r   r  rO   r  r   rN   rH   r   s      r!   r  zPythonFormatter.format_row  sJ    998T**))+77A**55c:
r#   c                     | j                         j                  |      }| j                  j                  ||j                  d         }|S rC   )r  rR   r  r   r;   rN   rH   r/   s      r!   r   zPythonFormatter.format_column  D    ,,.==hG--;;FHDYDYZ[D\]r#   c                     | j                   rt        ||       S | j                         j                  |      }| j                  j                  |      }|S rJ   )r)  r  r  rT   r  r   rN   rH   r   s      r!   r  zPythonFormatter.format_batch  sJ    99Xt,,++-;;HE,,99%@r#   )NFN)rU   rV   rW   rs   rY   r   r   r  rn   r   r  __classcell__r   s   @r!   r&  r&    sM    288  bhh 4 
RXX ' r#   r&  c                       e Zd ZdZdZdej                  dej                  fdZ	dej                  dej                  fdZdej                  dej                  fdZy)	PandasFormatterzpandas dataframezpandas seriesrH   r   c                 z    | j                         j                  |      }| j                  j                  |      }|S rJ   )r  rO   r  r   r+  s      r!   r  zPandasFormatter.format_row  s6    ))+77A**55c:
r#   c                     | j                         j                  |      }| j                  j                  ||j                  d         }|S rC   )r  rR   r  r   r;   r-  s      r!   r   zPandasFormatter.format_column  r.  r#   c                 z    | j                         j                  |      }| j                  j                  |      }|S rJ   )r  rT   r  r   r+  s      r!   r  zPandasFormatter.format_batch  s6    ))+99(C**77<
r#   N)rU   rV   rW   r  r  rY   r   r   r   r  r   r   r  rZ   r#   r!   r4  r4    s]    #J!K288  
bhh 299 
RXX ",, r#   r4  c                        e Zd ZdZd	deegef   f fdZdej                  defdZ	dej                  de
fdZdej                  defdZ xZS )
CustomFormattera  
    A user-defined custom formatter function defined by a ``transform``.
    The transform must take as input a batch of data extracted for an arrow table using the python extractor,
    and return a batch.
    If the output batch is not a dict, then output_all_columns won't work.
    If the output batch has several fields, then querying a single column won't work since we don't know which field
    to return.
    r   c                 6    t         |   ||       || _        y )Nr   )r(  rs   r   )rN   r   r   r   kwargsr   s        r!   rs   zCustomFormatter.__init__  s    (>OP"r#   rH   r   c                 |    | j                  |      }	 t        |      S # t        $ r}t        d|       |d }~ww xY w)Nz]Custom formatting function must return a dict of sequences to be able to pick a row, but got )r  r_   	Exceptionr%   rN   rH   formatted_batchexcs       r!   r  zCustomFormatter.format_row  sU    ++H5	?++ 	op  pA  B	s   
 	;6;c                 B   | j                  |      }t        |d      rBt        |j                               dkD  r4t	        dt        |j                                d      t	        d|       	 ||j                  d      S # t        $ r}t	        d|       |d }~ww xY w)Nr   r   zTried to query a column but the custom formatting function returns too many columns. Only one column was expected but got columns .zPCustom formatting function must return a dict to be able to pick a row, but got r   )r  hasattrr?   r   r%   rn   r;   r=  r>  s       r!   r   zCustomFormatter.format_column  s    ++H5?F+?'')*Q.DDHI]I]I_D`Caabd 
 bcrbst 	"8#8#8#;<< 	bcrbst	s   /B 	B
BBc                     | j                         j                  |      }| j                  j                  |      }| j	                  |      S rJ   )r  rT   r  r   r   r0  s      r!   r  zCustomFormatter.format_batch  s@    ++-;;HE,,99%@~~e$$r#   r  )rU   rV   rW   rX   r   rm   rs   rY   r   r  r   r   r  r1  r2  s   @r!   r9  r9    sd    #(D64<"8 #288  bhh < &%RXX %$ %r#   r9  columnsc                 .    | |vrt        d|  d|       y )NzColumn z5 not in the dataset. Current columns in the dataset: )KeyError)r   rE  s     r!   _check_valid_column_keyrH     s*    
'%Z[bZcdee r#   sizec                    t        | t              r$| dk  r| |z   dk  s| |k\  rt        d|  d|       y t        | t              ry t        | t              r<t        |       dkD  r-t        t        |       |       t        t        |       |       y y t        | t              rNt        |       dkD  r?t        t        t        |             |       t        t        t        |             |       y y t        |        y )Nr   zInvalid key: z is out of bounds for size )rI  )r+   r,   
IndexErrorr2   r3   r?   _check_valid_index_keymaxminr   r'   )r   rI  s     r!   rL  rL  %  s    #s!Gd
QC4K}SE1LTFSTT	C		C	s8a<"3s8$7"3s8$7  
C	"s8a<"3s3x=t<"3s3x=t<  	C r#   c                     t        | t        j                        ryt        | t              ryt        | t        t
        t        f      ryt        |        y r
  )r+   numbersIntegralr4   r2   r3   r   r'   r    s    r!   key_to_query_typerR  8  s>    #w''(	C		C%1	2r#   c                    t        |t        t        t        t        t
        f      s	 t        j                  |      }t        |t              rt        || j                         n&||j                  n| j                  }t        ||       |t        | |      }|S t        | ||      }|S # t        $ r t        |       Y w xY w)a1  
    Query a Table to extract the subtable that correspond to the given key.

    Args:
        table (``datasets.table.Table``): The input Table to query from
        key (``Union[int, slice, range, str, Iterable]``): The key can be of different types:
            - an integer i: the subtable containing only the i-th row
            - a slice [i:j:k]: the subtable containing the rows that correspond to this slice
            - a range(i, j, k): the subtable containing the rows that correspond to this range
            - a string c: the subtable containing all the rows but only the column c
            - an iterable l: the subtable that is the concatenation of all the i-th rows for all i in the iterable
        indices (Optional ``datasets.table.Table``): If not None, it is used to re-map the given key to the table rows.
            The indices table must contain one column named "indices" of type uint64.
            This is used in case of shuffling or rows selection.


    Returns:
        ``pyarrow.Table``: the result of the query on the input table
    )r)   )r+   r,   r2   r3   r4   r   operatorindexr%   r'   rH  r;   r.   rL  r1   r8   )r(   r   r)   rI  pa_subtables        r!   query_tablerW  B  s    2 cCsH=>	%..%C #sU%7%78#*#6wENNsD)"5#.  8sGT  	%$	%s   B) )C ?C r   format_columnsc                    t        | t              r| j                  }n| }t        |      }t	        |j
                        }
 |||      S |dk(  r|v r	 |||      S  |||      S |j                  fd|j                  D              } |||      }	|r_t        |	t              rA|j                  fd|j                  D              }
 ||
|      }|	j                  |       |	S t        d|	       |	S )a  
    Format a Table depending on the key that was used and a Formatter object.

    Args:
        table (``datasets.table.Table``): The input Table to format
        key (``Union[int, slice, range, str, Iterable]``): Depending on the key that was used, the formatter formats
            the table as either a row, a column or a batch.
        formatter (``datasets.formatting.formatting.Formatter``): Any subclass of a Formatter such as
            PythonFormatter, NumpyFormatter, etc.
        format_columns (:obj:`List[str]`, optional): if not None, it defines the columns that will be formatted using the
            given formatter. Other columns are discarded (unless ``output_all_columns`` is True)
        output_all_columns (:obj:`bool`, defaults to False). If True, the formatted output is completed using the columns
            that are not in the ``format_columns`` list. For these columns, the PythonFormatter is used.


    Returns:
        A row, column or batch formatted object defined by the Formatter:
        - the PythonFormatter returns a dictionary for a row or a batch, and a list for a column.
        - the NumpyFormatter returns a dictionary for a row or a batch, and a np.array for a column.
        - the PandasFormatter returns a pd.DataFrame for a row or a batch, and a pd.Series for a column.
        - the TorchFormatter returns a dictionary for a row or a batch, and a torch.Tensor for a column.
        - the TFFormatter returns a dictionary for a row or a batch, and a tf.Tensor for a column.
    r   )r  r/   c              3   ,   K   | ]  }|vs|  y wrJ   rZ   r}   rx   rX  s     r!   r   zformat_table.<locals>.<genexpr>  s     *m3SV^lSl3*m   	c              3   ,   K   | ]  }|v s|  y wrJ   rZ   r[  s     r!   r   zformat_table.<locals>.<genexpr>  s      @C><QC@r\  z\Custom formatting function must return a dict to work with output_all_columns=True, but got )r+   r   r(   rR  r&  r   r:   r;   r   r   r%   )r(   r   r   rX  output_all_columnsrH   r  python_formatterpa_table_to_formatformatted_outputpa_table_with_remaining_columnsremaining_columns_dicts      `        r!   format_tablerd  m  s&   < %;;"3'J&	0B0BCj99	x	. Xz22#HDD%]]*m(:O:O*mm$%7JO*N;2:-- @#+#8#8@ 3/ *::Yfp)q& ''(>?
    r  tD  sE  F   r#   rJ   )NF)KrP  rT  collections.abcr   r   r   	functoolsr   typingr   r   r	   r
   r   r   numpyr<   pandasr   pyarrowrY   r   r   features.featuresr   r   r   r   r(   r   utils.py_utilsr   r   r   r   r   r3   r   r"   r'   r,   r2   r4   r8   r1   rf   rE   rG   rm   rn   r_   ra   rh   r   rp   r   r   r   r   r   r   r   r  r   r  r  r!  r&  r4  r9  rH  rL  rR  rW  rd  rZ   r#   r!   <module>rm     s     = =  D C     u u  3 CLK 	~&m$3e 3 3S S%X=>INXX< E#ueS(*J$K PRPXPX 2#288 # #"L+!EF ""=T#tAw,' =DaL =
-bhh"((.JK $-dD$.>? $1/,T2::t-CD 1/hD-bllBIIr||.ST Dw w*& &@j$~ j$ZLh L
I I
$"	<<= $"N"i	< DE "
Yy,CD 
E^BHHbhh$@A Eiw 67 2nR\\299bll%JK (-%ilD 89 -%`f ftCy fT f
!c5%&A B !# !RV !&5eUC!AB s   $((	sE5#x/	0( e_( XX	(^ &*9 9 	sE5#x/	09  9  TN	9 r#   