
    bi*                      d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7m8Z8 d dl9m:Z:m;Z;  ejx                          ejz                          ej|                          ej~                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                          ej                         iZT e7j                  ej                        d        ZWd ZXd ZY	 	 d$dZZd Z[ G d d      Z\d Z]d Z^d Z_d Z`d Zad Zbd%d Zcd&d!Zd G d" d#e$      Zey)'    )annotationsN)defaultdict)datetime)reduce)expand_paths_if_neededstringify_path)ArrowFSWrapper)datasetfs)flatten)PANDAS_GE_220)pyarrow_schema_dispatch)	Engine_get_aggregation_depth_infer_split_row_groups_normalize_index_columns_process_open_file_options_row_groups_to_parts_set_gather_statistics_set_metadata_task_size_sort_and_analyze_paths)_get_pyarrow_dtypes_is_local_fs_open_input_files)clear_known_categoriespyarrow_strings_enabled)Delayed)normalize_tokentokenize)
getargspecnatural_sort_keyc                "    | j                         S N)
__reduce__)objs    Z/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/dask/dataframe/io/parquet/arrow.pytokenize_arrowfsr(   :   s    >>    c                >    t        | t              r| j                  S | S )z5Return the wrapped filesystem if fs is ArrowFSWrapper)
isinstancer	   r   r   s    r'   _wrapped_fsr,   D   s    r>2255::r)   c                    	 | j                  |       y# t        $ r }dt        |      v rt        d      ||d}~ww xY w)zmAppend row-group metadata and include a helpful
    error message if an inconsistent schema is detected.
    zrequires equal schemaszSchemas are inconsistent, try using `to_parquet(..., schema="infer")`, or pass an explicit pyarrow schema. Such as `to_parquet(..., schema={"column1": pa.string()})`N)append_row_groupsRuntimeErrorstr)metadatamderrs      r'   _append_row_groupsr4   I   sP    ""2& 	#s3x/E
  I	s    	=8=c
                   |j                  |d       |r|j                  d       || j                  j                     }|rt	        |      ng }d}|r|j                  |d       d}|D cg c]  }||   	 }}|j                  |d      }|j                  j                  |      }t        |      dk(  r|st        d      | j                  }| j                  j                  D ]'  }||v s|j                  |j                  |            }) g }t        |      d	k(  r|d   n|}|j                  |dd
      }|D ]  \  }}t        |t              s|f}|j                  j!                  t#        ||      D cg c]  \  }}t%        ||       c}}      } ||||      }|j                  j!                  ||g      }|j                  |d       |j                  j!                  ||g      }|j'                  |d      5 }t)        j*                  ||fd|	r|ndi|
 ddd       |	s|d   j-                  |j                  j!                  ||g               |S c c}w c c}}w # 1 sw Y   KxY w)zWrite table to a partitioned dataset with pyarrow.

    Logic copied from pyarrow.parquet.
    (arrow/python/pyarrow/parquet.py::write_to_dataset)

    TODO: Remove this in favor of pyarrow's `write_to_dataset`
          once ARROW-8244 is addressed.
    Texist_okinplaceFcolumns)axisr   z.No data left to save outside partition columns   )dropnaobservedpreserve_indexschemawbmetadata_collectorN)mkdirsreset_indexrA   nameslist	set_indexdropr:   len
ValueErrorremoveget_field_indexgroupbyr+   tuplesepjoinzip_hive_dirnameopenpqwrite_tableset_file_path)tabledf	root_pathfilenamepartition_colsr   pandas_to_arrow_tabler@   
index_colsreturn_metadatakwargscolpartition_keysdata_df	data_cols	subschemamd_listgbkeyssubgroupnamevalsubdirsubtableprefix	full_pathfs                              r'   _write_partitionedrr   [   si   * IIi$I'
t$	ELL	B%/j!RJN
Z.)78#bg8N8ggn9g5G

/I
9~:IJJI||!! I. !(()B)B3)GHII G*-n*=*B^A&N		FB Gh$&7D7:>47PQ)$]4%Q
 )^I
 i01
		&4	(FFKK 23	WWY% 	NN />74 		 BK%%bffkk682D&EF)G, NI 9$ R	 	s   -I ;I%I++I4	c                    | r;|9t        t        |       j                  |j                              t        |       k(  S | ryy)z[Simple utility to check if all `index` columns are included
    in the known `schema`.
    TF)rK   setintersectionrG   )indexrA   s     r'   _index_in_schemarw      s<     #3u:**6<<89SZGG	r)   c                      e Zd ZdZd Zd Zy)PartitionObjag  Simple class providing a `name` and `keys` attribute
    for a single partition column.

    This class was originally designed as a mechanism to build a
    duck-typed version of pyarrow's deprecated `ParquetPartitions`
    class. Now that `ArrowLegacyEngine` is deprecated, this class
    can be modified/removed, but it is still used as a convenience.
    c                f    || _         t        j                  |j                         d      | _        y )NF)copy)rk   pdIndexsort_valuesri   )selfrk   ri   s      r'   __init__zPartitionObj.__init__   s$    	HHT--/e<	r)   c                B    t        | j                  | j                        S r$   )r    rk   ri   )r   s    r'   __dask_tokenize__zPartitionObj.__dask_tokenize__   s    		499--r)   N)__name__
__module____qualname____doc__r   r    r)   r'   ry   ry      s    =.r)   ry   c                |    | j                   j                  | j                  | j                  | j                  |      S )z*Create new fragment with row-group subset.)
row_groups)formatmake_fragmentpath
filesystempartition_expression)old_fragr   s     r'   _frag_subsetr      s:    ??((%%	 )  r)   c                    | j                   duxr d| j                   v }|r1t        j                  | j                   d   j                  d            S i S )z)Get pandas-specific metadata from schema.N   pandasutf8)r1   jsonloadsdecode)rA   has_pandas_metadatas     r'   _get_pandas_metadatar      sL     !//5V)v:Vzz&//)4;;FCDD	r)   c           	     &   |j                  di       j                         }t        |j                  di       fi t	        |      rdddn||dgk(  r|n|gddd\  }}	|j                  d	d      du }
d
|j                  d
|
      i}t        | gf||d|	d   5 }|dgk(  r3 t        j                  |fi |j                  d|ddd|cddd       S  t        j                  |fi |j                  |f|ddd|cddd       S # 1 sw Y   yxY w)zRead arrow table from file path.

    Used by `ArrowDatasetEngine._read_table` when no filters
    are specified (otherwise fragments are converted directly
    into tables).
    readopen_file_optionsFnone)allow_precachedefault_cacheNpyarrow)r:   r   default_enginer   method
pre_buffer)r   precache_optionsr   T)r:   use_threadsuse_pandas_metadatar   )
getr{   r   popr   r   rV   ParquetFiler   read_row_groups)r   r   r   r:   rA   filtersra   read_kwargsr   r   pre_buffer_defaultr   fils                r'   _read_table_from_pathr      su   " **VR(--/K*D+R0+ B #(!' #,64&,@jzl"+!'	+''( *--h=E>P QRJ		
)
 	

 
	 
 $92>>#4499 !$( 	  E2>>#44DD!$(	
   s   0D*DDc                    t        t         j                  j                  t	        j
                  t        t         j                  j                        dz
        D cg c]/  }t         j                  j                  |   j                  d      1 c}d                   fd}t        ||      D ci c]  \  }}|	|| c}}S c c}w c c}}w )aq  Custom version of pyarrow's RowGroupInfo.statistics method
    (https://github.com/apache/arrow/blob/master/python/pyarrow/_dataset.pyx)

    We use column names to specify the specific subset of columns
    that we need statistics for.  This is more optimal than the
    upstream `RowGroupInfo.statistics` method, which will return
    statistics for all columns.
    r<   r   )initialc                0   j                   j                  |          }|j                  }||j                  sy|j                  }j
                  j                  |      }|dk  ry|j                  |j                  |j                  |j                  dfS )N)NNr   )minmax
null_count)
r1   column
statisticshas_min_maxpath_in_schemarA   rN   r   r   r   )column_namerb   statsrk   field_index	row_grouprow_group_schemas        r'   
name_statsz&_get_rg_statistics.<locals>.name_stats.  s      ''(8(EF= 1 1!!&&66t<?!!9999**$
 
 	
r)   )dictrS   rA   rG   	itertools
accumulaterangerK   r   types
num_fieldsmap)r   	col_namesir   rk   r   r   s   `     @r'   _get_rg_statisticsr     s     ""   #3y'7'7'='=#>#BC  	((..q1<<a@
 
	

& (+:y'A#eUEVe 98s   4C8
CCc                   |r.t        |t              D ch c]  }t        |      s|d    c}n	t               }| r.t        | t              D ch c]  }t        |      s|d    c}n	t               }t	        ||z
        S c c}w c c}w )N	containerr   )r   rH   rK   rt   bool)r   rc   vr]   filtered_colss        r'   _need_filteringr   E  s      ~>I!#a&1IU  GNww$7B!3q61BSVSX  .// 	J
 	Cs   BBB!Bc                B    t        j                  |      rdn|}|  d| S )N__HIVE_DEFAULT_PARTITION__=)r|   isna)rk   rl   s     r'   rT   rT   V  s&     +-''#,
&CCV1SE?r)   c                ^    dt        | t              rt        j                  di | i|S | i|S )Npartitioningr   )r+   r   pa_dsr   )r   ra   s     r'   _process_kwargsr   ^  sQ    
 	,- ..   	  r)   c                   t        | t        j                        r| S | Dt        |       dk(  st	        d | D              rt        d      t        | d   d   t              r| g} fd}g }| D ]J  }|D cg c]  \  }}} ||||       }	}}}|j                  t        t        j                  |	             L t        t        j                  |      S c c}}}w )Nr   c              3  8   K   | ]  }t        |      d k(    yw)r   N)rK   ).0rq   s     r'   	<genexpr>z)_filters_to_expression.<locals>.<genexpr>v  s     #AACFaK#As   zMalformed filtersc                &   t        j                  |       }|rS|t        j                  u rA|dk(  r|j	                        S |dk(  r|j	                         S t        d| ||f d      |dk(  s|dk(  r||k(  }nx|dk(  r||k7  }nm|d	k(  r||k  }nb|d
k(  r||kD  }nW|dk(  r||k  }nL|dk(  r||k\  }nA|dk(  r|j                  |      }n*|dk(  r|j                  |       }nt        d| ||f d      s|dv r|j	                        |z  S |S )Nis)nan_is_nullzis not"zO" is not a supported predicate Please use "is" or "is not" for null comparison.r   z==!=<>z<=z>=innot inz(" is not a valid operator in predicates.)r   r   )r   fieldnpnanis_nullrL   isin)rb   oprl   r   exprr   propagate_nulls        r'   convert_single_predicatez8_filters_to_expression.<locals>.convert_single_predicate~  sZ   C  ;;3"&&=Tz}}}==x+>>> b#' (G H 
 9d
C<D4ZC<D3Y3;D3Y3;D4ZC<D4ZC<D4Z::c?D8^JJsO#DS"cN##KL 
 "(8"8==[=9D@@r)   )r+   r   
ExpressionrK   anyrL   r0   appendr   operatorand_or_)
r   r   r   r   disjunction_membersconjunctionrb   r   rl   conjunction_memberss
    ``       r'   _filters_to_expressionr   l  s     '5++,w<1#A#A A011gajmS) iG'R  OGR
 
7CsB$S"c2
 
 	""6(--9L#MNO (,, 344
s   8C
c                  r   e Zd Zed        Zed        Ze	 	 	 	 	 dd       Ze	 	 	 	 	 	 dd       Ze	 d	 	 	 dd       Ze	 	 	 	 	 	 dd       Z	edd	       Z
ed
        Zed        Zed        Zed        Ze	 	 	 	 dd       Zed        Zedddd       Ze	 	 d	 	 	 dd       Zed        Zed        Zy)ArrowDatasetEnginec                   ||j                  dd      }nd|v rt        d      |}t        |t        j                        s|dv r&t        |t
        t        t        f      r&|st        d      |D cg c]  }t        |       }}nt        |      g}|dv rNt        j                  j                  |d         d   }|r'd|v ri nd|j                  i} t        |      di ||}t        |      }	|d   j                  d      r(t        |t        j                        rdd	lm}
  |
       }n|	}t!        |d
d|	d       }|	|D cg c]  }|j#                  |       c}|d|j$                  ifS t'        j(                  |||||      S c c}w c c}w )Nr   fsspeczXCannot specify a filesystem argument if the 'filesystem' dataset option is also defined.)arrowr   zempty urlpath sequencer   regionzC:)LocalFileSystemrbr<   open_file_funcr   )r   rL   r+   pa_fs
FileSystemrH   rP   rt   r   from_urir   typer	   
startswithr   fsspec.implementations.localr   _strip_protocolopen_input_filer   extract_filesystem)clsurlpathr   dataset_optionsr   storage_optionsr   ur   	fsspec_fsr   fs_strippathss                r'   r	  z%ArrowDatasetEngine.extract_filesystem  s     $$\8<B. C  B b%**+r5I/I'D%#56$%=>>6=>>!,>>)'23))%%..wqz:1=" '/9"))?T  "b>F>o>B&r*Iqz$$T*z"e>S>S/T I*,$*7D!YME6;<))!,<!2#5#56	  ((
 	
? ?2 =s   2FF
c                    | t         k(  S r$   )r   )r
  s    r'   multi_supportz ArrowDatasetEngine.multi_support  s    (((r)   Nc
                >   t        |t              r|D ]  }||vs|j                  |        |j                         }t        |t        t        f      sB|r@|r>|j
                  D ])  }||v r|j                  |       |j                  |       + |xs d}t        |t              s|g}g }t        |      dkD  }|D ]{  }t        |t              r|}d}d}n|\  }}}t        |t              rt        |      }t        |t              s|g} | j                  |||||	|||fi |
}|sk|j                  |       } |rt        j                  |      } | j                  |fd|i|
}|rt        |t              r|D ]  }t        |j                        s||j                     j                  j                  dk7  s@t!        j"                  t!        j$                  |j                  ||j                     j&                        |j(                        ||j                  <    t+        |j(                  j,                        j/                  t+        |            }|s*|r|j1                  dd	       n|j1                  dd	       nyt+        |j(                  j,                        t+        |      k7  r|r|j1                  dd	       n8|r6d}t        t+        |      t+        |j(                  j,                        z
        }|t        |         }|r|j3                  |      }|S )
z!Read in a single output partitionNr<   dtype_backendcategory
categoriesvaluesrv   FT)rJ   r9   )r+   rH   r   r{   rP   partition_namesrM   rK   r0   _read_tablepaconcat_tables_arrow_table_to_pandasri   rk   dtyper|   SeriesCategoricalr  rv   rt   rG   issubsetrF   rI   )r
  r   piecesr:   rv   r  r  
partitionsr   rA   ra   levelcolumns_and_parts	part_nametables
multi_readpiecepath_or_fragr   rc   arrow_tablerZ   	partitionindex_in_columns_and_partss                           r'   read_partitionz!ArrowDatasetEngine.read_partition  s     eT" * 'NN5)* $LLN*tUm4 Z!+!;!; <I G+y1)00;	<
 "/T &$'XF[1_
 	+E%%$ 	!% =B9y. )U+ O	i.&K	 *#//
 
K k*=	+@ **62K (S''
3@
DJ
 *Z6' 	y~~&2inn+=+C+C+H+HJ+V *,'0~~#%inn#5#<#< !hh*By~~& &)%8%A%A!"&
" ) E48 D$7288>>"c%j05O E48+$(->)?#bhhnnBU)U$V!&'(e$B	r)   Fc
           	     	  " |dk(  st        |t              rt        |	r|j                  j	                  |	      n|j                        j                         }t        |t              rit        j                  |      }|j                  D ]E  }|j                  |      }|j                  |      }|j                  ||j                  |            }G |}|j                  |d       |r|d}d }d "d}d}|rt        j                  |t        |      d      }t!        |j"                        }|dkD  rR	 |j%                  |j&                  j)                  |dg      d	
      5 }t+        j,                  |      }d d d        |"d}nd}|rn"k"j                  j5                         }|j                  }|j6                  d uxr d|j6                  v }|rOt9        j:                  |j6                  d   j=                  d            }|d   D cg c]  }|d   dk(  r|d    }}nd }t?        ||      }t        |      t        |j@                        t        |      z
  k7  r.tC        djE                  |tG        |j@                                    tI        jJ                  |      jL                  |   jO                         ||   jP                  jO                         k7  rUtC        djE                  t        |jS                               t        |jP                  jS                               z              |d   |vrd}|sd }"fdtU        "jV                        D        }|jY                  |d         }|D ]l  }|j[                  |      }|j\                  s!||j\                  j^                  }:|j\                  j^                  |kD  r|j\                  j^                  }jd } n |d   } || d   |k  rtC        d      ||	d}!||||!fS # 1 sw Y   xY w# t.        $ ro 	 |j%                  t1        |j"                  t2              d   d	
      5 }t+        j,                  |      "d d d        n# 1 sw Y   nxY wn# t.        $ r Y nw xY wY  w xY wc c}w )NinferTr6   r   Fparquet)r   r   	_metadatar   )modekeyrD   r   r   r:   pandas_typecategoricalrk   z5Appended columns not the same.
Previous: {} | New: {}zAppended dtypes differ.
{}c              3  @   K   | ]  }j                  |        y wr$   )r   )r   r   tail_metadatas     r'   r   z6ArrowDatasetEngine.initialize_write.<locals>.<genexpr>  s$       "++A.s   	divisionszThe divisions of the appended dataframe overlap with previously written divisions. If this is desired, set ``ignore_divisions=True`` to append anyway.
- End of last written partition: {old_end}
- Start of first new partition: {divisions[0]})rA   r_   )0r+   r   r   _meta_nonemptyrI   remove_metadatar  rA   rG   rN   rt   r   rE   r   r
   r,   rK   filesrU   rQ   rR   rV   read_metadataOSErrorsortedr"   to_arrow_schemar1   r   r   r   r   r:   rL   r   rH   r|   r!  loctolistdtypesitemsr   num_row_groupsrv   r   r   r   )#r
  rZ   r   r   r   partition_onignore_divisionsdivision_inforA   r_   ra   inferred_schemark   r   jfull_metadatai_offsetmetadata_file_existsdsr   arrow_schemarG   r   pandas_metadatacr  rF  old_endr   index_col_ir   r   r<  extra_write_kwargsr;  s#                                     @r'   initialize_writez#ArrowDatasetEngine.initialize_write  s    W
64 85 !!++J7&& o	  &$'6*"LL ND'77=A..t4A&5&9&9!V\\!_&MON %F 			$	&m+#$tB	RB288}H!|dK-@!AM >QT(*(8(8(=>$1M+/(  m/(//??AL &&E%%T1Xi<CXCX6X   #"&** )))4;;FC#
 -Y7'=8 fI
  "
(zBF5zS_s</@@@ --3VE4

;K-L  6"&&u-446"U):J:J:Q:Q:SS 188FLLN+c"))//2C.DD  V$E1#' #"=#?#?@
 $kk-*?@!+ "I&--k:F(("?&,&7&7&;&;G#..22W<&,&7&7&;&;G '+G!" *+6	&9Q<7+B$I  )/jI(<>PPPc> >  WW"2881AB2FT %  B ,.,<,<S,AMB B B # ,sl   1.P P 5P R P
P 	R/Q2Q%	Q2%Q.	*Q21R2	Q>;R=Q>>RRc           
        	 t         j                  j                  |d||      S # t         j                  $ r}| t         j                  j                  |      }t        j                  |j                  d      d      }t        j                  |j                  d      d      }t        d|d| d| d	      d d }~ww xY w)
Nr<   )nthreadsr@   rA   F)show_schema_metadataz    z=Failed to convert partition to expected pyarrow schema:
    `z`

Expected partition schema:
z

Received partition schema:
z

This error *may* be resolved by passing in schema information for
the mismatched column(s) using the `schema` keyword in `to_parquet`.)	r  Tablefrom_pandasArrowExceptionSchematextwrapindent	to_stringrL   )r
  rZ   r@   rA   exc	df_schemaexpectedactuals           r'   _pandas_to_arrow_tablez)ArrowDatasetEngine._pandas_to_arrow_table   s    	88''Q~f (      	~		--b1I  e <fH __###?F w / * / ( W
X 	s   "% CBB<<Cc           
        d }d}t        |	|
      r|j                  |	d       d}ng }	| j                  |||
      }|r9|j                  j                  }|j                  |       |j                  |      }|rRt        ||||||| j                  |f|	||d|}|r|d   }t        dt        |            D ]  }t        |||           nng }|j                  |j                  j                  ||g      d	      5 }t        j                  ||f||r|nd d
| d d d        |r|d   }|j!                  |       |rd|i}|r|j                  |d<   |gS g S # 1 sw Y   =xY w)NFTr8   r?   )r1   )r_   compressionr`   r   r<   rB   )ri  rC   metarA   )rw   rI   rg  rA   r1   updatereplace_schema_metadatarr   r   rK   r4   rU   rQ   rR   rV   rW   rX   )r
  rZ   r   r   r\   rI  r`   fmdri  r_   rA   headcustom_metadatara   _metar@   t_mdrg   r   r   ds                         r'   write_partitionz"ArrowDatasetEngine.write_partition   s   " J/LLTL2!NJ&&r.QW&X((##CJJ'))3)7A(** &' / G 
q#g,/ :A&ugaj9: GdH%56=  !,2Awt	
  
##H-Ahh(3JI' s   ;EEc                   |d   d   j                  dd       }|D cg c]  }|d   d   | }}|r+|s|j                  j                  |dg      }	t        t        j
                        j                  }
|j                         D ci c]  \  }}||
v s|| }}}|j                  |	d      5 }t	        j
                  ||fi | d d d        |j                  j                  |dg      }|r||}d}n|d   d   d   }d}t        |t        |            D ]  }t        |||   d   d           |j                  |d      5 }|j                  |       d d d        y y c c}w c c}}w # 1 sw Y   xY w# 1 sw Y   y xY w)Nr   rA   rj  _common_metadatarB   r4  r<   )r   rQ   rR   r!   rV   write_metadataargsrG  rU   r   rK   r4   write_metadata_file)r
  partsrj  r   r   r   ra   rA   pcommon_metadata_pathkeywordskr   kwargs_metar   metadata_pathrp  i_startr   s                      r'   rw  z!ArrowDatasetEngine.write_metadatai  s   q!40!>qQqT&\%=>>')vv{{D:L3M'N$%b&7&78==06P1!x-q!tPPWW148 BC%%fcA[AB FFKK{(;<M$*aF+7CJ/ ?"5%(1+f*=>?- /))#./ /'  ? QB B/ /s.   E E 	E%E%0E+E7+E47F c           
     X   d}d}|j                  di       }d|vrd|d<   d|vrt        j                         |d<   t        d%i |}d}t	        |      dk(  r|j                  |d         rt        ||      \  }}}|j                  j                  ||d   g      }|j                  j                  |d	g      }|
s5|j                  |      r$t        j                  |fd
t        |      i|}d}n|rt	        |      }|j                  |      D cg c]  }|j                  |      r| }}|r|g k(  rt        d| d      t	        |      dkD  rit        ||      \  }}}|j                  j                  |d	g      }d	|v r8|
s#t        j                  |fd
t        |      i|}d}|j                  d	       |}|!t        j                   |fd
t        |      i|}	 t#        |j%                               }|j&                  }|dk(  rB|r>|d}n;t-        |j.                  D cg c]  }|j0                   c}|t3        |	            }nd}|dk(  r|rd}nd}g g }}|j4                  r|j4                  j*                  rt7        |j4                  j*                  j8                        }t;        |      D ]x  \  }}|j4                  j<                  r|j4                  j<                  |   nd}|j?                  tA        ||tC        jD                  g d      n|jG                                      z tI        |	|      } i d|d|d|d|j*                  d|d|d|d|d|d|d|d|d|	d| d |d!|d"|d#|tK               d$|iS c c}w # t(        $ r d}|j*                  }Y w xY wc c}w )&zpyarrow.dataset version of _collect_dataset_info
        Use pyarrow.dataset API to construct a dictionary of all
        general information needed to read the dataset.
        Nr
   r   hiver   Fr<   r   r4  r   TzLNo files satisfy the `parquet_file_extension` criteria (files must end with z).r2  adaptiveobjectr   rQ  physical_schemahas_metadata_filerA   r   valid_pathsgather_statisticsr  rv   r   split_row_groups	blocksizeaggregate_filesaggregation_depthr%  r  metadata_task_sizera   )r
   convert_stringr   )&r   r   ParquetFileFormatr   rK   isdirr   rQ   rR   existsparquet_datasetr,   findendswithrL   rM   r
   nextget_fragmentsr  StopIterationrA   r   r   total_byte_sizer   r   rH   rG   	enumeratedictionariesr   ry   r|   r!  	to_pandasr   r   )!r
  r  r   r  rv   r  r   r  r  r  ignore_metadata_filer  parquet_file_extensionra   rQ  r  _dataset_kwargs_processed_dataset_kwargsr  basefns	meta_pathlen0r   	file_fragr  rgpartition_objr  r   rk   
dictionaryr  s!                                    r'   _collect_dataset_infoz(ArrowDatasetEngine._collect_dataset_info  s   .  !**Y30.4ON+?*(-(?(?(AOH%$3$Fo$F! "u:?rxxa1  7ubAE4FFKKs1v/EUK$89I'BIIi,@***2 0
 %)!' 5z !#}}%;<  
 ERK$00F/GrK 
 Z!^6ubAE4T;$78Ic! ,..!#.r? 4B
 )-% 

;'! :&r? ,B	(R--/0I'77OL w&$',$'>6?6J6JK++K!_-($ $) z)#' #(  *,R??r55"2??#9#9#?#?@O$_5 4 33 OO003 
 $$   *1 IIb9!+!5!5!7	$ 3?OT
"

  !2
 bii	

 "
 ;
  !2
 *
 U
 w
  0
 
 
  !2
 -
  !
" !"4#
$ *"9"; %
 	
GT  	(I iiO	(Z Ls   N%N	 N'	N$#N$c                2   |d   }|d   }|d   }|d   }|d   }|j                  d|      j                  }d}t        |      xs i }	|	r3|1g }|	d   D ]'  }
|
d	   d
k(  s|
d   |vs|j                  |
d          ) |d   j                  di       j	                         }|d   d   }|d   d   }| j                  |j                         ||||      }t        |j                  j                        }t        |j                        }|r|dgk7  r|j                  d       ||r|dgk7  s|	j                  dd      r|}|xs d}|r|dgk7  r|j                  |d       |rZ|D cg c]	  }||vs| }}|sg }g |d<   i |d<   ||d<   n2t        |      t        |      k7  rt        dj                  ||            t        |||z   ||      \  }}||z   }|rst!        |      j#                  |      s$t        dj                  |t        |                  t%        ||D cg c]  }||j                  j                  vs| c}|      }|r1|D ]+  }t        |j&                        st)        |t              r=|j*                  |d   k(  r+t-        j.                  g |j&                  |d         |_        g|j*                  |j                  j*                  k(  r<t-        j.                  g |j&                  |j                  j*                        |_        |j*                  |j                  v st-        j0                  t-        j2                  |j&                  g       |j                        ||j*                  <   . ||d<   ||d<   ||d<   |S c c}w c c}w )z|Use parquet schema and hive-partition information
        (stored in dataset_info) to construct DataFrame metadata.
        rA   rv   r  r%  r  r  Nr:   r8  r9  rk   ra   arrow_to_pandasr  r  )r  r  r  Tr8   index_columnsr   rc   zNo partition-columns should be written in the 
file unless they are ALL written in the file.
physical columns: {} | partitions: {}zAcategories not in available columns.
categories: {} | columns: {})colsr  r   )r  rk   r  r  r_   )r   rG   r   r   r{   r  empty_tablerH   rv   r:   rF   rI   rK   rL   r   r   rt   ru   r   ri   r+   rk   r|   CategoricalIndexr!  r"  )r
  dataset_inforA   rv   r  r  r%  physical_column_namesr:   rS  rb   r  r  r  rj  index_namescolumn_namesr_   r{  _partitionsall_columnsrT  r.  s                          r'   _create_dd_metaz"ArrowDatasetEngine._create_dd_meta\  s    h'W%!,/
$\2!"34
 , 0 01BF K Q Q /v6<"!
*95 7CM*m;F:5"))#f+6	7 'x0445FKPPR%h/0@A$X.?)) +') * 
 4::++,DLL);4&0T*
 M v%"&&=  E [b
*.NN:tN4 &0SA=R4R1SKS
-/\*13-.2<./[!S_4 <<BF-z=  %=\J.{%
!k "L0z?//< 3396*d;FW3X  *!+IAq

8H8H/HaI+D * 	9>>*eT*y~~q/I!#!4!4y~~E!H"DJ ^^tzz6!#!4!4y~~DJJOO"DJ ^^t||3+-99)..L"jj,D(( !&W%/\"%/\" TB Js   &	N0NN
=N
c                   |d   }|d   }|d   }|d   }|d   }|d   }|d   }|d   }	|d	   }
|d
   }|d   }|d   }|d   }|d   }|d   }t        |d   |      }t               }|Xt        |t              D ]D  }|\  }}}|dk(  r&t	        |t        t        t
        f      st        d      |j                  |       F i }|rt        |	      dk(  r|	ng }t        |
j                        D ]  \  }}||v s||v s||v r|||<    t        |||||t        |            }||||
d|}|du r5|s3|s1t        |j                  t              D cg c]	  }d|ddfi c}g |fS d}|t        |      }||||||
|||||d   d}|s|dk(  s|t        |j                        kD  r;t        d |j!                  |      D        d       }| j#                  ||      \  } }!n|r'|r%t        d |j!                  |      D        d        }"nLt        |j                  t              }"|r/|"D #cg c]$  }#|#j%                  |j&                        d!   |v r|#& }"}#g g }!} |"ri }$d"t)        |"|      z   }g }%t        t+        dt        |"      |            D ]3  \  }&}'|%j-                  ||&f       | j"                  |"|'|'|z    |f|$|%d!   <   5 d# }(|(|%f|$d$|z   <   t/        d$|z   |$      j1                         \  } }!| |!|fS c c}w c c}#w )%aI  pyarrow.dataset version of _construct_collection_plan
        Use dataset_info to construct the general plan for
        generating the output DataFrame collection.

        The "plan" is essentially a list (called `parts`) of
        information that is needed to produce each output partition.
        After this function is returned, the information in each
        element of `parts` will be used to produce a single Dask-
        DataFrame partition (unless some elements of `parts`
        are aggregated together in a follow-up step).

        This method also returns ``stats`` (which is a list of
        parquet-metadata statistics for each element of parts),
        and ``common_metadata`` (which is a dictionary of kwargs
        that should be passed to the ``read_partition`` call for
        every output partition).
        rQ  r   r   r  r  r  r  r_   rA   r  r%  r  r  r  ra   r  Nr   r   z2Value of 'in' filter must be a list, set or tuple.r<   )r%  r  r   rA   Fr6  r+  r
   )r   r  r  r   
ds_filtersrA   stat_col_indicesr  r  r%  r  r   c              3      K   | ]  }|  y wr$   r   r   frags     r'   r   z@ArrowDatasetEngine._construct_collection_plan.<locals>.<genexpr>]  s     ?$?   c                ,    t        | j                        S r$   r"   r   xs    r'   <lambda>z?ArrowDatasetEngine._construct_collection_plan.<locals>.<lambda>^  s    .qvv6 r)   c              3      K   | ]  }|  y wr$   r   r  s     r'   r   z@ArrowDatasetEngine._construct_collection_plan.<locals>.<genexpr>h  s     CdTCr  c                ,    t        | j                        S r$   r  r  s    r'   r  z?ArrowDatasetEngine._construct_collection_plan.<locals>.<lambda>i  s    "2166": r)   rD   zgather-pq-parts-c                @    g g }}| D ]  \  }}||z  }|s||z  } ||fS r$   r   )parts_and_statsrz  r   partstats        r'   _combine_partszEArrowDatasetEngine._construct_collection_plan.<locals>._combine_parts  s?    #%r5E&5 *
d!TME* !%<'r)   zfinal-)r   rt   r   rH   r+   rP   	TypeErroraddrK   r  rG   r   rB  r?  r"   r   r  _collect_file_partssplitrQ   r    r   r   r   compute))r
  r  rQ  r   r   r  r  r  r  r_   rA   r  r%  r  r  r  ra   r  filter_columnsfilterrb   r   rl   r  _index_colsr   rk   common_kwargsrp   r  dataset_info_kwargs
file_fragsrz  r   	all_filesfilefgather_parts_dskfinalize_listtask_ifile_ir  s)                                            r'   _construct_collection_planz-ArrowDatasetEngine._construct_collection_plan  s   * $$y)'(:;()<= -	()<=!,/
h'&'89!,/
!,/
()<="=1h' 5-.

 !'T: (%R:jsD%6H&I#L  ""3'( %63z?a;OjVX . 	+GAt{"dn&<?*)* &	+ 3 !
 %$	

 
 %/?7 &,BHH:J%K! y$56   
/8J  0!2$ 0!2"$%i0
  !Q&!CM1  ?""2"2:">?6J 22:?RSLE5
 :"Cb&6&6z&BC:	 #2881AB	 &/!! ;;rvv.r2kA !I ! r5E#% )HY@S,TT "&/!S^-?@' NFF "(($8//!&64F+FG+;$]2%67	( 6D]4S D1&x$8HIQQSue]**}z!s   K/$)K4c                6   |d   }|d   }|d   }|d   }|d   }t        |t              s|g}n|sg g fS t        |d   t              ra|s|s|du r|D cg c]	  }d|d d fi c}d fS t        t        j                  |fd	t        |      it        di |j                               }	n|}	|d
   }
|d   }|d   }|d   }|d   }|d   }t        t              }t        t              }t        t              }t        |      dk(  }i }i }|	D ][  }|j                  }t        j                  |j                        }|D cg c]  }|j                  ||j                     f c}||<   |j                  ||      D ]  }|j                  }|s|r*||j!                          |j                  }t#        |      s>d g||<   E|D ]  }||   j%                  |j&                         |s%t)        |t        |            }|r||j*                  |j,                  g d}n|j*                  |j,                  d}g }|j/                         D ]  } | |v r||    d   }!||    d   }"||    d   }#t        |!t0              rt3        j4                  |!      n|!}!t        |"t0              rt3        j4                  |"      n|"}"|j7                  |       }$|
s|r|du s|s|!|$r|!|$k  rd}i }i } nL|r|d   j%                  | |!|"|#d       n||!|"|#gz  }|"|| <   |r|d   j%                  d| i       |g dz  } |sl||   j%                  |       |r||   j%                  t9        |               ^ |sg g fS t;        |||||||| j<                  |||dd	      S c c}w c c}w ) Nr   r  r  r%  r  r   Fr+  r   r   r  rA   r  r  r  r<   )rA   )file_path_0num-rowsr  r:   )r  r  r   r   r   Tr:   )rk   r   r   r   rk   )NNN )r   rc   r  	data_path)make_part_kwargsr   )r+   rH   r0   r   r
   r,   r   r  r   intr   _get_partition_keysr   rk   split_by_row_groupr   ensure_complete_metadatarK   r   idr   num_rowsr  ri   r   r|   	Timestampr   rP   r   
_make_part)%r
  files_or_fragsr  r   r  r  r%  r  file_or_fragr  r   r  rA   r  r  r  file_row_groupsfile_row_group_statsfile_row_group_column_statssingle_rg_partshive_partition_keys	cmax_lastr  fpathraw_keys	hive_partr  row_group_infor   r   scstatsrk   cmincmaxr   lasts%                                        r'   r  z&ArrowDatasetEngine._collect_file_parts  s    !&./AB/0CD(6
-.?@ .$/,-Nr6M nQ'- %
8IU8R )7$ |T489   "*2 &88  -/J (J &i0(6
$X../AB/0CD'4	 &d+*40&1$&7#./14 	# b	YINNE 001O1OPHLV*?H)..!9:*& "44Z4O XY!%$(8 &-557)-~. .2VOE*!/ JYI#E*11),,?(%7%t,<'=&
 +/4,5,>,>3<3L3L+-	!A -6,>,>3<3L3L!A "$$4$9$9$; 3AD#z1'1$'7'>'1$'7'>-7-=l-K
 (2$'A %'LL$6)- !% (2$'A %'LL$6)- !%
 (1}}T':$+(16F$6N'8 (,| =B(9?A(<FH(C(-#2$%iL$7$7483737:D	)*%& %+tT:.F$FF26	$#2$%iL$7$7$G$*.@$@Fg3Ah -07>>qA#2 ;E B I I%PV- XUJYXYb	YJ r6M $ 'NN"5!+	
 	
QJ*s   N<"Nc                    |j                   j                  ||fD cg c]
  }|dk7  s	| c}      }|j                  |d      }	|r|	yd|||	fiS c c}w )z1Generate a partition-specific element of `parts`.r  Nr+  )rQ   rR   r   )
r
  r\   rg_listr   rc   r  r  r{  rp   pkeyss
             r'   r  zArrowDatasetEngine._make_partH  s`     FFKKY,A MqQ"W MN	""9d3U])We455 !Ns
   
AAc	                   t        |t        j                        r|}
nd}
|	j                  di       j                  dd      }|xr |du xs |xr t        |t        t
        f       }|st        ||      rt        j                  |fdt        |      it        di |	j                  di       }t        |j                               }t        |      dk(  sJ |dgk7  rt        |d   |      n|d   }
t        j                  |
j                        }|D cg c]  }|j                  ||j                     f }}|
r`g }|D ]6  }|!d|j                   v s|j#                  d       &|j#                  |       8 |
j%                  d|||rt'        |      nd	      }nt)        ||||||fi |	}|rIt        |t
              r8|D ci c]  \  }}||
 }}}|D ]  }|j                  |j*                  j                   vs'|j                  |j                  d      }t        |j,                        s3t/        j0                  t3        j4                  t        |      |            }nwt3        j4                  t        |      |j,                  j7                  |      d
      }t.        j8                  j;                  |t/        j0                  |j,                              }|j=                  |j                  |      }  |S c c}w c c}}w )zRead in a pyarrow tableNr
   r   r   r<   r   __index_level_0__F)r   rA   r:   r  i4r  r   )r+   r   ParquetFileFragmentr   r0   rH   r   r
   r,   r   r  rK   r   r  r   rk   rG   r   to_tabler   r   rA   ri   r  arrayr   fullget_locDictionaryArrayfrom_arraysappend_column)r
  r,  r   r   r:   rA   r   r%  rc   ra   r  r   missing_partitioning_inforQ  fragsr  r  r  rk   r-  r~  r   	keys_dictr.  catarrcat_inds                              r'   r  zArrowDatasetEngine._read_table\  s    lE$=$=>D D "::i488NL 6$ 6 P NL3+)N%N	 & )OG^,T]] *2 &B

9b(AB
 R--/05zQ& "dV+ !q:6q  !44T5N5NO &0"! ^^Xinn%=>" "
 D &<*fll:':;KK%& --!:A-g6t	 ( K 0 K *Z6,:;&1aA;I;' Q	>>););)A)AA $--	=Cy~~. hhrwws;/?'EF"$'',inn.D.DS.IQU# !00<<#RXXinn%= #.";";INNC"PKQ  g"D <s   "K;L r  r  c                  |j                  di       j                  d      }g d }|j                  |       |rFj                  t        j                         t	        j
                  d      ij                          t        rBj                  t        j                         t	        j
                  d      ij                          j                  t        j                         t	        j                  t        j                               ij                          j                  t        j                         t	        j                  t        j                               ij                          d }j                  |       |dk(  r j                  t        j                          n|dk(  rj                  |       fd}t              dkD  r|S y )	Nr  types_mapperc                    t         r,| t        j                         k(  rt        j                  d      S | t        j
                         k(  rt        j                  d      S t        j                  |       S )Nr   )r   r  large_stringr|   StringDtypestring
ArrowDtype)pyarrow_dtypes    r'   pyarrow_type_mapperzFArrowDatasetEngine._determine_type_mapper.<locals>.pyarrow_type_mapper  sP     "//2C!C~~i00		+~~i00}}]33r)   r   c                l    t         j                  j                  |       rt        j                  |       S y r$   )r  r   
is_decimalr|   r  )r  s    r'   _convert_decimal_typezHArrowDatasetEngine._determine_type_mapper.<locals>._convert_decimal_type  s&    88&&t,==..r)   numpy_nullablec                0    D ]  } ||       }||c S  y)zBTry all type mappers in order, starting from the user type mapper.Nr   )r  type_converterconverted_typetype_mapperss      r'   default_types_mapperzGArrowDatasetEngine._determine_type_mapper.<locals>.default_types_mapper  s(    ". *!/!>!-))*r)   r   )r   r   r  r  r|   r  r   r  date32r  date64PYARROW_NULLABLE_DTYPE_MAPPINGrK   )	r
  r  r  ra   user_mapperr  r  r"  r!  s	           @r'   _determine_type_mapperz)ArrowDatasetEngine._determine_type_mapper  sb    jj!2B7;;NK	4 ", bnnY.G H L LM##R__%6y8Q$R$V$VWbmmBIIK.H I M MNbmmBIIK.H I M MN
  56 ,, > B BCi' 34	* |q '' !r)   c                   |j                  di       }|j                  ddd        | j                  d||d|}|||d<    |j                  dd|i|}|r
t	        |j
                  t        j                        rt	        |j
                  t        j                        st        j                  j                  j                  |j
                  j                        r|j
                  j                  t        j                  d      t        j                  t        j                                fvr3|j
                  j#                  t        j                  d            |_        |S )	Nr  F)r   ignore_metadatar  r  r  r   r   )r   rk  r'  r  r+   rv   r|   r}   
MultiIndexapir   is_string_dtyper   r  r  r  r  astype)	r
  r-  r  r  r  ra   _kwargsr  ress	            r'   r  z)ArrowDatasetEngine._arrow_table_to_pandas  s    **.3uGH1s11 
')
 

 #&2GN##k##EzEWE 399bhh/syy"--8,,SYY__=		NN9-r}}RYY[/IJK 		((	)BCCI
r)   c                    |j                  |d      5 }t        j                  |      j                  }d d d        |rj	                  |       S # 1 sw Y   xY w)Nr   )rU   rV   r   r1   rX   )r
  r   r   	file_pathrq   rj  s         r'   collect_file_metadataz(ArrowDatasetEngine.collect_file_metadata  sQ    WWT4  	.A>>!$--D	.y)		. 	.s    AAc                   d }|D ]  }|rt        ||       |} |rW|j                  j                  |dg      }|j                  |d      5 }|st	        d      |j                  |       d d d        y |S # 1 sw Y   y xY w)Nr4  rB   zCannot write empty metadata!)r4   rQ   rR   rU   rL   ry  )r
  	meta_listr   out_pathrj  rp  r  r   s           r'   aggregate_metadataz%ArrowDatasetEngine.aggregate_metadata%  s     	E"4/		
 FFKK;(?@M- .$%CDD((-. K. s   A66A?)Nr   r   NN)FNFNr2  N)FN)rZ   pd.DataFramereturnpa.Table)NNNNFN)F)NNNN)NF)r-  r9  r8  r7  )r   r   r   classmethodr	  r  r0  rX  rg  rt  rw  r  r  r  r  r  r  r'  r  r2  r6  r   r)   r'   r   r     s   
 =
 =
~ ) )  { {z  ~Q ~Q@ <@	 >  F FP / /: R
 R
h ~ ~@ v+ v+p o
 o
b 
 6 6& i iV "51( 1(f 
  
 @    r)   r   )r   Tr$   )FT)f
__future__r   r   r   r   r`  collectionsr   r   	functoolsr   numpyr   pandasr|   r   r  pyarrow.parquetr3  rV   fsspec.corer   r   fsspec.implementations.arrowr	   r
   r   r   r  	dask.corer   dask.dataframe._compatr   dask.dataframe.backendsr   dask.dataframe.io.parquet.utilsr   r   r   r   r   r   r   r   r   dask.dataframe.io.utilsr   r   r   dask.dataframe.utilsr   r   dask.delayedr   dask.tokenizer   r    
dask.utilsr!   r"   int8	Int8Dtypeint16
Int16Dtypeint32
Int32Dtypeint64
Int64Dtypeuint8
UInt8Dtypeuint16UInt16Dtypeuint32UInt32Dtypeuint64UInt64Dtypebool_BooleanDtyper  r  float32Float32Dtypefloat64Float64Dtyper%  registerr  r(   r,   r4   rr   rw   ry   r   r   r   r   r   rT   r   r   r   r   r)   r'   <module>rc     s   "     #       ? 7 $   0 ;
 
 
 Y X P   3 3 BGGI|r||~BHHJBHHJBHHJBHHJBIIK!BIIK!BIIK!BHHJ!BIIK!BJJL/"//#BJJL/"//#"   %**+ ,;
6 EP
. .$=@,^0"D5X} }r)   