
    bi;1                     n   d dl Z d dlmZmZ d dlmZmZ d dlZd dl	Z
d dlZd dlmZmZmZmZmZmZmZmZmZmZ d dlmZ erd dlZej4                  j6                  j9                  e      ZddgZe G d d	ej@                               Z! G d
 dejD                        Z#dejH                  de%fdZ&defdZ'dejP                  de
jR                  fdZ*dejH                  de%fdZ+e G d d             Z,e G d d             Z-defdZ.de
jR                  fdZ/dejH                  de%fdZ0defdZ1dejP                  de
jd                  fdZ3defdZ4d Z5de6de7d e7de
jd                  fd!Z8d"ede7d e7fd#Z9d$ Z:d% Z;d&e7fd'Z<d(ejH                  defd)Z=d0d"efd*Z>d"ede7fd+Z?de%fd,Z@de%fd-ZAde%fd.ZBd/ ZCy)1    N)	dataclassfield)TYPE_CHECKINGOptional)
Array2DArray3DArray4DArray5DFeatures	LargeListListValue_ArrayXD_arrow_to_datasets_dtype)cast_table_to_featuresz.h5z.hdf5c                   N    e Zd ZU dZdZee   ed<   dZee	j                     ed<   y)
HDF5ConfigzBuilderConfig for HDF5.N
batch_sizefeatures)__name__
__module____qualname____doc__r   r   int__annotations__r   datasetsr        ^/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/packaged_modules/hdf5/hdf5.pyr   r       s(    ! $J$,0Hhx(()0r   r   c                   &    e Zd ZdZeZd Zd Zd Zy)HDF5zXArrowBasedBuilder that converts HDF5 files to Arrow tables using the HF extension types.c                 V    t        j                  | j                  j                        S )N)r   )r   DatasetInfoconfigr   selfs    r   _infoz
HDF5._info-   s    ##T[[-A-ABBr   c                    dd l }| j                  j                  s"t        d| j                  j                         d|j                  _        |j                  | j                  j                        }g }|j                         D ]  \  }}t        |t              r|g}|D cg c]  }|j                  |       }}| j                  j                  [t        j                  j                  |      D ]9  } |j                   |d      5 }	t#        |	      | j                  _        d d d         n |j%                  t'        j(                  |d|i              |S c c}w # 1 sw Y   =xY w)Nr   z=At least one data file must be specified, but got data_files=Trfiles)name
gen_kwargs)h5pyr$   
data_files
ValueErrordownload_configextract_on_the_flydownload_and_extractitems
isinstancestr
iter_filesinfor   	itertoolschainfrom_iterableFile_recursive_infer_featuresappendr   SplitGenerator)
r&   
dl_managerr-   r.   splits
split_namer*   file
first_fileh5s
             r   _split_generatorszHDF5._split_generators0   sI   {{%%\]a]h]h]s]s\tuvv8<
""544T[[5K5KL
!+!1!1!3 	aJ%%=BCTZ**40CECyy!!)"+//"?"?"F J":s3 Kr-Fr-J		*K MM(11zwX]N^_`	a  DK Ks   "EEE'c           
   #     K   dd l }| j                  j                  }t        t        j
                  j                  |            D ]5  \  }}	  |j                  |d      5 }| j                  j                  t        |      | j                  _	        t        || j                  j                        }|#t        j                  d| d       	 d d d        |xs | j                  xs |}t        d||      D ]x  }	t!        |	|z   |      }
t#        || j                  j                  |	|
      }|t        j                  d| d       P| d|	 t%        || j                  j                        f z 	 d d d        8 y # 1 sw Y   xY w# t&        $ r-}t        j)                  d| dt+        |       d|         d }~ww xY ww)	Nr   r)   zFile z contains no data, skipping..._zFailed to read file 'z' with error z: )r-   r$   r   	enumerater8   r9   r:   r;   r7   r   r<   _check_dataset_lengthsloggerwarning_writer_batch_sizerangemin_recursive_load_arraysr   r/   errortype)r&   r*   r-   batch_size_cfgfile_idxrB   rD   num_rowseffective_batchstartendpa_tablees                r   _generate_tableszHDF5._generate_tablesF   s    //'	(E(Ee(LM 	NHdTYYtS) jRyy))1-Fr-J		*5b$)):L:LMH'tf4R'ST j j '5&[8O8O&[S[O!&q(O!D j!%/"98D#9"dii>P>PRWY\#]#+"NNU4&8V+WX$!)
!E735KHVZV_V_VhVh5iiijj	j j   4TF-QyPRSTRUVWsV   A
GF A-E?FGBE?3F;G?F	F	G(F<<GGN)	r   r   r   r   r   BUILDER_CONFIG_CLASSr'   rE   rZ   r   r   r   r!   r!   (   s    b%C,r   r!   dtypereturnc                 l    | j                   dk(  ry| j                  t        | j                  d         S y)NcTr   F)kindsubdtype_is_complex_dtyper\   s    r   rb   rb   f   s2    zzS~~! !233r   c                    | j                   j                  | j                   j                  \  }}n| j                  dd  }| j                   }|t        j                  k(  rt        d      }nC|t        j                  k(  rt        d      }n$t        j                  d| d       t        d      }t        t        ||      t        ||      d      S )N   float32float64zFound complex dtype z0 that is not supported. Converting to float64...realimag)r\   ra   shapenp	complex64r   
complex128rJ   rK   r   _create_sized_feature_impl)dsetr\   
data_shape
value_types       r   _create_complex_featuresrs   n   s    zz& JJ//zZZ^


9%
	"--	9%
-eW4def9%
.z:F.z:F	
 r   arrc                 &   t         j                  j                  j                  | j                        t         j                  j                  j                  | j                        d}t
        j                  j                  |d   |d   gddg      S )Nrh   ri   rj   names)r   r   numpy_to_pyarrow_listarrayri   rj   paStructArrayfrom_arrays)rt   datas     r   _convert_complex_to_nestedr}      st    !!**EEchhO!!**EEchhOD >>%%tF|T&\&B6SYJZ%[[r   c                      | j                   dk(  S NV)r`   rc   s    r   _is_compound_dtyper      s    ::r   c                   @    e Zd ZU ded<   dZej                  ed<   d Zy)_CompoundGroupzh5py.Datasetrp   Nr|   c              #      K   | j                   j                  j                  D ]6  }| j                   j                  |   }|t        | j                  ||      f 8 y wN)rp   r\   rw   _CompoundFieldr|   )r&   
field_namefield_dtypes      r   r3   z_CompoundGroup.items   sO     ))//// 	QJ))//*5KnTYY
KPPP	Qs   AA)r   r   r   r   r|   rl   ndarrayr3   r   r   r   r   r      s    
D"**Qr   r   c                       e Zd ZU eej
                     ed<   eed<   ej                  ed<    e	d      Z
eedf   ed<   d Zd	 Zy
)r   r|   r+   r\   F)init.rk   c                     | j                   t        | j                         ndf| j                  j                  z   | _        y Nr   )r|   lenr\   rk   r%   s    r   __post_init__z_CompoundField.__post_init__   s.    (,		(=c$))n1FIYIYY
r   c                 :    | j                   |   | j                     S r   )r|   r+   )r&   keys     r   __getitem__z_CompoundField.__getitem__   s    yy~dii((r   N)r   r   r   r   rl   r   r   r5   r\   r   rk   tupler   r   r   r   r   r   r   r      sA    
2::

I88O".E5c?.Z)r   r   c                 .    t        |       }t        |      S r   )r   r<   )rp   
mock_groups     r   _create_compound_featuresr      s    %J$Z00r   c                 `    t        ||       }t        |      }t        ||dt        |             S )N)r|   r   )r   r   rO   r   )rt   rp   r   r   s       r   _convert_compound_to_nestedr      s-    3/J(.H!*h3s8DDr   c                 :    | j                   rd| j                   v ryy)NvlenTF)metadatarc   s    r   _is_vlen_dtyper      s    ~~&ENN2r   c                     | j                   j                  d   }|t        t        fv rt	        d      S t        |      }t        |      S )Nr   string)r\   r   r5   bytesr   _np_to_pa_to_hf_valuer   )rp   
vlen_dtypeinner_features      r   _create_vlen_featuresr      sA    $$V,Jc5\!X)*5Mr   c                 T    t         j                  j                  j                  |       S r   )r   r   rx   )rt   s    r   _convert_vlen_to_arrayr      s    %%@@EEr   c                     i }| j                         D ]C  \  }}t        |      rt        |      }|s|||<   %t        |      s1t	        |      }|s?|||<   E t        |      S r   )r3   	_is_groupr<   _is_dataset_infer_featurer   )h5_objfeatures_dictpathrp   r   s        r   r<   r<      sj    Mlln /
dT?06H&.d#%d+H&.d#/ M""r   c                 
   t        | j                        rt        |       S t        | j                        s| j                  j                  dk(  rt        |       S t        | j                        rt        |       S t        |       S r   )	rb   r\   rs   r   r`   r   r   r   _create_sized_feature)rp   s    r   r   r      sa    $'--	DJJ	'4::??c+A(..	

	#$T** &&r   r   rV   rW   c                 f   | || }t        | j                        rt        |      S t        | j                        rt	        |      S t        | j                        rt        ||       S | j                  j                  dk(  rt        d| d      t        d | j                  dd  D              rWt        j                  | j                        }t        j                  |D cg c]  }g  c}t        j                  |            S t        j                   j                   j#                  |      S c c}w )NOzObject dtype dataset 'z' is not supported. For variable-length data, please use h5py.vlen_dtype() when creating the HDF5 file. See: https://docs.h5py.org/en/stable/special.html#variable-length-stringsc              3   &   K   | ]	  }|d k(    ywr   Nr   .0dims     r   	<genexpr>z_load_array.<locals>.<genexpr>  s     2Csax2   re   )rQ   )r   r\   r   rb   r}   r   r   r`   r/   anyrk   ry   from_numpy_dtypearraylist_r   r   rx   )rp   r   rV   rW   rt   
inner_typerG   s          r   _load_arrayr      s    
uS/Cdjj!%c**	4::	&)#..	DJJ	'*355	C	$TF +X Y
 	
 24::ab>22,,TZZ8J88-AR-BHHZ4HII$$--HHMM .s    	D.r   c                    i }| j                         D ]`  \  }}||vrt        |      rt        |||   ||      }n1t        |      rt	        ||||      }nt        dt        |             |\|||<   b t        |       rt        j                  j                  |      S |rdg g }
}	}|j                         D ]S  \  }}t        |t        j                        rd}|j                         }|	j                  |       |
j                  |       U t        j                  j!                  |
|	      }|rt        j"                  |      S |S y )NzUnexpected type FTrv   )r3   r   rO   r   r   r/   rQ   _is_filery   Tablefrom_pydictr4   ChunkedArraycombine_chunksr=   rz   r{   chunked_array)r   r   rV   rW   
batch_dictr   rp   rt   should_chunkkeysvalueskvsarrs                 r   rO   rO     s?   Jlln #
dxT?(x~ucJCdD%5C/T
|<==?"Jt# xx##J//%*BFd$$& 	DAq!R__-#$$&KKNMM!	 ~~))&)=)5r%?4? r   c                 b    | j                   dd  }t        | j                        }t        ||      S )Nre   )rk   r   r\   ro   )rp   
dset_shapevalue_features      r   r   r   /  s,    ABJ)$**5M%j-@@r   c           	      .   |j                   }t        d | D              r*t        j                  d|  d| d| d       t	        |      S t        |       }|dk(  r|S |dk(  rt	        || d         S |d	k  r t        |      | |
      S t        d| d      )Nc              3   &   K   | ]	  }|d k(    ywr   r   r   s     r   r   z-_create_sized_feature_impl.<locals>.<genexpr>7  s     
*3!8
*r   z*HDF5 to Arrow: Found a dataset with shape z and dtype z\ that has a dimension with size 0. Shape information will be lost in the conversion to List(z).r   re   )length   )rk   r\   Arrayz.D not supported. Maximum 5 dimensions allowed.)r\   r   rJ   rK   r   r   _sized_arrayxd	TypeError)r   r   	dtype_strranks       r   ro   ro   5  s    ##I

*z
**8KPY{  [w  xE  wF  FH  I	
 M""z?Dqy	M*Q-88	#~d#*IFF%v%STUUr   r   c                 6    t         t        t        t        d|    S )N)         r   )r   r   r	   r
   )r   s    r   r   r   H  s    7w7;DAAr   numpy_dtypec                 R    t        t        t        j                  |                   S )Nrc   )r   r   ry   r   )r   s    r   r   r   L  s    /0C0CK0PQRRr   c                     | j                         D ]E  \  }}||vrt        |      rt        |||   | | d      }|/|c S t        |      s?| | c S  y )N/)prefix)r3   r   _first_datasetr   )r   r   r   r   rp   founds         r   r   r   P  sm    lln %
dxT?"4$6(4&PQ@RSE XdV$$%r   c           	          t        | |      }|y | |   j                  d   }| j                         D ]I  \  }}||vrt        |      s|j                  d   |k7  s*t	        d| d|j                  d    d|        |S )Nr   z	Dataset 'z' has length z but expected )r   rk   r3   r   r/   )r   r   
first_pathrT   r   rp   s         r   rI   rI   \  s    1Jj!''*Hlln i
dxtzz!}( 9TF-

1n]e\f!ghhi Or   c                 Z    dd l }t        | |j                        xs t        | t              S r   )r-   r4   Groupr   r   r-   s     r   r   r   k  s"    fdjj)OZ-OOr   c                 Z    dd l }t        | |j                        xs t        | t              S r   )r-   r4   Datasetr   r   s     r   r   r   q  s"    fdll+Qz&./QQr   c                 6    dd l }t        | |j                        S r   )r-   r4   r;   r   s     r   r   r   w  s    fdii((r   c                    t        | t              rt        d | j                  D              S t        | t              r&| j
                  dk(  xs t        | j                        S t        | t              rt        | j                        S y)Nc              3   &   K   | ]	  }|d k(    ywr   r   r   s     r   r   z'_has_zero_dimensions.<locals>.<genexpr>  s     53!85r   r   F)	r4   r   r   rk   r   r   _has_zero_dimensionsfeaturer   )r   s    r   r   r   }  se    '8$5w}}555	GT	"~~"K&:7??&KK	GY	'#GOO44r   ) )Dr8   dataclassesr   r   typingr   r   numpyrl   pyarrowry   r   datasets.features.featuresr   r   r	   r
   r   r   r   r   r   r   datasets.tabler   r-   utilslogging
get_loggerr   rJ   
EXTENSIONSBuilderConfigr   ArrowBasedBuilderr!   r\   boolrb   rs   r   rz   r}   r   r   r   r   r   r   r   r   r   r<   r   r5   r   r   rO   r   ro   r   r   r   rI   r   r   r   r   r   r   r   <module>r      sQ    ( *      2 				*	*8	4W
 1'' 1 168%% 6|RXX $ h 2\BJJ \2>> \bhh 4  Q Q Q 
) 
) 
)1x 1
Ebnn E"(( t 8 F

 Frxx F# #'NC N N# N"(( N4@X @c @ @HAV&B BSrxx SE S	%X 	%X # P PR4 R) )r   