
    bi(                     ,   d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dl	Z
d dlZd dlZd dlmZ d dlmZmZ ej&                  j(                  j+                  e      Z G d dej0                        Zd Zg d	Zee_        g d
Zee_        g dZee_        defdZdefdZ defdZ!defdZ"defdZ#defdZ$defdZ%i dededede&de&de&de&de&dejN                  dejN                  de de d e!d!e!d"e"d#e#d$e$d%e%iZ(e(e_(        y)&    N)islice)AnyCallable)cast_to_python_objects)-SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL	xbasenamec                       e Zd ZU dZee   ed<   ee   ed<   ee   ed<   eeee	ge	f   f   ed<   dZ
ed        Zdej                  fd	Zd
 Zd Zy)
WebDatasetd   IMAGE_EXTENSIONSAUDIO_EXTENSIONSVIDEO_EXTENSIONSDECODERS   c              #     K   i }t        j                  d      }t        j                         }|D ]x  \  }}t	        |      \  }}	||r6|d   |k7  r.|j                  d      |d<   |j                  d      |d<   | i }||d<   ||d<   |j                         ||	<   |	j                  d      d   j                         t        v r|j                  |||	          |j                  d|       }
t        j                  |
      5 }|j                         ||	<   d d d        |j                  |       t        |
      j                  d      d   j                         }n"|	j                  d      d   j                         }|| j                  v s^ | j                  |   ||	         ||	<   { |r| y y # 1 sw Y   xY ww)Nmemory__key____url__.z	memory://)fsspec
filesystemdatasetsStreamingDownloadManagerbase_plus_extpopreadsplitlowerr   write_bytesextractopendeleter   r   )clstar_pathtar_iteratorcurrent_examplefsstreaming_download_managerfilenamefexample_key
field_nameextracted_file_pathdata_extensions               j/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/packaged_modules/webdataset/webdataset.py_get_pipeline_from_tarz!WebDataset._get_pipeline_from_tar   s    (.(9(9((C%-%F%F%H"' 	hKHa&3H&=#K"?9#=#L-<-@-@-K	*-<-@-@-K	*%%"$)4OI&)1OI&*+&&(OJ'$R(..04aax)DE&@&H&H9U]T^I_&`#[[!45 ;23&&(OJ/;		(#!*+>!?!E!Ec!J2!N!T!T!V!+!1!1#!6r!:!@!@!B-.Jcll>.J?[eKf.g
+/	h0 !! ; ;s%   DG F4A5G &G 4F=	9G returnc                 *    t        j                         S )N)r   DatasetInfo)selfs    r0   _infozWebDataset._info;   s    ##%%    c           	         | j                   j                  s"t        d| j                   j                         |j                  | j                   j                        }g }|j	                         D ]_  \  }}t        |t              r|g}|D cg c]  }|j                  |       }}|j                  t        j                  |||d             a | j                  j                  s|| j                  d   d         }t        t        || j                               t#        fdD              rt        d      D 	cg c]-  }	t$        j&                  j)                  t+        |	gd            / }
}	t%        j,                  |
d	
      j.                  }t        j0                  j3                  |      }d   D ]  }|j5                  dd      d   j7                         }|| j8                  v rt        j:                         ||<   || j<                  v rt        j>                         ||<   || j@                  v st        jB                         ||<    || j                  _        |S c c}w c c}	w )z-We handle string, list and dicts in datafilesz=At least one data file must be specified, but got data_files=)	tar_pathstar_iterators)name
gen_kwargsr   c              3   f   K   | ](  }|j                         d    j                         k7   * yw)r   N)keys).0examplefirst_exampless     r0   	<genexpr>z/WebDataset._split_generators.<locals>.<genexpr>R   s*     \'7<<>^A%6%;%;%==\s   .1zThe TAR archives of the dataset should be in WebDataset format, but the files in the archive don't share the same prefix or the same types.T)only_1d_for_numpydefault)promote_optionsr      r   )"config
data_files
ValueErrordownloaditems
isinstancestriter_archiveappendr   SplitGeneratorinfofeaturesr1   listr   #NUM_EXAMPLES_FOR_FEATURES_INFERENCEanypaTablefrom_pylistr   concat_tablesschemaFeaturesfrom_arrow_schemarsplitr   r   Imager   Audior   Video)r5   
dl_managerrH   splits
split_namer9   r%   r:   pipeliner@   	pa_tablesinferred_arrow_schemarR   r-   	extensionrA   s                  @r0   _split_generatorszWebDataset._split_generators>   sA    {{%%\]a]h]h]s]s\tuvv(()?)?@
%/%5%5%7 	!J	)S)&K	OXY8Z44X>YMYMM''#iZg0h		 yy!!229Q<qAQRH!&43[3["\]N\^\\ b   . $$%;WIY]%^_I  %'$4$4YPY$Z$a$a!((::;PQH,Q/ 
<
&--c15b9??A	 5 55+3>>+;HZ( 5 55+3>>+;HZ( 5 55+3>>+;HZ(
< "*DIIG Zs   I(2I-c              #     K   | j                   j                  j                         D cg c]"  \  }}t        |t        j
                        s!|$ }}}| j                   j                  j                         D cg c]"  \  }}t        |t        j                        s!|$ }}}t        | j                   j                  j                               }t        t        ||            D ]k  \  }\  }	}
t        | j                  |	|
            D ]E  \  }}|D ]  }||vsd ||<    ||z   D ]  }||   	|d   dz   |z   ||   d||<    | d| |f G m y c c}}w c c}}w w)Nr   r   )pathbytes_)rQ   rR   rK   rL   r   r^   r_   rS   r>   	enumeratezipr1   )r5   r9   r:   r-   featureimage_field_namesaudio_field_namesall_field_namestar_idxr%   r&   example_idxr@   s                r0   _generate_exampleszWebDataset._generate_examplesm   s    26))2D2D2J2J2L
.:wPZ[bdldrdrPsJ
 
 37))2D2D2J2J2L
.:wPZ[bdldrdrPsJ
 
 tyy1166891:3y-;X1Y 	:-G-h(1$2M2MhXd2e(f 
:$W"1 3J!0.2
+3 #46G"G Jz*6$+I$6$<z$I%,Z%8/
+ !	;-0'99
:	:

s5   (E""EE*E";"EE"A8E"E"02E"N)__name__
__module____qualname__DEFAULT_WRITER_BATCH_SIZErS   rM   __annotations__dictr   r   rT   classmethodr1   r   r4   r6   rh   ru    r7   r0   r
   r
      sv     #3i3i3i3#,,--*+'" ">&x++ &-^:r7   r
   c                 x    t        j                  d|       }|sy|j                  d      |j                  d      fS )z>Split off all file extensions.

    Returns base, allext.
    z^((?:.*/|)[^.]+)[.]([^/]*)$)NNrF      )rematchgroup)rj   r   s     r0   r   r      s6    
 HH3T:E;;q>5;;q>))r7   )?blpbmpdibbufrcurpcxdcxddspsepsfitfitsfliflcftcftugbrgifgribh5hdfpngapngjp2j2kjpcjpfjpxj2cicnsicoimiimtiftiffjfifjpejpgjpegmpgmpegmsppcdpxrpbmpgmppmpnmpsdbwrgbrgbasgirastgaicbvdavstwebpwmfemfxbmxpm)aiffauavrcafflachtksvxmat4mat5mpc2koggpafpvfrawrf64sd2sdsircamvocw64wavnistwavexwveximp3opus)z.mkvz.mp4z.aviz.mpegz.movdatac                 $    | j                  d      S )Nzutf-8)decoder   s    r0   
text_loadsr     s    ;;wr7   c                 0    ddl m} |j                  |       S )NrF   )_tenbin) r   decode_buffer)r   r   s     r0   tenbin_loadsr     s      &&r7   c                 ,    dd l }|j                  |       S Nr   )msgpackunpackb)r   r   s     r0   msgpack_loadsr   !  s    ??4  r7   c                     dd l }t        j                  |       }|j                  j                  j                  |d      S )Nr   Fallow_pickle)numpy.lib.formatioBytesIOlibformat
read_array)r   numpystreams      r0   	npy_loadsr   '  s3    ZZF99&&vE&BBr7   c                 V    t        j                  t        j                  |       d      S )NFr   )nploadr   r   r   s    r0   	npz_loadsr   .  s    772::d#%88r7   c                 ,    dd l }|j                  |       S r   )cborloads)r   r   s     r0   
cbor_loadsr   2  s    ::dr7   c                 V    dd l }|j                  t        j                  |       d      S )Nr   T)weights_only)torchr   r   r   )r   r  s     r0   torch_loadsr  8  s!    ::bjj&T:::r7   txttext
transcriptr$   cls2indexinxidjsonjsntentbmpmsgnpynpzr   pth))r   r  r   	itertoolsr   typingr   r   r   r   r   pyarrowrV   r   datasets.features.featuresr   datasets.utils.file_utilsr   r   utilslogging
get_loggerrv   loggerGeneratorBasedBuilderr
   r   r   r   r   rk   r   r   r   r   r   r   r  intr   r   r}   r7   r0   <module>r     s   	  	        = ^ 
			*	*8	4m:// m:b*.@ B /
   : /
   /
  U  'u '! !CE C9E 9U ;e ;	:
J * 
3	
 C S 
3 	# DJJ 
4:: 
< 	, 	- 
= 
9  
9!" J#$ 
;%( 
 r7   