
    bi2                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZ d dlmZ d dlmZmZ ddlmZ ddlmZ dd	lmZ  ee      Z G d
 d      Z G d de
      Z G d dee
      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d  d!e      Z$ G d" d#      Z%y)$    N)ABCabstractmethod)Path)OptionalUnion   )config   )FileLock)
get_loggerc                   Z    e Zd Zddee   fdZdedefdZdededefd	Zdd
ededefdZ	y)ExtractManagerN	cache_dirc                     |r.t         j                  j                  |t        j                        nt        j
                  | _        t        | _        y N)	ospathjoinr	   EXTRACTED_DATASETS_DIREXTRACTED_DATASETS_PATHextract_dir	Extractor	extractor)selfr   s     Q/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/utils/extract.py__init__zExtractManager.__init__   s6    FOBGGLLF$A$ABU[UsUs 	 #    r   returnc                     ddl m} t        j                  j	                  |      }t        j                  j                  | j                   ||            S )Nr
   )hash_url_to_filename)
file_utilsr    r   r   abspathr   r   )r   r   r    abs_paths       r   _get_output_pathzExtractManager._get_output_path   s:    4 77??4(ww||D,,.B8.LMMr   output_pathforce_extractc                     |xsY t         j                  j                  |       xr7 t         j                  j                  |      xr t        j                  |       S r   )r   r   isfileisdirlistdir)r   r%   r&   s      r   _do_extractzExtractManager._do_extract%   sI     
{++lRWW]];5O5kTVT^T^_jTk0l	
r   
input_pathc                     | j                   j                  |      }|s|S | j                  |      }| j                  ||      r| j                   j	                  |||       |S r   )r   infer_extractor_formatr$   r+   extract)r   r,   r&   extractor_formatr%   s        r   r/   zExtractManager.extract*   s]    >>@@L++J7K7NN"":{<LMr   r   F)
__name__
__module____qualname__r   strr   r$   boolr+   r/    r   r   r   r      s\    #(3- #NS NS N
s 
4 
D 

# d s r   r   c                   v    e Zd Zeedeeef   defd              Z	e
edeeef   deeef   ddfd              Zy)BaseExtractorr   r   c                      y r   r7   clsr   kwargss      r   is_extractablezBaseExtractor.is_extractable5   s    GJr   r,   r%   Nc                      y r   r7   )r,   r%   s     r   r/   zBaseExtractor.extract9   s    VYr   )r2   r3   r4   classmethodr   r   r   r5   r6   r>   staticmethodr/   r7   r   r   r9   r9   4   sc    J%c	"2JJ  JYE$),Y5s;KYPTY  Yr   r9   c                   p    e Zd ZU g Zee   ed<   edee	e
f   defd       Zed	dee	e
f   dedefd       Zy)
MagicNumberBaseExtractormagic_numbersr   magic_number_lengthc                 h    t        | d      5 }|j                  |      cd d d        S # 1 sw Y   y xY w)Nrb)openread)r   rE   fs      r   read_magic_numberz*MagicNumberBaseExtractor.read_magic_numberA   s0    $ 	/66-.	/ 	/ 	/s   (1magic_numberr   c                     s/t        d | j                  D              }	 | j                  ||      t	        fd| j                  D              S # t        $ r Y yw xY w)Nc              3   2   K   | ]  }t        |        y wr   )len).0cls_magic_numbers     r   	<genexpr>z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>I   s     %f@Pc*:&;%fs   Fc              3   @   K   | ]  }j                  |        y wr   )
startswith)rP   rQ   rL   s     r   rR   z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>N   s     gAQ<**+;<gs   )maxrD   rK   OSErrorany)r<   r   rL   rE   s     ` r   r>   z'MagicNumberBaseExtractor.is_extractableF   sd    "%%fTWTeTe%f"f"44T;NO gUXUfUfggg  s   A 	AANr   )r2   r3   r4   rD   listbytes__annotations__rA   r   r   r5   intrK   r@   r6   r>   r7   r   r   rC   rC   >   sq    !#M4;#/dCi 0 /s / / h%c	"2 h% hRV h hr   rC   c                   r    e Zd Zedeeef   defd       Ze	d        Z
e	deeef   deeef   ddfd       Zy)	TarExtractorr   r   c                 ,    t        j                  |      S r   )tarfile
is_tarfiler;   s      r   r>   zTarExtractor.is_extractableR   s    !!$''r   c              #   >  K   dt         dt         fddt         dt         dt        ffddt         dt        ffd} |      }| D ]  } |j                  |      r$t        j	                  d|j                   d       :|j                         r9 |||      r0t        j	                  d|j                   d	|j                          |j                         r9 |||      r0t        j	                  d|j                   d
|j                          |  yw)a  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309
        r   r   c                 z    t         j                  j                  t         j                  j                  |             S r   )r   r   realpathr"   )r   s    r   resolvedz*TarExtractor.safemembers.<locals>.resolvedb   s$    77##BGGOOD$9::r   basec                 p     t         j                  j                  ||             j                  |       S r   )r   r   r   rT   )r   rf   re   s     r   badpathz)TarExtractor.safemembers.<locals>.badpathe   s+    T4 89DDTJJJr   c                      t         j                  j                  |t         j                  j                  | j                                    } | j
                  |      S )N)rf   )r   r   r   dirnamenamelinkname)inforf   tiprh   re   s      r   badlinkz)TarExtractor.safemembers.<locals>.badlinki   s>    277<<bggoodii.HIJC4==s33r   zExtraction of z is blocked (illegal path)z is blocked: Symlink to z is blocked: Hard link to N)r5   r6   rk   loggererrorissymrl   islnk)membersr%   ro   rf   finforh   re   s        @@r   safememberszTarExtractor.safemembersV   s    	;3 	;3 	;	K# 	KS 	KT 	K	4 	4 	4
 $ 	Euzz4(~ejj\9STU75$#7~ejj\9QRWR`R`Qabc75$#7~ejj\9STYTbTbScde	s   DDr,   r%   Nc                     t        j                  |d       t        j                  |       }|j	                  |t
        j                  ||             |j                          y )NTexist_ok)rt   )r   makedirsr`   rH   
extractallr^   rv   close)r,   r%   tar_files      r   r/   zTarExtractor.extractz   sI    
K$/<<
+K1I1I(T_1`ar   )r2   r3   r4   r@   r   r   r5   r6   r>   rA   rv   r/   r7   r   r   r^   r^   Q   s|    (%c	"2 ( ( ( ! !F E$), 5s;K PT  r   r^   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)GzipExtractors   r,   r%   r   Nc                     t        j                  | d      5 }t        |d      5 }t        j                  ||       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY wNrG   wb)gziprH   shutilcopyfileobj)r,   r%   	gzip_fileextracted_files       r   r/   zGzipExtractor.extract   s]    YYz4( 	>Ik4( >N""9n=>	> 	>> >	> 	>!   AAAA	AA!	r2   r3   r4   rD   rA   r   r   r5   r/   r7   r   r   r   r      sC     MM>E$), >5s;K >PT > >r   r   c                   |     e Zd Zg dZed
deeef   dede	f fd       Z
edeeef   deeef   ddfd	       Z xZS )ZipExtractor)s   PKs   PKs   PKr   rL   r   c                    t         |   ||      ry	 ddlm}m}m}m}m}m}m	}	m
}
m}m} t        |d      5 } |	|      }|r||   dk(  r||   dk(  r||   dk(  r
	 d d d        y||   ||   k(  ry|j                  ||          |j                         ||   k(  rO||   |
k\  rG|j!                  |
      }t#        |      |
k(  r(t%        j&                  ||      }||   |k(  r
	 d d d        yd d d        y# 1 sw Y   yxY w# t(        $ r Y yw xY w)NrL   Tr   )
_CD_SIGNATURE_ECD_DISK_NUMBER_ECD_DISK_START_ECD_ENTRIES_TOTAL_ECD_OFFSET	_ECD_SIZE_EndRecDatasizeCentralDirstringCentralDirstructCentralDirrG   F)superr>   zipfiler   r   r   r   r   r   r   r   r   r   rH   seektellrI   rO   structunpack	Exception)r<   r   rL   r   r   r   r   r   r   r   r   r   r   fpendrecdatacentdir	__class__s                    r   r>   zZipExtractor.is_extractable   s4   7!$\!B	   dD! 0R$R01Q66);LPQ;QV\]hVimnVn#	0 0
   01VO5LL{ 34779{(;;y@QUc@c#%77>#:D"4yN:*0--8H$*O#*=#9=M#M+/0 0 0 0  		sA   $C? $C3C? &A;C3!C? *C? 3C<8C? <C? ?	D
Dr,   r%   Nc                     t        j                  |d       t        j                  | d      5 }|j	                  |       |j                          d d d        y # 1 sw Y   y xY w)NTrx   r)r   rz   r   ZipFiler{   r|   )r,   r%   zip_files      r   r/   zZipExtractor.extract   sM    
K$/__Z- 	,NN	 	 	s   "AA"rX   )r2   r3   r4   rD   r@   r   r   r5   rZ   r6   r>   rA   r/   __classcell__)r   s   @r   r   r      sz    M "%c	"2 "% "RV " "H E$), 5s;K PT  r   r   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)XzExtractors   7zXZ r,   r%   r   Nc                     t        j                  |       5 }t        |d      5 }t        j                  ||       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr   )lzmarH   r   r   r,   r%   compressed_filer   s       r   r/   zXzExtractor.extract   sd    YYz" 	Dok4( DN""?NCD	D 	DD D	D 	Ds!   AAAA	AA r   r7   r   r   r   r      sI    01MDE$), D5s;K DPT D Dr   r   c                   F    e Zd ZddgZedeeef   deeef   ddfd       Zy)RarExtractors   Rar! s   Rar! r,   r%   r   Nc                     t         j                  st        d      dd l}t	        j
                  |d       |j                  |       }|j                  |       |j                          y )NzPlease pip install rarfiler   Trx   )	r	   RARFILE_AVAILABLEImportErrorrarfiler   rz   RarFiler{   r|   )r,   r%   r   rfs       r   r/   zRarExtractor.extract   sK    '':;;
K$/__Z(
k"

r   r   r7   r   r   r   r      sG    (*ABME$), 5s;K PT  r   r   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)ZstdExtractors   (/r,   r%   r   Nc                 
   t         j                  st        d      dd l}|j	                         }t        | d      5 }t        |d      5 }|j                  ||       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)NzPlease pip install zstandardr   rG   r   )r	   ZSTANDARD_AVAILABLEr   	zstandardZstdDecompressorrH   copy_stream)r,   r%   zstddctxifhofhs         r   r/   zZstdExtractor.extract   sy    ))<== $$&*d# 	'sDd,C 	'sS#&	' 	' 	' 	' 	' 	's#   A9	A-A9-A6	2A99Br   r7   r   r   r   r      sD    ()M'E$), '5s;K 'PT ' 'r   r   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)Bzip2Extractors   BZhr,   r%   r   Nc                     t        j                  | d      5 }t        |d      5 }t        j                  ||       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY wr   )bz2rH   r   r   r   s       r   r/   zBzip2Extractor.extract   sf    XXj$' 	D?k4( DN""?NCD	D 	DD D	D 	Dr   r   r7   r   r   r   r      sI    $%MDE$), D5s;K DPT D Dr   r   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)SevenZipExtractors   7z'r,   r%   r   Nc                     t         j                  st        d      dd l}t	        j
                  |d       |j                  | d      5 }|j                  |       d d d        y # 1 sw Y   y xY w)NzPlease pip install py7zrr   Trx   r   )r	   PY7ZR_AVAILABLEr   py7zrr   rz   SevenZipFiler{   )r,   r%   r   archives       r   r/   zSevenZipExtractor.extract   s]    %%899
K$/
C0 	,G{+	, 	, 	,s   	A$$A-r   r7   r   r   r   r      sD    01M,E$), ,5s;K ,PT , ,r   r   c                   D    e Zd ZdgZedeeef   deeef   ddfd       Zy)Lz4Extractors   "Mr,   r%   r   Nc                    t         j                  st        d      dd l}|j                  j                  | d      5 }t        |d      5 }t        j                  ||       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)NzPlease pip install lz4r   rG   r   )r	   LZ4_AVAILABLEr   	lz4.frameframerH   r   r   )r,   r%   lz4r   r   s        r   r/   zLz4Extractor.extract  s    ##677YY^^J- 	Dk4( DN""?NCD	D 	DD D	D 	Ds#   A=	A1 A=1A:	6A==Br   r7   r   r   r   r      sI    ()MDE$), D5s;K DPT D Dr   r   c            
           e Zd ZU eeeeeee	e
ed	Zeeee   f   ed<   ed        Zedeeef   defd       Zeddeeef   dedefd	       Zedeeef   dee   fd
       Zedeeef   deeef   deddfd       Zy)r   )	tarr   zipxzrarr   r   7zr   
extractorsc                 V    t        d | j                  j                         D              S )Nc              3   t   K   | ]0  }t        |t              r|j                  D ]  }t        |        2 y wr   )
issubclassrC   rD   rO   )rP   r   extractor_magic_numbers      r   rR   z9Extractor._get_magic_number_max_length.<locals>.<genexpr>  sD      
)%=>*3*A*A	
 ' &'
'
s   68)rU   r   values)r<   s    r   _get_magic_number_max_lengthz&Extractor._get_magic_number_max_length  s)     
 ^^224
 
 	
r   r   rE   c                 P    	 t         j                  | |      S # t        $ r Y yw xY w)N)rE   r   )rC   rK   rV   )r   rE   s     r   _read_magic_numberzExtractor._read_magic_number$  s0    	+==dXk=ll 		s    	%%return_extractorr   c                     t        j                  dt               | j                  |      }|r|sdS d| j                  |   fS |sdS dS )Nz{Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'infer_extractor_format' instead.)categoryTF)FN)warningswarnFutureWarningr.   r   )r<   r   r   r0   s       r   r>   zExtractor.is_extractable+  sU    4"	

 55d;/4]dCNNK[<\5]],u?-?r   c                     | j                         }| j                  ||      }| j                  j                         D ]  \  }}|j	                  ||      s|c S  y )Nr   )r   r   r   itemsr>   )r<   r   magic_number_max_lengthrL   r0   r   s         r   r.   z Extractor.infer_extractor_format7  s_    "%"B"B"D--d4KL+.>>+?+?+A 	('i''<'H''	(r   r,   r%   r0   Nc                 b   t        j                  t         j                  j                  |      d       t	        t        |      j                  d            }t        |      5  t        j                  |d       | j                  |   }|j                  ||      cd d d        S # 1 sw Y   y xY w)NTrx   z.lock)ignore_errors)r   rz   r   rj   r5   r   with_suffixr   r   rmtreer   r/   )r<   r,   r%   r0   	lock_pathr   s         r   r/   zExtractor.extract?  s     	BGGOOK04@[)55g>?	i  	>MM+T:'78I$$Z=	> 	> 	>s   #8B%%B.r1   )r2   r3   r4   r^   r   r   r   r   r   r   r   r   r   dictr5   typer9   r[   r@   r   rA   r   r   r\   r   r6   r>   r   r.   r/   r7   r   r   r   r     s.    
2JS$}--. 
 
 
 tSy!1    	@%c	"2 	@d 	@W[ 	@ 	@ (%c	*: (x} ( ( >$)$> 49%> 	>
 
> >r   r   )&r   r   r   r   r   r   r`   r   r   abcr   r   pathlibr   typingr   r    r	   	_filelockr   loggingr   r2   rp   r   r9   rC   r^   r   r   r   r   r   r   r   r   r   r7   r   r   <module>r      s    
   	      #  "    
H	 <ZC Zh}c h&.= .b>, >1+ 1hD* D+ ', 'D- D,0 ,D+ D?> ?>r   