
    bi1                     :   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&  ee'      Z( G d dejR                        Z* G d d      Z+y)zDownload manager interface.    N)datetime)partial)OptionalUnion)	url_to_fs)
thread_map   )config)tqdm)ArchiveIterableFilesIterablecached_pathis_relative_path,stack_multiprocessing_download_progress_barsurl_or_path_join)get_size_checksum_dict)
get_loggerr   )NestedDataStructure
map_nested)tracked_str   )DownloadConfigc                       e Zd ZdZdZdZdZy)DownloadModea)  `Enum` for how to treat pre-existing downloads and data.

    The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both
    raw downloads and the prepared dataset if they exist.

    The generations modes:

    |                                     | Downloads | Dataset |
    |-------------------------------------|-----------|---------|
    | `REUSE_DATASET_IF_EXISTS` (default) | Reuse     | Reuse   |
    | `REUSE_CACHE_IF_EXISTS`             | Reuse     | Fresh   |
    | `FORCE_REDOWNLOAD`                  | Fresh     | Fresh   |

    reuse_dataset_if_existsreuse_cache_if_existsforce_redownloadN)__name__
__module____qualname____doc__REUSE_DATASET_IF_EXISTSREUSE_CACHE_IF_EXISTSFORCE_REDOWNLOAD     ]/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/download/download_manager.pyr   r   2   s     83)r&   r   c            
          e Zd ZdZ	 	 	 	 	 ddee   dee   dee   dee   fdZed        Z	ed	        Z
d
edefdZd Zdee   dedee   fdZdededefdZdeeej&                  f   fdZdeeee   f   fdZd Zd Zd Zd Zd Zy)DownloadManagerFNdataset_namedata_dirdownload_config	base_pathc                     || _         || _        |xs t        j                  j	                  d      | _        i | _        || _        |xs
 t               | _	        i | _
        i | _        y)a4  Download manager constructor.

        Args:
            data_dir:
                can be used to specify a manual directory to get the files from.
            dataset_name (`str`):
                name of dataset this instance will be used for. If
                provided, downloads will contain which datasets they were used for.
            download_config (`DownloadConfig`):
                to specify the cache directory and other
                download options
            base_path (`str`):
                base path that is used when relative paths are used to
                download files. This can be a remote url.
            record_checksums (`bool`, defaults to `True`):
                Whether to record the checksums of the downloaded files. If None, the value is inferred from the builder.
        .N)_dataset_name	_data_dirospathabspath
_base_path_recorded_sizes_checksumsrecord_checksumsr   r,   downloaded_pathsextracted_paths)selfr*   r+   r,   r-   r7   s         r'   __init__zDownloadManager.__init__J   s\    2 *!#;rwws';Z\& 0.B.2B "!r&   c                     | j                   S N)r1   r:   s    r'   
manual_dirzDownloadManager.manual_dirm   s    ~~r&   c                 V    t        d | j                  j                         D              S )z+Returns the total size of downloaded files.c              3   &   K   | ]	  }|d      yw)	num_bytesNr%   ).0checksums_dicts     r'   	<genexpr>z2DownloadManager.downloaded_size.<locals>.<genexpr>t   s     m>>+.ms   )sumr6   valuesr>   s    r'   downloaded_sizezDownloadManager.downloaded_sizeq   s$     mTEcEcEjEjElmmmr&   url_or_urlsdownloaded_path_or_pathsc           	          d}t        t        t        |j                         |j                                     |d      D ]2  \  }}t	        || j
                        | j                  t        |      <   4 y)z)Record size/checksum of downloaded files.   zComputing checksums)delaydesc)record_checksumN)hf_tqdmlistzipflattenr   r7   r6   str)r:   rI   rJ   rM   urlr3   s         r'   _record_sizes_checksumsz'DownloadManager._record_sizes_checksumsv   sn     [((*,D,L,L,NOP&
 	IC 8Nd&;&;8D**3s84	r&   c           
      `   | j                   j                         }d|_        |j                  d|_        t	        | j
                  |      }t        j                         }t               5  t        ||d|j                  ddd      }ddd       t        j                         |z
  }t        j                  d	|j                         d
z   d       t        |      }t              }| j                  j!                  t#        t%        |j'                         |j'                                            t        j                         }| j)                  ||       t        j                         |z
  }t        j                  d|j                         d
z   d       |j*                  S # 1 sw Y   xY w)ay  Download given URL(s).

        By default, only one process is used for download. Pass customized `download_config.num_proc` to change this behavior.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download. Each URL is a `str`.

        Returns:
            `str` or `list` or `dict`:
                The downloaded paths matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        FNzDownloading datar,   TzDownloading data files)	map_tuplenum_procrN   batched
batch_sizezDownloading took <   z minzChecksum Computation took )r,   copyextract_compressed_filedownload_descr   _download_batchedr   nowr   r   r[   loggerinfototal_secondsr   r8   updatedictrR   rS   rV   data)r:   rI   r,   download_func
start_timerJ   durations          r'   downloadzDownloadManager.download   sp   & ..33527/((0,>O) 6 6X\\^
9; 		'1(11-($		 <<>J.'(>(>(@B(F'GtLM)+6#67O#P $$T#k.A.A.CE]EeEeEg*h%ij\\^
$$[2JK<<>J.01G1G1IR1O0PPTUV',,,+		 		s   *F##F-url_or_filenamesreturnc           	         t        |      dk\  r6|j                         }d|_        t        | j                  |      }t        |d         }t        |      rt        | j                  |      }t        |fi |j                  \  }}d}	 |j                  |      j                  dd      }|dk  rt        j                  nd}t!        |||j"                  xs dd	t$        j&                  j                  d
      dk(  r?t)        j*                         j,                  r!t)        j*                         j,                  d   nd |t.              S |D cg c]  }| j	                  ||       c}S # t        $ r Y w xY wc c}w )N   TrX   r   sizei  @r   Downloadingfiles8HF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS1rY   )rN   unitpositionmax_workers
tqdm_class)lenr_   disable_tqdmr   _download_singlerT   r   r   r5   r   storage_optionsre   get	Exceptionr
   &HF_DATASETS_MULTITHREADING_MAX_WORKERSr   ra   r2   environmultiprocessingcurrent_process	_identityr   )	r:   rn   r,   rj   r3   fsrr   ry   url_or_filenames	            r'   rb   z!DownloadManager._download_batched   su   
  B&-224O+/O(#D$9$9?[M '*+D%'> I)H)HIHBDwwt}((3 BFAR==XY   $22Cm::>>"\]add#335?? )88:DDRH '  (8# %%o%W '  &s   	!E' 
E6'	E32E3r   c                     t        |      }t        |      rt        | j                  |      }t	        ||      }t        |      }|j                  |       |S )NrX   )rT   r   r   r5   r   r   
set_origin)r:   r   r,   outs       r'   r}   z DownloadManager._download_single   sK    o.O,.tPO/?K#'
r&   path_or_bufc                 n    t        |d      rt        j                  |      S t        j                  |      S )aK  Iterate over files within an archive.

        Args:
            path_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        read)hasattrr   from_buffrom_urlpath)r:   r   s     r'   iter_archivezDownloadManager.iter_archive   s0    ( ;'"++K88"//<<r&   pathsc                 ,    t        j                  |      S )a  Iterate over file paths.

        Args:
            paths (`str` or `list` of `str`):
                Root paths.

        Yields:
            `str`: File path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        )r   from_urlpaths)r:   r   s     r'   
iter_fileszDownloadManager.iter_files  s    " **511r&   c           	      |   | j                   j                         }d|_        t        | j                  |      }t        |||j                  d      }t        |      }t        |      }| j                  j                  t        t        |j                         |j                                            |j                  S )a$  Extract given path(s).

        Args:
            path_or_paths (path or `list` or `dict`):
                Path of file to extract. Each path is a `str`.

        Returns:
            extracted_path(s): `str`, The extracted paths matching the given input
            path_or_paths.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        TrX   zExtracting data files)r[   rN   )r,   r_   r`   r   r}   r   r[   r   r9   rg   rh   rR   rS   ri   )r:   path_or_pathsr,   extract_funcr9   s        r'   extractzDownloadManager.extract  s    $ ..33526/t44oV$$--(	
 ,M:-o>##D]-B-B-DoF]F]F_)`$ab###r&   c                 B    | j                  | j                  |            S )a  Download and extract given `url_or_urls`.

        Is roughly equivalent to:

        ```
        extracted_paths = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.

        Returns:
            extracted_path(s): `str`, extracted paths of given URL(s).
        )r   rm   )r:   rI   s     r'   download_and_extractz$DownloadManager.download_and_extract6  s      ||DMM+677r&   c                 6    | j                   j                         S r=   )r6   r_   r>   s    r'   get_recorded_sizes_checksumsz,DownloadManager.get_recorded_sizes_checksumsH  s    --2244r&   c                 v   t        | j                  j                               t        | j                  j                               z
  }t	        | j                  j                               D ]L  \  }}||v st        j                  j                  |      s+t        j                  |       | j                  |= N y r=   )
setr9   rG   r8   rQ   itemsr2   r3   isfileremove)r:   paths_to_deletekeyr3   s       r'   delete_extracted_filesz&DownloadManager.delete_extracted_filesK  s    d2299;<s4CXCXC_C_Ca?bbd2288:; 	.IC&277>>$+?		$((-	.r&   c                 R    | j                   j                  r| j                          y y r=   )r,   delete_extractedr   r>   s    r'   manage_extracted_filesz&DownloadManager.manage_extracted_filesR  s"    00'') 1r&   )NNNNT)r   r   r    is_streamingr   rT   r   r;   propertyr?   rH   r   rV   rm   rQ   rb   r}   r   ioBufferedReaderr   r   r   r   r   r   r   r%   r&   r'   r)   r)   G   s.   L '+"&48#'!"sm!" 3-!" ".1	!"
 C=!"F   n n3F bu 0-d)s)) () 
c	)V n Y\ =c23D3D.D(E =22c49n 5 2&$@8$5.*r&   r)   ),r!   enumr   r   r2   r   	functoolsr   typingr   r   fsspecfsspec.corer   tqdm.contrib.concurrentr    r
   utilsr   rP   utils.file_utilsr   r   r   r   r   r   utils.info_utilsr   utils.loggingr   utils.py_utilsr   r   utils.trackr   r,   r   r   rd   Enumr   r)   r%   r&   r'   <module>r      sq     "  	  	   "  ! .  #  6 , < % + 
H	*499 **M* M*r&   