
    biq                         d dl Z d dlZd dlmZ d dlmZmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* dd	l+m,Z,  e(e-      Z. G d
 d      Z/y)    N)Iterable)OptionalUnion   )!SINGLE_FILE_COMPRESSION_PROTOCOLSArchiveIterableFilesIterable_get_extraction_protocol_get_path_extension!_prepare_path_and_storage_optionsis_relative_pathurl_or_path_join	xbasenamexdirname	xet_parsexexistsxgetsizexglob
xgzip_openxisdirxisfilexjoinxlistdirxnumpy_loadxopenxpandas_read_csvxpandas_read_excelxPathxpyarrow_parquet_read_tablexrelpathxsio_loadmatxsplit	xsplitextxwalkxxml_dom_minidom_parse)
get_logger)
map_nested   )DownloadConfigc            
           e Zd ZdZdZ	 	 	 	 ddee   dee   dee   dee   fdZe	d	        Z
d
 ZdedefdZd ZdedefdZd Zdeeej$                  f   dee   fdZdeeee   f   dee   fdZd Zd Zy)StreamingDownloadManagera  
    Download manager that uses the "::" separator to navigate through (possibly remote) compressed archives.
    Contrary to the regular `DownloadManager`, the `download` and `extract` methods don't actually download nor extract
    data, but they rather return the path or url that could be opened using the `xopen` function which extends the
    built-in `open` function to stream data from remote files.
    TNdataset_namedata_dirdownload_config	base_pathc                     || _         || _        |xs t        j                  j	                  d      | _        |xs
 t               | _        d | _        d| _	        y )N.F)
_dataset_name	_data_dirospathabspath
_base_pathr)   r.   downloaded_sizerecord_checksums)selfr,   r-   r.   r/   s        g/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/download/streaming_download_manager.py__init__z!StreamingDownloadManager.__init__9   sL     *!#;rwws';.B.2B# %    c                     | j                   S N)r3   r:   s    r;   
manual_dirz#StreamingDownloadManager.manual_dirG   s    ~~r=   c                 6    t        | j                  |d      }|S )aU  Normalize URL(s) of files to stream data from.
        This is the lazy version of `DownloadManager.download` for streaming.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) of files to stream data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input url_or_urls.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        T	map_tuple)r'   _download_singler:   url_or_urlss     r;   downloadz!StreamingDownloadManager.downloadK   s    " !!6!6tTr=   urlpathreturnc                 ^    t        |      }t        |      rt        | j                  |      }|S r?   )strr   r   r7   )r:   rI   s     r;   rE   z)StreamingDownloadManager._download_single_   s(    g,G$&t@Gr=   c                 6    t        | j                  |d      }|S )a  Add extraction protocol for given url(s) for streaming.

        This is the lazy version of `DownloadManager.extract` for streaming.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) of files to stream data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        TrC   )r'   _extract)r:   rG   urlpathss      r;   extractz StreamingDownloadManager.extractf   s    & dmm[DIr=   c                    t        |      }t        || j                        }|j                  d      d   }t	        |      }|dv s|j                  d      rt        d| d      ||S |t        v rUt        j                  j                  |j                  d      d         }d|v r|d |j                  d       n|}| d	| d| S | d
| S )Nr.   z::r   )tgztar)z.tar.gzz.tar.bz2z.tar.xzz+Extraction protocol for TAR archives like 'z' is not implemented in streaming mode. Please use `dl_manager.iter_archive` instead.

Example usage:

	url = dl_manager.download(url)
	tar_archive_iterator = dl_manager.iter_archive(url)

	for filename, file in tar_archive_iterator:
		...r1   z://z://::)rL   r
   r.   splitr   endswithNotImplementedErrorr   r4   r5   basenamerindex)r:   rI   protocolr5   	extension
inner_files         r;   rN   z!StreamingDownloadManager._extract|   s    g,+GTEYEYZ}}T"1%'-	&$--8Z*[%=gY G   N::))'--*=a*@AJAD
AR$<j&7&7&<=XbJZs:,b	::ZuWI..r=   c                 B    | j                  | j                  |            S )a0  Prepare given `url_or_urls` for streaming (add extraction protocol).

        This is the lazy version of `DownloadManager.download_and_extract` for streaming.

        Is equivalent to:

        ```
        urls = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) to stream from data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.
        )rP   rH   rF   s     r;   download_and_extractz-StreamingDownloadManager.download_and_extract   s    $ ||DMM+677r=   urlpath_or_bufc                     t        |d      rt        j                  |      S t        j                  || j                        S )aN  Iterate over files within an archive.

        Args:
            urlpath_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        readrR   )hasattrr   from_buffrom_urlpathr.   )r:   r_   s     r;   iter_archivez%StreamingDownloadManager.iter_archive   s9    ( >6*"++N;;"//PTPdPdeer=   rO   c                 D    t        j                  || j                        S )a  Iterate over files.

        Args:
            urlpaths (`str` or `list` of `str`):
                Root paths.

        Yields:
            str: File URL path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        rR   )r	   from_urlpathsr.   )r:   rO   s     r;   
iter_filesz#StreamingDownloadManager.iter_files   s    " **8TEYEYZZr=   c                      y r?    r@   s    r;   manage_extracted_filesz/StreamingDownloadManager.manage_extracted_files       r=   c                      y r?   rj   r@   s    r;   get_recorded_sizes_checksumsz5StreamingDownloadManager.get_recorded_sizes_checksums   rl   r=   )NNNN)__name__
__module____qualname____doc__is_streamingr   rL   r)   r<   propertyrA   rH   rE   rP   rN   r^   r   ioBufferedReaderr   tuplere   listrh   rk   rn   rj   r=   r;   r+   r+   /   s     L '+"&48#'&sm& 3-& ".1	&
 C=&  (  ,/ / /68(f5b6G6G1G+H fXV[_ f2[5d3i#8 [Xc] [&r=   r+   )0ru   r4   collections.abcr   typingr   r   utils.file_utilsr   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   utils.loggingr&   utils.py_utilsr'   r.   r)   ro   loggerr+   rj   r=   r;   <module>r      sb    	 	 $ "                 B ' ' + 
H	l lr=   