
    biL                        d Z ddlZddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlZddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ  ee       Z!e G d d             Z"e G d d             Z# G d de$      Z% G d de$      Z&e G d d             Z'e G d d             Z( G d de)e*e(f         Z+y)aw  DatasetInfo record information we know about a dataset.

This includes things that we know about the dataset statically, i.e.:
 - description
 - canonical location
 - does it have validation and tests splits
 - size
 - etc.

This also includes the things that can and should be computed once we've
processed the dataset as well:
 - number of examples (in each split)
 - etc.
    N)	dataclass)Path)ClassVarOptionalUnion)	url_to_fs)DatasetCardDatasetCardData   )config)Features)	SplitDict)Version)
get_logger)asdictunique_valuesc                   *    e Zd ZU dZeed<   dZeed<   y)SupervisedKeysData inputoutputN)__name__
__module____qualname__r   str__annotations__r        H/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/info.pyr   r   7   s    E3OFCr   r   c                   *    e Zd ZU dZeed<   dZeed<   y)DownloadChecksumsEntryDatar   keyvalueN)r   r   r   r"   r   r   r#   r   r   r   r!   r!   =   s    CME3Or   r!   c                       e Zd ZdZy)MissingCachedSizesConfigErrorz;The expected cached sizes of the download file are missing.Nr   r   r   __doc__r   r   r   r%   r%   C   s    Er   r%   c                       e Zd ZdZy)NonMatchingCachedSizesErrorz/The prepared split doesn't have expected sizes.Nr&   r   r   r   r)   r)   G   s    9r   r)   c                   V    e Zd ZU dZee   ed<   dZee   ed<   d Z	e
dedd fd       Zy)PostProcessedInfoNfeaturesresources_checksumsc                     | j                   @t        | j                   t              s%t        j                  | j                         | _         y y y N)r,   
isinstancer   	from_dictselfs    r   __post_init__zPostProcessedInfo.__post_init__P   s8    ==$Zx-P$..t}}=DM .Q$r   post_processed_info_dictreturnc           
          t        j                  |       D ch c]  }|j                   }} | di |j                         D ci c]  \  }}||v s|| c}}S c c}w c c}}w Nr   dataclassesfieldsnameitems)clsr5   ffield_nameskvs         r   r1   zPostProcessedInfo.from_dictU   s`    '2'9'9#'>?!qvv??]'?'E'E'G\tq!1P[K[ad\]] @\   AA!A!)r   r   r   r,   r   r   r   r-   dictr4   classmethodr1   r   r   r   r+   r+   K   sL    #'Hhx '*.$.>
 ^ ^:M ^ ^r   r+   c                      e Zd ZU dZ ej
                  e      Zeed<    ej
                  e      Z	eed<    ej
                  e      Z
eed<    ej
                  e      Zeed<   dZee   ed<   dZee   ed	<   dZee   ed
<   dZee   ed<   dZee   ed<   dZee   ed<   dZeeeef      ed<   dZee   ed<   dZee   ed<   dZee   ed<   dZee   ed<   dZee   ed<   dZ ee   ed<   g dZ!e"e#e      ed<   d Z$d(dee   fdZ%d)dZ&d Z'e(de#d    fd       Z)e(d*dedee   dd fd        Z*e(d!edd fd"       Z+d+d,d#Z,d-d$Z-defd%Z.e(d&edd fd'       Z/y).DatasetInfoa	  Information about a dataset.

    `DatasetInfo` documents datasets, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Not all fields are known on construction and may be updated later.

    Attributes:
        description (`str`):
            A description of the dataset.
        citation (`str`):
            A BibTeX citation of the dataset.
        homepage (`str`):
            A URL to the official homepage for the dataset.
        license (`str`):
            The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
        features ([`Features`], *optional*):
            The features used to specify the dataset's column types.
        post_processed (`PostProcessedInfo`, *optional*):
            Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
        supervised_keys (`SupervisedKeysData`, *optional*):
            Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
        builder_name (`str`, *optional*):
            The name of the `GeneratorBasedBuilder` subclass used to create the dataset. It is also the snake_case version of the dataset builder class name.
        config_name (`str`, *optional*):
            The name of the configuration derived from [`BuilderConfig`].
        version (`str` or [`Version`], *optional*):
            The version of the dataset.
        splits (`dict`, *optional*):
            The mapping between split name and metadata.
        download_checksums (`dict`, *optional*):
            The mapping between the URL to download the dataset's checksums and corresponding metadata.
        download_size (`int`, *optional*):
            The size of the files to download to generate the dataset, in bytes.
        post_processing_size (`int`, *optional*):
            Size of the dataset in bytes after post-processing, if any.
        dataset_size (`int`, *optional*):
            The combined size in bytes of the Arrow tables for all splits.
        size_in_bytes (`int`, *optional*):
            The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
        **config_kwargs (additional keyword arguments):
            Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
    )default_factorydescriptioncitationhomepagelicenseNr,   post_processedsupervised_keysbuilder_namedataset_nameconfig_nameversionsplitsdownload_checksumsdownload_sizepost_processing_sizedataset_sizesize_in_bytes)rQ   rU   rW   r,   rS   _INCLUDED_INFO_IN_YAMLc                    | j                   >t        | j                   t              s$t        j                  | j                         | _         | j                  >t        | j                  t
              s$t
        j                  | j                        | _        | j                  st        | j                  t              sYt        | j                  t              rt        | j                        | _        n$t        j                  | j                        | _        | j                  >t        | j                  t              s$t        j                  | j                        | _	        | j                  nt        | j                  t              sSt        | j                  t        t        f      rt        | j                   | _        y t        di | j                  | _        y y y r8   )r,   r0   r   r1   rM   r+   rR   r   r   rS   r   from_split_dictrN   r   tuplelistr2   s    r   r4   zDatasetInfo.__post_init__   s>   ==$Zx-P$..t}}=DM*:d>Q>QSd3e"3"="=d>Q>Q"RD<<#Jt||W,M$,,,&t||4&00>;;":dkk9+M#33DKK@DK+Jt?S?SUg4h$..>'94;O;O'P$'9'QD<P<P'Q$	 5i+r   storage_optionsc                    t        |fi |xs i ^}}|j                  t        j                  |t        j
                        d      5 }| j                  ||       ddd       | j                  rO|j                  t        j                  |t        j                        d      5 }| j                  |       ddd       yy# 1 sw Y   exY w# 1 sw Y   yxY w)a  Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.

        Args:
            dataset_info_dir (`str`):
                Destination directory.
            pretty_print (`bool`, defaults to `False`):
                If `True`, the JSON will be pretty-printed with the indent level of 4.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
        >>> ds.info.write_to_directory("/path/to/directory/")
        ```
        wb)pretty_printN)
r   open	posixpathjoinr   DATASET_INFO_FILENAME
_dump_inforL   LICENSE_FILENAME_dump_license)r3   dataset_info_dirra   r^   fs_r?   s          r   write_to_directoryzDatasetInfo.write_to_directory   s    , +G0E2GQWWY^^$4f6R6RSUYZ 	:^_OOALO9	:<<(8&:Q:QRTXY &]^""1%& & 	: 	:& &s   B?#C?CCc                     |j                  t        j                  t        |       |rdnd      j	                  d             y)zQDump info in `file` file-like object open in bytes mode (to support remote files)   Nindentutf-8)writejsondumpsr   encode)r3   filera   s      r   rf   zDatasetInfo._dump_info   s-    

4::fTl1$OVVW^_`r   c                 X    |j                  | j                  j                  d             y)zTDump license in `file` file-like object open in bytes mode (to support remote files)rq   N)rr   rL   ru   )r3   rv   s     r   rh   zDatasetInfo._dump_license   s    

4<<&&w/0r   dataset_infosc                 :   D cg c]  }||j                          c}t              dkD  rt        fdD              rd   S dj                  t	        d D                    j                         }dj                  t	        d D                    j                         }dj                  t	        d D                    j                         }dj                  t	        d D                    j                         }d }d } | ||||||      S c c}w )	Nr   c              3   .   K   | ]  }d    |k(    yw)r   Nr   ).0	dset_inforx   s     r   	<genexpr>z)DatasetInfo.from_merge.<locals>.<genexpr>   s     )gI-*:i*G)gs   z

c              3   4   K   | ]  }|j                     y wr/   )rI   r{   infos     r   r}   z)DatasetInfo.from_merge.<locals>.<genexpr>   s     /[T0@0@/[   c              3   4   K   | ]  }|j                     y wr/   )rJ   r   s     r   r}   z)DatasetInfo.from_merge.<locals>.<genexpr>        ,UtT]],Ur   c              3   4   K   | ]  }|j                     y wr/   )rK   r   s     r   r}   z)DatasetInfo.from_merge.<locals>.<genexpr>   r   r   c              3   4   K   | ]  }|j                     y wr/   )rL   r   s     r   r}   z)DatasetInfo.from_merge.<locals>.<genexpr>   s     +STDLL+Sr   )rI   rJ   rK   rL   r,   rN   )copylenallrd   r   strip)	r>   rx   r|   rI   rJ   rK   rL   r,   rN   s	    `       r   
from_mergezDatasetInfo.from_merge   s    ;HbiILa)b}!c)gYf)g&g ##kk-/[]/["[\bbd;;},U},UUV\\^;;},U},UUV\\^++m+S]+SSTZZ\#+
 	
 cs
   DDri   r6   c                 R   t        |fi |xs i ^}}t        j                  d|        |st        d      |j	                  t        j                  |t        j                        dd      5 }t        j                  |      }ddd       | j                        S # 1 sw Y   xY w)a   Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.

        This function updates all the dynamically generated fields (num_examples,
        hash, time of creation,...) of the [`DatasetInfo`].

        This will overwrite all previous metadata.

        Args:
            dataset_info_dir (`str`):
                The directory containing the metadata file. This
                should be the root directory of a specific dataset version.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import DatasetInfo
        >>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
        ```
        zLoading Dataset info from zECalling DatasetInfo.from_directory() with undefined dataset_info_dir.rrq   encodingN)r   loggerdebug
ValueErrorrb   rc   rd   r   re   rs   loadr1   )r>   ri   r^   rj   rk   r?   dataset_info_dicts          r   from_directoryzDatasetInfo.from_directory   s    4 +G0E2GQ12B1CDEdeeWWY^^$4f6R6RSUXcjWk 	-op $		!	-}}.//	- 	-s   .BB&r   c           
          t        j                  |       D ch c]  }|j                   }} | di |j                         D ci c]  \  }}||v s|| c}}S c c}w c c}}w r8   r9   )r>   r   r?   r@   rA   rB   s         r   r1   zDatasetInfo.from_dict  s_    '2'9'9#'>?!qvv??V'8'>'>'@Utq!ADTadUVV @UrC   c                     | j                   } |j                  di |j                   j                         D ci c]  \  }}||s|t        j                  |      ! c}} y c c}}w r8   )__dict__updater=   r   deepcopy)r3   other_dataset_infoignore_none	self_dictrA   rB   s         r   r   zDatasetInfo.update  se    MM		 	
 /77==?AqM 4==##	
s   $A#c                      | j                   di | j                  j                         D ci c]  \  }}|t        j                  |       c}}S c c}}w r8   )	__class__r   r=   r   r   )r3   rA   rB   s      r   r   zDatasetInfo.copy(  sC    t~~XATATAV WADMM!$4!4 WXX Ws    Ac                     i }t        |       }|D ]b  }|| j                  v st        | |      }t        |d      r|j	                         ||<   >t        |d      r|j                         ||<   ^|||<   d |S )N_to_yaml_list_to_yaml_string)r   rY   getattrhasattrr   r   )r3   	yaml_dictr   r"   r#   s        r   _to_yaml_dictzDatasetInfo._to_yaml_dict+  s    	"4L$ 	+Cd111c*5/2%*%8%8%:IcNU$56%*%:%:%<IcN%*IcN	+ r   	yaml_datac           
         t        j                  |      }|j                  d      t        j                  |d         |d<   |j                  d      t        j                  |d         |d<   t        j                  |       D ch c]  }|j                   }} | di |j                         D ci c]  \  }}||v s|| c}}S c c}w c c}}w )Nr,   rS   r   )
r   r   getr   _from_yaml_listr   r:   r;   r<   r=   )r>   r   r?   r@   rA   rB   s         r   _from_yaml_dictzDatasetInfo._from_yaml_dict9  s    MM),	==$0$,$<$<Yz=R$SIj!=="."+";";Ih<O"PIh'2'9'9#'>?!qvv??Ny'8Mtq!A<LadMNN @Ms   C	1C>C)FN)Fr/   )T)r   rG   )r6   rG   )0r   r   r   r'   r:   fieldr   rI   r   rJ   rK   rL   r,   r   r   rM   r+   rN   r   rO   rP   rQ   rR   r   r   rS   rD   rT   rU   intrV   rW   rX   rY   r   r]   r4   rl   rf   rh   rE   r   r   r1   r   r   r   r   r   r   r   rG   rG   [   s   *Z ){((=K=%K%%c:Hc:%K%%c:Hc:$;$$S9GS9#'Hhx '26NH./648OX018 #'L(3-&"&L(3-&!%K#%-1GXeCL)*1!FHTN!)--#'M8C='*.(3-."&L(3-&#'M8C='3HT#Y/ R&&X`aeXf &:a1 
tM': 
 
. 0c 0HTN 0^k 0 0B W$ W= W W
Yt  O O O Or   rG   c                   R    e Zd Zdd	dZed
d       Zededd fd       ZdeddfdZy)DatasetInfosDictr6   Nc           	      Z   i }t         j                  j                  |t        j                        }t         j                  j                  |t        j
                        }|s| j                  |      }|j                  |        t         j                  j                  |      r_t        |dd      5 }|j                         D 	ci c]  \  }}	|t        |	       }
}}	t        j                  |
||rdnd        d d d        t         j                  j                  |      r"t        j                  |      }|j                   }nd }t#               }|rJ|j%                  |       |t        dt'        |      z   dz         n|}|j)                  t+        |             y y c c}	}w # 1 sw Y   xY w)Nwrq   r   rn   ro   z---
z
---
)ospathrd   r   DATASETDICT_INFOS_FILENAMEREPOCARD_FILENAMEr   r   existsrb   r=   r   rs   dumpr	   r   datar
   to_dataset_card_datar   saver   )r3   dataset_infos_dir	overwritera   total_dataset_infosdataset_infos_pathdataset_readme_pathr?   rQ   r|   dataset_infos_dictdataset_carddataset_card_datas                r   rl   z#DatasetInfosDict.write_to_directoryE  sz    WW\\*;V=^=^_ ggll+<f>V>VW"&"5"56G"H""4(77>>,-(#@ UAQdQjQjQl&7M{IK	!22&" & 		,a\tT	U 77>>-.&++,?@L , 1 1L / 1445FGMYMaGc*;&<<yHIgs  d#678 &U Us   0F!FF!F!!F*c                 R   t         j                  d|        t        j                  j	                  t        j                  j                  |t        j                              rNt        j                  t        |      t        j                  z        j                  }d|v r| j                  |      S t        j                  j	                  t        j                  j                  |t        j                              rt        t        j                  j                  |t        j                        d      5 } | t        j                  |      j!                         D ci c]  \  }}|t"        j%                  |       c}}      cd d d        S  |        S c c}}w # 1 sw Y   y xY w)NzLoading Dataset Infos from dataset_inforq   r   )r   r   r   r   r   rd   r   r   r	   r   r   r   from_dataset_card_datar   rb   rs   r=   rG   r1   )r>   r   r   r?   rQ   r   s         r   r   zDatasetInfosDict.from_directorya  s;   23D2EFG77>>"'',,'8&:R:RST + 0 06G1H6KcKc1c d i i!22112CDD77>>"'',,'8&:[:[\]bggll#4f6W6WXcjk op ?Ciil>P>P>R:K): $[%:%:;L%MM  5L s   6)F F?FFF&r   c           	         t        |j                  d      t        t        f      rt        |d   t              r= | |d   D ci c](  }|j                  dd      t        j                  |      * c}      S t        j                  |d         }|d   j                  dd      |_         | |j                  |i      S  |        S c c}w )Nr   rQ   default)r0   r   r]   rD   rG   r   rQ   )r>   r   dataset_info_yaml_dictr   s       r   r   z'DatasetInfosDict.from_dataset_card_datau  s    '++N;dD\J+N;TB
 7H6W	 3 /22=)LkNiNi2O    +::;L^;\]+<^+L+P+PQ^`i+j(L44lCDD5Ls   -Cc                    | rcd|v r.t        |d   t              r|d   j                  dd      |d   i}n0d|v r*t        |d   t              r|d   D ci c]  }|d   |
 }}ni }i || j	                         D ci c]  \  }}||j                          c}}}|j	                         D ]
  \  }}||d<    t        |      dk(  rMt        t        |j                                     |d<   |d   j                  dd       }|dk7  rd|i|d   |d<   y y g |d<   t        |j	                               D ]1  \  }}|j                  dd        d|i|}|d   j                  |       3 y y c c}w c c}}w )Nr   rQ   r   r   )r0   rD   r   r]   r=   r   r   nextitervaluespopsortedappend)	r3   r   dataset_metadata_infosconfig_metadatarQ   r|   r   dset_info_yaml_dictr   s	            r   r   z%DatasetInfosDict.to_dataset_card_data  s   !22zBSTbBcei7j%n599-SUfguUv*&  #44DUVdDegk9l ,=^+L*' $M2OC*& *
 *,&#(#X\XbXbXde>Tk9;	 7 7 99e#
 5H4M4M4O A005@#M2A&'1,48>Q>X>X>Z9[4\!.1/?CCMSWX)+ &{9+N;9%n5 , 57!.1;ABUB[B[B];^ U7K!7*..}dC.;[-cLb-c*%n5<<=ST	UC * fs   E(<E-)FF)r6   N)r6   r   )	r   r   r   rl   rE   r   r
   r   r   r   r   r   r   r   D  sT    98  &  K]  $&Uo &U$ &Ur   r   ),r'   r   r:   rs   r   rc   r   pathlibr   typingr   r   r   fsspecfsspec.corer   huggingface_hubr	   r
   r   r   r,   r   rS   r   utilsr   utils.loggingr   utils.py_utilsr   r   r   r   r   r!   	Exceptionr%   r)   r+   rG   rD   r   r   r   r   r   <module>r      s        	  !  , ,  ! 8     % 1 
H	   
   
FI F:) : ^ ^ ^ eO eO eOPjUtC,- jUr   