
    bi *                    B   d dl mZ d dlZd dlZd dlmZ d dlmZ d dlZ	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ 	 	 	 	 	 	 	 	 	 ddZd Z ej>                  d      eddddddddde	j@                  ddfd              Z 	 ddZ!d Z"d Z#y)    )annotationsN)partial)zip_longest)
open_files)compute)
read_bytes)flatten)dataframe_creation_dispatch)insert_meta_param_description	make_meta)delayedrecordsutf-8strictc           	     n   ||dk(  }|dk7  r|rt        d      ||d<   |xr |dk(  |d<   t        |df|||
| j                  |d|xs i }t        || j	                               D cg c]  \  }} t        t              |||       }}}|r |	
t               }	t        t        |i |	      S |S c c}}w )a  Write dataframe into JSON text files

    This utilises ``pandas.DataFrame.to_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    produces the kind of JSON output that is most common in big-data
    applications, and which can be chunked when reading (see ``read_json()``).

    Parameters
    ----------
    df: dask.DataFrame
        Data to save
    url_path: str, list of str
        Location to write to. If a string, and there are more than one
        partitions in df, should include a glob character to expand into a
        set of file names, or provide a ``name_function=`` parameter.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    compute: bool
        If true, immediately executes. If False, returns a set of delayed
        objects, which can be computed at a later time.
    compute_kwargs : dict, optional
        Options to be passed in to the compute method
    compression : string or None
        String like 'gzip' or 'xz'.
    name_function : callable, default None
        Function accepting an integer (partition index) and producing a
        string to replace the asterisk in the given filename globstring.
        Should preserve the lexicographic order of partitions.
    r   <Line-delimited JSON is only available with orient="records".orientlineswt)encodingerrorsname_functionnumcompression)

ValueErrorr   npartitionszip
to_delayedr   write_json_partitiondictlistdask_compute)dfurl_pathr   r   storage_optionsr   r   r   r   compute_kwargsr   kwargsoutfilesoutfiledpartss                   Q/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/dask/dataframe/io/json.pyto_jsonr-      s    h })#uWXXF83) 3F7O	 #NN	  b	H h8GQ 	&$%a&9E  !!VNL%:>:;;s   )!B1c                    |5 } | j                   |fi | d d d        t        j                  j                  |j                        S # 1 sw Y   2xY wN)r-   ospathnormpath)r#   openfiler'   fs       r,   r   r   d   sH    	  Q

1 77HMM**   s   AApandasi   inferFc                   ||dk(  }|dk7  r|rt        d      |r|dk7  s|st        d      |xs i }|du rd}d t        |
t              rt        t        j
                  |
      }
|rt        | d	f||||d
|}|rN|\  }}} |d         }t	        j                  fd|D              }t        fdt        ||      D              }n|\  }}d}d}d}t        |      }|	t        ||||
||||      }	t        |	      }	t        ||      D cg c]#  \  }} t        t              ||||
|||||		      % }}}nht        | df|||d|}t	        j                  fd|D              }|D cg c].  } t        t              ||||
| |j                         ||      0 }}t#        j$                  ||	      S c c}}w c c}w )a  Create a dataframe from a set of JSON files

    This utilises ``pandas.read_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    is appropriate for line-delimited "JSON-lines" data, the kind of JSON output
    that is most common in big-data scenarios, and which can be chunked when
    reading (see ``read_json()``). All other options require blocksize=None,
    i.e., one partition per input file.

    Parameters
    ----------
    url_path: str, list of str
        Location to read from. If a string, can include a glob character to
        find a set of file names.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    blocksize: None or int
        If None, files are not blocked, and you get one partition per input
        file. If int, which can only be used for line-delimited JSON files,
        each partition will be approximately this size in bytes, to the nearest
        newline character.
    sample: int
        Number of bytes to pre-load, to provide an empty dataframe structure
        to any blocks without data. Only relevant when using blocksize.
    encoding, errors:
        Text conversion, ``see bytes.decode()``
    compression : string or None
        String like 'gzip' or 'xz'.
    engine : callable or str, default ``pd.read_json``
        The underlying function that dask will use to read JSON files. By
        default, this will be the pandas JSON reader (``pd.read_json``).
        If a string is specified, this value will be passed under the ``engine``
        key-word argument to ``pd.read_json`` (only supported for pandas>=2.0).
    include_path_column : bool or str, optional
        Include a column with the file path where each row in the dataframe
        originated. If ``True``, a new column is added to the dataframe called
        ``path``. If ``str``, sets new column name. Default is ``False``.
    path_converter : function or None, optional
        A function that takes one argument and returns a string. Used to convert
        paths in the ``path`` column, for instance, to strip a common prefix from
        all the paths.
    $META

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_json('myfile.1.json')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_json('myfile.*.json')  # doctest: +SKIP

    >>> dd.read_json(['myfile.1.json', 'myfile.2.json'])  # doctest: +SKIP

    Load large line-delimited JSON files using partitions of approx
    256MB size

    >> dd.read_json('data/file*.csv', blocksize=2**28)
    Nr   r   zSJSON file chunking only allowed for JSON-linesinput (orient='records', lines=True).Tr1   c                    | S r/    )xs    r,   <lambda>zread_json.<locals>.<lambda>   s    1     )engine   
)	blocksizesampler   include_pathr   c              3  .   K   | ]  } |        y wr/   r9   ).0ppath_converters     r,   	<genexpr>zread_json.<locals>.<genexpr>   s     ,N1^A->,Ns   c              3  N   K   | ]  \  }} |      gt        |      z    y wr/   )len)rC   rD   chunkrE   s      r,   rF   zread_json.<locals>.<genexpr>   s+      !5=Q"#c%j0!s   "%r/   )metart)r   r   r   c              3  B   K   | ]  } |j                           y wr/   )r1   )rC   r4   rE   s     r,   rF   zread_json.<locals>.<genexpr>  s     (OA)?(Os   )r   
isinstancestrr   pd	read_jsonr   CategoricalDtyper	   r   read_json_chunkr   r   r   r   read_json_filer1   ddfrom_delayed)r$   r   r   r%   r?   r@   r   r   r   rJ   r=   include_path_columnrE   r'   b_outfirstchunkspaths
first_path
path_dtype
flat_pathsflat_chunksrI   r1   r+   filesr4   s               `              r,   rP   rP   j   se   v })#uWXXf	)4
 	
 &+Od"$$ &#f5
  #,
 
 #( E65'a1J,,,N,NNJ  !ADUFAS! J "ME6J JJfo<"#	D   +;
C
 t %GO$#

 
 
 #
 
 (((O(OO
 
  $GN##qvv&	
 
 ??5t,,M
0
s   (F>23Gc	                    t        j                  | j                  ||            }	|	j                  d        ||	fddd|}
||
j                  r|S |rt        |
|||      }
|
S )Nr   r   Tr   r   )ioStringIOdecodeseekemptyadd_path_column)rI   r   r   r=   column_namer1   r\   r'   rJ   sr#   s              r,   rR   rR   &  sg     	ELL623AFF1I		:)4	:6	:BBHHRdJ?Ir<   c                l    | 5 } ||f||d|}	d d d        |rt        	|||      }		S # 1 sw Y   xY w)Nra   )rg   )
r4   r   r   r=   rh   r1   r\   r'   	open_filer#   s
             r,   rS   rS   5  sN    	
 EiIDfEDVDERdJ?I	E Es   *3c                    || j                   v rt        d| d       | j                  di |t        j                  |gt        |       z  |      iS )Nz(Files already contain the column name: 'z^', so the path column cannot use this name. Please set `include_path_column` to a unique name.)dtyper9   )columnsr   assignrO   SeriesrH   )r#   rh   r1   rm   s       r,   rg   rg   =  s^    bjj 6{m D 
 	

 299ORYYvB/?u%MNOOr<   )	r   NNTr   r   NNNr/   )$
__future__r   rb   r0   	functoolsr   	itertoolsr   r5   rO   fsspec.corer   dask.dataframe	dataframerT   	dask.baser   r"   
dask.bytesr   	dask.corer	   dask.dataframe.backendsr
   dask.dataframe.utilsr   r   dask.delayedr   r-   r   register_inplacerP   rR   rS   rg   r9   r<   r,   <module>r~      s    " 	 	  !  "  - !  ? I   
M`+ .--h7 
	<<w-  8w-v RVPr<   