
    bi,                         d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ e	r
d dlZddlmZ ddde fdZ!e G d d             Z"ddde#fdZ$y)    N)	dataclassfield)BytesIO)Path)TYPE_CHECKINGAnyClassVarDictOptionalUnion   )config)DownloadConfig)
array_cast)is_local_pathxopen)no_op_if_value_is_nullstring_to_dict   )FeatureTypepdfpdfplumber.pdf.PDFreturnc                     t               5 }| j                  D ]'  }|j                  |j                  j                         ) |j                         cddd       S # 1 sw Y   yxY w)z-Convert a pdfplumber.pdf.PDF object to bytes.N)r   pageswriter   streamgetvalue)r   bufferpages      P/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/features/pdf.pypdf_to_bytesr"      sO    	 !fII 	*DLL)	* ! ! !s   AAA$c                      e Zd ZU dZdZeed<    edd      Ze	e
   ed<   dZee
   ed	<    ej                   ej                          ej                          d
      Zee   ed<    ed dd      Ze
ed<   d Zdee
eeedf   defdZddeddfdZdedee
df   f   fdZdeej:                  ej<                  ej>                  f   dej<                  fdZ ddej<                  dej<                  fdZ!y)Pdfa1  
    **Experimental.**
    Pdf [`Feature`] to read pdf documents from a pdf file.

    Input: The Pdf feature accepts as input:
    - A `str`: Absolute path to the pdf file (i.e. random access is allowed).
    - A `pathlib.Path`: path to the pdf file (i.e. random access is allowed).
    - A `dict` with the keys:
        - `path`: String with relative path of the pdf file in a dataset repository.
        - `bytes`: Bytes of the pdf file.
      This is useful for archived files with sequential access.

    - A `pdfplumber.pdf.PDF`: pdfplumber pdf object.

    Args:
        mode (`str`, *optional*):
            The mode to convert the pdf to. If `None`, the native mode of the pdf is used.
        decode (`bool`, defaults to `True`):
            Whether to decode the pdf data. If `False`,
            returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`.

    Examples:

    ```py
    >>> from datasets import Dataset, Pdf
    >>> ds = Dataset.from_dict({"pdf": ["path/to/pdf/file.pdf"]}).cast_column("pdf", Pdf())
    >>> ds.features["pdf"]
    Pdf(decode=True, id=None)
    >>> ds[0]["pdf"]
    <pdfplumber.pdf.PDF object at 0x7f8a1c2d8f40>
    >>> ds = ds.cast_column("pdf", Pdf(decode=False))
    >>> ds[0]["pdf"]
    {'bytes': None,
    'path': 'path/to/pdf/file.pdf'}
    ```
    TdecodeNF)defaultrepridr   dtypebytespathpa_type)r&   initr'   _typec                     | j                   S N)r-   )selfs    r!   __call__zPdf.__call__M   s    ||    valuer   c                 x   t         j                  rddl}nd}t        |t              r|ddS t        |t
              rt	        |j                               ddS t        |t        t        f      rd|dS |+t        ||j                  j                        rt        |      S |j                  d      6t        j                  j                  |d         rd|j                  d      dS |j                  d      |j                  d      #|j                  d      |j                  d      dS t!        d| d      )	zEncode example into a format for Arrow.

        Args:
            value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`):
                Data passed as input to Pdf feature.

        Returns:
            `dict` with "path" and "bytes" fields
        r   Nr,   r+   r,   r*   r+   zRA pdf sample should have one of 'path' or 'bytes' but they are missing or None in .)r   PDFPLUMBER_AVAILABLE
pdfplumber
isinstancestrr   absoluter+   	bytearrayr   PDFencode_pdfplumber_pdfgetosr,   isfile
ValueError)r2   r5   r:   s      r!   encode_examplezPdf.encode_exampleP   s    &&JeS!!D11t$ 01DAAy12 511#
5*..:L:L(M(//YYv*rww~~eFm/L!599V+<==YYw+uyy/@/L"YYw/69JKKdejdkklm r4   c                    | j                   st        d      t        j                  rddl}nt        d      |i }|d   |d   }}||t        d| d      t        |      r |j                  |      }|S |j                  d	      d
   }|j                  t        j                        rt        j                  nt        j                  }	 t        ||      d   }	|j                  |	      }
t!        |
      }t#        |d|      } |j                  |      S  |j                  t%        |            5 }|}ddd       |S # t        $ r d}
Y aw xY w# 1 sw Y   S xY w)ai  Decode example pdf file into pdf data.

        Args:
            value (`str` or `dict`):
                A string with the absolute pdf file path, a dictionary with
                keys:

                - `path`: String with absolute or relative pdf file path.
                - `bytes`: The bytes of the pdf file.

            token_per_repo_id (`dict`, *optional*):
                To access and decode pdf files from private repositories on
                the Hub, you can pass a dictionary
                repo_id (`str`) -> token (`bool` or `str`).

        Returns:
            `pdfplumber.pdf.PDF`
        zKDecoding is disabled for this feature. Please use Pdf(decode=True) instead.r   Nz6To support decoding pdfs, please install 'pdfplumber'.r,   r+   z@A pdf should have one of 'path' or 'bytes' but both are None in r8   ::repo_idtokenrbdownload_config)r%   RuntimeErrorr   r9   r:   ImportErrorrD   r   opensplit
startswithHF_ENDPOINTHUB_DATASETS_URLHUB_DATASETS_HFFS_URLr   rA   r   r   r   )r2   r5   token_per_repo_idr:   r,   bytes_r   
source_urlpatternrI   rK   rN   fps                 r!   decode_examplezPdf.decode_examples   so   & {{lmm&&VWW$ "V}eGnf>| #cdicjjk!lmm &)*//$/C( 
% "&D!1"!5J &001C1CD //#99 
%"0W"Ei"P 1 5 5g > '55&AOdD/JA*:??1-- 1 Q 
 & % $% 
s    E  3E EEEr   c                 L    ddl m} | j                  r| S  |d       |d      dS )zfIf in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.r   )Valuebinarystringr*   )featuresr_   r%   )r2   r_   s     r!   flattenzPdf.flatten   s2    # {{ 	
 xh	
r4   storagec                    t         j                  j                  |j                        rlt        j                  dgt        |      z  t        j                               }t         j                  j                  ||gddg|j                               }nt         j                  j                  |j                        rlt        j                  dgt        |      z  t        j                               }t         j                  j                  ||gddg|j                               }n*t         j                  j                  |j                        r |j                  j                  d      dk\  r|j                  d      }n6t        j                  dgt        |      z  t        j                               }|j                  j                  d      dk\  r|j                  d      }n6t        j                  dgt        |      z  t        j                               }t         j                  j                  ||gddg|j                               }t        || j                         S )a  Cast an Arrow array to the Pdf arrow storage type.
        The Arrow types that can be converted to the Pdf pyarrow storage type are:

        - `pa.string()` - it must contain the "path" data
        - `pa.binary()` - it must contain the image bytes
        - `pa.struct({"bytes": pa.binary()})`
        - `pa.struct({"path": pa.string()})`
        - `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter
        - `pa.list(*)` - it must contain the pdf array data

        Args:
            storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):
                PyArrow array to cast.

        Returns:
            `pa.StructArray`: Array in the Pdf arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        Ntyper+   r,   maskr   )patypes	is_stringrg   arraylenr`   StructArrayfrom_arraysis_null	is_binaryra   	is_structget_field_indexr   r   r-   )r2   rd   bytes_array
path_arrays       r!   cast_storagezPdf.cast_storage   s   & 88gll+((D6CL#8ryy{KKnn00+w1G'SYIZahapapar0sGXX-4&3w<"7biikJJnn00':1FRXHY`g`o`o`q0rGXX-||++G49%mmG4 hhvG'<299;O||++F3q8$]]62
XXtfs7|&;"))+N
nn00+z1JWV\L]dkdsdsdu0vG'4<<00r4   c           	      r   i t         fd       }t        j                  |j                         D cg c]  }||d    ||d         n|d   nd c}t        j                               }t        j                  |j                  d      j                         D cg c]%  }|t        j                  j                  |      nd' c}t        j                               }t        j                  j                  ||gddg|j                               }t        || j                        S c c}w c c}w )a4  Embed PDF files into the Arrow array.

        Args:
            storage (`pa.StructArray`):
                PyArrow array to embed.

        Returns:
            `pa.StructArray`: Array in the PDF arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        Nc                 r   | j                  d      d   }|j                  t        j                        rt        j                  nt        j
                  }t        ||      }|j                  |d         nd }t        |      }t        | d|      5 }|j                         cd d d        S # 1 sw Y   y xY w)NrG   rH   rI   rJ   rL   rM   )rR   rS   r   rT   rU   rV   r   rA   r   r   read)r,   rY   rZ   source_url_fieldsrK   rN   r[   rW   s          r!   path_to_bytesz(Pdf.embed_storage.<locals>.path_to_bytes   s    D)"-J+5+@+@ASAS+T''Z`ZvZv  !/z7 CK\Kh%))*;I*FGnrE,59OtT?C  qvvx     s   B--B6r+   r,   rf   rh   )r   rj   rm   	to_pylistr`   r   rB   r,   basenamera   ro   rp   rq   r   r-   )r2   rd   rW   r|   xru   r,   rv   s     `     r!   embed_storagezPdf.embed_storage   s%    $ "			  
 		  hh !**, UVTaQwZ-?qy)QwZgkk 
 XXNUmm\bNcNmNmNopdt'7RWWd#TAp

 ..,,k:-FRXHY`k`s`s`u,v'4<<00 qs    D/ *D4r1   )"__name__
__module____qualname____doc__r%   bool__annotations__r   r(   r   r<   r)   r	   rj   structr`   ra   r-   r   r/   r3   r   r+   r>   dictrE   r]   r
   rc   StringArrayro   	ListArrayrw   r    r4   r!   r$   r$      s+   #J FDd7B7 0E8C=/&RYYibiik'RSGXc]Su5u=E3=!E#uiG[*[$\ !ae !F8D 8EY 8t
}d33E.FFG 
#1E".."..",,*V$W #1\^\j\j #1J&1R^^ &1PRP^P^ &1r4   r$   c                     t        | d      rEt        | j                  d      r/| j                  j                  r| j                  j                  ddS dt        |       dS )aA  
    Encode a pdfplumber.pdf.PDF object into a dictionary.

    If the PDF has an associated file path, returns the path. Otherwise, serializes
    the PDF content into bytes.

    Args:
        pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.

    Returns:
        dict: A dictionary with "path" or "bytes" field.
    r   nameNr7   )hasattrr   r   r"   )r   s    r!   r@   r@     sJ     sH'#**f"=#**//

$77 |C'899r4   )%rB   dataclassesr   r   ior   pathlibr   typingr   r   r	   r
   r   r   pyarrowrj    r   download.download_configr   tabler   utils.file_utilsr   r   utils.py_utilsr   r   r:   rb   r   r+   r"   r$   r   r@   r   r4   r!   <module>r      sy    	 (   F F   5  3 C %!* !u ! f1 f1 f1R:3 : :r4   