
    bi                         d dl Z d dlmZ d dlmZ d dlmZ d dlZd dl	Z	d dl
mZ d dlmZ e	j                  j                  j!                  e      Ze G d de	j&                               Z G d	 d
e	j*                        Zy)    N)	dataclass)StringIO)Optionalrequire_storage_cast)
table_castc                       e Zd ZU dZdZeej                     ed<   dZ	e
ed<   dZee
   ed<   dZeed<   d	Zeed
<   dZe
ed<   y)
TextConfigzBuilderConfig for text files.Nfeatureszutf-8encodingencoding_errorsi   	chunksizeFkeep_linebreaksline	sample_by)__name__
__module____qualname____doc__r   r   datasetsFeatures__annotations__r   strr   r   intr   boolr        ^/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/datasets/packaged_modules/text/text.pyr
   r
      sP    ',0Hhx(()0Hc%)OXc])Is!OT!Isr   r
   c                   Z    e Zd ZeZd Zd Zdej                  dej                  fdZ	d Z
y)Textc                 V    t        j                  | j                  j                        S )N)r   )r   DatasetInfoconfigr   )selfs    r   _infoz
Text._info   s    ##T[[-A-ABBr   c                    | j                   j                  s"t        d| j                   j                         d|j                  _        |j                  | j                   j                        }g }|j                         D ]^  \  }}t        |t              r|g}|D cg c]  }|j                  |       }}|j                  t        j                  |d|i             ` |S c c}w )a  The `data_files` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].

        If str or List[str], then the dataset returns only the 'train' split.
        If dict, then keys should be from the `datasets.Split` enum.
        z=At least one data file must be specified, but got data_files=Tfiles)name
gen_kwargs)r#   
data_files
ValueErrordownload_configextract_on_the_flydownload_and_extractitems
isinstancer   
iter_filesappendr   SplitGenerator)r$   
dl_managerr*   splits
split_namer'   files          r   _split_generatorszText._split_generators"   s     {{%%\]a]h]h]s]s\tuvv8<
""544T[[5K5KL
!+!1!1!3 	aJ%%=BCTZ**40CECMM(11zwX]N^_`		a
  Ds   C$pa_tablereturnc                    | j                   j                  u| j                   j                  j                  }t        d | j                   j                  j	                         D              r|j                  |      }|S t        ||      }|S |j                  t        j                  dt        j                         i            S )Nc              3   4   K   | ]  }t        |         y w)Nr   ).0features     r   	<genexpr>z#Text._cast_table.<locals>.<genexpr>7   s     b+G44bs   text)
r#   r   arrow_schemaallvaluescastr   paschemastring)r$   r9   rF   s      r   _cast_tablezText._cast_table4   s    ;;+[[))66FbDKKDXDXD_D_Dabb#==0 O &h7O==FBIIK+@!ABBr   c           
   #     K   | j                   j                  t        | j                   j                        ndg}t        t        j
                  j                  |            D ]  \  }}t        || j                   j                  | j                   j                        5 }| j                   j                  dk(  rd}	 |j                  | j                   j                        }|snQ||j                         z  }t        |      j                         }| j                   j                   s|D cg c]  }|j#                  d       }}t$        j&                  j)                  t%        j*                  |      g|      }	||f| j-                  |	      f |dz  }| j                   j                  dk(  rd}d	}	 |j                  | j                   j                        }
|
sn||
z  }||j                         z  }|j/                  d
      }t$        j&                  j)                  t%        j*                  |d d D cg c]  }|s|	 c}      g|      }	||f| j-                  |	      f |dz  }|d   }|rt$        j&                  j)                  t%        j*                  |g      g|      }	||f| j-                  |	      f nt| j                   j                  dk(  r[|j                         }t$        j&                  j)                  t%        j*                  |g      g|      }	|| j-                  |	      f d d d         y c c}w c c}w # 1 sw Y   xY ww)Nr@   )r   errorsr   r   
)names   	paragraph z

document)r#   r   list	enumerate	itertoolschainfrom_iterableopenr   r   r   readr   readliner   	readlinesr   rstriprE   Tablefrom_arraysarrayrH   split)r$   r'   pa_table_namesfile_idxr7   f	batch_idxbatchr   r9   	new_batchexampler@   s                r   _generate_tableszText._generate_tablesA   s    7;{{7K7K7Wdkk223^d]e'	(E(Ee(LM -	?NHddT[[%9%9$++B]B]^ +?bc;;((F2 !I !t{{'<'< =$!- ( 9 9 ;#{{::CH$I4T[[%6$IE$I#%88#7#7%8IQ_#7#`  (3T5E5Eh5OOO!Q	  [[**k9 !IE$%FF4;;+@+@$A	(!*- %F 3#%88#7#7XXeCRj&T7Gw&TUV^l $8 $  (3T5E5Eh5OOO!Q	 %b	   #%88#7#75'9J8KSa#7#b'3T5E5Eh5OOO[[**j8668D!xx33RXXtf5E4Fn3]H"D$4$4X$>>>W+? +?-	? %J$ 'U;+? +?sE   BM BM,MC3M7M?MC4M7M
MM	MN)r   r   r   r
   BUILDER_CONFIG_CLASSr%   r8   rE   r\   rH   rg   r   r   r   r    r       s6    %C$CBHH C C/?r   r    )rT   dataclassesr   ior   typingr   pyarrowrE   r   datasets.features.featuresr   datasets.tabler   utilslogging
get_loggerr   loggerBuilderConfigr
   ArrowBasedBuilderr    r   r   r   <module>ru      sn     !     ; % 
			*	*8	4 ''  T?8%% T?r   