
    bi              
          d dl Z d dlmZmZmZmZ d dlZd dlmZmZ d dl	m
Z
 d dlmZ ddlmZ  e
j                  ej                          e
j                  d      k\  r8d d	lmZ  e ed
d       ed
d      d       ed
d       ed
d      ddZn, ed
d       ed
d      dg ed
d       ed
d      ddZ	 ddeded   dee   fdZdefdZ	 ddeeef   dedee   dee   fdZy)    N)CallableLiteralOptionalUnion)DatasetValue)version)AutoTokenizer   )ConstantLengthDatasetz4.0.0)Liststring)dtypeid)contentrole)
completionprompt)chatmlinstruction	tokenizermessages_field)messagesconversationstoolsc                       fd}|S )z
    return a callable function that takes in a "messages" dataset and returns a formatted dataset, based on the
    tokenizer apply chat template to the dataset along with the schema of the list of functions in the tools list.
    c                     t        |    d   t              rIg }t        t        |                D ]+  }|j	                  j                  |    |   d             - |S j                  |    d      S )Nr   F)tokenizer   
isinstancelistrangelenappendapply_chat_template)examplesoutput_textsir   r   r   s      X/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/extras/dataset_formatting.pyformat_datasetz9conversations_formatting_function.<locals>.format_dataset0   s    h~.q148L3x789 ##11(>2J12MX]ej1k  00.1ITYaf0gg     )r   r   r   r*   s   ``` r)   !conversations_formatting_functionr-   (   s    	h r+   c                       fd}|S )z
    return a callable function that takes in an "instructions" dataset and returns a formatted dataset, based on the
    tokenizer apply chat template to the dataset
    c                     t        | d   t              rXg }t        t        | d               D ]:  }d| d   |   dd| d   |   dg}|j	                  j                  |d             < |S d| d   dd| d   dg}j                  |d      S )Nr   user)r   r   	assistantr   F)r   r   )r&   r'   r(   converted_sampler   s       r)   r*   z8instructions_formatting_function.<locals>.format_datasetD   s    hx($/L3x123 e#0B10EF(Xl5KA5NO$  ##I$A$ABR]b$A$cde    HX,>?$,1GH  001AE0RRr+   r,   )r   r*   s   ` r)    instructions_formatting_functionr3   >   s    S" r+   datasetreturnc                    t        | t              rd| j                  v r;| j                  d   t        d   k(  r"t	        j
                  d       t        |d|      S d| j                  v r<| j                  d   t        d   k(  r"t	        j
                  d       t        |d|      S y| j                  t        d   k(  r t	        j
                  d       t        |      S y)a  
    Finds the correct formatting function based on the dataset structure. Currently supported datasets are:
    - `ChatML` with [{"role": str, "content": str}]
    - `instruction` with [{"prompt": str, "completion": str}]

    Args:
        dataset (Dataset): User dataset
        tokenizer (AutoTokenizer): Tokenizer used for formatting

    Returns:
        Callable: Formatting function if the dataset format is supported else None
    r   r   z%Formatting dataset with chatml formatr   r   z*Formatting dataset with instruction formatN)r    r   featuresFORMAT_MAPPINGlogginginfor-   r3   )r4   r   r   s      r)    get_formatting_func_from_datasetr;   X   s     '7#)))
+~h/GGDE8JPUVVg...0N84LLDE8OUZ[[
 	 !>>LLEF3I>>r+   )N)r9   typingr   r   r   r   datasetsr   r   	packagingr	   transformersr
   trainer.utilsr   parse__version__r   r8   r!   r-   r3   r;   r,   r+   r)   <module>rC      sB    5 5  #  & 1 7==%%&-'--*@@ 5xD#A5W_dhKijk&+(t&DPU\dimPnoN  %8=uS[`dGefg&+(t&DPU\dimPnoN mq.56Q.R[cdh[i, 6 gk7112?LU]^bUchr+   