
    bi~                        d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlZd dlmZmZ d dlmZ  edee      Zd	eee	f   d
efdZ	 d&d	eee eeef      f   dedee eee
f         d
eeef   fdZ!	 d&d	eee eeef      f   dedee eee
f         d
eeef   fdZ"de eee eeef      f      d
e eee eeef      f      fdZ#	 d'dedee$   dee   d
efdZ%	 d'dedee$   dee   d
efdZ&d	eeef   d
eeef   fdZ'd	eee f   d
eee f   fdZ(deee e    f   de$d
eee e    f   fdZ) G d d      Z*dejV                  de$d
ejV                  fdZ,dejV                  de$d
ejV                  fdZ-	 d(dede$ded eeee	f      d
ef
d!Z.	 d&ded"e$d eeee	f      d
efd#Z/d	eee	f   d
efd$Z0d	eee f   d
eee f   fd%Z1y))    N)defaultdictdeque)Sequence)	takewhile)AnyCallableOptionalTypeVarUnion)DatasetDatasetDict)PreTrainedTokenizerBaseDatasetTypeexamplereturnc                     g d}| j                         D ch c]	  }||v s| }}|rC|j                         }| |   }t        |t              r|d   }t        |t              r	d|v rd|v ryyc c}w )aM  
    Check if the example is in a conversational format.

    Args:
        example (`dict[str, Any]`):
            A single data entry of a dataset. The example can have different keys depending on the dataset type.

    Returns:
        `bool`:
            `True` if the data is in a conversational format, `False` otherwise.

    Examples:

    ```python
    >>> example = {"prompt": [{"role": "user", "content": "What color is the sky?"}]}
    >>> is_conversational(example)
    True

    >>> example = {"prompt": "The sky is"}
    >>> is_conversational(example)
    False
    ```
    )promptchosenrejected
completionmessagesr   rolecontentTF)keyspop
isinstancelistdict)r   supported_keyskeyexample_keysmaybe_messagesmaybe_messages         I/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/data_utils.pyis_conversationalr%       s    0 PN#*<<>KCSN5JCKLK   nd+*1-M-.6]3Jy\iOi Ls
   	A-A-	tokenizertoolsc           
         g d}| j                         D ch c]	  }||v s| }}|dhdhddhh dddhh dfvrt        d	|       d| v r|j                  | d   |d
      }d| v rF| d   d   d   }|dk(  rd}d
}	n|dk(  rd
}d}	nt        d|       |j                  | d   ||	d
|      }
d| v rd| v rX|j                  | d   | d   z   |d
      }dj	                  d t        d t        
|            D              }
|t        |
      d }d| v r\d| v rX|j                  | d   | d   z   |d
      }dj	                  d t        d t        
|            D              }
|t        |
      d }d| v r|j                  | d   | d   z   |d
      }dj	                  d t        d t        
|            D              }
|t        |
      d }n6d| v r|j                  | d   |d
      }d| v r|j                  | d   |d
      }i }d| v r|d<   d| v r
|d<   d| v r|d<   d| v r|d<   d| v r|d<   d| v r| d   |d<   |S c c}w )z
    Apply a chat template to a conversational example along with the schema for a list of functions in `tools`.

    For more details, see [`maybe_apply_chat_template`].
    )r   r   r   r   r   labelr   r   r   >   r   r   r   r   r   >   r)   r   r   zInvalid keys in the example: F)r'   tokenizer   userT	assistantz"Invalid role in the last message: )r'   continue_final_messager*   add_generation_prompt c              3   &   K   | ]	  \  }}|  y wN .0x_s      r$   	<genexpr>z&apply_chat_template.<locals>.<genexpr>   s     i41aQi   c                     | d   | d   k(  S Nr      r3   r6   s    r$   <lambda>z%apply_chat_template.<locals>.<lambda>       qtqt|     Nc              3   &   K   | ]	  \  }}|  y wr2   r3   r4   s      r$   r8   z&apply_chat_template.<locals>.<genexpr>   s     k41aQkr9   c                     | d   | d   k(  S r;   r3   r=   s    r$   r>   z%apply_chat_template.<locals>.<lambda>   r?   r@   c              3   &   K   | ]	  \  }}|  y wr2   r3   r4   s      r$   r8   z&apply_chat_template.<locals>.<genexpr>   s     m41aQmr9   c                     | d   | d   k(  S r;   r3   r=   s    r$   r>   z%apply_chat_template.<locals>.<lambda>   r?   r@   textr)   )r   KeyErrorapply_chat_template
ValueErrorjoinr   ziplen)r   r&   r'   r   r    r!   r   	last_roler/   r.   r   prompt_chosenr   prompt_rejectedr   prompt_completionr   outputs                     r$   rG   rG   I   s3    YN#*<<>KCSN5JCKLK		
	< (	:)  6|nEFF W001DE\a0b 7H%b)&1	$(!%*"+%$)!%)"A)MNN..H#9"7 / 
 7w%99!GH$55UU : M WWi95KSQWYfMg+hiiF"3v;=1F X%8';;!GJ$77uu < O WWk95KSQWYhMi+jkkF&s6{}5H7" ) = =!GL$99QV !> ! WWm95KSQWYjMk+lmmF*3v;=9Jw22783DE\a2bF  44WZ5HPU`e4fH FW!v7!x7!xW%zw)|'!'*wMc Ls
   	H?H?c                 6    t        |       rt        | ||      S | S )a:	  
    If the example is in a conversational format, apply a chat template to it.

    Args:
        example (`dict[str, list[dict[str, str]]`):
            Dictionary representing a single data entry of a conversational dataset. Each data entry can have different
            keys depending on the dataset type. The supported dataset types are:

                - Language modeling dataset: `"messages"`.
                - Prompt-only dataset: `"prompt"`.
                - Prompt-completion dataset: `"prompt"` and `"completion"`.
                - Preference dataset: `"prompt"`, `"chosen"`, and `"rejected"`.
                - Preference dataset with implicit prompt: `"chosen"` and `"rejected"`.
                - Unpaired preference dataset: `"prompt"`, `"completion"`, and `"label"`.

            For keys `"messages"`, `"prompt"`, `"chosen"`, `"rejected"`, and `"completion"`, the values are lists of
            messages, where each message is a dictionary with keys `"role"` and `"content"`.
        tokenizer (`PreTrainedTokenizerBase`):
            Tokenizer to apply the chat template with.
        tools (`list[Union[dict, Callable]]` or `None`, *optional*, defaults to `None`):
            A list of tools (callable functions) that will be accessible to the model. If the template does not support
            function calling, this argument will have no effect

    Returns:
        `dict[str, str]`:
            Formatted example with the chat template applied.

    Notes:
        - This function does not alter the keys, except for Language modeling dataset, where `"messages"` is replaced
        by `"text"`.

        - In case of prompt-only data, if the last role is `"user"`, the generation prompt is added to the prompt.
        Else, if the last role is `"assistant"`, the final message is continued.

    Example:

    ```python
    >>> from transformers import AutoTokenizer

    >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
    >>> example = {
    ...     "prompt": [{"role": "user", "content": "What color is the sky?"}],
    ...     "completion": [{"role": "assistant", "content": "It is blue."}],
    ... }
    >>> apply_chat_template(example, tokenizer)
    {'prompt': '<|user|>\nWhat color is the sky?<|end|>\n<|assistant|>\n', 'completion': 'It is blue.<|end|>\n<|endoftext|>'}
    ```
    )r%   rG   )r   r&   r'   s      r$   maybe_apply_chat_templaterR      s!    j !"7Iu==r@   examplesc                 x    t        | d         }| d   | d   z   dg|z  dg|z  z   d}d| v r| d   | d   z   |d<   |S )Nr   r   TF)r   r)   r   )rK   )rS   
batch_sizenew_rowss      r$   _unpair_rowrW      sh    Xh'(Jx(8J+??*$w';;H 8%h/(82DDOr@   datasetnum_procdescc                 :    | j                  t        dddg||      S )a  
    Unpair a preference dataset.

    Args:
        dataset (`Dataset` or `DatasetDict`):
            Preference dataset to unpair. The dataset must have columns `"chosen"`, `"rejected"` and optionally
            `"prompt"`.
        num_proc (`int` or `None`, *optional*, defaults to `None`):
            Number of processes to use for processing the dataset.
        desc (`str` or `None`, *optional*, defaults to `None`):
            Meaningful description to be displayed alongside with the progress bar while mapping examples.

    Returns:
        `Dataset`: The unpaired preference dataset.

    Example:

    ```python
    >>> from datasets import Dataset

    >>> dataset_dict = {
    ...     "prompt": ["The sky is", "The sun is"],
    ...     "chosen": [" blue.", "in the sky."],
    ...     "rejected": [" green.", " in the sea."],
    ... }
    >>> dataset = Dataset.from_dict(dataset_dict)
    >>> dataset = unpair_preference_dataset(dataset)
    >>> dataset
    Dataset({
        features: ['prompt', 'completion', 'label'],
        num_rows: 4
    })

    >>> dataset[0]
    {'prompt': 'The sky is', 'completion': ' blue.', 'label': True}
    ```
    Tr   r   )batchedremove_columnsrY   rZ   )maprW   )rX   rY   rZ   s      r$   unpair_preference_datasetr_      s%    P ;;{D(JAWbjqu;vvr@   c                     t        | t              r*| t        | j                               d      j                  }n| j                  }d|v rd|v rt        | ||      S | S )a  
    Unpair a preference dataset if it is paired.

    Args:
        dataset (`Dataset` or `DatasetDict`):
            Preference dataset to unpair. The dataset must have columns `"chosen"`, `"rejected"` and optionally
            `"prompt"`.
        num_proc (`int` or `None`, *optional*, defaults to `None`):
            Number of processes to use for processing the dataset.
        desc (`str` or `None`, *optional*, defaults to `None`):
            Meaningful description to be displayed alongside with the progress bar while mapping examples.

    Returns:
        `Dataset` or `DatasetDict`: The unpaired preference dataset if it was paired, otherwise the original dataset.

    Example:

    ```python
    >>> from datasets import Dataset

    >>> dataset_dict = {
    ...     "prompt": ["The sky is", "The sun is"],
    ...     "chosen": [" blue.", "in the sky."],
    ...     "rejected": [" green.", " in the sea."],
    ... }
    >>> dataset = Dataset.from_dict(dataset_dict)
    >>> dataset = unpair_preference_dataset(dataset)
    >>> dataset
    Dataset({
        features: ['prompt', 'completion', 'label'],
        num_rows: 4
    })

    >>> dataset[0]
    {'prompt': 'The sky is', 'completion': ' blue.', 'label': True}
    ```
    r   r   r   )rY   rZ   )r   r   r   r   column_namesr_   )rX   rY   rZ   ra   s       r$   maybe_unpair_preference_datasetrb     s]    P ';'tGLLN3A67DD++<J,$>(8$OOr@   c           	          t        t        t        | d         t        | d                     D ](  }| d   |   | d   |   k7  s| d   |dz
     dk(  r|dz  } n | d   d | d   |d | d   |d dS )z
    Extracts the shared prompt from a preference data example, where the prompt is implicit within both the chosen and
    rejected completions.

    For more details, see [`maybe_extract_prompt`].
    r   r   r<    N)r   r   r   )rangeminrK   )r   idxs     r$   extract_promptrh   L  s     SWX./WZ5H1IJK 8S!WZ%8%==x q)S0q	 (#DS)(#CD)J'- r@   c                     d| vsd| vr| S d| v r*t        d| d   i      }t        d| d   i      }|r|s|s|s| S t        | d   | d   d      S )a  
    Extracts the shared prompt from a preference data example, where the prompt is implicit within both the chosen and
    rejected completions.

    If the example already contains a `"prompt"` key, the function returns the example as is. Else, the function
    identifies the longest common sequence (prefix) of conversation turns between the "chosen" and "rejected"
    completions and extracts this as the prompt. It then removes this prompt from the respective "chosen" and
    "rejected" completions.

    Args:
        example (`dict[str, list]`):
            A dictionary representing a single data entry in the preference dataset. It must contain the keys
            `"chosen"` and `"rejected"`, where each value is either conversational or standard (`str`).

    Returns:
        `dict[str, list]`: A dictionary containing:
            - `"prompt"`: The longest common prefix between the "chosen" and "rejected" completions.
            - `"chosen"`: The remainder of the "chosen" completion, with the prompt removed.
            - `"rejected"`: The remainder of the "rejected" completion, with the prompt removed.

    Examples:

    ```python
    >>> example = {
    ...     "chosen": [
    ...         {"role": "user", "content": "What color is the sky?"},
    ...         {"role": "assistant", "content": "It is blue."},
    ...     ],
    ...     "rejected": [
    ...         {"role": "user", "content": "What color is the sky?"},
    ...         {"role": "assistant", "content": "It is green."},
    ...     ],
    ... }
    >>> extract_prompt(example)
    {'prompt': [{'role': 'user', 'content': 'What color is the sky?'}],
     'chosen': [{'role': 'assistant', 'content': 'It is blue.'}],
     'rejected': [{'role': 'assistant', 'content': 'It is green.'}]}
    ```

    Or, with the `map` method of `datasets.Dataset`:

    ```python
    >>> from trl import extract_prompt
    >>> from datasets import Dataset

    >>> dataset_dict = {
    ...     "chosen": [
    ...         [
    ...             {"role": "user", "content": "What color is the sky?"},
    ...             {"role": "assistant", "content": "It is blue."},
    ...         ],
    ...         [
    ...             {"role": "user", "content": "Where is the sun?"},
    ...             {"role": "assistant", "content": "In the sky."},
    ...         ],
    ...     ],
    ...     "rejected": [
    ...         [
    ...             {"role": "user", "content": "What color is the sky?"},
    ...             {"role": "assistant", "content": "It is green."},
    ...         ],
    ...         [
    ...             {"role": "user", "content": "Where is the sun?"},
    ...             {"role": "assistant", "content": "In the sea."},
    ...         ],
    ...     ],
    ... }
    >>> dataset = Dataset.from_dict(dataset_dict)
    >>> dataset = dataset.map(extract_prompt)
    >>> dataset[0]
    {'prompt': [{'role': 'user', 'content': 'What color is the sky?'}],
     'chosen': [{'role': 'assistant', 'content': 'It is blue.'}],
     'rejected': [{'role': 'assistant', 'content': 'It is green.'}]}
    ```
    r   r   r   )r   r   )r%   rh   )r   chosen_convprompt_convs      r$   maybe_extract_promptrl   _  sq    d w*G";7'783D(EF'783D(EFK[NWX%6GJDWXYYr@   
seq_lengthc                 R   t        j                  dt               | j                         D ci c]  \  }}|t	        |g        } }}| j                         D ci c]0  \  }}|t        dt        |      |      D cg c]
  }||||z     c}2 } }}}| S c c}}w c c}w c c}}}w )a  
    Pack examples into chunks of size `seq_length`.

    Args:
        examples (`dict[str, list[list]]`):
            Dictionary of examples with keys as strings and values as lists of lists.
        seq_length (`int`):
            Maximum sequence length.

    Returns:
        `dict[str, list[list]]`: Dictionary of examples with keys as strings and values as lists of lists.

    Example:

    ```python
    >>> from trl import pack_examples

    >>> examples = {
    ...     "input_ids": [[1, 2, 3], [4, 5, 6, 7], [8]],
    ...     "attention_mask": [[0, 1, 1], [0, 0, 1, 1], [1]],
    ... }
    >>> pack_examples(examples, seq_length=5)
    {'input_ids': [[1, 2, 3, 4, 5], [6, 7, 8]], 'attention_mask': [[0, 1, 1, 0, 0], [1, 1, 1]]}

    >>> pack_examples(examples, seq_length=2)
    {'input_ids': [[1, 2], [3, 4], [5, 6], [7, 8]], 'attention_mask': [[0, 1], [1, 0], [0, 1], [1, 1]]}
    ```
    zo`pack_examples` is deprecated and will be removed in version 0.20.0. Use `pack_dataset` with a dataset instead.r   )warningswarnDeprecationWarningitemssumre   rK   )rS   rm   kvis        r$   pack_examplesrw     s    : MM	 +3..*:;$!Q3q":;H;]e]k]k]mnnUYUVXY5CFJ3OPaAa!j.)PPnHnO <Pns   B!B"=BB"B"c                   .    e Zd ZdZdefdZd Zd Zd Zy)_SegmentTreea   
    A segment tree data structure that, when initialized as `_SegmentTree(maxval)`, efficiently finds the next larger
    value for a given input within the range [1, maxval].

    See [Fewer Truncations Improve Language Modeling](https://arxiv.org/abs/2404.10830) for more details.
    maxvalc                 .    || _         dgd|z  z  | _        y )Nr      rz   tree)selfrz   s     r$   __init__z_SegmentTree.__init__  s    C1v:&	r@   c                    d|cxk  r| j                   k  sJ  J | j                   |z   dz
  }|| j                  |<   |dkD  rI|dz  }| j                  |dz     | j                  |dz  dz      }}||k\  r|n|| j                  |<   |dkD  rHy y r;   r}   r   valrv   leftrights        r$   addz_SegmentTree.add  s    3%$++%%%%%KK#!		!!e!GA))AF+TYYQ!|-D%D#'5=4eDIIaL	 !er@   c                    d|cxk  r| j                   k  sJ  J | j                   |z   dz
  }d| j                  |<   |dkD  rI|dz  }| j                  |dz     | j                  |dz  dz      }}||k\  r|n|| j                  |<   |dkD  rHy y r;   r}   r   s        r$   removez_SegmentTree.remove  s    3%$++%%%%%KK#!		!!e!GA))AF+TYYQ!|-D%D#'5=4eDIIaL	 !er@   c                     d|cxk  r| j                   k  sJ  J d}|| j                   k  r3| j                  |dz     |k\  r|dz  }n|dz  dz   }|| j                   k  r3| j                  |   S r;   r}   )r   r   rv   s      r$   searchz_SegmentTree.search  sw    3%$++%%%%%$++oyya C'F!VqL	 $++o
 yy|r@   N)	__name__
__module____qualname____doc__intr   r   r   r   r3   r@   r$   ry   ry     s"    's '<<r@   ry   c           
         | d   }|j                         D cg c]  }t        t        t        |                  ! }}t	        j
                  || d   j                        }| j                  d|      } g }d}t        | j                        D ]  \  }}	t        j                  j                  |	j                        s)t        j                  j                  |	j                        rt        j                  |	d|      }	||}|j!                  |	        t        j"                  j%                  || j&                        } t)        j*                  t        |             }
|J t        j,                  t        j.                  | |         j1                         |
      }|j3                  dd      }t5        |      }|j7                  |       t9        t:              }g }t=        |j?                  d      jA                         |j?                  d	      jA                               D ]  \  }}|jC                  |      }||k  r||   jE                         }ng dd
}|j!                  |       |d   j!                  |       |dxx   |z  cc<   ||k  r||   s|jG                  |       ||z
  }||   j!                  |       |dkD  s|j7                  |        t        jH                  | |D cg c]  }|d   D ]  }|  c}}      } t)        j
                  dg|D cg c]  }|d   	 c}z         }t)        jJ                  |      }g }| j                  D ]  }	t        |	jL                        d	k(  sJ |	jL                  d   }	t        j                  j                  |	j                        s)t        j                  j                  |	j                        rX|	jN                  j                  jQ                         }t        |	      j%                  |jS                  |      |	jT                        }	|j!                  |	        t        j"                  j%                  || j&                        S c c}w c c}}w c c}w )zFPack sequences in a pyarrow Table using First Fit Decreasing strategy.	input_ids)typeposition_idsNr   names
descending)byr<   )idslengthr   r   )+	to_pylistr   re   rK   paarrayr   append_column	enumeratecolumnspyarrowtypesis_listis_large_listpc
list_sliceappendTablefrom_arraysra   nparangemake_structlist_value_lengthcombine_chunkssortry   r   r   r   rJ   fieldto_numpyr   popleftr   takecumsumchunksoffsetsto_pandas_dtypeastypevalues)rS   rm   r   sequenceposition_ids_pythonposition_ids_arrayr   list_column_idxrg   columnr   lengthssegment_treespace_to_binbinsr   spacebinid_r   dtypes                        r$   	_pack_ffdr     s    %IFOFYFYF[\(4c(m 45\\"5H[<Q<V<VW%%n6HIHGO !1!12 V==  -1L1LV[[1Y]]61j9F&"%v xx##G83H3H#IH
))CM
"C&&&nnR11(?2KL[[]_bcGll<Al.G
+LZ u%L D7==+446a8H8Q8Q8ST $##F+:u%--/C*CKKE
#H:l5&9&U""3'19U##$& wwx!L#U!L##!L#!LMHhhst<c(m<<=Gii GG"" 6==!Q&&&q!88FKK(BHH,B,B6;;,ONN''779E&\--gnnU.CV]]SFv 88x/D/DEEq ]X "M<s   $Q%)Q*Q0
c                    g }| j                   D ]@  }t        j                  j                  |j                        s)t        j                  j                  |j                        rt        |t        j                        r|j                         }|j                  |j                  }}||d   j                         |d   j                          }t        |      }|j                  j                         }t        j                   d|||      }t        j"                  ||gf      }t	        |      j%                  ||      }|j'                  |       C t        j(                  j%                  || j*                        S )z;Pack sequences in a pyarrow Table using a wrapped strategy.r   r+   )r   r   )r   r   r   r   r   r   r   r   ChunkedArrayr   r   r   as_pyrK   r   r   r   concatenater   r   r   ra   )rS   rm   r   r   r   r   num_elementsr   s           r$   _pack_wrappedr   O  s"   G"" ==  -1L1LV[[1Y&"//2..0$nnfmmVGGAJ,,.1B1B1DEFv;LLL002Eii<5IGnng~%>?G&\--gv>Fv 88x/D/DEEr@   strategy
map_kwargsc                     |i }| j                  d      } |dk(  r | j                  t        fdd|id|} n1|dk(  r | j                  t        fdd|id|} nt	        d| d	      | j                  d      } | S )
a  
    Pack sequences in a dataset into chunks of size `seq_length`.

    Args:
        dataset (`Dataset` or `DatasetDict`):
            Dataset to pack
        seq_length (`int`):
            Target sequence length to pack to.
        strategy (`str`, *optional*, defaults to `"ffd"`):
            Packing strategy to use. Can be either:

            - `"ffd"` (First Fit Decreasing): Slower but preserves sequence boundaries. Sequences are never cut in the
                middle.
            - `"wrapped"`: Faster but more aggressive. Ignores sequence boundaries and will cut sequences in the middle
                to completely fill each packed sequence with data.
        map_kwargs (`dict` or `None`, *optional*, defaults to `None`):
            Additional keyword arguments to pass to the dataset's map method when packing examples.

    Returns:
        `Dataset` or `DatasetDict`: The dataset with packed sequences. The number of examples may decrease as sequences
        are combined.

    Example:
    ```python
    >>> from datasets import Dataset
    >>> from trl import pack_dataset

    >>> examples = {
    ...     "input_ids": [[1, 2, 3], [4, 5], [6, 7, 8], [9]],
    ...     "attention_mask": [[1, 1, 0], [1, 0], [1, 0, 0], [1]],
    ... }
    >>> dataset = Dataset.from_dict(examples)
    >>> packed_dataset = pack_dataset(dataset, seq_length=4, strategy="ffd")
    >>> packed_dataset[:]
    {'input_ids': [[1, 2, 3, 9], [6, 7, 8, 4, 5]],
     'attention_mask': [[1, 1, 0, 1], [1, 0, 0, 1, 0]]}
    ```
    NarrowffdTrm   )r\   	fn_kwargswrappedzInvalid packing strategy: z. Use 'ffd' or 'wrapped'.)with_formatr^   r   r   rH   )rX   rm   r   r   s       r$   pack_datasetr   a  s    R 
!!'*G5'++ij,PZA[j_ij	Y	'++mnTlT^E_ncmn5hZ?XYZZ!!$'GNr@   
max_lengthc                     |i }t        | t              r>fd}| j                  d      }  | j                  |fddi|} | j                  d      } | S fd} | j                  |fddi|} | S )a  
    Truncate sequences in a dataset to a specifed `max_length`.

    Args:
        dataset (`Dataset` or `DatasetDict`):
            Dataset to truncate.
        seq_length (`int`):
            Maximum sequence length to truncate to.
        map_kwargs (`dict` or `None`, *optional*, defaults to `None`):
            Additional keyword arguments to pass to the dataset's map method when truncating examples.

    Returns:
        `Dataset` or `DatasetDict`: The dataset with truncated sequences.

    Example:
    ```python
    >>> from datasets import Dataset

    >>> examples = {
    ...     "input_ids": [[1, 2, 3], [4, 5, 6, 7], [8]],
    ...     "attention_mask": [[0, 1, 1], [0, 0, 1, 1], [1]],
    ... }
    >>> dataset = Dataset.from_dict(examples)
    >>> truncated_dataset = truncate_dataset(dataset, max_length=2)
    >>> truncated_dataset[:]
    {'input_ids': [[1, 2], [4, 5], [8]],
     'attention_mask': [[0, 1], [0, 0], [1]]}
    ```
    Nc                 t   g }| j                   D ]|  }t        j                  j                  |j                        s)t        j                  j                  |j                        rt        j                  |d      }|j                  |       ~ t        j                  j                  || j                        S )Nr   r   )r   r   r   r   r   r   r   r   r   r   r   r   ra   )rS   truncated_columnsr   r   s      r$   truncatez"truncate_dataset.<locals>.truncate  s     ""** 1==((59T9TU[U`U`9a]]61jAF!((01 88''(9AVAV'WWr@   r   r\   Tc                     i }| j                         D ]1  \  }}|r%t        |d   t              r|D cg c]  }|d  	 }}|||<   3 |S c c}w )Nr   )rr   r   r   )rS   truncated_examplesr    r   r   r   s        r$   r   z"truncate_dataset.<locals>.truncate  sf    !#'~~/ 1VjD9:@A3c+:.AFA*0"3'1 &% Bs   A
)r   r   r   r^   )rX   r   r   r   s    `  r$   truncate_datasetr     s    @ 
'7#	X %%g.'++hCC
C%%d+  N	& '++

 

 Nr@   c                     | j                  d      }t        |t              r|d   }t        |t              r	d|v rd|v ryy)aj  
    Check if the example is in a conversational format (from/value). Note that this format isn't recommended. Prefer
    the ChatML format (role/content)

    Args:
        example (`dict[str, Any]`):
            A single data entry of a dataset. The example can have different keys depending on the dataset type.

    Returns:
        `bool`:
            `True` if the data is in a conversational Chatformat, `False` otherwise.

    Examples:

    ```python
    >>> example = {"conversations": [{"from": "user", "value": "What color is the sky?"}]}
    >>> is_conversational_from_value(example)
    True
    >>> example = {"conversations": [{"role": "user", "content": "What color is the sky?"}]}
    >>> is_conversational_from_value(example)
    False
    >>> example = {"conversations": "The sky is"})
    >>> is_conversational_from_value(example)
    False
    ```
    conversationsr   fromvalueTF)getr   r   r   )r   r"   r#   s      r$   is_conversational_from_valuer     sE    6 [[1N.$'&q)mT*v/F7VcKcr@   c                    dD ]i  }|| v st        | |   t              s| |   }|D ]D  }t        |t              sd|v r|j                  d      |d<   d|v s1|j                  d      |d<   F k d| v r| j                  d      | d<   | S )a3  
    Convert a conversational dataset with fields `from` and `value` to ChatML format.

    This function modifies conversational data to align with OpenAI's ChatML format:
    - Replaces the key `"from"` with `"role"` in message dictionaries.
    - Replaces the key `"value"` with `"content"` in message dictionaries.
    - Renames `"conversations"` to `"messages"` for consistency with ChatML.

    Args:
        example (`dict[str, list]`):
            A single data entry containing a list of messages.

    Returns:
        `dict[str, list]`:
            Example reformatted to ChatML style.

    Example:
    ```python
    >>> from trl import maybe_convert_to_chatml

    >>> example = {
    ...     "conversations": [
    ...         {"from": "user", "value": "What color is the sky?"},
    ...         {"from": "assistant", "value": "It is blue."},
    ...     ]
    ... }
    >>> maybe_convert_to_chatml(example)
    {'messages': [{'role': 'user', 'content': 'What color is the sky?'},
                  {'role': 'assistant', 'content': 'It is blue.'}]}
    ```
    )r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   )r   r    r   messages       r$   maybe_convert_to_chatmlr     s    B [ B'>jt<s|H# Bgt,(*1++f*=')-4[[-A	*BB '!%kk/:
Nr@   r2   )NN)r   N)2ro   collectionsr   r   collections.abcr   	itertoolsr   typingr   r   r	   r
   r   numpyr   r   r   pyarrow.computecomputer   pyarrow.typesdatasetsr   r   transformersr   r   r   strboolr%   r   rG   rR   rW   r   r_   rb   rh   rl   rw   ry   r   r   r   r   r   r   r   r3   r@   r$   <module>r      s}    * $  : :     ) 0 mWk:&tCH~ &$ &X 48]#tDcN++,]&] DtX~./0] 
#s(^	]F 488#tDcN++,8&8 DtX~./08 
#s(^	8v$tCd38n)=$=>? DcSWX\]`be]eXfSgNgIhDi  QU(w(w$,SM(w@H(w(wX QU//$,SM/@H//dDh/ Dh4G &ZZ$sDy/ ZZd39o ZZz&Dd4j1 &s &tCQUVZQ[OG\ &R( (V<F <Fc <Fbhh <F~FBHH F# F"(( F& jn44&)4584NVW[\_ad\dWeNf44p SW>>&)>7?S#X7O>>B#$sCx. #T #L/T#t)_ /c4i /r@   