o
    }oi"                     @   s   d dl Z d dlmZ e dZe dZG dd dZedd Zdd
dZdd Z	dd Z
dd Zdd Zdd Zdd Zdd ZdS )    N)cachez^\{_[A-Za-z][A-Za-z0-9_]*_\}$z({_[^}]+_})c                   @   s    e Zd Zdd Zedd ZdS )ChatTemplateMixinc                 C   s   | j d usJ t| || j S N)chat_templatetokenize_with_chat_template)selfmessages r	   j/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/chat_template_mixin.pyapply_chat_template   s   z%ChatTemplateMixin.apply_chat_templatec                 C   s
   | j d uS r   )r   )r   r	   r	   r
   has_chat_template   s   
z#ChatTemplateMixin.has_chat_templateN)__name__
__module____qualname__r   propertyr   r	   r	   r	   r
   r      s    r   c                 C   s   t t| S r   )rematchTEMPLATE_VAR_VALIDATION_PATsr	   r	   r
   is_template_var    s   r   Tc                 c   s,    t t| D ]}|r|dkrq|V  qd S )N )r   splitTEMPLATE_VAR_SEARCH_PAT)template
skip_emptypartr	   r	   r
   extract_template_parts&   s   r   c                 C   s   t | s| S | dd S )N   )r   r   r	   r	   r
   strip_template_wrap.   s   r    c                 c   sx    g }t t|D ](\}}t|r,t|}|dkr!|| d  q	d||fV  g }q	|| q	d|dfV  dS )av  Renders a chat turn based on template

    Args:
        message (Dict)
        e.g. {'role': ['user'], 'content': ['What is your favourite fruit?']},
        template (Str):
            "[INST] {_content_} [/INST]",

    Returns:
        (str, token_id/None): the template formatted message
        e.g.
            "[INST] What is your favourite fruit? [/INST]", None
    contentr   N)	enumerater   r   r    appendjoin)messager   ansitemplate_partr	   r	   r
   render_chat_turn5   s   r)   c                 C   s   g }t |tr|dkr|| |7 }nt |tr(t|dkr(|| d|7 }|durAt| |s9J d| d|t| |g7 }|S )a  
    Tokenizes a string or a list of string into their corresponding token_ids
    and appends (at the end) a special_token if present.

    Args:
        tokenizer: (SPM)
        inputs: (Str, List[Str])
        e.g. "Alex" or ["Alex", "nvidia"]
        special_token: (Str):
        e.g. "eos"

        Returns:
         (list[int]): list of token_ids
         e.g.
            input="Alex", special_token="eos"
            Alex->[3413]
            eos->[2]

            Will return the following:
            [3413, 2]
    r   r   NzSpecial_token z is not part of tokenizer)
isinstancestrtext_to_idslistlenr$   hasattrgetattr)	tokenizerinputsspecial_tokenr&   r	   r	   r
    encode_string_with_special_tokenS   s   r4   c                    s  t |sJ dt|dksJ dd|v sJ dg } fdd}d|v r9td	|d D ]\}}||||7 }q-g }|D ]8}|d
 |d v sQJ |d
 |d f|d |d
  }	t||	D ]\}
}||
g7 }|d urt||||7 }g }q^q=|||d 7 }t|dksJ d|S )Nz"Expected input to be chat-templater   zExpected non-empty messagesrolesz&Expected template to have key `roles`.c                    s   t  | |S r   )r4   )xyr1   r	   r
   <lambda>z   s    z-tokenize_with_chat_template.<locals>.<lambda>prefixr   rolezExpected non-empty output)is_chat_inputr.   r)   )r1   r   r   r&   encoder   r3   bufferr%   msg_templatetemplated_messagesr	   r8   r
   r   u   s,   $
r   c                    s.   g }| D ]}|  fdd| D  q|S )a'  
    a collated messages can have multiple chat messages in each dict,
    this extracts (vertically) one of them, for example:

    messages = [
        {'role': ['user', 'user'], 'content': ['What is your favourite condiment?', 'What is your favourite fruit?']},
        {'role': ['assistant', 'assistant'], 'content': ["Well, I'm quite partial to a ", "good squeeze of fresh lemon"]},
        {'role': ['user', 'user'], 'content': ['Do you have mayonnaise recipes?', 'Do you have tomato salad recipes?']}
    ]
    ans = extract_turns(messages, axis=1)

    ans = [
        {'role': ['user'], 'content': ['What is your favourite fruit?']},
        {'role': ['assistant'], 'content': ["good squeeze of fresh lemon"]},
        {'role': ['user'], 'content': ['Do you have tomato salad recipes?']}
    ]
    c                    s   i | ]	\}}||  qS r	   r	   ).0kvaxisr	   r
   
<dictcomp>   s    z!extract_turns.<locals>.<dictcomp>)r#   items)r   rE   r&   turnr	   rD   r
   extract_turns   s   rI   c                 #   s    t | ts
J dt| dksJ dttdd | s!J dttdd | s.J dt| d d	  tt fd
d| sEJ dt D ]	}t| |dV  qIdS )a+  
    Example input
    [
       {'role': ['user', 'user'], 'content': ['What is your favourite condiment?', 'What is your favourite fruit?']},
       {'role': ['assistant', 'assistant'], 'content': ["Well, I'm quite partial to a ", "good squeeze of fresh lemon"]},
       {'role': ['user', 'user'], 'content': ['Do you have mayonnaise recipes?', 'Do you have tomato salad recipes?']}
    ]

    Notice the 2D axis system of the messages variable, one for the list and one for each item in the list (i.e.
    the 'content' contains multiple messages).
    zExpected messages to be a listr   zExpected non empty messagesc                 S   s
   t | tS r   )r*   dictr6   r	   r	   r
   r9      s   
 z-explode_chat_template_input.<locals>.<lambda>z"Expected messages to contain dictsc                 S   s   d| v od| v S )Nr;   r!   r	   rK   r	   r	   r
   r9          zBExpected messages each dict to contain 'role' and 'content' fieldsr;   c                    s   t | d  kS )Nr;   )r.   rK   nr	   r
   r9      rL   zIExpected all batch messages to contain equal number of roles in all turnsrD   N)r*   r-   r.   allmaprangerI   )r   r'   r	   rM   r
   explode_chat_template_input   s$   rR   c                 C   s$   t | tot| dkot | d tS )Nr   )r*   r-   r.   rJ   )r   r	   r	   r
   r<      s   $r<   )T)r   	functoolsr   compiler   r   r   r   r   r    r)   r4   r   rI   rR   r<   r	   r	   r	   r
   <module>   s   




"