o
    witI                     @   s   d dl mZ d dlmZ d dlmZmZ d dlZd dlm	Z	m
Z
 dZdZdZG d	d
 d
ZG dd deZG dd deZG dd dZG dd deZdedefddZdedefddZdS )    )ABC)	lru_cache)AnyTypeN)AggregateTokenizerTokenizerSpecpreamblez|bos|z|eos|c                   @   s*   e Zd ZededefddZdd ZdS )BaseModalityTypevaluereturnc                 C   s   t N)NotImplementedErrorr
    r   f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/common/prompts/formatter.pymatches&   s   zBaseModalityType.matchesc                 C   s   d| j j dS )N	Modality.z())	__class____name__selfr   r   r   __repr__*   s   zBaseModalityType.__repr__N)r   
__module____qualname__staticmethodr   boolr   r   r   r   r   r   r	   %   s    r	   c                   @   s&   e Zd ZdZededefddZdS )TextzModality for text values.r
   r   c                 C   s
   t | tS r   )
isinstancestrr   r   r   r   r   1   s   
zText.matchesN)r   r   r   __doc__r   r   r   r   r   r   r   r   r   .   s    r   c                   @   s.   e Zd Zdd ZdedefddZdd Zd	S )
TextLiteralc                 G   s
   || _ d S r   )allowed_values)r   itemsr   r   r   __init__7   s   
zTextLiteral.__init__r
   r   c                 C   s   t |to	|| jv S r   )r   r   r!   )r   r
   r   r   r   r   :   s   zTextLiteral.matchesc                 C   s   d| j j d| j dS )Nr   z(allowed_values=))r   r   r!   r   r   r   r   r   =   s   zTextLiteral.__repr__N)r   r   r   r#   r   r   r   r   r   r   r   r   r    6   s    r    c                   @   s   e Zd ZdZeZeZdS )Modalityz>
    Modalities supported as PromptFormatter slot values.
    N)r   r   r   r   r   r    r   r   r   r   r%   A   s    r%   c                	       s|  e Zd ZdZdZdZdZdZdZdZ	dZ
i Zd+dedee dB ddfdd	Zd, fd
dZededed  fddZeeddee fddZededeeef fddZededefddZdee fddZdedeeef deeef dee fddZdee deeejf fdd Z d+d!ed"edB dee fd#d$Z!d%eeef d&eeef ddfd'd(Z"d)d* Z#  Z$S )-PromptFormatterap  
    :class:`~nemo.collections.common.prompts.formatter.PromptFormatter` is intended to simplify
    working with various prompt format templates and encoding them into token ID tensors.

    It assumes a dialog-like structure, which is a list of turns, with each turn assigned to a role.
    Sub-classes of PromptFormatter define turn templates for each role under TEMPLATE class attribute.
    Each template may define some constant parts (e.g. begin-of-turn or end-of-turn tokens, whitespaces, etc.)
    and variable parts which we call "slots", that will be provided by the user during training or inference.

    A role is typically "user" and "assistant", and some popular models also use a "system" role.
    Other roles may be defined as well. We expect the role corresponding to the model's responses
    will be registered under class attribute called OUTPUT_ROLE.
    We reserve a special "preamble" role with no slots that will be inserted at the beginning of
    the formatted prompt, if "preamble" is present in TEMPLATE.

    A turn is a dict with keys "role" and "slots", where "slots" are a dict that maps slot names
    to values that should be filled in the template.
    For example, a user role template may be ``"Question: |message|"`` and corresponding ``slots`` would then be
    ``{"message": "What time is it?"}``.

    There is a special slot called ``|prompt_language|`` that's used to select the sub-tokenizer in
    :class:`~nemo.collections.common.tokenizers.aggregate_tokenizer.AggregateTokenizer`.
    It's only used when the tokenizer is aggregate; otherwise it's discarded.

    PromptFormatter supports constructing prompts for training (complete context and answers)
    and for inference (context-only).
    Training/inference is determined automatically; if the last role in a dialog is the OUTPUT_ROLE,
    that's an 'asked-and-answered' scenario, so we assume it's inteded for training.
    We'll create a dict with tokenized results available under the following keys:

    * ``context_ids`` (all turns minus last one),
    * ``answer_ids`` (last turn)
    * ``input_ids`` (previous two values concatenated)
    * ``mask`` (boolean mask tensor of the same lenth as ``input_ids`` that's set to True on OUTPUT_ROLE turns)

    Typically, the user will use the ``encode_dialog`` method providing a list of turns to it.
    Example showing how to construct model inputs/outputs for training::

        >>> formatter = PromptFormatter(tokenizer)
        ... encoded_for_training = formatter.encode_dialog(
        ...     turns=[
        ...         {"role": "user", "slots": {"message": "What time is it?"}},
        ...         {"role": "assistant", "slots": {"message": "Ten o'clock."}},
        ...         {"role": "user", "slots": {"message": "PM or AM?"}},
        ...         {"role": "assistant", "slots": {"message": "AM, naturally! It's bright outside"}},
        ...     ]
        ... )

    Another example that shows how to use the same method to generate prompts for inference::


        >>> formatter = PromptFormatter(tokenizer)
        ... encoded_for_inference = formatter.encode_dialog(
        ...     turns=[
        ...         {"role": "user", "slots": {"message": "What time is it?"}},
        ...         {"role": "assistant", "slots": {"message": "Ten o'clock."}},
        ...         {"role": "user", "slots": {"message": "PM or AM?"}},
        ...     ]
        ... )

    prompt_languageNF	tokenizerdefaultsr   c                 C   s$   || _ |d ur	|ng | _|   d S r   )r(   	_defaults_validate_defaults)r   r(   r)   r   r   r   r#      s   zPromptFormatter.__init__c                    s*  d}| j | jvrAdD ]}t| |d d us J | d|  d| q
| j| jvs;J d| j  d| j d| j| j  d| | j| j< d	| jv r`t| jd	 d
g dks`J | d| jd	  d|  D ]%}| |}| 	|D ]}t
||v sJ | d| d| d| dqpqdt jdi | d S )Nz*PromptFormatter subclass definition error:)NAMETEMPLATEOUTPUT_ROLEz PromptFormatter subclass z" did not define a class attribute zCannot register z under z#: another prompt formatter of type z- has already been registered under this name.r   slotsr   z= Slots are not allowed for preamble template, but we found: ''z Slot 'z' not found in template 'z' for role 'r   )r   _REGISTERED_FORMATTERSgetattrr,   r-   lenget	get_rolesget_template	get_slots_mangledsuper__init_subclass__)clskwargsERRattrroletemplateslotr   r   r   r:      s0   


z!PromptFormatter.__init_subclass__namec                 C   s6   || j vrtd| dd| j   d| j | S )NzUnknown prompt formatter: 'z' (known formats: z, r$   )r1   RuntimeErrorjoinkeys)r;   rC   r   r   r   resolve   s
   

zPromptFormatter.resolve   c                 C   s   t | j S r   )listr-   rF   )r;   r   r   r   r5      s   zPromptFormatter.get_rolesr?   c                 C   s   | j | di  S )Nr/   )r-   r4   copyr;   r?   r   r   r   r7      s   zPromptFormatter.get_slotsc                 C   s   | j | d S )Nr@   )r-   rK   r   r   r   r6      s   zPromptFormatter.get_templatec                    s.   dt dtffdd  fdd D S )a  
        Returns a list of dialog turns that can be used as a skeleton to fill with actual slot values.
        If ``PromptFormatter`` was initialized with ``defaults`` argument, this method will return the
        defaults. Otherwise, every slot is pre-filled with ``None``.
        r?   r   c                    s$    j D ]}|d | kr|  S qi S )Nr?   )r*   )r?   turnr   r   r   _get_default_for_role   s
   
zGPromptFormatter.get_default_dialog_slots.<locals>._get_default_for_rolec                    s6   g | ]  j kr  fd d D dqS )c                    s$   i | ]}|  d i  |qS )r/   )r4   ).0rA   )rM   r?   r   r   
<dictcomp>   s    zGPromptFormatter.get_default_dialog_slots.<locals>.<listcomp>.<dictcomp>)r?   r/   )r.   r7   )rN   rM   r   )r?   r   
<listcomp>   s    
z<PromptFormatter.get_default_dialog_slots.<locals>.<listcomp>)r   dictr5   r   r   rP   r   get_default_dialog_slots   s   z(PromptFormatter.get_default_dialog_slotsprompt_templateexpected_slotsslot_valuesc                 C   s`   |}|D ] }| |}|d usJ d|d|d||t||}q| j|| | jdS )NzMissing required slot=z in slot_values=z for prompt_template=)lang)r4   replacer8   _apply_tokenizerPROMPT_LANGUAGE_SLOT)r   rT   rU   rV   promptrA   r
   r   r   r   encode_turn   s   
"zPromptFormatter.encode_turnturnsc                    s  |   }t|dksJ d|D ] d v sJ d  d |v s/J d d d| qg }g }g | jrJ|| jj |d d d	| jv r{d
d t|D }|sgdd	i| jd	 g| }nt|dkrs|d dks{J d| d|d d | jk}|D ][  d }| 	|}d v rt|dkr fdd|
 D }	n di }	|r|	sJ d| d | ||	 | |}
| |
||	}|| |t| || jk q|r| jd ur| | j}|| |t| d | jr|s|| jj |d  d7  < d dtj|tjdi}d r[|d d |d   |d< |d |d  d  |d< tjfddt|D tjd|d< |S |d |d< |S )Nr   zEmpty dialog is not supported.r?   z5A turn must have have a 'role' key. We received turn=zFound turn with turn['role']=z, but available roles are rH   Fr   c                 S   s    g | ]\}}|d  dkr|qS )r?   r   r   )rN   idxtr   r   r   rQ     s     z1PromptFormatter.encode_dialog.<locals>.<listcomp>zPPreamble can only be presented at turn 0 but we found preamble turns at indexes .contentc                    s   i | ]}| d  qS )rb   r   )rN   k)rL   r   r   rO   )  s    z1PromptFormatter.encode_dialog.<locals>.<dictcomp>r/   zA turn for role zF must have have a non-empty value under 'slots' key. We received turn=T	input_ids)dtypecontext_ids
answer_idsc                    s&   g | ]\}}t |D ]} | q
qS r   )range)rN   turn_idxturn_len_)turn_mask_valuesr   r   rQ   K  s    mask)r5   r3   
INSERT_BOSappendr(   bosr-   	enumerater.   r7   rF   r4   _validate_slot_valuesr6   r\   extendINFERENCE_PREFIXrY   
INSERT_EOSeostorchtensorlongr   )r   r]   rolesturn_tokensturn_token_countspreamble_turnsis_inferencer?   rU   rV   r@   tokensinference_prefixansr   )rL   rl   r   encode_dialog	  sx   &













zPromptFormatter.encode_dialogtextrW   c           	      C   s   t | jt}|r|d usJ d| j d|t}|t}|r)|ttd  }|r4|d tt  }|r>| j	||}n| j	|}|rW|rN| j
|n| jj}|| |rj|ra| j|n| jj}|g| }|S )NzMissing key 'zX' in slot_values -- cannot resolve the correct sub-tokenizer in the aggregate tokenizer.)r   r(   r   rZ   
startswithBOS_SLOTendswithEOS_SLOTr3   text_to_idsget_eosrv   ro   get_bosrp   )	r   r   rW   is_agghas_boshas_eosr   eos_idbos_idr   r   r   rY   V  s*   




z PromptFormatter._apply_tokenizerexpectedreceivedc                 C   sd   t |t | }|rJ d| |D ]}|| }|| }||s/J d|d|d| qd S )Nz-The following slot values were not provided: zslot=z received value=z which does not match modality )setr   )r   r   r   missingrA   expected_modalityr
   r   r   r   rr   x  s   z%PromptFormatter._validate_slot_valuesc                 C   s   | j sd S d}t| j tsJ | j D ]g}t|tsJ d|v s(J | d||d }||  v sDJ | d|d|d|   d| | }ryd|v s\J | d	|d
| d|d D ]}||v sxJ | d|d|d|d| 	q`qd S )Nz#Error in default prompt definition:r?   z/ Missing required 'role' key. We received turn=z Invalid role=z	 in turn=z - supported roles are: r`   r/   z& Missing required 'slots' key in turn=z3 - we expected the following slots to be provided: z Invalid slot=z-. The following slots are supported for role=z: )r*   r   rI   rR   r5   r7   )r   errrL   r?   rU   rA   r   r   r   r+     s6   


z"PromptFormatter._validate_defaultsr   )r   N)%r   r   r   r   rZ   r,   r-   r.   rt   rn   ru   r1   r   rI   rR   r#   r:   classmethodr   r   rG   r   r5   r%   r7   r6   rS   r   intr\   rw   Tensorr   rY   rr   r+   __classcell__r   r   rB   r   r&   J   sF    ? 


  M&"
r&   rA   r   c                 C   s(   | d dkr| d dksd|  dS | S )Nr   |ra   r   rA   r   r   r   r8        r8   c                 C   s(   | d dkr| d dkr| dd S | S )Nr   r   ra   rH   r   r   r   r   r   
_unmangled  r   r   )abcr   	functoolsr   typingr   r   rw   "nemo.collections.common.tokenizersr   r   PREAMBLE_ROLEr   r   r	   r   r    r%   r&   r   r8   r   r   r   r   r   <module>   s"   			  S