o
    }oiG                  	   @   s   d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZmZmZmZ G d	d
 d
eZdeeef deeef fddZeeedededeeejf fddZdS )    )AnyN)MonoCut)CutMixedCut)ifnone)registered_prompt_format_fn)ModalityPromptFormatter)
CANARY_BOS
CANARY_EOSCANARY_NOPNC
CANARY_PNCCANARY_SPECIAL_TOKENIZERc                       s   e Zd ZdZdZde dejeddddd	d
dejeddddddddddddddede	 dejidiZ
deeef deeef ddf fd d!Zd"ed#ed$edee f fd%d&Z  ZS )'CanaryPromptFormattercanary	assistantuserz%|source_lang||task||target_lang||pnc|asrast	translate
transcribes2t_translation<|transcribe|><|translate|>yesnotrueTruefalseFalse10pncnopncz<|pnc|>z	<|nopnc|>)source_langtasktarget_langr"   )templateslotsz|text|textexpectedreceivedreturnNc                    s.   d|v rd|vr| d|d< t j||dS )Ntasknamer%   )r*   r+   )popsuper_validate_slot_values)selfr*   r+   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/prompts/canary.pyr0   :   s   z+CanaryPromptFormatter._validate_slot_valuesprompt_templateexpected_slotsslot_valuesc                    s   t |}t j|||dS )N)r6   r7   r8   )%map_manifest_values_to_special_tokensr/   encode_turn)r1   r6   r7   r8   r2   r4   r5   r:   ?   s   z!CanaryPromptFormatter.encode_turn)__name__
__module____qualname__NAMEOUTPUT_ROLEr
   r   TextTextLiteralr   TEMPLATEdictstrr   r0   listintr:   __classcell__r4   r4   r2   r5   r   !   s.    **r   r8   r,   c                 C   s  |   } d}dD ] }|| v r(| |  }dr|ds(d| |  d | |< d}qd}|| v rE| | ttfvrE| | dv r?tnt| |< d}dD ]0}|| v rw| | d	vrw| | d
v r^d| d< n| | dv rid| d< nJ d| |  d| d}qG|rtj| vrt| tj< | S )NF)r$   r&   z<|z|>Tr"   )r   r    r   r   r"   )r%   r-   )r   r   >   r   r   r   r   r%   >   r   r   r   zTask z invalid task for slot )copy
startswithendswithr   r   r	   PROMPT_LANGUAGE_SLOTr   )r8   any_special_token_presentkvr4   r4   r5   r9   L   s.   $


r9   cutpromptc                    sL  t  tr j t  tstd dt|d}|t j }d|v r9d jv r9 jd  jd< |d |rGt	d j
 d| dtdi  fd	d
|D |jtidg}ddd  jD }|tdd||jt jd j jdid ||}|d d  |jjksJ d|d  |d dd |d< |S )a`  
    Prepend and append control tokens to the token sequence as per Canary format.

    We use the following special tokens:
    * <|startoftranscript|>
    * <|transcribe|>
    * <|translate|>
    * <|nopnc|>
    * <|pnc|>
    * <|endoftext|>
    * <|LANG|> - for each supported language.
    * <|nospeech|>

    The prompt format syntax is as follows:

        <|startoftranscript|> [ <|nospeech|> | <|LANG|> [ <|transcribe|> | <|translate|> ] <|LANG|> [ <|pnc|> | <|nopnc|> ] TEXT <|endoftext|> ]

    Where expression ``[ a | b ]`` denotes expression ``a`` or expression ``b``, and can be nested.
    Note that ``<|LANG|>`` appears twice: the first occurrence is for the "source" language
    (i.e., spoken language in the recording) and the second occurrence is for the "target" language
    (i.e., the language in which we are going to output the text).
    z_Expected input audio to have a single channel (required MonoCut/MixedCut, but we received: cut=)r   r%   r-   zWe found cut with ID z% that is missing the following keys: zNPlease ensure that every utterance in the input manifests contains these keys.c                    s   i | ]}| j | qS r4   )custom).0slotrO   r4   r5   
<dictcomp>   s    zcanary.<locals>.<dictcomp>)roler(    c                 s   s     | ]}|j d ur|j V  qd S )N)r)   )rS   sr4   r4   r5   	<genexpr>   s    zcanary.<locals>.<genexpr>r   r)   r   r&   
answer_idsz<Expected the last token in answer_ids to be EOS, but we got N)
isinstancer   _first_non_padding_cutr   	TypeErrorset	get_slotsrR   removeRuntimeErroridrC   rK   r   joinsupervisionsappendr   languagegetencode_dialogitem	tokenizereos)rO   rP   r7   missing_keysturnsr)   ansr4   rU   r5   r   n   sN   






r   )typingr   torchlhotser   
lhotse.cutr   r   lhotse.utilsr   &nemo.collections.common.data.prompt_fnr   )nemo.collections.common.prompts.formatterr   r	   3nemo.collections.common.tokenizers.canary_tokenizerr
   r   r   r   r   r   rC   rD   r9   Tensorr   r4   r4   r4   r5   <module>   s   	"+"&