o
    wi                  	   @   s  d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZmZmZmZ h d	Zh d
ZeddhB ZeddhB Zh dZh dZG dd deZdeeef deeef fddZeeedededeeejf fddZ dS )    )AnyN)MonoCut)CutMixedCut)ifnone)registered_prompt_format_fn)ModalityPromptFormatter)
CANARY_BOS
CANARY_EOSCANARY_NOPNC
CANARY_PNCCANARY_SPECIAL_TOKENIZER>   1YesyesTruetrue>   0NonoFalsefalsepncz<|pnc|>nopncz	<|nopnc|>>   asr
transcribe<|transcribe|>>   ast	translates2t_translation<|translate|>c                	       s   e Zd ZdZdZdZde dejej	e
eB  ejej	eeB  ddede d	ejidiZd
eeef deeef ddf fddZdedededee f fddZ  ZS )CanaryPromptFormatterzCanary Promptcanary	assistantuserz%|source_lang||task||target_lang||pnc|)source_langtasktarget_langr   )templateslotsz|text|textexpectedreceivedreturnNc                    s.   d|v rd|vr| d|d< t j||dS )Ntasknamer'   )r,   r-   )popsuper_validate_slot_values)selfr,   r-   	__class__ c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/common/prompts/canary.pyr2   D   s   z+CanaryPromptFormatter._validate_slot_valuesprompt_templateexpected_slotsslot_valuesc                    s   t |}t j|||dS )N)r8   r9   r:   )%map_manifest_values_to_special_tokensr1   encode_turn)r3   r8   r9   r:   r4   r6   r7   r<   I   s   z!CanaryPromptFormatter.encode_turn)__name__
__module____qualname____doc__NAMEOUTPUT_ROLEr
   r   TextTextLiteralTASK_TRANSLATETASK_TRANSCRIBEPNC_TRUE	PNC_FALSEr   TEMPLATEdictstrr   r2   listintr<   __classcell__r6   r6   r4   r7   r"   -   s(    	**r"   r:   r.   c                 C   s  |   } d}dD ] }|| v r(| |  }dr|ds(d| |  d | |< d}qd}|| v rE| | ttfvrE| | dv r?tnt| |< d}dD ]0}|| v rw| | d	vrw| | d
v r^d| d< n| | dv rid| d< nJ d| |  d| d}qG|rtj| vrt| tj< | S )z7Convert manifest values to Canary special token format.F)r&   r(   z<|z|>Tr   )r   r   r   r   r   )r'   r/   )r   r!   >   r   r   r    r!   r'   >   r   r   r   zTask z invalid task for slot )copy
startswithendswithr   r   r	   PROMPT_LANGUAGE_SLOTr   )r:   any_special_token_presentkvr6   r6   r7   r;   V   s.   $


r;   cutpromptc                    sL  t  tr j t  tstd dt|d}|t j }d|v r9d jv r9 jd  jd< |d |rGt	d j
 d| dtdi  fd	d
|D |jtidg}ddd  jD }|tdd||jt jd j jdid ||}|d d  |jjksJ d|d  |d dd |d< |S )a  
    Prepend and append control tokens to the token sequence as per Canary format.

    We use the following special tokens:
    * <|startoftranscript|>
    * <|transcribe|>
    * <|translate|>
    * <|nopnc|>
    * <|pnc|>
    * <|endoftext|>
    * <|LANG|> - for each supported language.
    * <|nospeech|>

    The prompt format syntax is as follows:

        <|startoftranscript|> [ <|nospeech|> | <|LANG|> [ <|transcribe|> | <|translate|> ] <|LANG|> [ <|pnc|> | <|nopnc|> ] TEXT <|endoftext|> ]  # pylint: disable=line-too-long

    Where expression ``[ a | b ]`` denotes expression ``a`` or expression ``b``, and can be nested.
    Note that ``<|LANG|>`` appears twice: the first occurrence is for the "source" language
    (i.e., spoken language in the recording) and the second occurrence is for the "target" language
    (i.e., the language in which we are going to output the text).
    z_Expected input audio to have a single channel (required MonoCut/MixedCut, but we received: cut=)r%   r'   r/   zWe found cut with ID z% that is missing the following keys: zNPlease ensure that every utterance in the input manifests contains these keys.c                    s   i | ]}| j | qS r6   )custom).0slotrV   r6   r7   
<dictcomp>   s    zcanary.<locals>.<dictcomp>)roler*    c                 s   s     | ]}|j d ur|j V  qd S )N)r+   )rZ   sr6   r6   r7   	<genexpr>   s    zcanary.<locals>.<genexpr>r$   r+   r   r(   
answer_idsz<Expected the last token in answer_ids to be EOS, but we got N)
isinstancer   _first_non_padding_cutr   	TypeErrorset	get_slotsrY   removeRuntimeErroridrJ   rR   r   joinsupervisionsappendr   languagegetencode_dialogitem	tokenizereos)rV   rW   r9   missing_keysturnsr+   ansr6   r\   r7   r#   y   sN   






r#   )!typingr   torchlhotser   
lhotse.cutr   r   lhotse.utilsr   &nemo.collections.common.data.prompt_fnr   )nemo.collections.common.prompts.formatterr   r	   3nemo.collections.common.tokenizers.canary_tokenizerr
   r   r   r   r   	BOOL_TRUE
BOOL_FALSErG   rH   rF   rE   r"   rJ   rK   r;   Tensorr#   r6   r6   r6   r7   <module>   s$   	")#&