o
    }oi!                  	   @   s   d dl Z d dlmZ d dlmZmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZmZmZmZmZ G dd	 d	eZd
eeef deeef fddZe	eedededeee jf fddZdS )    N)MonoCut)CutMixedCut)ifnone)registered_prompt_format_fn)ModalityPromptFormatter)CANARY2_BOCTX
CANARY_BOS
CANARY_EOSCANARY_SPECIAL_TOKENIZERCanaryTokenizerc                       s  e Zd ZdZdZde de deje	dddd	d
ejeje	dddddddddddde	dddddddddddde	dddddddddddde	dddddddddd d!d"d#d$d%e de d&ejid$ed'e
 d(ejid$iZd)ed*ed+ed,ee f fd-d.Z  ZS )/Canary2PromptFormattercanary2	assistantuserz|decodercontext|zA|emotion||source_lang||target_lang||pnc||itn||timestamp||diarize|<|emo:undefined|>z<|emo:neutral|>z<|emo:angry|>z<|emo:happy|>z<|emo:sad|>yesnotrueTruefalseFalse10pncnopnc<|pnc|>z	<|nopnc|>itnnoitnz<|itn|>	<|noitn|>	timestampnotimestampz<|timestamp|><|notimestamp|>diarize	nodiarizez<|diarize|><|nodiarize|>)decodercontextemotionsource_langtarget_langr   r   r!   r$   )templateslotsuser_partialr'   z|text|textprompt_templateexpected_slotsslot_valuesreturnc                    s   t |}t j|||dS )N)r/   r0   r1   )%map_manifest_values_to_special_tokenssuperencode_turn)selfr/   r0   r1   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/prompts/canary2.pyr5   o   s   z"Canary2PromptFormatter.encode_turn)__name__
__module____qualname__NAMEOUTPUT_ROLEr	   r
   r   TextTextLiteralr   TEMPLATEstrdictlistintr5   __classcell__r9   r9   r7   r:   r   !   sv    
8*Kr   r1   r2   c                 C   s   |   } d}dD ]"}|| v r*| |  }dr|ds*| | }d| d | |< d}qdD ]-}d| d}d| d}|| v rZ| | ||fvrZ| | dd	d
d|fv rT|n|| |< d}q-|rgtj| vrgt| tj< | S )NF)r)   r*   z<|z|>T)r   r   r!   r$   z<|nor   r   r   r   )copy
startswithendswithr   PROMPT_LANGUAGE_SLOTr   )r1   any_special_token_presentkvval
true_tokenfalse_tokenr9   r9   r:   r3   |   s$   $"
r3   cutpromptc                    s  t  tr j t  tstd dddh}|t j }|r.td j d| ddd	d
dddd} fdd|D }t	||j
< | D ]\}}| jv rW j| n|||< qItd|dg}ddd  jD }	|tdd|	|j
t jd j jdid ||}
t |jtr|jj}n|jt}|dksJ d|
d d  |ksJ d|
d  |
d dd |
d< |
S )z
    Prepend and append control tokens to the token sequence as per Canary 2.0 format.

    The prompt format syntax is defined in :class:`Canary2PromptFormatter`
    z_Expected input audio to have a single channel (required MonoCut/MixedCut, but we received: cut=)r)   r*   zWe found cut with ID z% that is missing the following keys: zNPlease ensure that every utterance in the input manifests contains these keys. r   r    r#   r&   r   )r'   r(   r   r!   r$   r   c                    s   i | ]}| j | qS r9   )custom).0slotrR   r9   r:   
<dictcomp>   s    zcanary2.<locals>.<dictcomp>r   )roler,    c                 s   s     | ]}|j d ur|j V  qd S )N)r.   )rW   sr9   r9   r:   	<genexpr>   s    zcanary2.<locals>.<genexpr>r   r.   r   zGInvalid tokenizer: tokenizer.token_to_id('{CANARY_EOS}') returned {eos}
answer_idsz<Expected the last token in answer_ids to be EOS, but we got N)
isinstancer   _first_non_padding_cutr   	TypeErrorsetrV   RuntimeErroridr   rK   itemsrD   joinsupervisionsappendr   languagegetencode_dialog	tokenizerr   eostoken_to_idr   item)rR   rS   r0   missing_keysoptional_slotsr,   rM   rN   turnsr.   ansro   r9   rY   r:   r      sV   




	
r   )torchlhotser   
lhotse.cutr   r   lhotse.utilsr   &nemo.collections.common.data.prompt_fnr   )nemo.collections.common.prompts.formatterr   r   3nemo.collections.common.tokenizers.canary_tokenizerr	   r
   r   r   r   r   rD   rC   r3   Tensorr   r9   r9   r9   r:   <module>   s   	"[&