o
    wih                   	   @   s(  d dl Z d dlmZ d dlmZmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZ d dlmZmZ d dlmZmZmZmZmZ ed	d
hB ZeddhB ZeddhB ZeddhB ZeddhB ZeddhB ZG dd deZdee e f dee e f fddZ!e	eedededee e j"f fddZ#dS )    N)MonoCut)CutMixedCut)ifnone)registered_prompt_format_fn)
BOOL_FALSE	BOOL_TRUE	PNC_FALSEPNC_TRUE)ModalityPromptFormatter)CANARY2_BOCTX
CANARY_BOS
CANARY_EOSCANARY_SPECIAL_TOKENIZERCanaryTokenizeritnz<|itn|>noitn	<|noitn|>	timestampz<|timestamp|>notimestamp<|notimestamp|>diarizez<|diarize|>	nodiarize<|nodiarize|>c                       s   e Zd ZdZdZde de deje	dddd	d
ejejej	e
eB  ej	eeB  ej	eeB  ej	eeB  ddde de dejidede dejidiZdedededee f fddZ  ZS )Canary2PromptFormattercanary2	assistantuserz|decodercontext|zA|emotion||source_lang||target_lang||pnc||itn||timestamp||diarize|<|emo:undefined|>z<|emo:neutral|>z<|emo:angry|>z<|emo:happy|>z<|emo:sad|>)decodercontextemotionsource_langtarget_langpncr   r   r   )templateslotsuser_partialr    z|text|textprompt_templateexpected_slotsslot_valuesreturnc                    s   t |}t j|||dS )N)r)   r*   r+   )%map_manifest_values_to_special_tokenssuperencode_turn)selfr)   r*   r+   	__class__ d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/common/prompts/canary2.pyr/   ]   s   z"Canary2PromptFormatter.encode_turn)__name__
__module____qualname__NAMEOUTPUT_ROLEr   r   r   TextTextLiteralr
   r	   ITN_TRUE	ITN_FALSETIMESTAMP_TRUETIMESTAMP_FALSEDIARIZE_TRUEDIARIZE_FALSEr   TEMPLATEstrdictlistintr/   __classcell__r3   r3   r1   r4   r   -   s:    
*-r   r+   r,   c                 C   s   |   } d}dD ]"}|| v r*| |  }dr|ds*| | }d| d | |< d}qdD ]-}d| d}d| d}|| v rZ| | ||fvrZ| | dd	d
d|fv rT|n|| |< d}q-|rgtj| vrgt| tj< | S )NF)r"   r#   z<|z|>T)r$   r   r   r   z<|noyes1Truetrue)copy
startswithendswithr   PROMPT_LANGUAGE_SLOTr   )r+   any_special_token_presentkvval
true_tokenfalse_tokenr3   r3   r4   r-   j   s$   $"
r-   cutpromptc                    s  t  tr j t  tstd dddh}|t j }|r.td j d| ddd	d
dddd} fdd|D }t	||j
< | D ]\}}| jv rW j| n|||< qItd|dg}ddd  jD }	|tdd|	|j
t jd j jdid ||}
t |jtr|jj}n|jt}|dksJ d|
d d  |ksJ d|
d  |
d dd |
d< |
S )z
    Prepend and append control tokens to the token sequence as per Canary 2.0 format.

    The prompt format syntax is defined in :class:`Canary2PromptFormatter`
    z_Expected input audio to have a single channel (required MonoCut/MixedCut, but we received: cut=)r"   r#   zWe found cut with ID z% that is missing the following keys: zNPlease ensure that every utterance in the input manifests contains these keys. r   r   r   r   z<|pnc|>)r    r!   r   r   r   r$   c                    s   i | ]}| j | qS r3   )custom).0slotrV   r3   r4   
<dictcomp>   s    zcanary2.<locals>.<dictcomp>r   )roler&    c                 s   s     | ]}|j d ur|j V  qd S )N)r(   )r[   sr3   r3   r4   	<genexpr>   s    zcanary2.<locals>.<genexpr>r   r(   r   zGInvalid tokenizer: tokenizer.token_to_id('{CANARY_EOS}') returned {eos}
answer_idsz<Expected the last token in answer_ids to be EOS, but we got N)
isinstancer   _first_non_padding_cutr   	TypeErrorsetrZ   RuntimeErroridr   rO   itemsrD   joinsupervisionsappendr   languagegetencode_dialog	tokenizerr   eostoken_to_idr   item)rV   rW   r*   missing_keysoptional_slotsr&   rQ   rR   turnsr(   ansrs   r3   r]   r4   r      sV   




	
r   )$torchlhotser   
lhotse.cutr   r   lhotse.utilsr   &nemo.collections.common.data.prompt_fnr   &nemo.collections.common.prompts.canaryr   r   r	   r
   )nemo.collections.common.prompts.formatterr   r   3nemo.collections.common.tokenizers.canary_tokenizerr   r   r   r   r   r<   r=   r>   r?   r@   rA   r   rD   rC   r-   Tensorr   r3   r3   r3   r4   <module>   s*   	"=&