o
    }oi                      @   s   d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ G dd dejjjZe jdd Z	 e jdd Ze jdd Zdd Z	 e jdd Zdd Z dS )    N)CutSetMonoCutSupervisionSegment)dummy_recording)	OmegaConf)!get_lhotse_dataloader_from_config)SourceTargetTextExample)SentencePieceTokenizer)create_spt_model) LhotseAudioQuestionAnswerDataset)PromptFormatterTextProcessingc                   @   s   e Zd Zdd ZdS )Identityc                 C   s   |S )N )selfcutsr   r   \/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/multimodal/test_emmett.py__getitem__   s   zIdentity.__getitem__N)__name__
__module____qualname__r   r   r   r   r   r      s    r   c              	   C   st   d}| d}|d }|| |   tt|dddt|dd W d    n1 s-w   Y  tt|d	 S )
Nay  
    Example system message.
    Example user message.
    Example assistant message.
    TEST
    [INST]
    [/INST]
    <s>
    </s>
    <<SYS>>
    <</SYS>>
    User: Assistant:
    user model
    Instruct Output
    


    <start_of_turn> <end_of_turn>
    <|
    |>
    <|en|> <|de|> <|fr|> <|es|> <|transcribe|> <|translate|> <|pnc|> <|nopnc|> <|startoftranscript|> <|endoftext|>
    Feel free to add new tokens for your own tests!?
    But know that if you do so, you may need to update the token IDs in the existing tests!
    So, it might be a good idea to create a new tokenizer instead when adding new prompt formats.
    bpe_tokenizerztext.txti   FT)
vocab_sizesample_sizedo_lower_case
output_dirremove_extra_whitespacesztokenizer.model)mktemp
write_textdisabledr
   strr	   )capsystmp_path_factoryTOKENIZER_TRAIN_TEXTtmpdir	text_pathr   r   r   	tokenizer!   s   


	r&   c                   C   s>   t tddddtdddddddgtdddd	d
dddgS )Nex0r   g      @zdummy-recording-0000some transcriptionen)idrecording_idstartdurationtextlanguageT)r-   	with_data<en>zsome desired answer)contextanswer)r*   r,   r-   channelsupervisions	recordingcustom)r   r   r   r   r   r   r   r   r   N   s,   
r   c                 C   s.   |  d}|d }|d }||| |S )Ndatazcuts.jsonl.gzaudio)r   save_audiosto_file)r"   r   tmp_pathppar   r   r   	cuts_pathk   s
   
r?   c           
      C   s  t d| dgdddddddd}t|ddt |d}d	d
 |D }t|dks+J |d }t|ts6J t|dks>J |d }t|tsIJ |dsPJ t	
|jsXJ ||jdksbJ |dsiJ t	
|jsqJ ||jdks{J |dsJ t	
|jsJ ||jdksJ t|dd}t|ddddd}||d  }	||	d d dksJ ||	d d dksJ ||	d d dksJ ||	d d dksJ d S )Nlhotse)typer?   t5nmtTr      	input_cfgprompt_formatforce_finiteshufflenum_workers
batch_sizeseed
shard_seedconfigglobal_rank
world_sizedatasetr&   c                 S      g | ]}|qS r   r   .0batchr   r   r   
<listcomp>       z<test_audio_example_with_prompt_emmett_t5.<locals>.<listcomp>context_idsr1   
answer_idsr(   	input_idsz<en> some transcriptionr&   rF   F@   text_processordefault_contexttokens_to_generatepad_to_max_lengthmax_seq_lengthtokensz<en> some transcriptiolabelszen> some transcriptioncontextsanswers)r   creater   r   len
isinstancer   r   
has_customtorch	is_tensorrX   ids_to_textrY   rZ   r   r   )
r?   r&   rN   dlbatchesbexr^   rQ   rU   r   r   r   (test_audio_example_with_prompt_emmett_t5t   s\   rr   c                 C   sJ   |  d}|d }|d }|d }|d |d |d |||fS )Nnmtdatazsrc.txtztgt.txtzq.txtzfake germanreal englishr1   )r   r   )r"   r<   srctgtqr   r   r   	nmt_paths   s   




rx   c                 C   s  | \}}}t d||dd|ddgdddddddd	}t|ddt |d
}dd |D }t|dks5J |d }t|ts@J t|dksHJ |d }	t|	tsSJ t	|	j
s[J ||	j
dkseJ t	|	jsmJ ||	jdkswJ t	|	jsJ ||	jdksJ t|dd}
t|
ddddd}||d  }||d d dksJ ||d d dksJ ||d d dksJ d S )Ntxt_pairder)   )rA   source_pathstarget_pathssource_languagetarget_languagequestions_pathquestions_languagerB   Tr   rC   rD   rM   c                 S   rR   r   r   rS   r   r   r   rV      rW   z;test_text_example_with_prompt_emmett_t5.<locals>.<listcomp>z<en> fake germanrt   z<en> fake german real englishr[   r1   Fr\   r]   text_input_idstext_context_idstext_answer_ids)r   rg   r   r   rh   ri   r   r   rk   rl   rX   rm   rY   rZ   r   r   )rx   r&   ru   rv   rw   rN   rn   ro   rp   rq   r^   rQ   rU   r   r   r   'test_text_example_with_prompt_emmett_t5   s`   
r   )!pytestrk   r@   r   r   r   lhotse.testing.dummiesr   	omegaconfr   .nemo.collections.common.data.lhotse.dataloaderr   1nemo.collections.common.data.lhotse.text_adaptersr   "nemo.collections.common.tokenizersr	   :nemo.collections.common.tokenizers.sentencepiece_tokenizerr
   :nemo.collections.multimodal.speech_llm.data.lhotse_datasetr   =nemo.collections.multimodal.speech_llm.parts.utils.data_utilsr   utilsr8   Datasetr   fixturer&   r   r?   rr   rx   r   r   r   r   r   <module>   s0   
'

?
