o
    }oiA                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZmZmZmZ d dlmZ d dlmZmZ G d	d
 d
ejjjZejdddd Zdd Zejdd Z dd Z!dd Z"dd Z#dd Z$dd Z%dd Z&dS )    N)dummy_recording)	OmegaConf)!get_lhotse_dataloader_from_config)*MultimodalFixedBucketBatchSizeConstraint2DMultimodalSamplingConstraint)	AudioTurnNeMoMultimodalConversation&NeMoMultimodalConversationJsonlAdapter#NeMoMultimodalConversationTarWriterTextTurn)Llama2PromptFormatter)SentencePieceTokenizercreate_spt_modelc                   @   s"   e Zd ZdejdejfddZdS )Identitycutsreturnc                 C   s   |S )N )selfr   r   r   o/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/common/test_lhotse_multimodal_dataloading.py__getitem__&   s   zIdentity.__getitem__N)__name__
__module____qualname__lhotseCutSetr   r   r   r   r   r   %   s    r   session)scopec              
   C   s   |  d}|d }dddddddd	d
dddddddd	ddddddddddgdg}tj|| tdd
dd |d  tdddd |d  |S )N	text_datazmanifest.jsonconvo_1%Can you help summarize the following?Usertext)valuefromtypez123.wavaudioQ@)r"   r#   r$   duration;I'm glad to assist you with your request. Here's a summary:	Assistantz123_answer.wavq=
ףp@Can you further shorten it?
Of course!)idconversationsr   T)	with_data)mktempr   serializationsave_to_jsonlr   to_cut
save_audio)tmp_path_factorytmp_pathen_pathdatar   r   r   multimodal_conversations_path*   sH   
'r9   c              
   C   s<  t d| ddgddddddd}t|ddt d}d	d
 |D }t|dks*J |d }t|tjs6J t|dks>J |d }t|tsIJ |j	dksPJ t|j
dksYJ |j
d }t|tseJ |jdkslJ |jdkssJ |j
d }t|tsJ |jdksJ |jdksJ |jjdksJ |j jdksJ |j
d }t|tsJ |jdksJ |jdksJ |j
d }t|tsJ |jdksJ |jdksJ |jjdksJ |j jdksJ |j
d }t|tsJ |jdksJ |jdksJ |j
d }t|tsJ |jdksJ |jdksJ d S )Nmultimodal_conversation[audio]r$   manifest_filepathaudio_locator_tagTr      )	input_cfgforce_finiteshufflenum_workers
batch_sizeseed
shard_seed)configglobal_rank
world_sizedatasetc                 S      g | ]}|qS r   r   .0batchr   r   r   
<listcomp>q       z6test_multimodal_conversation_input.<locals>.<listcomp>r      userr   r&   )r?   i f    	assistantr(      r*   )r?   i`    r+      r,   )r   creater   r   len
isinstancer   r   r   r-   turnsr   roler"   r   r>   cutr'   
load_audioshape)r9   rG   dlbatchesbextr   r   r   "test_multimodal_conversation_input[   sh   





re   c                 C   sf   |  d}|d }|ddd tj|D  t|dddt|d	d	g d
d	d	 tt|d S )Nmulti_convo_tokenizerztext.txt
c                 s   s&    | ]}|d  D ]}|d V  qqdS )r.   r"   Nr   )rM   itemturnr   r   r   	<genexpr>   s    ztokenizer.<locals>.<genexpr>   FT)z[INST]z[/INST]z<<SYS>>z<</SYS>>r;   )
vocab_sizesample_sizedo_lower_case
output_dirboseosuser_defined_symbolsremove_extra_whitespacesztokenizer.model)	r0   
write_textjoinr   r1   
load_jsonlr   strr   )r5   r9   tmpdir	text_pathr   r   r   	tokenizer   s&   


r{   c                 C   s  t d| ddgdddddddd}t|ddt |d	}d
d |D }t|dks,J |d }t|tjs8J t|dks@J |d }t|tsKJ t	
|jsSJ |jjdks[J ||jdkseJ t	
|jsmJ |jjdksuJ ||jdksJ t	
|jsJ |jjdksJ ||jdksJ t	
|jsJ |jjdksJ |jd d dk sJ |jdd dk sJ |jdd dk sJ |jdd  dk sJ d S )Nr:   r;   r<   llama2Tr   r?   )r@   prompt_formatrA   rB   rC   rD   rE   rF   )rG   rH   rI   rJ   r{   c                 S   rK   r   r   rL   r   r   r   rO      rP   zBtest_multimodal_conversation_input_with_prompt.<locals>.<listcomp>)i   z[INST] Can you help summarize the following? [audio] [/INST] I'm glad to assist you with your request. Here's a summary: [audio] [INST] Can you further shorten it? [/INST] Of course!)_   z[INST] Can you help summarize the following? [audio] [/INST] I'm glad to assist you with your request. Here's a summary: [audio] [INST] Can you further shorten it? [/INST])
   r,      FH   r   )r   rX   r   r   rY   rZ   r   r   r   torch	is_tensor	input_idsr_   ids_to_textcontext_ids
answer_idsmaskall)r9   r{   rG   r`   ra   rb   rc   r   r   r   .test_multimodal_conversation_input_with_prompt   sX   

r   c                 C   s  t dtddtddgd}|t| }| |jdksJ | |jdks)J | |jdks3J |jt	|j  krBd	ksEJ  J |j
t	|j  krTd
ksWJ  J |jt	|j  krfdksiJ  J tdd}||d	kswJ tdd}||dksJ tg dg ddd}||dksJ ||j|dksJ tg dg ddd}||dksJ ||j|dksJ d S )Nz
textonly-1hellorR   hirT   )r-   r[   z[INST] hello [/INST] hiz[INST] hello [/INST]r   rV      Fmeasure_total_lengthT)rW   r      )rU   rS   r?   max_seq_len_bucketsbatch_sizesr   rS   ))rW   rS   )rW   rW   )r   rU   )r   rQ   )r   r   )rW   rV   rU   rS   r?   )r   rV   rU   )r   r   apply_prompt_formatr   r   r   r   r   input_lengthrY   output_lengthtotal_lengthr   measure_lengthr   select_bucketr   )r{   convoconstrr   r   r   .test_text_only_conversation_length_measurement   s:   $$$

r   c                 C   s  | d}tdddd |d }tdddd |d	 }td
t|ddt|ddgdd}|t| }| |j	dksCJ | |j
dksMJ | |jdksWJ t|j
dks`J |jdksgJ t|jdkspJ |jdkswJ t|j	dksJ |jdksJ tdd}||dksJ tdd}||dksJ tg dg ddd}||dksJ ||j|dksJ tg dg ddd}||d ksJ ||j|d!ksJ d S )"Nr%   r   p=
ף@Tr'   r/   1.wavr?   Q/@2.wavzaudioonly-1rR   r;   rT   皙?r-   r[   token_equivalent_durationz[INST] [audio] [/INST] [audio]z[INST] [audio] [/INST]   N   rU      r      Fr   d      ,    rU   rS   r?   r?   r   rS   )2   r   )r   r   )r   r   )r   r   )r      )r   r   )r   r   )r   r      r   rQ   rW   rV   rU   rS   r?   )r   r   rW   )r0   r   r3   r4   r   r   r   r   r   r   r   r   rY   r   r   r   r   r   r   r   r   r{   r5   	audio_dirc1c2r   r   r   r   r   /test_audio_only_conversation_length_measurement  sH   





r   c                 C   s"  | d}tdddd |d }tdddd |d	 }td
tddt|ddtddtddtddt|ddgdd}|t| }t	| | 
|jdksWJ | 
|jdksaJ | 
|jdkskJ t|jdkstJ |jd  krdksJ  J t|jdksJ |jd  krdksJ  J t|jdksJ |jd  krdksJ  J tdd}||dksJ tdd}||dksJ tg dg d dd!}||dksJ ||j|d"ksJ tg d#g d$dd!}||d%ksJ ||j|dksJ d S )&Nr%   r   r   Tr   r   r?   r   r   zmultimodal-1z'listen to this and tell me your opinionrR   r;   zits finerT   zremove the noisesurer   r   zt[INST] listen to this and tell me your opinion [audio] [/INST] its fine [INST] remove the noise [/INST] sure [audio]zg[INST] listen to this and tell me your opinion [audio] [/INST] its fine [INST] remove the noise [/INST]zsure [audio]B      r      I   i/  Fr   r   r   r   rU   r   r   )r   r   )r0   r   r3   r4   r   r   r   r   r   printr   r   r   r   rY   r   r   r   r   r   r   r   r   r   r   r   r   /test_multimodal_conversation_length_measurementN  sZ   







r   c                 C   s*  t t| d\}|d}t|}|| W d    n1 s!w   Y  t t|d d|d d\}|j|jks<J t|jt|jksHJ t|j|jD ]C\}}t	|t	|ks]J |j
|j
kseJ t|trs|j|jksrJ qO|j|jks{J |jj|jjksJ tj|j |j  qOd S )Nr;   multi_convo_tarredzmanifest_0.jsonlzaudio_0.tarr=   r>   tarred_audio_filepaths)listr	   r0   r
   writer-   rY   r[   zipr$   r\   rZ   r   r"   r>   r]   nptestingassert_allcloser^   )r9   r5   conversationtar_dirwriterrestored_conversationlhsrhsr   r   r   *test_multimodal_conversation_tarred_format  s.   


r   c                    s   t t| d\}|d}t|dd}tdD ]}|| qW d    n1 s*w   Y  t|d d|d d}t | t dksFJ t fd	d
 dd  D sWJ d S )Nr;   r   r   )
shard_sizer   zmanifest_{0..2}.jsonlzaudio_{0..2}.tarr   c                 3   s    | ]	}| d  kV  qdS )r   Nr   )rM   crestoredr   r   rj     s    zLtest_multimodal_conversation_tarred_format_sharding_works.<locals>.<genexpr>r?   )r   r	   r0   r
   ranger   rY   r   )r9   r5   r   r   r   iloaderr   r   r   9test_multimodal_conversation_tarred_format_sharding_works  s   
&r   )'r   numpyr   pytestr   lhotse.testing.dummiesr   	omegaconfr   #nemo.collections.common.data.lhotser   ,nemo.collections.common.data.lhotse.samplingr   r   1nemo.collections.common.data.lhotse.text_adaptersr   r   r	   r
   r   nemo.collections.common.promptsr   :nemo.collections.common.tokenizers.sentencepiece_tokenizerr   r   utilsr8   Datasetr   fixturer9   re   r{   r   r   r   r   r   r   r   r   r   r   <module>   s.   

0>
;&;D