o
    }oiC                     @   s   d dl Z d dlZd dlmZmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ e jd	d
 Ze jdd Zdd Ze jdd Zdd Zdd ZdS )    N)CutSetMonoCutSupervisionSegment)dummy_recording)tensor)SentencePieceTokenizer)create_spt_model) LhotseAudioQuestionAnswerDataset)PromptFormatterTextProcessingc              	   C   st   d}| d}|d }|| |   tt|dddt|dd W d    n1 s-w   Y  tt|d	 S )
Nay  
    Example system message.
    Example user message.
    Example assistant message.
    TEST
    [INST]
    [/INST]
    <s>
    </s>
    <<SYS>>
    <</SYS>>
    User: Assistant:
    user model
    Instruct Output
    


    <start_of_turn> <end_of_turn>
    <|
    |>
    <|en|> <|de|> <|fr|> <|es|> <|transcribe|> <|translate|> <|pnc|> <|nopnc|> <|startoftranscript|> <|endoftext|>
    Feel free to add new tokens for your own tests!?
    But know that if you do so, you may need to update the token IDs in the existing tests!
    So, it might be a good idea to create a new tokenizer instead when adding new prompt formats.
    bpe_tokenizertext.txt   FT)
vocab_sizesample_sizedo_lower_case
output_dirremove_extra_whitespacestokenizer.modelmktemp
write_textdisabledr   strr   capsystmp_path_factoryTOKENIZER_TRAIN_TEXTtmpdir	text_path r    g/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/multimodal/test_speechllm_dataset.py	tokenizer   s   


	r"   c                   C   s@   t tddddtdddddddgtdddd	d
ddddgS )Nex0r   g      @zdummy-recording-0000some transcriptionen)idrecording_idstartdurationtextlanguageT)r)   	with_datanon default prompt contextzsome desired answerzAPlease answer the following based on the previous speech feature.)contextanswersystem_prompt)r&   r(   r)   channelsupervisions	recordingcustom)r   r   r   r   r    r    r    r!   cutsB   s.   
r5   c           	      C   s  t | dd}t|ddddd}|| }h d}|t| }t|| }|s'|r1J d	|d
||d dgks:J |d ddigksEJ tj|d tdg tj|d tdg t|d sfJ t|d soJ |d j	dksxJ tj|d tdgtj
d tg dg}tj|d | tj|d tdg | |dd df  dksJ tg dg}tj|d | tj|d  td!g | |dd d!f  d"ksJ tg d#g}tj|d$ | | |dd d%f  d&ksJ tg d'g}tj|d( | | |dd df  dks%J tj|d) tg d*g tj|d+ tg d,g d S )-Nplainr"   prompt_formatdo this taskr   T@   text_processordefault_contexttokens_to_generatepad_to_max_lengthmax_seq_length>   labelstokensanswerscontextsmetadata	loss_mask
max_length
sample_idsaudio_ratioaudio_signalposition_idstokens_lengthcontext_lengthsaudio_signal_lengthmissing_keys= unexpected_keys=rH   r#   rE   audio_filepathex0.wavrI         ?rG   rJ      8 rN   rV   dtype)?rU   N   	   rU   r:   P      K         rU               r_   rU      rb   rZ   r_   ?   r_   L   rb   I      rU   8   d   )   rb   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rB   rL       !   z-non default prompt context some transcription)@rU   rY   rZ   rU   r:   r[   r\   r]   r^   r_   rU   r`   ra   rb   rc   r_   rU   rd   rb   rZ   r_   re   r_   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rD   rM   rc   r-   )@rf   rb   rg   rh   rU   ri   rj   rk   rb   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rC   
   r$   )?rY   rZ   rU   r:   r[   r\   r]   r^   r_   rU   r`   ra   rb   rc   r_   rU   rd   rb   rZ   r_   re   r_   rf   rb   rg   rh   rU   ri   rj   rk   rb   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rA   rK   )@r   rU   rh         r\   r_         rZ   rn      r`      rb   r^                        rc   ra         rd               rl   rm   "   #   $   %   &   '   (   rk   *   +   ,   -   .   /   0   1   2   3   4   5   6   7   ri   9   :   ;   <   =   >   re   rF   )?        r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rS   rS   rS   rS   rS   rS   rS   rS   rS   rS   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r
   r	   settorchtestingassert_closer   	is_tensoris_floating_pointshapeint32ids_to_texttolist)	r"   r5   r<   datasetbatchexpected_keysmissing_keysunexpected_keysexpectedr    r    r!   test_speechllm_dataset`   sV   ""$$r   c                 C   s~   d}| d}|d }|| |   tt|dddt|ddg ddd		 W d    n1 s2w   Y  tt|d
 S )Na  
    a b c d e f g h i j k l m n o p q r s t u v x y z
    A B C D E F G H I J K L M N O P Q R S T U V X Y Z
    [EOG]
    Example system message.
    Example user message.
    Example assistant message.
    TEST
    [INST]
    [/INST]
    <s>
    </s>
    <<SYS>>
    <</SYS>>
    User: Assistant:
    user model
    Instruct Output
    


    <start_of_turn> <end_of_turn>
    <|
    |>
    <|en|> <|de|> <|fr|> <|es|> <|transcribe|> <|translate|> <|pnc|> <|nopnc|> <|startoftranscript|> <|endoftext|>
    Feel free to add new tokens for your own tests!?
    But know that if you do so, you may need to update the token IDs in the existing tests!
    So, it might be a good idea to create a new tokenizer instead when adding new prompt formats.
    r   r   r   r   FT)z[INST]z[/INST]z<<SYS>>z<</SYS>>z[EOG])r   r   r   r   boseosuser_defined_symbolsr   r   r   r   r    r    r!   llama_tokenizer   s$   


r   c                 C   s  | }t |dd}t|ddddd}|| }t| h d}|t| }t|| }|s-|r7J d	|d
||d dgks@J |d ddigksKJ tj|d tdg tj|d tdg t|d slJ t	|d suJ |d j
dks~J tj|d tdgtjd dD ]}	td|	 d|||	 d  qtg dg}
tj|d |
 tj|d tdg ||
dd df  dksJ tg d g}
tj|d! |
 tj|d" td#g ||
dd d#f  d$ksJ tg d%g}
tj|d& |
 ||
dd d'f  d(ksJ tg d)g}
tj|d* |
 ||
dd d+f  dks@J tj|d, tg d-g tj|d. tg d/g d S )0Nllama2r7   r9   r   T   r;   >   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rH   r#   rE   rQ   rR   rI   rS   rG   rJ   rT   rN   rV   rW   )rB   rD   rC   rA   zbatch['z']=)rU   rr   ro   rr   r\   rr   i   rw   rZ   r`   rv   rZ   rk   rb   rv   r{   }   r   rZ   u   rx   rw   rw   O   r   r^   \   r`   rv   rZ   r   rr   rx   rb   r   rZ   U   rz   rZ   r   r   rx   V   rv   H   ry   rZ   rZ   rl   r   r   rZ   {   E   rZ   r|   rr   r_   rr   ]   rb   rr   J   X   r`   r   rw   rt   r   rz   rx   rd   rt   t   rx   rb   rt   rY   rt   rr   rp   r   rx   T   rZ   rr   A   x   r   rx   rb   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rB   rL   [   z[INST] <<SYS>> Please answer the following based on the previous speech feature. <</SYS>> non default prompt context [/INST] some transcription)rU   rr   ro   rr   r\   rr   r   rw   rZ   r`   rv   rZ   rk   rb   rv   r{   r   r   rZ   r   rx   rw   rw   r   r   r^   r   r`   rv   rZ   r   rr   rx   rb   r   rZ   r   rz   rZ   r   r   rx   r   rv   r   ry   rZ   rZ   rl   r   r   rZ   r   r   rZ   r|   rr   r_   rr   r   rb   rr   r   r   r`   r   rw   rt   r   rz   rx   rd   rt   r   rx   rb   rt   rY   rt   rr   rp   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rD   rM   Q   z|[INST] <<SYS>> Please answer the following based on the previous speech feature. <</SYS>> non default prompt context [/INST])r   rx   r   rZ   rr   r   r   r   rx   rb   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rC   rs   r$   )rr   ro   rr   r\   rr   r   rw   rZ   r`   rv   rZ   rk   rb   rv   r{   r   r   rZ   r   rx   rw   rw   r   r   r^   r   r`   rv   rZ   r   rr   rx   rb   r   rZ   r   rz   rZ   r   r   rx   r   rv   r   ry   rZ   rZ   rl   r   r   rZ   r   r   rZ   r|   rr   r_   rr   r   rb   rr   r   r   r`   r   rw   rt   r   rz   rx   rd   rt   r   rx   rb   rt   rY   rt   rr   rp   r   rx   r   rZ   rr   r   r   r   rx   rb   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rh   rA   Z   rK   )r   rU   rh   ro   rp   r\   r_   rq   rr   rZ   rn   rs   r`   rt   rb   r^   ru   rv   rw   rx   ry   rz   r{   rc   ra   r|   r}   rd   r~   r   r   r   rl   rm   r   r   r   r   r   r   r   rk   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ri   r   r   r   r   r   r   re   r:   r   B   C   D   r   F   G   r   rg   r   r]   rf   M   rY   r   r[   r   R   S   r   r   r   W   r   Y   r   r   r   r   ^   _   `   a   b   c   rj   e   f   g   h   r   j   k   l   m   n   o   p   q   r   s   r   r   v   w   r   y   z   r   |   r   ~      rF   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rS   rS   rS   rS   rS   rS   rS   rS   rS   rS   rS   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r
   r	   printr   r   r   r   r   r   r   r   r   r   r   )r   r5   r"   r<   r   r   r   r   r   kr   r    r    r!   &test_speechllm_dataset_prompt_template   s^   "
"
"
$
$r   c                 C   s   | }t |dd}t|ddddd}|| }|d jd	ksJ |d
 jd	ks'J |d jdks0J |d jdks9J |d jdksBJ t|ddddd}|| }|d jd	ksXJ |d
 jd	ksaJ |d jdksjJ |d jdkssJ |d jdks|J d S )Nr   r7   r9   r   Fr   r;   rB   )rU   r   rA   rD   )rU   r   rC   )rU   rs   rK   )rU   r      )rU   iQ  )r
   r	   r   )r   r5   r"   r<   r   r   r    r    r!   ;test_speechllm_dataset_tokens_to_generate_increases_seq_lenm  s8   r   )pytestr   lhotser   r   r   lhotse.testing.dummiesr   r   "nemo.collections.common.tokenizersr   :nemo.collections.common.tokenizers.sentencepiece_tokenizerr   :nemo.collections.multimodal.speech_llm.data.lhotse_datasetr	   =nemo.collections.multimodal.speech_llm.parts.utils.data_utilsr
   fixturer"   r5   r   r   r   r   r    r    r    r!   <module>   s&   
'
^
- 