o
    wi                      @   s8  d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ej rRed dd ZdZdZejdddd Zejdddd Z ejdddd Z!ejdddd Z"dd Z#dd Z$dd Z%d d! Z&d"d# Z'd$d% Z(dS )&    N)CutSetSupervisionSegment)	dummy_cutdummy_recording)GenerationConfig)NeMoMultimodalConversation)	AudioTurnTextTurn)move_data_to_device)PromptFormatter)SALMDataset)SALMcudac                   C   s    t jdrdddS dddS )Nz)/home/TestData/speechlm/pretrained_modelszC/home/TestData/speechlm/pretrained_models/TinyLlama--TinyLlama_v1.1z>/home/TestData/speechlm/pretrained_models/canary-1b-flash.nemo)pretrained_llmpretrained_asrznvidia/canary-1b-flashzTinyLlama/TinyLlama_v1.1)r   r   )ospathexists r   r   b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/tests/collections/speechlm2/test_salm.pyresolve_pretrained_models!   s   r   z<|audioplaceholder|>llama2session)scopec                  C   s   i t  dttddi dddddgddd	d d
ddddddddddddddddddddddddd d!d"d#dd$d%dd&d'd(dd)d*d+d,d-dd.d/d0d1d2d3dd4id5} t| }tj rp|d6 |S )7NFzCnemo.collections.speechlm2.modules.perception.AudioPerceptionModulei   _target_z-nemo.collections.asr.modules.ConformerEncoderatt_context_sizecausal_downsamplingconv_context_sizeconv_kernel_size	   conv_norm_type
batch_normd_modeli   dropoutg?dropout_attdropout_embg        dropout_pre_encoderfeat_in   feat_outff_expansion_factor   n_heads   n_layers   pos_emb_max_leni  rel_posdw_striding   )self_attention_modelsubsamplingsubsampling_conv_channelssubsampling_factorz?nemo.collections.speechlm2.modules.perception.IdentityConnector)r   r#   z>nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessorgh㈵>   Ti   per_featurer   >  hanng?g{Gz?)r   ditherfeaturesframe_splicinglogn_fft	normalizepad_to	pad_valuesample_ratewindowwindow_sizewindow_stride)target
output_dimencodermodality_adapterpreprocessorztorch.optim.AdamW)pretrained_weightsprompt_formataudio_locator_tag
perception	optimizerr   )r   PROMPTAUDIO_LOCATOR_TAGr   torchr   is_availableto)cfgmodelr   r   r   rY   4   s   	
.5

rY   c                 C   s
   t | jS N)r   	tokenizerrY   r   r   r   datasetq   s   
r]   c                 C   s   t t| jS rZ   )r   resolverS   r[   r\   r   r   r   prompt_formatterv   s   r_   c               	   C   sj   t dtdddd} t| j| jddddg| _ttdtd	d
dt	d	| t
dtd| jd jdgddgS )Nr   T	with_data)	recordingg      ?zSome text transcription.)idrecording_idstartdurationtextz	example-0userzRepeat after me:)rolevalue)ri   cutrP   	assistantg{Gz?)rc   turnstoken_equivalent_duration)r   r   r   rc   rd   supervisionsr   r   r	   r   rT   rg   )rk   r   r   r   training_cutset_batch{   s    
rp   c                    sr   |j  fddd d}|d j} jj|dt dksJ | | }dD ]}||v s-J t|| s6J q%d S )Nc                    
   |   S rZ   apply_prompt_formatcr_   r   r   <lambda>      
 z#test_salm_dataset.<locals>.<lambda>apply_fnr   z<s> [INST] Repeat after me: z'  [/INST] Some text transcription. </s>)audios
audio_lens	input_ids	loss_mask)mapr}   r[   decoderT   rU   	is_tensor)r]   r_   rp   	tokenizedbatchkeyr   rv   r   test_salm_dataset   s   

r   c                    sr   |j  fddd d}|| }t|| jd}| j|dd}t|d s&J t|d r/J |d dks7J d S )Nc                    rq   rZ   rr   rt   rv   r   r   rw      rx   z)test_salm_training_step.<locals>.<lambda>ry   devicer   	batch_idxloss)r   r
   r   training_steprU   r   isnanrY   r]   r_   rp   r   resultsr   rv   r   test_salm_training_step   s   r   c                    sV   |    |j fddd d}|| }t|| jd}| jd|idd}|d u s)J d S )Nc                    rq   rZ   rr   rt   rv   r   r   rw      rx   z+test_salm_validation_step.<locals>.<lambda>ry   r   dummy_val_setr   r   )on_validation_epoch_startr   r
   r   validation_stepr   r   rv   r   test_salm_validation_step   s   r   c                 C   sz   | j dddt idggtddtdgdd}|jd	ks"J |jtjks*J |d
k s2J || j	k  s;J d S )Nrh   messageRepeat after me: )ri   slotsr9   r;   r,   promptsr{   r|   max_new_tokensr9   r,   r   )
generaterT   rU   randntensorshapedtypelongalltext_vocab_sizerY   answerr   r   r   test_salm_generation   s   


r   c                 C   s   |d }t ddd| | jddt |gdgddt dt ||gdggtd	d
d}|jdks6J |jtjks>J |dk	 sFJ || j
k 	 sOJ d S )Nz	audio.wavr   Tr`   rh   r   )ri   contentaudioz and r,   )r   )r   generation_config)r0   r,   )r   
save_audior   rT   r   r   r   rU   r   r   r   )rY   tmp_path
audio_pathr   r   r   r   &test_salm_generation_audios_via_prompt   s    
r   c                 C   s   | j tddddddd| jggtddtdgdd	}|jd
ks%J |jtjks-J |dk s5J || j	k  s>J d S )Nr9   r0      r,            r;   r   r   r   )
r   rU   r   audio_locator_tag_idr   r   r   r   r   r   r   r   r   r   &test_salm_generation_prompts_as_tensor   s   

r   ))r   pytestrU   lhotser   r   lhotse.testing.dummiesr   r   transformersr   #nemo.collections.common.data.lhotser   1nemo.collections.common.data.lhotse.text_adaptersr   r	   "nemo.collections.common.data.utilsr
   nemo.collections.common.promptsr   nemo.collections.speechlm2.datar   !nemo.collections.speechlm2.modelsr   r   rV   set_default_devicer   rT   rS   fixturerY   r]   r_   rp   r   r   r   r   r   r   r   r   r   r   <module>   s>   




<





	