o
    }oi                     @   s(  d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ej rRed dd ZdZdZejdddd Zejdddd Z ejdddd Z!ejdddd Z"dd Z#dd Z$dd Z%d d! Z&dS )"    N)CutSetSupervisionSegment)	dummy_cutdummy_recording)GenerationConfig)NeMoMultimodalConversation)	AudioTurnTextTurn)move_data_to_device)PromptFormatter)SALMDataset)SALMcudac                   C   s    t jdrdddS dddS )Nz)/home/TestData/speechlm/pretrained_modelszC/home/TestData/speechlm/pretrained_models/TinyLlama--TinyLlama_v1.1z>/home/TestData/speechlm/pretrained_models/canary-1b-flash.nemo)pretrained_llmpretrained_asrznvidia/canary-1b-flashzTinyLlama/TinyLlama_v1.1)r   r   )ospathexists r   r   Y/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/speechlm2/test_salm.pyresolve_pretrained_models!   s   r   z<|audioplaceholder|>llama2session)scopec                  C   sP   i t  dttddddddddd	d
id} t| }tj r&|d |S )NFzCnemo.collections.speechlm2.modules.perception.AudioPerceptionModulez-nemo.collections.asr.modules.ConformerEncoderi      )_target_feat_infeat_outn_layersd_model)r   modality_adapterr   ztorch.optim.AdamW)pretrained_weightsprompt_formataudio_locator_tag
perception	optimizerr   )r   PROMPTAUDIO_LOCATOR_TAGr   torchr   is_availableto)cfgmodelr   r   r   r-   4   s(   


r-   c                 C   s
   t | jS N)r   	tokenizerr-   r   r   r   datasetM   s   
r1   c                 C   s   t t| jS r.   )r   resolver'   r/   r0   r   r   r   prompt_formatterR   s   r3   c               	   C   sj   t dtdddd} t| j| jddddg| _ttdtd	d
dt	d	| t
dtd| jd jdgddgS )Nr   T)	with_data)	recordingg      ?zSome text transcription.)idrecording_idstartdurationtextz	example-0userzRepeat after me:)rolevalue)r<   cutr$   	assistantg{Gz?)r6   turnstoken_equivalent_duration)r   r   r   r6   r7   supervisionsr   r   r	   r   r(   r:   )r>   r   r   r   training_cutset_batchW   s    
rC   c                    sr   |j  fddd d}|d j} jj|dt dksJ | | }dD ]}||v s-J t|| s6J q%d S )Nc                    
   |   S r.   apply_prompt_formatcr3   r   r   <lambda>p      
 z#test_salm_dataset.<locals>.<lambda>apply_fnr   z<s> [INST] Repeat after me: z'  [/INST] Some text transcription. </s>)audios
audio_lens	input_ids	loss_mask)maprP   r/   decoder(   r)   	is_tensor)r1   r3   rC   	tokenizedbatchkeyr   rI   r   test_salm_datasetn   s   

rX   c                    sr   |j  fddd d}|| }t|| jd}| j|dd}t|d s&J t|d r/J |d dks7J d S )Nc                    rD   r.   rE   rG   rI   r   r   rJ      rK   z)test_salm_training_step.<locals>.<lambda>rL   devicer   	batch_idxloss)rR   r
   rZ   training_stepr)   rT   isnanr-   r1   r3   rC   rV   resultsr   rI   r   test_salm_training_step~   s   rb   c                    sV   |    |j fddd d}|| }t|| jd}| jd|idd}|d u s)J d S )Nc                    rD   r.   rE   rG   rI   r   r   rJ      rK   z+test_salm_validation_step.<locals>.<lambda>rL   rY   dummy_val_setr   r[   )on_validation_epoch_startrR   r
   rZ   validation_stepr`   r   rI   r   test_salm_validation_step   s   rf   c                 C   s   | j dddt idggtddtdgtddd	}|jd
ks%J |jtjks-J |dk	 s5J || j
k 	 s>J d S )Nr;   messagezRepeat after me: )r<   slotsr   i>     )max_new_tokens)promptsrN   rO   generation_config)r   ri   r   )generater(   r)   randntensorr   shapedtypelongalltext_vocab_size)r-   answerr   r   r   test_salm_generation   s   


rv   )'r   pytestr)   lhotser   r   lhotse.testing.dummiesr   r   transformersr   #nemo.collections.common.data.lhotser   1nemo.collections.common.data.lhotse.text_adaptersr   r	   "nemo.collections.common.data.utilsr
   nemo.collections.common.promptsr   nemo.collections.speechlm2.datar   !nemo.collections.speechlm2.modelsr   r   r*   set_default_devicer   r(   r'   fixturer-   r1   r3   rC   rX   rb   rf   rv   r   r   r   r   <module>   s:   










	