o
    }oi                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZ ej r8ed dd	 Zejd
ddd Zejd
ddd Zejd
ddd Zdd Zdd Zdd Zdd ZdS )    N)CutSetSupervisionSegment)	dummy_cutdummy_recording)move_data_to_device)DuplexS2SDataset)DuplexS2SModelcudac                   C   s(   t jdrdddddS ddd	d
dS )Nz)/home/TestData/speechlm/pretrained_modelszC/home/TestData/speechlm/pretrained_models/TinyLlama--TinyLlama_v1.1zP/home/TestData/speechlm/pretrained_models/low-frame-rate-speech-codec-22khz.nemoz_/home/TestData/speechlm/pretrained_models/stt_en_fastconformer_hybrid_large_streaming_80ms.nemozT/home/TestData/speechlm/pretrained_models/stt_en_fastconformer_transducer_large.nemo)pretrained_llmpretrained_audio_codecpretrained_asrscoring_asr0stt_en_fastconformer_hybrid_large_streaming_80ms%stt_en_fastconformer_transducer_largezTinyLlama/TinyLlama_v1.1z(nvidia/low-frame-rate-speech-codec-22khz)r   r   r
   r   )ospathexists r   r   _/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/speechlm2/test_duplex_s2s.pyresolve_pretrained_models   s   	r   session)scopec                  C   sV   i t  ddgdddddddddd	d
ddid} t| }tj r)|d |S )NFz^audio_codec\..+$      zCnemo.collections.speechlm2.modules.perception.AudioPerceptionModulez-nemo.collections.asr.modules.ConformerEncoderi   )_target_feat_infeat_outn_layersd_modelsubsampling_factor)r   modality_adapterr   ztorch.optim.AdamW)pretrained_weightsfreeze_paramsaudio_loss_weighttext_loss_weight
perception	optimizerr	   )r   r   torchr	   is_availableto)cfgmodelr   r   r   r,   0   s,   

r,   c                 C   s   t | jddddgdgdS )Ng{Gz?>  "V  user	assistant)frame_lengthsource_sample_ratetarget_sample_rateinput_rolesoutput_roles)r   	tokenizer)r,   r   r   r   datasetK   s   r7   c                  C   s   t dtdddd} tddd| _t| j| jddddd	t| j| jd
dddd	t| j| jddddd	t| j| jddddd	g| _t| gS )Nr   T)	with_data)	recordingr   g?hir/   )idrecording_idstartdurationtextspeakerg333333?hellor0   g      ?okg333333?g?okay)r   r   target_audior   r;   r<   supervisionsr   )cutr   r   r   training_cutset_batchW   sH   
"rG   c                 C   s   | | }dD ]}||v sJ t || sJ q|d jdks!J |d jdks*J |d dgks3J |d  g d	gks@J |d
  g dgksMJ d S )N)source_audiorD   source_audio_lenstarget_audio_lenstarget_tokenstarget_token_lenssource_tokenssource_token_lensrH   )r   r-   rD   )r   r.   target_textsz
hello okayrK   )r   r   r   r   r      r   r   r   iQ  r   r   r   rM   )r   rP   r   r   r   r   r   ig  rP   r   r   r   r   )r(   	is_tensorshapetolist)r7   rG   batchkeyr   r   r   test_s2s_dataset   s   
rV   c                 C   sd   |    || }t|| jd}| j|dd}t|d sJ t|d r(J |d dks0J d S )Ndevicer   	batch_idxloss)on_train_epoch_startr   rX   training_stepr(   rQ   isnanr,   r7   rG   rT   resultsr   r   r   test_s2s_training_step   s   ra   c                 C   s@   |    || }t|| jd}| jd|idd}|d u sJ d S )NrW   dummy_val_setr   rY   )on_validation_epoch_startr   rX   validation_stepr_   r   r   r   test_s2s_validation_step   s
   re   c                 C   s  | j tddtdgd}| h dksJ t|d ts"J t|d d ts-J |d }|jdks8J |j	tj
ks@J |dk sHJ || jk  sQJ |d	 }|jd
ks\J |j	tj
ksdJ |dk slJ || jk  suJ |d }|j	tjksJ d S )Nr   r-   )input_signalinput_signal_lens>   r?   audio	audio_len
tokens_lentokens_texttokens_audior?   r   rk   )r      rl   )r   rm      rh   )offline_inferencer(   randntensorkeys
isinstanceliststrrR   dtypelongalltext_vocab_sizespeech_vocab_sizefloat32)r,   ansgen_textgen_audio_codes	gen_audior   r   r   test_s2s_offline_generation   s&   

r   )r   pytestr(   lhotser   r   lhotse.testing.dummiesr   r   "nemo.collections.common.data.utilsr   nemo.collections.speechlm2.datar   !nemo.collections.speechlm2.modelsr   r	   r)   set_default_devicer   fixturer,   r7   rG   rV   ra   re   r   r   r   r   r   <module>   s*   







(
