o
    wi1                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZ ej r8ed dd	 Zejd
ddd Zejd
ddd Zejd
ddd Zdd Zdd Zdd Zdd ZdS )    N)CutSetSupervisionSegment)	dummy_cutdummy_recording)move_data_to_device)DuplexS2SDataset)DuplexS2SModelcudac                   C   s(   t jdrdddddS ddd	d
dS )Nz)/home/TestData/speechlm/pretrained_modelszC/home/TestData/speechlm/pretrained_models/TinyLlama--TinyLlama_v1.1zP/home/TestData/speechlm/pretrained_models/low-frame-rate-speech-codec-22khz.nemoz_/home/TestData/speechlm/pretrained_models/stt_en_fastconformer_hybrid_large_streaming_80ms.nemozT/home/TestData/speechlm/pretrained_models/stt_en_fastconformer_transducer_large.nemo)pretrained_llmpretrained_audio_codecpretrained_asrscoring_asr0stt_en_fastconformer_hybrid_large_streaming_80ms%stt_en_fastconformer_transducer_largezTinyLlama/TinyLlama_v1.1z(nvidia/low-frame-rate-speech-codec-22khz)r   r   r
   r   )ospathexists r   r   h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/tests/collections/speechlm2/test_duplex_s2s.pyresolve_pretrained_models   s   	r   session)scopec                  C   s   i t  ddgddddi ddd	d
d
gdddd dddddddddddddddddd
ddddd d!d"d#d$d%d&dd'd(dd)d*d+ddd,d-d.d/dd0d1d2d3d4d5dd6id7} t| }tj rr|d8 |S )9NFz^audio_codec\..+$      zCnemo.collections.speechlm2.modules.perception.AudioPerceptionModulei   _target_z-nemo.collections.asr.modules.ConformerEncoderatt_context_sizecausal_downsamplingconv_context_sizeconv_kernel_size	   conv_norm_type
batch_normd_modeli   dropout皙?dropout_attdropout_embg        dropout_pre_encoderfeat_in   feat_outff_expansion_factor   n_heads   n_layers   pos_emb_max_leni  rel_posdw_striding   )self_attention_modelsubsamplingsubsampling_conv_channelssubsampling_factorz?nemo.collections.speechlm2.modules.perception.IdentityConnector)r   r#   z>nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessorgh㈵>Ti   per_featurer   >  hanng?g{Gz?)r   ditherfeaturesframe_splicinglogn_fft	normalizepad_to	pad_valuesample_ratewindowwindow_sizewindow_stride)target
output_dimencodermodality_adapterpreprocessorztorch.optim.AdamW)pretrained_weightsfreeze_paramsaudio_loss_weighttext_loss_weight
perception	optimizerr	   )r   r   torchr	   is_availableto)cfgmodelr   r   r   rX   0   s   	
.6

rX   c                 C   s   t | jddddgdgdS )Ng{Gz?r;   "V  user	assistant)frame_lengthsource_sample_ratetarget_sample_rateinput_rolesoutput_roles)r   	tokenizer)rX   r   r   r   datasetn   s   rb   c                  C   s   t dtdddd} tddd| _t| j| jddddd	t| j| jd
dddd	t| j| jddddd	t| j| jddddd	g| _t| gS )Nr   T)	with_data)	recordingr   r%   hirZ   )idrecording_idstartdurationtextspeakerg333333?hellor[   g      ?okg333333?g?okay)r   r   target_audior   rf   rg   supervisionsr   )cutr   r   r   training_cutset_batchz   sH   
"rr   c                 C   s   | | }dD ]}||v sJ t || sJ q|d jdks!J |d jdks*J |d dgks3J |d  g d	gks@J |d
  g dgksMJ d S )N)source_audioro   source_audio_lenstarget_audio_lenstarget_tokenstarget_token_lenssource_tokenssource_token_lensrs   )r   r;   ro   )r   rY   target_textsz
hello okayrv   )r   r   r   r   r   r1   r   r   r   iQ  r   r   r   rx   )r   r1   r   r   r   r   r   ig  r1   r   r   r   r   )rT   	is_tensorshapetolist)rb   rr   batchkeyr   r   r   test_s2s_dataset   s   
r   c                 C   sd   |    || }t|| jd}| j|dd}t|d sJ t|d r(J |d dks0J d S )Ndevicer   	batch_idxloss)on_train_epoch_startr   r   training_steprT   r{   isnanrX   rb   rr   r~   resultsr   r   r   test_s2s_training_step   s   r   c                 C   s@   |    || }t|| jd}| jd|idd}|d u sJ d S )Nr   dummy_val_setr   r   )on_validation_epoch_startr   r   validation_stepr   r   r   r   test_s2s_validation_step   s
   r   c                 C   s  | j tddtdgd}| h dksJ t|d ts"J t|d d ts-J |d }|jdks8J |j	tj
ks@J |dk sHJ || jk  sQJ |d	 }|jd
ks\J |j	tj
ksdJ |dk slJ || jk  suJ |d }|j	tjksJ d S )Nr   r;   )input_signalinput_signal_lens>   rj   audio	audio_len
tokens_lentokens_texttokens_audiorj   r   r   )r      r   )r   r   r/   r   )offline_inferencerT   randntensorkeys
isinstanceliststrr|   dtypelongalltext_vocab_sizespeech_vocab_sizefloat32)rX   ansgen_textgen_audio_codes	gen_audior   r   r   test_s2s_offline_generation   s&   

r   )r   pytestrT   lhotser   r   lhotse.testing.dummiesr   r   "nemo.collections.common.data.utilsr   nemo.collections.speechlm2.datar   !nemo.collections.speechlm2.modelsr   r	   rU   set_default_devicer   fixturerX   rb   rr   r   r   r   r   r   r   r   r   <module>   s*   




=


(
