o
    }o™iÓ.  ã                   @   s
  d dl Z d dlZd dlmZ d dlmZ d dlZd dl	Z	d dl
Z
d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ G d
d„ dejƒZdd„ Ze	jdd„ ƒZe	jdd„ ƒZe	jdd„ ƒZe	jdd„ ƒZ e	j!j"ddG dd„ dƒƒZ#dS )é    N)ÚPath)ÚTorchElasticEnvironment)Úparallel_state)Ú
DictConfigÚ	OmegaConf)Úmodular_models)Úshift_tokens_by_multi_audios)ÚGPTModel)ÚNLPDDPStrategyc                   @   s   e Zd Zdd„ ZdS )ÚModularAudioGPTModelc                 O   s   d S )N© )ÚselfÚargsÚkwargsr   r   úf/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/multimodal/test_speechllm_models.pyÚlog#   s   zModularAudioGPTModel.logN)Ú__name__Ú
__module__Ú__qualname__r   r   r   r   r   r   !   s    r   c                  C   sJ   t  d¡ d} d}d}| |d | 7 } tjjddd| d t dd¡ d S )	Né   ztcp://Ú	localhostÚ6000ú:Úgloor   )ÚbackendÚ
world_sizeÚrankÚinit_method)ÚplÚseed_everythingÚtorchÚdistributedÚinit_process_groupr   Úinitialize_model_parallel)r   Ú	master_ipÚmaster_portr   r   r   Úsetup_module'   s   
r&   c                  C   s\   t j t j t¡¡} t t j | d¡¡}d|j_	d|j_
d|j_d|jjj_d|jjj_|S )NzO../../../examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yamlzu/root/home/works/TestData/pretrained_models/megatron_gpt/gpt_pretrain_220m_len_4096_pos_alibi_step_595508_gbs256.nemoé   zE/root/home/works/TestData/datasets/LibriSpeech/dev_clean_cleaned.json)ÚosÚpathÚdirnameÚabspathÚ__file__r   ÚloadÚjoinÚmodelÚrestore_from_pathÚmicro_batch_sizeÚglobal_batch_sizeÚdataÚvalidation_dsÚmanifest_filepathÚtrain_ds)Úthis_test_dirÚconfigr   r   r   Úllm_model_config2   s   þÿ
ÿ
ÿr9   c                  C   sœ   t i ƒ} tj ¡ rd}t d¡ nd}|| _d| _d| _d| _d| _	d| _
d| _dtjd	< dtjd
< dtjd< tƒ }tƒ g}tjdd||dœ| ¤Ž}|| fS )NÚgpuÚcudaÚcpur   é   g      ð?é    Ú0Ú
LOCAL_RANKÚRANKÚ1Ú
WORLD_SIZEF)ÚloggerÚpluginsÚstrategyr   )r   r    r;   Úis_availableÚset_default_deviceÚacceleratorÚdevicesÚ	num_nodesÚ
max_epochsÚ	max_stepsÚval_check_intervalÚ	precisionr(   Úenvironr
   r   r   ÚTrainer)Úconfig_trainerrI   rF   rE   Útrainerr   r   r   Útrainer_configJ   s&   



rT   c                  C   sB   ddi} dddddddgdœ}t d	t | ƒt |ƒt |ƒd
dœƒ}|S )NÚ_target_z>nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessorz-nemo.collections.asr.modules.ConformerEncoderé@   é   Úrel_pos_local_attné€   )rU   Úfeat_inÚn_layersÚd_modelÚself_attention_modelÚatt_context_sizezXnemo.collections.multimodal.speechllm.modules.speechllm_perception.AudioPerceptionModulei   )rU   ÚpreprocessorÚencoderÚmodality_adapterÚ
output_dim)r   )r_   r`   Úmodel_configr   r   r   Úperception_model_configk   s"   ú	ûÿ	rd   c               	   C   sÆ   t  t ddg¡¡} t  d¡ dd¡ ¡ }|d d …d d…f }|d d …dd …f }t  ddg¡ ¡ }t  g d¢g d	¢g¡}| ||t  d
¡ dd¡ ¡ t  ddg¡ ¡ |||dœ}t  ddg¡|d< |S )Ni ú  é
   r'   é   éÿÿÿÿr   é   )r   r   r   r   )r   r   r   r   i  é‚   )Úaudio_signal_lengthÚtokensÚtokens_lengthÚcontextsÚcontext_lengthsÚlabelsÚanswersÚ	loss_maskÚaudio_signal)	r    Ú
from_numpyÚnpÚarrayÚarangeÚreshapeÚintÚTensorÚrandn)Ú
signal_lenÚ
transcriptrk   ro   Útranscript_lengthrq   Úbatchr   r   r   Ú
test_batchƒ   s"   ø
r   z?nedd to move pretrained GPT model to /home/works/TestData first)Úreasonc                   @   s|   e Zd Zejjdd„ ƒZejjdd„ ƒZejjdd„ ƒZejjdd„ ƒZ	ejjd	d
„ ƒZ
ejjdd„ ƒZejjdd„ ƒZdS )ÚTestModularAudioGPTModelc                 C   sˆ   d|j _||j _|\}|_tj||d}t|j tƒsJ ‚t 	¡ }t
t|ƒd ƒ}| ¡  | |¡ W d   ƒ d S 1 s=w   Y  d S )NÚ%stt_en_fastconformer_transducer_large©rS   z
model.nemo)r/   Úpretrained_audio_modelÚ
perceptionrS   r   Úrestore_from_pretrained_modelsÚ
isinstancer	   ÚtempfileÚTemporaryDirectoryÚstrr   ÚtrainÚsave_to)r   r9   rd   rT   rS   r/   ÚtmpdirÚ	save_pathr   r   r   Útest_init_and_trainœ   s   

"ýz,TestModularAudioGPTModel.test_init_and_trainc                 C   sè   d|j _||j _|\}|_tj||d}| ¡  | ¡  dd„ | ¡ D ƒ}| 	|¡\}}	}
}}|j
dks6J ‚t | ¡  ¡  ¡  ¡ d¡sFJ ‚|	j
dksMJ ‚|
j
dksTJ ‚t |jd	d
 ¡  ¡ dd	g¡sfJ ‚t | ¡  ¡ d¡srJ ‚d S )Nr‚   rƒ   c                 S   ó   i | ]\}}||j d d“qS ©T)Únon_blocking©r;   ©Ú.0ÚkeyÚvalr   r   r   Ú
<dictcomp>±   ó    zCTestModularAudioGPTModel.test_prepare_llm_input.<locals>.<dictcomp>)é   r'   i   gÍV^ò?‘/@)r'   r   rš   rš   )r'   rš   r   )Úaxisr'   )é   é   )r/   r„   r…   rS   r   r†   r;   r‹   ÚitemsÚprepare_llm_inputÚshapert   ÚallcloseÚsumr<   ÚdetachÚnumpy)r   r9   rd   rT   r   rS   r/   r~   Úencoder_inputÚattention_maskro   rq   Úencoder_lengthr   r   r   Útest_prepare_llm_input©   s   
 $z/TestModularAudioGPTModel.test_prepare_llm_inputc                 C   sz   d|j _||j _|\}|_tj||d}| ¡  | ¡  | ¡  | 	¡  | 
t|gƒd ¡}t | ¡  ¡  ¡ d¡s;J ‚d S )Nr‚   rƒ   çŠŽäòÒ@)r/   r„   r…   rS   r   r†   r;   Úon_train_startÚsetupr‹   Útraining_stepÚiterrt   r¡   r<   r£   r¤   )r   r9   rd   rT   r   rS   r/   Ú	loss_meanr   r   r   Útest_training_stepº   s   
 z+TestModularAudioGPTModel.test_training_stepc           	      C   s€   d|j _||j _|\}|_tj||d}| ¡  | ¡  dd„ | ¡ D ƒ}| 	t
|gƒd¡}t |d  ¡  ¡  ¡ d¡s>J ‚d S )Nr‚   rƒ   c                 S   r   r‘   r“   r”   r   r   r   r˜   Ï   r™   zATestModularAudioGPTModel.test_validation_step.<locals>.<dictcomp>r   Úlossr©   )r/   r„   r…   rS   r   r†   r;   r‹   rž   Úvalidation_stepr­   rt   r¡   r<   r£   r¤   )	r   r9   rd   rT   r   rS   r/   r~   r®   r   r   r   Útest_validation_stepÇ   s   
$z-TestModularAudioGPTModel.test_validation_stepc           
      C   st   d|j _||j _|\}|_tj||d}| ¡  | ¡  dd„ | ¡ D ƒ}| 	|dd¡}d}	|d d |	ks8J ‚d S )Nr‚   rƒ   c                 S   r   r‘   r“   r”   r   r   r   r˜   Û   r™   z>TestModularAudioGPTModel.test_predict_step.<locals>.<dictcomp>r   u`   to suit you. Please note these are lecture notes from an alternate presentation. Copyright  â‡ Ú	sentences)
r/   r„   r…   rS   r   r†   r;   r‹   rž   Úpredict_step)
r   r9   rd   rT   r   rS   r/   r~   ÚresponseÚground_truthr   r   r   Útest_predict_stepÓ   s   
z*TestModularAudioGPTModel.test_predict_stepc                 C   s`  d|j _||j _|\}|_tj||d}| ¡  d}t dd|g¡t dd|g¡g}t 	g d¢¡t 	g d¢¡g}t 
dd|g¡}	t 	ddg¡}
g d	¢g d
¢g}| |||	|
|¡\}}|jdd|fksdJ ‚|jdkskJ ‚t | ¡  ¡ d¡swJ ‚|dd |d d …f  ¡ dks‰J ‚t |d|d d |d d |d d  …f t |d d |g¡¡s®J ‚d S )Nr‚   rƒ   r>   rh   rœ   ©é   rW   r=   r'   é   ©r   r=   r¹   é   ©r   rW   rœ   é   é8   )r'   )r¿   é4   r   r   )r/   r„   r…   rS   r   r†   Úevalr    ÚonesÚ
LongTensorÚzerosÚ_concat_multi_featuresr    rt   r¡   r<   r¤   r¢   )r   r9   rd   rT   rS   r/   Úfeat_dimÚencodedÚencoded_lenÚinput_embedsÚinput_lengthÚcontext_start_idxr¥   r§   r   r   r   Útest_concat_multi_featuresà   s,   
 
ÿ$*þz3TestModularAudioGPTModel.test_concat_multi_featuresc                 C   sê   d}t  g d¢¡t  g d¢¡g}t  ddg¡}t  ddg¡}g d¢g d¢g}t|||||ƒ}|jdks5J ‚t |d	d
|d	 d …f t  |d	 d g¡¡sOJ ‚t |d	|d	 d |d	 d |d	 d	  …f t  |d	 d	 g¡¡ssJ ‚d
S )zVThis test is put here because its functionality is similar to _concat_multi_features()rV   r¸   r'   r>   rº   r»   r½   )r'   rV   r   Nr   )r    rÃ   rÂ   r   r    rt   r¡   rÄ   )r   Úencoder_max_lengthÚ	audio_lenÚcontext_tokensÚcontext_lengthrË   Únew_context_tokensr   r   r   Ú!test_shift_tokens_by_multi_audiosú   s   
ÿ4*þz:TestModularAudioGPTModel.test_shift_tokens_by_multi_audiosN)r   r   r   ÚpytestÚmarkÚunitr   r¨   r¯   r²   r·   rÌ   rÒ   r   r   r   r   r   š   s    





r   )$r(   rˆ   Úpathlibr   Úlightning.pytorchÚpytorchr   r¤   rt   rÓ   r    Ú&lightning.pytorch.plugins.environmentsr   Úmegatron.corer   Ú	omegaconfr   r   Ú-nemo.collections.multimodal.speech_llm.modelsr   Ú=nemo.collections.multimodal.speech_llm.parts.utils.data_utilsr   Ú@nemo.collections.nlp.models.language_modeling.megatron.gpt_modelr	   Ú(nemo.collections.nlp.parts.nlp_overridesr
   r   r&   Úfixturer9   rT   rd   r   rÔ   Úskipr   r   r   r   r   Ú<module>   s4   

 

