o
    …wÖiL.  ã                   @   sþ   d dl Z d dlZd dlmZ d dlmZ d dlZd dl	Z	d dl
Z
d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ G d	d
„ d
ejƒZdd„ Ze	jdd„ ƒZe	jdd„ ƒZe	jdd„ ƒZe	jdd„ ƒZe	jj ddG dd„ dƒƒZ!dS )é    N)ÚPath)ÚTorchElasticEnvironment)Úparallel_state)Ú
DictConfigÚ	OmegaConf)Úmodular_models)Úshift_tokens_by_multi_audios)ÚNLPDDPStrategyc                   @   s   e Zd Zdd„ ZdS )ÚModularAudioGPTModelc                 O   s   d S )N© )ÚselfÚargsÚkwargsr   r   úo/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/tests/collections/multimodal/test_speechllm_models.pyÚlog"   s   zModularAudioGPTModel.logN)Ú__name__Ú
__module__Ú__qualname__r   r   r   r   r   r
       s    r
   c                  C   sJ   t  d¡ d} d}d}| |d | 7 } tjjddd| d t dd¡ d S )	Né   ztcp://Ú	localhostÚ6000ú:Úgloor   )ÚbackendÚ
world_sizeÚrankÚinit_method)ÚplÚseed_everythingÚtorchÚdistributedÚinit_process_groupr   Úinitialize_model_parallel)r   Ú	master_ipÚmaster_portr   r   r   Úsetup_module&   s   
r%   c                  C   s\   t j t j t¡¡} t t j | d¡¡}d|j_	d|j_
d|j_d|jjj_d|jjj_|S )NzO../../../examples/multimodal/speech_llm/conf/modular_audio_gpt_config_peft.yamlzu/root/home/works/TestData/pretrained_models/megatron_gpt/gpt_pretrain_220m_len_4096_pos_alibi_step_595508_gbs256.nemoé   zE/root/home/works/TestData/datasets/LibriSpeech/dev_clean_cleaned.json)ÚosÚpathÚdirnameÚabspathÚ__file__r   ÚloadÚjoinÚmodelÚrestore_from_pathÚmicro_batch_sizeÚglobal_batch_sizeÚdataÚvalidation_dsÚmanifest_filepathÚtrain_ds)Úthis_test_dirÚconfigr   r   r   Úllm_model_config1   s   þÿ
ÿ
ÿr8   c                  C   sœ   t i ƒ} tj ¡ rd}t d¡ nd}|| _d| _d| _d| _d| _	d| _
d| _dtjd	< dtjd
< dtjd< tƒ }tƒ g}tjdd||dœ| ¤Ž}|| fS )NÚgpuÚcudaÚcpur   é   g      ð?é    Ú0Ú
LOCAL_RANKÚRANKÚ1Ú
WORLD_SIZEF)ÚloggerÚpluginsÚstrategyr   )r   r   r:   Úis_availableÚset_default_deviceÚacceleratorÚdevicesÚ	num_nodesÚ
max_epochsÚ	max_stepsÚval_check_intervalÚ	precisionr'   Úenvironr	   r   r   ÚTrainer)Úconfig_trainerrH   rE   rD   Útrainerr   r   r   Útrainer_configI   s&   



rS   c                  C   sB   ddi} dddddddgdœ}t d	t | ƒt |ƒt |ƒd
dœƒ}|S )NÚ_target_z>nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessorz-nemo.collections.asr.modules.ConformerEncoderé@   é   Úrel_pos_local_attné€   )rT   Úfeat_inÚn_layersÚd_modelÚself_attention_modelÚatt_context_sizezXnemo.collections.multimodal.speechllm.modules.speechllm_perception.AudioPerceptionModulei   )rT   ÚpreprocessorÚencoderÚmodality_adapterÚ
output_dim)r   )r^   r_   Úmodel_configr   r   r   Úperception_model_configj   s"   ú	ûÿ	rc   c               	   C   sÆ   t  t ddg¡¡} t  d¡ dd¡ ¡ }|d d …d d…f }|d d …dd …f }t  ddg¡ ¡ }t  g d¢g d	¢g¡}| ||t  d
¡ dd¡ ¡ t  ddg¡ ¡ |||dœ}t  ddg¡|d< |S )Ni ú  é
   r&   é   éÿÿÿÿr   é   )r   r   r   r   )r   r   r   r   i  é‚   )Úaudio_signal_lengthÚtokensÚtokens_lengthÚcontextsÚcontext_lengthsÚlabelsÚanswersÚ	loss_maskÚaudio_signal)	r   Ú
from_numpyÚnpÚarrayÚarangeÚreshapeÚintÚTensorÚrandn)Ú
signal_lenÚ
transcriptrj   rn   Útranscript_lengthrp   Úbatchr   r   r   Ú
test_batch‚   s"   ø
r~   z?nedd to move pretrained GPT model to /home/works/TestData first)Úreasonc                   @   s|   e Zd Zejjdd„ ƒZejjdd„ ƒZejjdd„ ƒZejjdd„ ƒZ	ejjd	d
„ ƒZ
ejjdd„ ƒZejjdd„ ƒZdS )ÚTestModularAudioGPTModelc                 C   sx   d|j _||j _|\}|_tj||d}t ¡ }tt	|ƒd ƒ}| 
¡  | |¡ W d   ƒ d S 1 s5w   Y  d S )NÚ%stt_en_fastconformer_transducer_large©rR   z
model.nemo)r.   Úpretrained_audio_modelÚ
perceptionrR   r
   Úrestore_from_pretrained_modelsÚtempfileÚTemporaryDirectoryÚstrr   ÚtrainÚsave_to)r   r8   rc   rS   rR   r.   ÚtmpdirÚ	save_pathr   r   r   Útest_init_and_train›   s   

"ýz,TestModularAudioGPTModel.test_init_and_trainc                 C   sè   d|j _||j _|\}|_tj||d}| ¡  | ¡  dd„ | ¡ D ƒ}| 	|¡\}}	}
}}|j
dks6J ‚t | ¡  ¡  ¡  ¡ d¡sFJ ‚|	j
dksMJ ‚|
j
dksTJ ‚t |jd	d
 ¡  ¡ dd	g¡sfJ ‚t | ¡  ¡ d¡srJ ‚d S )Nr   r‚   c                 S   ó   i | ]\}}||j d d“qS ©T)Únon_blocking©r:   ©Ú.0ÚkeyÚvalr   r   r   Ú
<dictcomp>¯   ó    zCTestModularAudioGPTModel.test_prepare_llm_input.<locals>.<dictcomp>)é   r&   i   gÍV^ò?‘/@)r&   r   r˜   r˜   )r&   r˜   r   )Úaxisr&   )é   é   )r.   rƒ   r„   rR   r
   r…   r:   r‰   ÚitemsÚprepare_llm_inputÚshapers   ÚallcloseÚsumr;   ÚdetachÚnumpy)r   r8   rc   rS   r~   rR   r.   r}   Úencoder_inputÚattention_maskrn   rp   Úencoder_lengthr   r   r   Útest_prepare_llm_input§   s   
 $z/TestModularAudioGPTModel.test_prepare_llm_inputc                 C   sz   d|j _||j _|\}|_tj||d}| ¡  | ¡  | ¡  | 	¡  | 
t|gƒd ¡}t | ¡  ¡  ¡ d¡s;J ‚d S )Nr   r‚   çŠŽäòÒ@)r.   rƒ   r„   rR   r
   r…   r:   Úon_train_startÚsetupr‰   Útraining_stepÚiterrs   rŸ   r;   r¡   r¢   )r   r8   rc   rS   r~   rR   r.   Ú	loss_meanr   r   r   Útest_training_step¸   s   
 z+TestModularAudioGPTModel.test_training_stepc           	      C   s€   d|j _||j _|\}|_tj||d}| ¡  | ¡  dd„ | ¡ D ƒ}| 	t
|gƒd¡}t |d  ¡  ¡  ¡ d¡s>J ‚d S )Nr   r‚   c                 S   rŽ   r   r‘   r’   r   r   r   r–   Í   r—   zATestModularAudioGPTModel.test_validation_step.<locals>.<dictcomp>r   Úlossr§   )r.   rƒ   r„   rR   r
   r…   r:   r‰   rœ   Úvalidation_stepr«   rs   rŸ   r;   r¡   r¢   )	r   r8   rc   rS   r~   rR   r.   r}   r¬   r   r   r   Útest_validation_stepÅ   s   
$z-TestModularAudioGPTModel.test_validation_stepc           
      C   st   d|j _||j _|\}|_tj||d}| ¡  | ¡  dd„ | ¡ D ƒ}| 	|dd¡}d}	|d d |	ks8J ‚d S )Nr   r‚   c                 S   rŽ   r   r‘   r’   r   r   r   r–   Ù   r—   z>TestModularAudioGPTModel.test_predict_step.<locals>.<dictcomp>r   u`   to suit you. Please note these are lecture notes from an alternate presentation. Copyright  â‡ Ú	sentences)
r.   rƒ   r„   rR   r
   r…   r:   r‰   rœ   Úpredict_step)
r   r8   rc   rS   r~   rR   r.   r}   ÚresponseÚground_truthr   r   r   Útest_predict_stepÑ   s   
z*TestModularAudioGPTModel.test_predict_stepc                 C   s`  d|j _||j _|\}|_tj||d}| ¡  d}t dd|g¡t dd|g¡g}t 	g d¢¡t 	g d¢¡g}t 
dd|g¡}	t 	ddg¡}
g d	¢g d
¢g}| |||	|
|¡\}}|jdd|fksdJ ‚|jdkskJ ‚t | ¡  ¡ d¡swJ ‚|dd |d d …f  ¡ dks‰J ‚t |d|d d |d d |d d  …f t |d d |g¡¡s®J ‚d S )Nr   r‚   r=   rg   rš   ©é   rV   r<   r&   é   ©r   r<   r·   é   ©r   rV   rš   é   é8   )r&   )r½   é4   r   r   )r.   rƒ   r„   rR   r
   r…   Úevalr   ÚonesÚ
LongTensorÚzerosÚ_concat_multi_featuresrž   rs   rŸ   r;   r¢   r    )r   r8   rc   rS   rR   r.   Úfeat_dimÚencodedÚencoded_lenÚinput_embedsÚinput_lengthÚcontext_start_idxr£   r¥   r   r   r   Útest_concat_multi_featuresÞ   s,   
 
ÿ$*þz3TestModularAudioGPTModel.test_concat_multi_featuresc                 C   sê   d}t  g d¢¡t  g d¢¡g}t  ddg¡}t  ddg¡}g d¢g d¢g}t|||||ƒ}|jdks5J ‚t |d	d
|d	 d …f t  |d	 d g¡¡sOJ ‚t |d	|d	 d |d	 d |d	 d	  …f t  |d	 d	 g¡¡ssJ ‚d
S )zVThis test is put here because its functionality is similar to _concat_multi_features()rU   r¶   r&   r=   r¸   r¹   r»   )r&   rU   r   Nr   )r   rÁ   rÀ   r   rž   rs   rŸ   rÂ   )r   Úencoder_max_lengthÚ	audio_lenÚcontext_tokensÚcontext_lengthrÉ   Únew_context_tokensr   r   r   Ú!test_shift_tokens_by_multi_audiosø   s   
ÿ4*þz:TestModularAudioGPTModel.test_shift_tokens_by_multi_audiosN)r   r   r   ÚpytestÚmarkÚunitr   r¦   r­   r°   rµ   rÊ   rÐ   r   r   r   r   r€   ™   s    





r€   )"r'   r†   Úpathlibr   Úlightning.pytorchÚpytorchr   r¢   rs   rÑ   r   Ú&lightning.pytorch.plugins.environmentsr   Úmegatron.corer   Ú	omegaconfr   r   Ú-nemo.collections.multimodal.speech_llm.modelsr   Ú=nemo.collections.multimodal.speech_llm.parts.utils.data_utilsr   Ú(nemo.collections.nlp.parts.nlp_overridesr	   r
   r%   Úfixturer8   rS   rc   r~   rÒ   Úskipr€   r   r   r   r   Ú<module>   s2   

 

