o
    wix(                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d dlm  mZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dl m!Z!m"Z" G dd de
Z#G dd dej$Z%G dd dej$Z&eG dd deej'Z(dS )    )	dataclass)OptionalN)ModelParallelConfig)MegatronModule)TransformerConfig)	OmegaConf)
AutoConfigAutoFeatureExtractorAutoModelForSpeechSeq2Seq)get_nested_attrto_dict_config)Serialization)NeuralModule)io)loggingmodel_utilsc                	       s   e Zd ZdZ		ddedeej deej f fddZ				ddd	Z	d
ee
j dee
j dee
j dee
j fddZ  ZS )MCoreASRModulea@  
    Wrapper class for ASR encoder from `nemo.collections.asr.models.ASRModel`.

    `TransformerConfig` is a dummy config to satisfy the `MegatronModule` constructor.
    `num_attention_heads` is set to 16 such that it's divisible by the value of TP.
    `num_layers` and `hidden_size` are set to 1 since not used.
    Nencoderpreprocessorspec_augmentc                    s.   t  jtddddd || _|| _|| _d S )N      )
num_layershidden_sizenum_attention_heads)config)super__init__r   r   r   spec_augmentation)selfr   r   r   	__class__ i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/speechlm/modules/asr_module.pyr   +   s   
zMCoreASRModule.__init__c                 C   sZ   |d uo|d u}|d uo|d u}||A du rt | j d|s)| j||d\}}||fS )NFz Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive  with ``processed_signal`` and ``processed_signal_len`` arguments.)input_signallength)
ValueErrorr!   r   )r   r$   input_signal_lengthprocessed_signalprocessed_signal_lengthhas_input_signalhas_processed_signalr"   r"   r#   maybe_preprocess_audio6   s   

z%MCoreASRModule.maybe_preprocess_audior$   r'   r(   r)   c                 C   sL   |  ||||\}}| jd ur| jr| j||d}| j||d\}}||fS )N)
input_specr%   )audio_signalr%   )r,   r   trainingr   )r   r$   r'   r(   r)   encodedencoded_lenr"   r"   r#   forwardL   s   zMCoreASRModule.forward)NN)NNNN)__name__
__module____qualname____doc__r   r   nnModuler   r,   torchTensorr2   __classcell__r"   r"   r    r#   r   "   s2    
r   c                       sD   e Zd Zdedef fddZdejdejfddZd	d
 Z	  Z
S )HFWrappedPreprocessorr   sample_ratec                    s   t    || _|| _d S N)r   r   r   r=   )r   r   r=   r    r"   r#   r   `   s   

zHFWrappedPreprocessor.__init__r$   r%   c                 C   sb   | j |  | jdd}|d }||j|}tj|j	d g|j	d  |j|j
d}||fS )Npt)sampling_ratereturn_tensorsinput_features   r   )devicedtype)r   cpunumpyr=   torD   type_asr9   tensorshaperE   )r   r$   r%   	processedr(   processed_signal_lenr"   r"   r#   r2   e   s   zHFWrappedPreprocessor.forwardc                 O      | j |i |S r>   r2   r   argskwargsr"   r"   r#   __call__p      zHFWrappedPreprocessor.__call__)r3   r4   r5   r	   intr   r9   r:   r2   rS   r;   r"   r"   r    r#   r<   _   s    r<   c                       sB   e Zd Zdejf fddZdejdejfddZdd	 Z	  Z
S )
HFWrappedEncoderr   c                    s   t    || _d S r>   )r   r   r   )r   r   r    r"   r#   r   u   s   

zHFWrappedEncoder.__init__r.   r%   c                 C   sV   |  || j j}|d }|dd}tj|jd g|jd  |jd }||fS )Nlast_hidden_stater   rC   r   )rD   )	r   typerE   	transposer9   rJ   rK   rD   long)r   r.   r%   outputr0   r1   r"   r"   r#   r2   y   s   &zHFWrappedEncoder.forwardc                 O   rN   r>   rO   rP   r"   r"   r#   rS      rT   zHFWrappedEncoder.__call__)r3   r4   r5   r7   r8   r   r9   r:   r2   rS   r;   r"   r"   r    r#   rV   t   s    
rV   c                   @   s  e Zd ZU dZee ed< dZee ed< dZee	 ed< dZ
ee ed< dZee ed< dZee ed< dZee ed	< dZee ed
< dZee ed< dZee ed< dZee	 ed< dZee ed< dZee ed< dZee ed< dd Zdd Zdd ZdS )ASRModuleConfigN_target_znvidia/canary-1bpretrained_modelr   r   preprocessor_configspec_augment_configinit_from_pretrained_modelinit_from_nemo_modelinit_from_ptl_ckptr   target_modulei>  r=   Fuse_hf_auto_modelhf_trust_remote_codeThf_load_pretrained_weightsc                 C   sZ  | j d urt| j }ntjj}| jd ur:| jd u r:t| j	dr)|
| j}n|j| jd}tj|jdd| _nt| j}||d}t| j| j| jd}|| |}| jd urct|| j}|d u rstd| j  d| j | jd urtt| j}nt|d	r|j}nd }td| j  d
 | j|j krtd| j d|j  d||fS )Nz.nemo)
model_nameT)resolve)cfg)ra   rb   rc   zModel z does not have attribute r   z0 does not have a preprocessor, use with caution.z'Sample rate mismatch: ASRModuleConfig (z) != preprocessor (zE). Please provide a preprocessor config with the correct sample rate.)!r]   r   import_class_by_pathnemo_asrmodelsASRModelr^   r   strendswithrestore_fromfrom_pretrainedr   to_containerrj   createra   rb   rc   %maybe_init_from_pretrained_checkpointrd   r   r&   r_   r   from_config_dictr   hasattrr   r   warningr=   _sample_rate)r   imported_cls	asr_modelrj   init_cfgmodelr   r"   r"   r#   configure_nemo_asr_model   sF   





z(ASRModuleConfig.configure_nemo_asr_modelc                 C   s   t j| j| jd}t|| j}| jrtj| jtj	| jdd}nt
j| j| jd}tj|| jd}|}| jd ur=t|| j}t|}||fS )N)trust_remote_codeT)torch_dtyper   use_safetensors)r	   rr   r^   rf   r<   r=   rg   r
   r9   bfloat16r   from_configrd   r   rV   )r   hf_preprocessorr   r{   r   r}   r"   r"   r#   configure_hf_auto_model   s$   
z'ASRModuleConfig.configure_hf_auto_modelc                 C   sT   | j r
|  \}}n|  \}}d|_| jd ur!tt| j}nd }t|||dS )NT)r   r   r   )	re   r   r~   tensor_parallel_grad_reducer`   r   rv   r   r   )r   r}   r   r   r"   r"   r#   configure_model   s   
zASRModuleConfig.configure_model)r3   r4   r5   r]   r   ro   __annotations__r^   r   rU   r   dictr_   r`   ra   rb   rc   rd   r=   re   boolrf   rg   r~   r   r   r"   r"   r"   r#   r\      s$   
 /r\   ))dataclassesr   typingr   r9   torch.nnr7   #megatron.core.model_parallel_configr    megatron.core.transformer.moduler   ,megatron.core.transformer.transformer_configr   	omegaconfr   transformersr   r	   r
   nemo.collections.asrcollectionsasrrl   nemo.collections.speechlm.utilsr   r   nemo.core.classes.commonr   nemo.core.classes.moduler   nemo.lightningr   
nemo.utilsr   r   r   r8   r<   rV   IOMixinr\   r"   r"   r"   r#   <module>   s(   =