o
    iO                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlm  mZ	 d dl
mZmZ d dlmZmZmZmZ d dlmZmZmZ d dlmZmZ G dd deZdS )	    N)partial)CallableDict)	LayerNormSamePadTransposeLastConvFeatureExtractionModel)ModalityBlockEncoder	Decoder1d)ModalitySpecificEncoderget_alibi_biasc                
       s^   e Zd Zdedeegejf deegejf de	de
f
 fddZdd	 Z fd
dZ  ZS )AudioEncoder	embed_dim
make_block
norm_layerlayer_norm_firstalibi_biasesc                    s@  t j| _| jd d }t| jdjdd}tt t|t	|}	j
}
tdj|
 tjt gfddt|
D t R  }jrTtt|}tjjj tt fd	d
tjD |st|nd |jj}jd urtjnd }tt|d}t j||	d ||||d	 d S )Nr   g        F)conv_layersdropoutmode	conv_bias   c                    sJ   g | ]!}t t j  d  jdtt t ddt t  qS )   )kernel_sizepaddinggroupsF)elementwise_affine)nn
SequentialConv1dconv_pos_groupsr   r   r   GELU).0_)r   kmodality_cfg S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/emotion2vec/audio.py
<listcomp>8   s"    
z)AudioEncoder.__init__.<locals>.<listcomp>c                 3   s    | ]	} | V  qd S Nr(   )r$   i)dprr   r(   r)   	<genexpr>U   s    z(AudioEncoder.__init__.<locals>.<genexpr>)r   )	r'   r   local_encoderproject_featuresfixed_positional_encoderrelative_positional_encodercontext_encoderdecoderr   )evalfeature_encoder_specfeature_enc_layersr   extractor_moder   r    r   r   Linearconv_pos_depthmaxconv_pos_widthrangeconv_pos_pre_lnnplinspacestart_drop_path_rateend_drop_path_rateprenet_depthr
   
ModuleListprenet_layerdropprenet_dropoutr4   r   r   r   super__init__)selfr'   r   r   r   r   r   feature_embed_dimr/   r0   num_pos_layerspositional_encoderr3   r4   alibi_bias_fn	__class__)r-   r   r&   r   r'   r)   rH      sh   

	
zAudioEncoder.__init__c                    s   dt jf fdd}|d urad|  d}||}| rRt j|jd d |j|jd}d|t j	|jd |jd	|d f< d|
dgd
dg  }|S t j|jd d t j|jd}|S )
Ninput_lengthsc                    sH   dd }t t jD ]}||  j| d  j| d } q| tjS )zP
            Computes the output length of the convolutional layers
            c                 S   s   t | | | d S )N   )torchfloor)input_lengthr   strider(   r(   r)   _conv_out_lengtht   s   zdAudioEncoder.convert_padding_mask.<locals>.get_feat_extract_output_lengths.<locals>._conv_out_lengthrQ   r   )r=   lenr7   torR   long)rP   rV   r,   rI   r(   r)   get_feat_extract_output_lengthso   s   zJAudioEncoder.convert_padding_mask.<locals>.get_feat_extract_output_lengthsrQ   r   r   )dtypedevicer   )r]   )rR   
LongTensorrY   sumanyzerosshaper\   r]   arangeflipcumsumbool)rI   xpadding_maskr[   rP   output_lengthsr(   rZ   r)   convert_padding_maskn   s    	"z!AudioEncoder.convert_padding_maskc                    sJ   t    | j D ]}t|tjr|  q
| jd ur#| j  d S d S r+   )rG   reset_parametersr0   children
isinstancer   r9   r4   )rI   modrN   r(   r)   rk      s   

zAudioEncoder.reset_parameters)__name__
__module____qualname__intr   floatr   rD   r   rf   r   rH   rj   rk   __classcell__r(   r(   rN   r)   r      s    U(r   )rR   numpyr?   torch.nnr   	functoolsr   torch.nn.functional
functionalFtypingr   r   )funasr.models.emotion2vec.fairseq_modulesr   r   r   r   !funasr.models.emotion2vec.modulesr	   r
   r   funasr.models.emotion2vec.baser   r   r   r(   r(   r(   r)   <module>   s   