o
    i %                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
m  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ eeZeejed
krbd dlmZ nedddZe ddG dd dejj!Z"dS )    N)partial)	OmegaConf)contextmanager)LooseVersion)tables)AltBlock)AudioEncoder)load_audio_text_image_videoz1.6.0)autocastTc                 c   s    d V  d S )N )enabledr   r   S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/emotion2vec/model.pyr
      s   
r
   model_classesEmotion2vecc                       sh   e Zd ZdZ fddZ									dddZ	dd	d
Z				ddefddZdd Z	  Z
S )r   z
    Author: Ziyang Ma, Zhisheng Zheng, Jiaxin Ye, Jinchao Li, Zhifu Gao, Shiliang Zhang, Xie Chen
    emotion2vec: Self-Supervised Pre-Training for Speech Emotion Representation
    https://arxiv.org/abs/2312.15185
    c                    sj  t    t|d   | _ttjj 	d 	ddd fdd	i | _
tj | _t jj 	d 	d| j
}|| jd	< d | _ 	d
| _ 	d| _ 	d| _tj 	d| _t 	d 	d 	dtjfddt 	dD | _d | _ 	dr 	d| _|	dd}d | _|dkrtj 	d|| _d S d S )N
model_confnorm_epsnorm_affine)epselementwise_affinec                    sp   t |d u r
 dn||d u r dn| dd d d d d|  d	 d
 dS )N	embed_dim	num_heads	mlp_ratioTencoder_dropoutattention_dropoutactivation_dropoutpost_mlp_droplayer_norm_firstend_of_block_targets)	qkv_biasdrop	attn_dropmlp_dropr   	drop_path
norm_layerr   ffn_targets)r   get)r"   dimheads)cfgmake_layer_normr   r   
make_block4   s   
z(Emotion2vec.__init__.<locals>.make_blockr   r   AUDIOaverage_top_k_layers	loss_beta
loss_scaledropout_inputstart_drop_path_rateend_drop_path_ratedepthc                    s   g | ]} | qS r   r   ).0i)dprr*   r   r   
<listcomp>]   s    z(Emotion2vec.__init__.<locals>.<listcomp>
vocab_sizer   )NN)super__init__r   creater(   r   torchnn	LayerNormr%   alibi_biases
ModuleDictmodality_encodersr   
modalitiesaudioemar,   r-   r.   Dropoutr/   nplinspace
ModuleListrangeblocksnormprojLinear)selfkwargsencr7   	__class__)r(   r5   r*   r)   r   r:   *   sF   

(
zEmotion2vec.__init__NTFc              	   K   s  | j d }d }||||| p||s| jdnd||
d}|d }|d }|d }|dd }|d	d }| jd ur?| |}g }t| jD ]M\}}| jrb| jd
ddksbtj | jd
dkr|}|d ur|d ur|	ddkrw|| n|
d}||| }||||d\}}|r|| qF| jd ur| |}|r|	r|d d |jjd f }|d ur|d d |jjd f }||||dS d S )Nr+   clone_batch   )remove_maskedrS   
mask_seedsprecomputed_maskxencoder_maskpadding_mask
alibi_biasalibi_scale	layerdropr   )rZ   r[   )rX   rZ   layer_resultsmask)rA   r(   r%   r/   	enumeraterJ   trainingrF   randomsizesqueezetype_asappendrK   modality_cfgnum_extra_tokens)rN   sourcetargetidmoderZ   r_   features_onlyforce_remove_maskedremove_extra_tokensrW   rO   feature_extractorrV   extractor_outrX   rY   masked_padding_maskmasked_alibi_biasr\   r^   r4   blkabscalelrr   r   r   forwardh   sf   



 



zEmotion2vec.forwardc                 C   s   | j ||||d|d}|S )NT)rl   rZ   r_   rm   ro   )rx   )rN   ri   rl   rZ   r_   ro   resr   r   r   extract_features   s   zEmotion2vec.extract_featureskeyc              
      sD  | dd}| dd}| jd u rd}i }	t }
t|d| dd| dd|d	}t }||
 d
|	d< t|d | dd |	d< g }| d}|rTtj|dd t|D ]\}}|j	|d d}| j
jrot||j}|dd}| j|d d}|d }|d d  }|dkr|}n|dkrtj|dd}|r|rttj|d|| | |d ur|jng }g  | jr|jdd}| |}t|D ]\}}|drtj n|d d |f |d d |f< qtj|dd}|d   dd |D } fddt|D }|| ||d}|r||d< | | qX||	fS ) Ngranularity	utteranceextract_embeddingTi>  fs	data_typesound)r   audio_fsr   	tokenizerz0.3f	load_datar   batch_data_time
output_dir)exist_okdevice)r   rT   r8   )rZ   rX   frame)axisz{}.npy)r&   unusec                 S   s   g | ]	}| d s|qS r   
startswith)r3   lbr   r   r   r6     s    z)Emotion2vec.inference.<locals>.<listcomp>c                    s"   g | ]\}}| d s | qS r   r   )r3   idxr   scoresr   r   r6     s   " )r{   labelsr   feats)!r%   rL   timeperf_counterr	   lenosmakedirsr`   tor(   	normalizeF
layer_normshapeviewrz   rd   cpunumpyrF   meansavepathjoinformat
token_listr   infr<   softmaxtolistrf   )rN   data_indata_lengthsr{   r   frontendrO   r|   r~   	meta_datatime1audio_sample_listtime2resultsr   r4   wavri   r   rX   r   r   labselect_labelselect_scoreresult_ir   r   r   	inference   sd   



 
0zEmotion2vec.inferencec                 K   s"   ddl m} |dd| i|}|S )NrT   )export_rebuild_modelmodelr   )export_metar   )rN   rO   r   modelsr   r   r   export  s   zEmotion2vec.export)	NNNNTFFTN)NNFT)NNNN)__name__
__module____qualname____doc__r:   rx   rz   listr   r   __classcell__r   r   rQ   r   r   "   s.    A
M

L)T)#r   r   r<   loggingr   rF   	functoolsr   	omegaconfr   torch.nn.functionalr=   
functionalr   
contextlibr   distutils.versionr   funasr.registerr   !funasr.models.emotion2vec.modulesr   funasr.models.emotion2vec.audior   funasr.utils.load_utilsr	   	getLoggerr   logger__version__torch.cuda.ampr
   registerModuler   r   r   r   r   <module>   s*   

