o
    i#                     @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlm  m	Z
 d dlmZ d dlmZmZmZ G dd deZeG dd dZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd deZG dd dejZG dd dejZdS )    N)Enumauto)	dataclass)	LayerNormSamePadTransposeLastc                   @   s   e Zd Ze ZdS )ModalityN)__name__
__module____qualname__r   AUDIO r   r   U/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/emotion2vec/modules.pyr      s    
r   c                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	e
ed	< d
Zeed< d
Zeed< dZeed< dZeed< dZe
ed< dS )D2vDecoderConfigi  decoder_dim   decoder_groups   decoder_kerneldecoder_layersg?input_dropoutFadd_positions_maskedadd_positions_allTdecoder_residual   projection_layersg       @projection_ratioN)r	   r
   r   r   int__annotations__r   r   r   r   floatr   boolr   r   r   r   r   r   r   r   r      s   
 r   c                       $   e Zd Z fddZdd Z  ZS )FixedPositionalEncoderc                       t    || _d S N)super__init__	positions)self	pos_embed	__class__r   r   r&   (      

zFixedPositionalEncoder.__init__c                 C   s   | j S r$   )r'   r(   xpadding_maskr   r   r   forward,   s   zFixedPositionalEncoder.forwardr	   r
   r   r&   r0   __classcell__r   r   r*   r   r"   '   s    r"   c                       s(   e Zd ZdZ fddZdd Z  ZS )TextFeatPositionalEncoderz
    Original encoder expects (B, T) long input. This module wraps it to take
    local_encoder output which are (B, T, D) float tensors
    c                    r#   r$   )r%   r&   pos_encoder)r(   r4   r*   r   r   r&   6   r,   z"TextFeatPositionalEncoder.__init__c                 C   s   |  |d S )N).r   )r4   r-   r   r   r   r0   :   s   z!TextFeatPositionalEncoder.forward)r	   r
   r   __doc__r&   r0   r2   r   r   r*   r   r3   0   s    r3   c                       r!   )BlockEncoderc                    s6   t    || _|| _|| _|| _tj|dd| _d S )NTinplace)	r%   r&   blocksnormlayer_norm_first	layerdropnnDropoutdropout)r(   r9   
norm_layerr;   r<   r?   r*   r   r   r&   A   s   
zBlockEncoder.__init__c           
      C   s   | j d ur| js|  |}| |}t| jD ]=\}}| jr+| jdks+tj | jkrT|}|d urL|d urL|	ddkr@|| n|
d}||| }||||\}}	q| j d urb| jrb|  |}|S )Nr   r   )r:   r;   r?   	enumerater9   trainingr<   nprandomsizesqueezetype_as)
r(   r.   r/   
alibi_biasalibi_scaleiblkabscale_r   r   r   r0   I   s   

  
zBlockEncoder.forwardr1   r   r   r*   r   r6   @   s    r6   c                       s<   e Zd ZU eed< def fddZdd Zdd Z  ZS )	DecoderBasedecoder_cfgcfgc                    r#   r$   )r%   r&   rP   )r(   rQ   r*   r   r   r&   `   s   

zDecoderBase.__init__c                 C   s(   | j  D ]}t|tjr|  qd S r$   )projmodules
isinstancer=   Linearreset_parameters)r(   modr   r   r   rV   e   s
   zDecoderBase.reset_parametersc                 C   s4   |d u s| j jr|d|dkr|S || }|S )Nr   )rP   r   rE   )r(   r.   residualrJ   	mask_inforetr   r   r   add_residualj   s   zDecoderBase.add_residual)	r	   r
   r   r   r   r&   rV   r[   r2   r   r   r*   r   rO   ]   s
   
 rO   c                       s*   e Zd Zdef fddZdd Z  ZS )	Decoder1drQ   c                    s   t     fddtj fddt jD  | _g } j}t jd D ]!}|dkr7t	| j
 n|}|t|| |t  |}q*|t| t|dkrb|d | _d S tj| | _d S )Nc                    sN   t j|  j j jd  jdt jt t jddt t  g}t j	| S )N   )kernel_sizepaddinggroupsF)elementwise_affine)
r=   Conv1dr   r   r   r   r   r   GELU
Sequential)in_dimblock)rQ   r   r   
make_block{   s   
z&Decoder1d.__init__.<locals>.make_blockc                    s"   g | ]}|d krn j qS )r   )r   ).0rJ   rQ   	input_dimrg   r   r   
<listcomp>   s    z&Decoder1d.__init__.<locals>.<listcomp>r   r   )r%   r&   r=   rd   ranger   r9   r   r   r   r   appendrU   rc   lenrR   )r(   rQ   rj   projscurr_dimrJ   next_dimr*   ri   r   r&   x   s$   zDecoder1d.__init__c                 C   sZ   | dd}|}t| jD ]\}}||}| ||||}|}q| dd}| |}|S )Nr   r]   )	transposerA   r9   r[   rR   )r(   r.   rY   rX   rJ   layerr   r   r   r0      s   
zDecoder1d.forward)r	   r
   r   r   r&   r0   r2   r   r   r*   r   r\   w   s    )r\   c                       sF   e Zd Zddddddddejejdddf fdd	Zd
dd	Z  ZS )AltBlockg      @FN        Tc              	      s   t    || _|| _ddlm}m} ||| _t|||||||d| _	|
dkr,||
nt
 | _||| _t|| }|||||d| _t
j|	dd| _d S )Nr   )DropPathMlp)	num_headsqkv_biasqk_scale	attn_drop	proj_dropcosine_attentionru   )in_featureshidden_features	act_layerdropFr7   )r%   r&   r;   ffn_targets&funasr.models.emotion2vec.timm_modulesrv   rw   norm1AltAttentionattnr=   Identity	drop_pathnorm2r   mlpr>   post_mlp_dropout)r(   dimrx   	mlp_ratiory   rz   r   r{   mlp_droppost_mlp_dropr   r   r@   r;   r   r}   rv   rw   mlp_hidden_dimr*   r   r   r&      s0   



zAltBlock.__init__c                 C   s   | j r1|| | | ||| }| | | }}|}|| | | }| js-|}||fS || | ||| }| | }}| |}|}| || | | }| js]|}||fS r$   )r;   r   r   r   r   r   r   r   )r(   r.   r/   rH   rtr   r   r   r0      s    

zAltBlock.forwardNN)	r	   r
   r   r=   rc   r   r&   r0   r2   r   r   r*   r   rt      s     /rt   c                       s4   e Zd Z						d	 fdd	Zd
ddZ  ZS )r      FNru   c           	   	      s   t    || _|| }|p|d | _tj||d |d| _t|| _t||| _	t|| _
|| _|rKtjtdt|ddf dd| _d S d S )Ng         )bias
   r   T)requires_grad)r%   r&   rx   rM   r=   rU   qkvr>   r{   rR   r|   r}   	Parametertorchlogoneslogit_scale)	r(   r   rx   ry   rz   r{   r|   r}   head_dimr*   r   r   r&      s   

zAltAttention.__init__c                 C   s  |j \}}}| |||d| j|| j ddddd}|d |d |d }}	}
|j}| jrXtj|ddtj|	dd	dd }t
j| jt
t
d	d
 }|| }n|| j }||		dd }|d ur||}|d d d |df  |7  < |d ur| r||ddt
jtd}|jdt
jdj|d}| |}||
 	dd}||||}| |}| |}|S )Nr   r]   r   r      )r   g      Y@)maxz-inf)r   dtype)r   )shaper   reshaperx   permuter   r}   F	normalizerr   r   clampr   r   tensorexprM   rG   rE   anymasked_fill	unsqueezetor    r   softmaxfloat32r{   rR   r|   )r(   r.   r/   rH   BNCr   qkvr   r   r   r   r   r   r0     sF   
$


"


zAltAttention.forward)r   FNru   ru   Fr   r1   r   r   r*   r   r      s    r   )r   numpyrC   torch.nnr=   enumr   r   torch.nn.functional
functionalr   dataclassesr   )funasr.models.emotion2vec.fairseq_modulesr   r   r   r   r   Moduler"   r3   r6   rO   r\   rt   r   r   r   r   r   <module>   s"   	:D