o
    i?                     @   sR  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZ	d dl
Z
d dlm  mZ d dl
mZmZ ddlmZ ddlmZ ddlmZ eG d	d
 d
ZG dd dejZG dd dejZG dd dejZd!ddZG dd dejZG dd dejZeedZG dd dejZ G dd dejZ!G dd dejZ"G dd  d ejZ#dS )"    N)	dataclass)DictIterableOptional)Tensornn   )decode)detect_language)
transcribec                   @   sj   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< dZeed< dS )ModelDimensionsn_melsn_audio_ctxn_audio_staten_audio_headn_audio_layern_vocab
n_text_ctxn_text_staten_text_headn_text_layerdefaultatt_typeN)__name__
__module____qualname__int__annotations__r   str r   r   _/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sense_voice/whisper_lib/model.pyr      s   
 r   c                       s$   e Zd Z fddZdd Z  ZS )	LayerNormc                    s   t  j|i | d S N)super__init__)selfargskwargs	__class__r   r    r$   %   s   zLayerNorm.__init__c                 C   sL   t | | j| jd ur| j nd | jd ur| j nd | j}||S r"   )F
layer_normfloatnormalized_shapeweightbiasepstype_as)r%   inputoutputr   r   r    forward(   s   
zLayerNorm.forward)r   r   r   r$   r4   __classcell__r   r   r(   r    r!   $   s    r!   c                   @   s   e Zd ZdedefddZdS )Linearxreturnc                 C   s2   t || j|j| jd u rd S | j|jS r"   )r*   linearr.   todtyper/   )r%   r7   r   r   r    r4   4   s   zLinear.forwardN)r   r   r   r   r4   r   r   r   r    r6   3   s    r6   c                       s2   e Zd Zdededee def fddZ  ZS )Conv1dr7   r.   r/   r8   c                    s.   t  |||j|d u rd S ||jS r"   )r#   _conv_forwardr:   r;   )r%   r7   r.   r/   r(   r   r    r=   =   s
   
zConv1d._conv_forward)r   r   r   r   r   r=   r5   r   r   r(   r    r<   <   s    *r<   '  c                 C   s   |d dksJ t ||d d  }t| t|d  }t| ddt jf |t jddf  }tjt|t|gddS )z*Returns sinusoids for positional embedding   r   r   Ndim)	nplogtorchexparangenewaxiscatsincos)lengthchannelsmax_timescalelog_timescale_incrementinv_timescalesscaled_timer   r   r    	sinusoidsC   s
   *rQ   c                
       v   e Zd Zdedef fddZ			ddedee dee d	ee fd
dZ	ddedededee fddZ	  Z
S )MultiHeadAttentionn_staten_headc                    H   t    || _t||| _t||dd| _t||| _t||| _d S NF)r/   r#   r$   rU   r6   querykeyvalueoutr%   rT   rU   r(   r   r    r$   M      
zMultiHeadAttention.__init__Nr7   xamaskkv_cachec                 K   s   | dd}| |}|d u s|d u s| j|vr/| |d u r |n|}| |d u r+|n|}	n
|| j }|| j }	| j|||	||d\}
}| |
|fS )Nis_pad_maskF)rb   getrY   rZ   r[   qkv_attentionr\   r%   r7   r_   r`   ra   r'   rb   qkvwvqkr   r   r    r4   U   s   


zMultiHeadAttention.forwardrg   rh   ri   c                 K   sl  | dd}|j\}}}	|	| j d }
|jg |jd d | jdR  dddd|
 }|jg |jd d | jdR  dddd|
 }|jg |jd d | jdR  dddd}|| }|d ur|sv||d |d |f  }n|dd}td	 }|||}| }t	j
|dd
|j}|d ur|r||d}|| ddddjdd| fS )Nrb   Fg      пr?   r   r      infr@           	start_dim)rd   shaperU   viewpermute	unsqueezeeqr,   masked_fillr*   softmaxr:   r;   flattendetach)r%   rg   rh   ri   r`   r'   rb   n_batchn_ctxrT   scalerk   	min_valuewr   r   r    re   n   s(   440$z MultiHeadAttention.qkv_attentionNNNr"   r   r   r   r   r$   r   r   dictr4   re   r5   r   r   r(   r    rS   L   0    
rS   c                
       rR   )MultiHeadAttentionSdparT   rU   c                    rV   rW   rX   r]   r(   r   r    r$      r^   zMultiHeadAttentionSdpa.__init__Nr7   r_   r`   ra   c                 K   s   | dd}| |}|d u s|d u s| j|vr/| |d u r |n|}| |d u r+|n|}	n
|| j }|| j }	| j|||	||dd\}
}| |
|fS )Nrb   F)rb   	is_causalrc   rf   r   r   r    r4      s   


zMultiHeadAttentionSdpa.forwardrg   rh   ri   c              	   K   sF  | dd}| dd}|j\}}	}
|
| j d }|jg |jd d | jdR  dddd	}|jg |jd d | jdR  dddd	}|jg |jd d | jdR  dddd	}|d uru|sld }d
}n	|dtj}tj	j
j||||d||d}|d ur||dd	 d}|dd}|jdd}|d fS )Nrb   Fr   g      r?   rl   r   r   rm   Tro   )	attn_mask	dropout_pr   r}   rp   )rd   rr   rU   rs   rt   ru   r:   rD   boolr   
functionalscaled_dot_product_attentionrw   	transposelogical_notry   )r%   rg   rh   ri   r`   r'   rb   r   r{   r|   rT   r}   attn_outputr   r   r    re      s4   000	z$MultiHeadAttentionSdpa.qkv_attentionr   r"   r   r   r   r(   r    r      r   r   )r   sdpac                
       sZ   e Zd Zddededef fddZ			dded	ee d
ee dee fddZ	  Z
S )ResidualAttentionBlockFrT   rU   cross_attentionc                    s   t    |dd}t| ||| _t|| _|r!t| ||nd | _|r*t|nd | _|d }t	
t||t	 t||| _t|| _d S )Nr   r      )r#   r$   rd   att_type_dictattnr!   attn_ln
cross_attncross_attn_lnr   
Sequentialr6   GELUmlpmlp_ln)r%   rT   rU   r   r'   r   n_mlpr(   r   r    r$      s   

 zResidualAttentionBlock.__init__Nr7   r_   r`   ra   c           	      K   s   | dd}| dd}| dd }|| j| ||||dd  }| jr6|| j| |||||dd  }|| | | }|S )Nrb   Fis_pad_memory_maskmemory_mask)r`   ra   rb   r   )rd   r   r   r   r   r   r   )	r%   r7   r_   r`   ra   r'   rb   r   r   r   r   r    r4      s&    
zResidualAttentionBlock.forward)Fr   )r   r   r   r   r   r$   r   r   r   r4   r5   r   r   r(   r    r      s    r   c                
       s@   e Zd Zdededededef
 fddZdefd	d
Z  ZS )AudioEncoderr   r|   rT   rU   n_layerc                    st   t    t|dddd| _tdddd| _| dt| t fddt	|D | _
t| _d S )Nrm   r?   r   )kernel_sizestridepaddingpositional_embeddingc              	      s"   g | ]}t  d ddqS )r   r   r   )r   rd   .0_r'   rU   rT   r   r    
<listcomp>  s    z)AudioEncoder.__init__.<locals>.<listcomp>)r#   r$   r<   conv1conv2register_bufferrQ   r   
ModuleListrangeblocksr!   ln_post)r%   r   r|   rT   rU   r   r'   r(   r   r    r$     s   
zAudioEncoder.__init__r7   c                 C   sx   t | |}t | |}|ddd}|| jd|dddf  |j}| j	D ]}||}q.| 
|}|S )zt
        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
            the mel spectrogram of the audio
        r   r?   r   N)r*   gelur   r   rt   r   sizer:   r;   r   r   )r%   r7   blockr   r   r    r4     s   (


zAudioEncoder.forward)r   r   r   r   r$   r   r4   r5   r   r   r(   r    r     s    "r   c                
       sl   e Zd Zdededededef
 fddZdd	ed
edee fddZdd Z	de
fddZdd Z  ZS )TextDecoderr   r|   rT   rU   r   c                    s   t    t|| _tt|| _t	 fddt
|D | _t| _t||tj d}| jd|dd d S )Nc                    s   g | ]
}t  d ddqS )Tr   )r   r   )r   r   rU   rT   r   r    r   6  s    z(TextDecoder.__init__.<locals>.<listcomp>r   r`   F
persistent)r#   r$   r   	Embeddingtoken_embedding	ParameterrD   emptyr   r   r   r   r!   lnfill_rB   rn   triu_r   )r%   r   r|   rT   rU   r   r'   r`   r(   r   r    r$   /  s   

zTextDecoder.__init__Nr7   r_   ra   c                 C   s   |rt t| jd nd}| || j|||jd    }||j}| jD ]}|||| j	|d}q)| 
|}|t| jj|jdd  }|S )z
        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
            the text tokens
        xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
            the encoded audio features to be attended on
        r   r   rl   )r`   ra   )nextitervaluesrr   r   r   r:   r;   r   r`   r   rD   r   r.   r,   )r%   r7   r_   ra   offsetr   logitsr   r   r    r4   E  s   "

"zTextDecoder.forwardc                 C   s   i }|S r"   r   )r%   r7   stater   r   r    
init_stateX  s   zTextDecoder.init_stater8   c                 C   s   dS )zScore eos (optional).

        Args:
            state: Scorer state for prefix tokens

        Returns:
            float: final score

        ro   r   )r%   r   r   r   r    final_score]  s   
zTextDecoder.final_scorec                 C   s>   | j ||d|d}tj|dd}|dddddf |fS )zScore.r   )cacherl   r@   N)r4   ru   rD   log_softmax)r%   ysr   r7   logpr   r   r    scorei  s   zTextDecoder.scorer"   )r   r   r   r   r$   r   r   r   r4   r   r,   r   r   r5   r   r   r(   r    r   .  s    "r   c                       s   e Zd Zdef fddZdefddZdejfdd	Z	d
ejdejfddZ
dejd
ejdeeejf fddZedd Zedd Zedd Zddee fddZeZeZeZ  ZS )Whisperdimsc                    s   t    || _t| jj| jj| jj| jj| jj| jj	d| _
t| jj| jj| jj| jj| jj| jj	d| _tj| jj| jjtjd}d|| jjd d < d S )Nr   r;   Tr?   )r#   r$   r   r   r   r   r   r   r   r   encoderr   r   r   r   r   r   decoderrD   zerosr   )r%   r   	all_headsr(   r   r    r$   s  s(   

zWhisper.__init__dumpc                 C   sP   t jtt|td }t	|
| jj| jj}| jd| dd d S )Nr   alignment_headsFr   )rB   
frombuffergzip
decompressbase64	b85decoder   copyrD   
from_numpyreshaper   r   r   r   	to_sparse)r%   r   arrayr`   r   r   r    set_alignment_heads  s   zWhisper.set_alignment_headsmelc                 C   s
   |  |S r"   )r   )r%   r   r   r   r    embed_audio  s   
zWhisper.embed_audiotokensaudio_featuresc                 C   s   |  ||S r"   )r   )r%   r   r   r   r   r    r     s   zWhisper.logitsr8   c                 C   s   |  || |S r"   )r   r   )r%   r   r   r   r   r    r4     s   zWhisper.forwardc                 C   s   t |  jS r"   )r   
parametersdevicer%   r   r   r    r     s   zWhisper.devicec                 C   s   | j jdkS )Ni  )r   r   r   r   r   r    is_multilingual  s   zWhisper.is_multilingualc                 C   s   | j jd t| j S )Ni5  )r   r   r   r   r   r   r   r    num_languages  s   zWhisper.num_languagesNr   c                    sP    duri  ni  g  fdddt jffdd}j|  fS )a  
        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
        tensors calculated for the previous positions. This method returns a dictionary that stores
        all caches, and the necessary hooks for the key and value projection modules that save the
        intermediate tensors to be reused during later calculations.

        Returns
        -------
        cache : Dict[nn.Module, torch.Tensor]
            A dictionary object mapping the key/value projection modules to its cache
        hooks : List[RemovableHandle]
            List of PyTorch RemovableHandle objects to stop the hooks to be called
        Nc                    sP   |  vs|j d jjkr| | <  |  S tj |  |gdd  | <  |  S )Nr   r@   )rr   r   r   rD   rH   rz   )moduler   r3   )r   r%   r   r    save_to_cache  s
   z5Whisper.install_kv_cache_hooks.<locals>.save_to_cachelayerc                    s6   t | tr | j  | j d S d S r"   )
isinstancerS   appendrZ   register_forward_hookr[   )r   )hooksr   r   r    install_hooks  s   
z5Whisper.install_kv_cache_hooks.<locals>.install_hooks)r   Moduler   apply)r%   r   r   r   )r   r   r   r%   r    install_kv_cache_hooks  s   zWhisper.install_kv_cache_hooksr"   )r   r   r   r   r$   bytesr   rD   r   r   r   r   r   r4   propertyr   r   r   r   r   r   detect_language_functionr
   transcribe_functionr   decode_functionr	   r5   r   r   r(   r    r   r  s     $


!r   )r>   )$r   r   dataclassesr   typingr   r   r   numpyrB   rD   torch.nn.functionalr   r   r*   r   decodingr	   r   r
   r   r   r   r   r!   r6   r<   rQ   r   rS   r   r   r   r   r   r   r   r   r   r    <module>   s4    	
	DJ,#D