o
    Xi%                     @   s.  d dl mZ d dlmZ d dlmZmZ d dlZd dlZd dl	m
  mZ d dlmZ d dlm
Z
 ddlmZ dd	lmZmZ eG d
d dZG dd de
jZG dd de
jZG dd de
jZdddZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZdS )     )	dataclass)Dict)IterableOptionalN)Tensor)nn   )
transcribe)detect_languagedecodec                   @   s^   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< dS )ModelDimensionsn_melsn_audio_ctxn_audio_staten_audio_headn_audio_layern_vocab
n_text_ctxn_text_staten_text_headn_text_layerN)__name__
__module____qualname__int__annotations__ r   r   A/home/ubuntu/.local/lib/python3.10/site-packages/whisper/model.pyr      s   
 r   c                       s&   e Zd Zdedef fddZ  ZS )	LayerNormxreturnc                    s   t  | |jS N)superforwardfloattypedtypeselfr   	__class__r   r   r#      s   zLayerNorm.forward)r   r   r   r   r#   __classcell__r   r   r)   r   r      s    r   c                   @   s   e Zd ZdedefddZdS )Linearr   r    c                 C   s2   t || j|j| jd u rd S | j|jS r!   )Flinearweighttor&   biasr'   r   r   r   r#   #   s
   zLinear.forwardN)r   r   r   r   r#   r   r   r   r   r,   "   s    r,   c                       s2   e Zd Zdededee def fddZ  ZS )Conv1dr   r/   r1   r    c                    s.   t  |||j|d u rd S ||jS r!   )r"   _conv_forwardr0   r&   )r(   r   r/   r1   r)   r   r   r3   *   s
   
zConv1d._conv_forward)r   r   r   r   r   r3   r+   r   r   r)   r   r2   )   s    *r2   '  c                 C   s   |d dksJ t ||d d  }t| t|d  }t| ddt jf |t jddf  }tjt|t|gddS )z*Returns sinusoids for positional embedding   r   r   Ndim)	nplogtorchexparangenewaxiscatsincos)lengthchannelsmax_timescalelog_timescale_incrementinv_timescalesscaled_timer   r   r   	sinusoids0   s
   *rG   c                
       st   e Zd Zdedef fddZ			ddedee dee d	ee fd
dZddedededee fddZ	  Z
S )MultiHeadAttentionn_staten_headc                    sH   t    || _t||| _t||dd| _t||| _t||| _d S )NF)r1   )r"   __init__rJ   r,   querykeyvalueout)r(   rI   rJ   r)   r   r   rK   :   s   
zMultiHeadAttention.__init__Nr   xamaskkv_cachec           	      C   s   |  |}|d u s|d u s| j|vr)| |d u r|n|}| |d u r%|n|}n
|| j }|| j }| ||||}| |S r!   )rL   rM   rN   qkv_attentionrO   )	r(   r   rP   rQ   rR   qkvwvr   r   r   r#   B   s   



zMultiHeadAttention.forwardrT   rU   rV   c                 C   s  |j \}}}|| j d }|jg |j d d | jdR  dddd| }|jg |j d d | jdR  dddd| }|jg |j d d | jdR  dddd}|| }	|d urm|	|d |d |f  }	tj|	 dd|j}
|
| ddddj	ddS )	Ng      пr5   r   r      r6   )	start_dim)
shaperJ   viewpermuter-   softmaxr$   r0   r&   flatten)r(   rT   rU   rV   rQ   n_batchn_ctxrI   scaleqkwr   r   r   rS   X   s   440z MultiHeadAttention.qkv_attentionNNNr!   )r   r   r   r   rK   r   r   dictr#   rS   r+   r   r   r)   r   rH   9   s    
(rH   c                
       sZ   e Zd Zddededef fddZ			dded	ee d
ee dee fddZ	  Z
S )ResidualAttentionBlockFrI   rJ   cross_attentionc                    s|   t    t||| _t|| _|rt||nd | _|r t|nd | _|d }t	t
||t t
||| _t|| _d S )N   )r"   rK   rH   attnr   attn_ln
cross_attncross_attn_lnr   
Sequentialr,   GELUmlpmlp_ln)r(   rI   rJ   rh   n_mlpr)   r   r   rK   h   s   

 zResidualAttentionBlock.__init__Nr   rP   rQ   rR   c                 C   sR   || j | |||d }| jr|| j| |||d }|| | | }|S )NrQ   rR   )rR   )rj   rk   rl   rm   rp   rq   )r(   r   rP   rQ   rR   r   r   r   r#   u   s
   zResidualAttentionBlock.forward)Fre   )r   r   r   r   boolrK   r   r   rf   r#   r+   r   r   r)   r   rg   g   s    rg   c                
       s@   e Zd Zdededededef
 fddZdefd	d
Z  ZS )AudioEncoderr   ra   rI   rJ   n_layerc                    sp   t    t|ddd| _tdddd| _| dt| t fddt	|D | _
t| _d S )	NrY   r   )kernel_sizepaddingr5   )rw   striderx   positional_embeddingc                    s   g | ]}t  qS r   rg   .0_rJ   rI   r   r   
<listcomp>   s    z)AudioEncoder.__init__.<locals>.<listcomp>)r"   rK   r2   conv1conv2register_bufferrG   r   
ModuleListrangeblocksr   ln_post)r(   r   ra   rI   rJ   rv   r)   r   r   rK      s   
zAudioEncoder.__init__r   c                 C   s   t | |}t | |}|ddd}|jdd | jjks&J d|| j |j}| j	D ]}||}q2| 
|}|S )zt
        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
            the mel spectrogram of the audio
        r   r5   r   Nzincorrect audio shape)r-   gelur   r   r]   r[   rz   r0   r&   r   r   )r(   r   blockr   r   r   r#      s   


zAudioEncoder.forward)r   r   r   r   rK   r   r#   r+   r   r   r)   r   ru      s    "ru   c                
       sN   e Zd Zdededededef
 fddZdd	ed
edee fddZ  Z	S )TextDecoderr   ra   rI   rJ   rv   c                    s   t    t|| _tt|| _t	 fddt
|D | _t| _t||tj d}| jd|dd d S )Nc                    s   g | ]	}t  d dqS )T)rh   r{   r|   r   r   r   r      s    z(TextDecoder.__init__.<locals>.<listcomp>r   rQ   F)
persistent)r"   rK   r   	Embeddingtoken_embedding	Parameterr:   emptyrz   r   r   r   r   lnfill_r8   inftriu_r   )r(   r   ra   rI   rJ   rv   rQ   r)   r   r   rK      s   

zTextDecoder.__init__Nr   rP   rR   c                 C   s   |rt t| jd nd}| || j|||jd    }||j}| jD ]}|||| j	|d}q)| 
|}|t| jj|jdd  }|S )z
        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
            the text tokens
        xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
            the encoded audio features to be attended on
        r   r   rX   rs   )nextitervaluesr[   r   rz   r0   r&   r   rQ   r   r:   	transposer/   r$   )r(   r   rP   rR   offsetr   logitsr   r   r   r#      s   "

"zTextDecoder.forwardr!   )
r   r   r   r   rK   r   r   rf   r#   r+   r   r   r)   r   r      s    "$r   c                       s   e Zd Zdef fddZdejfddZdejdejfd	d
Zdejdejde	e
ejf fddZedd Zedd Zddee fddZeZeZeZ  ZS )Whisperdimsc                    s`   t    || _t| jj| jj| jj| jj| jj| _	t
| jj| jj| jj| jj| jj| _d S r!   )r"   rK   r   ru   r   r   r   r   r   encoderr   r   r   r   r   r   decoder)r(   r   r)   r   r   rK      s    

zWhisper.__init__melc                 C   s
   |  |S r!   )r   )r(   r   r   r   r   embed_audio   s   
zWhisper.embed_audiotokensaudio_featuresc                 C   s   |  ||S r!   )r   )r(   r   r   r   r   r   r      s   zWhisper.logitsr    c                 C   s   |  || |S r!   )r   r   )r(   r   r   r   r   r   r#      s   zWhisper.forwardc                 C   s   t |  jS r!   )r   
parametersdevicer(   r   r   r   r      s   zWhisper.devicec                 C   s   | j jdkS )Ni  )r   r   r   r   r   r   is_multilingual   s   zWhisper.is_multilingualNcachec                    sP    duri  ni  g  fdddt jffdd}j|  fS )a  
        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
        tensors calculated for the previous positions. This method returns a dictionary that stores
        all caches, and the necessary hooks for the key and value projection modules that save the
        intermediate tensors to be reused during later calculations.

        Returns
        -------
        cache : Dict[nn.Module, torch.Tensor]
            A dictionary object mapping the key/value projection modules to its cache
        hooks : List[RemovableHandle]
            List of PyTorch RemovableHandle objects to stop the hooks to be called
        Nc                    sV   |  vs|j d jjj d kr| | <  |  S tj |  |gdd  | <  |  S )Nr   r   r6   )r[   r   rz   r:   r>   detach)moduler~   output)r   r(   r   r   save_to_cache   s
    z5Whisper.install_kv_cache_hooks.<locals>.save_to_cachelayerc                    s6   t | tr | j  | j d S d S r!   )
isinstancerH   appendrM   register_forward_hookrN   )r   )hooksr   r   r   install_hooks  s   
z5Whisper.install_kv_cache_hooks.<locals>.install_hooks)r   Moduler   apply)r(   r   r   r   )r   r   r   r(   r   install_kv_cache_hooks   s   zWhisper.install_kv_cache_hooksr!   )r   r   r   r   rK   r:   r   r   r   r   strr#   propertyr   r   r   rf   r   detect_language_functionr
   transcribe_functionr	   decode_functionr   r+   r   r   r)   r   r      s    $

 r   )r4   ) dataclassesr   typingr   r   r   numpyr8   r:   torch.nn.functionalr   
functionalr-   r   r	   r   decodingr
   r   r   r   r   r   r,   r2   rG   r   rH   rg   ru   r   r   r   r   r   r   <module>   s*    
	.#