o
    i                     @   sr   d dl Z d dlmZmZmZ d dlZd dlmZ d dlZd dlm	Z	 d dl
mZ eddG dd dejZdS )	    N)AnyListTuple)nn)make_pad_mask)tablesdecoder_classesOpenAIWhisperDecoderWarpc                       s   e Zd ZdZ				d dededed	ef fd
dZdej	dej	dej	dej	de
ej	ej	f f
ddZ	d!dej	dej	dej	deej	 de
ej	eej	 f f
ddZdd Zdej	dee dej	de
ej	ee f fddZ  ZS )"r	   zvTransformer-based Speech-to-Text Decoder from OpenAI's Whisper Model:

    URL: https://github.com/openai/whisper
            smallNFdropout_ratewhisper_modeldownload_diruse_padmaskc                    sf   t    |t v sJ tj||dd}t|j| _| jj	j
}tj|| _| j  ~|| _d S )Ncpu)download_rootdevice)super__init__whisperavailable_models
load_modelcopydeepcopydecoderdecoderstoken_embeddingembedding_dimtorchr   Dropoutdropouttrainr   )selfr   r   r   r   _modelattention_dim	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/whisper_lid/decoder.pyr      s   



z!OpenAIWhisperDecoderWarp.__init__hs_padhlens	ys_in_pad
ys_in_lensreturnc              	   C   s   ||}}| j || j jd|d  }| |}||j}| jr7t|dddddf  |j	}nd}t
| j jD ]\}	}
|
||| j j|ddd}|	t| j jd k r^| |}q?| j |}|t| j jj|jdd  }||fS )a@  Forward decoder.

        Args:
            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
            hlens: (batch)
            ys_in_pad:
                input token ids, int64 (batch, maxlen_out)
                if input_layer == "embed"
                input tensor (batch, maxlen_out, #mels) in the other cases
            ys_in_lens: (batch)
        Returns:
            (tuple): tuple containing:

            x: decoded token score before softmax (batch, maxlen_out, token)
                if use_output_layer is True,
            olens: (batch, )
        N   FT)maskmemory_maskis_pad_maskis_pad_memory_maskr   )r   r   positional_embeddingsizer    todtyper   r   r   	enumerateblocksr/   lenlnr   	transposeweightfloat)r"   r)   r*   r+   r,   tgtmemoryxr0   layerblockr'   r'   r(   forward+   s,   
"
&	
$z OpenAIWhisperDecoderWarp.forwardr>   tgt_maskr?   cachec           	      C   s   | j || j jd|d  }| |}||j}t| j jD ]\}}|||| j j	d}|t
| j jd k r>| |}q"| j |}|dddf }|t| j jj|jdd  }tj|dd}|dfS )a  Forward one step.

        Args:
            tgt: input token ids, int64 (batch, maxlen_out)
            tgt_mask: input token mask,  (batch, maxlen_out)
                      dtype=torch.uint8 in PyTorch 1.2-
                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            cache: cached output list of (batch, max_time_out-1, size)
        Returns:
            y, cache: NN output value and cache per `self.decoders`.
            y.shape` is (batch, maxlen_out, token)
        NOTE (Shih-Lun):
            cache implementation is ignored for now
            for simplicity & correctness
        Nr.   )r/   r   )dim)r   r   r3   r4   r    r5   r6   r7   r8   r/   r9   r:   r   r;   r<   r=   log_softmax)	r"   r>   rD   r?   rE   r@   rA   rB   yr'   r'   r(   forward_one_step`   s   "

$z)OpenAIWhisperDecoderWarp.forward_one_stepc                 C   s6   | j |dtd|d|d\}}|d|fS )zScore.r   rE   )rJ   	unsqueezer   emptysqueeze)r"   ysstater@   logpr'   r'   r(   score   s   
zOpenAIWhisperDecoderWarp.scorerO   statesxsc                 C   s$   | j |td|dd\}}|dfS )a  Score new token batch.

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        r   NrK   )rJ   r   rM   )r"   rO   rS   rT   rQ   r'   r'   r(   batch_score   s   z$OpenAIWhisperDecoderWarp.batch_score)r
   r   NF)N)__name__
__module____qualname____doc__r=   strboolr   r   Tensorr   rC   r   rJ   rR   r   rU   __classcell__r'   r'   r%   r(   r	      s`    
:
')r   typingr   r   r   r   r   r   *funasr.models.transformer.utils.nets_utilsr   funasr.registerr   registerModuler	   r'   r'   r'   r(   <module>   s   
