o
    iL                     @   s>  d Z ddlmZmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ G dd de
e$Z%G dd de%Z&G dd de%Z'G dd de%Z(G dd de%Z)G dd de%Z*dS )zDecoder definition.    )AnyListSequenceTupleN)check_argument_types)
AbsDecoder)make_pad_mask)MultiHeadedAttention)DecoderLayer)DynamicConvolution)DynamicConvolution2D)PositionalEncoding)	LayerNorm)LightweightConvolution)LightweightConvolution2D)subsequent_mask)PositionwiseFeedForward)repeat)BatchScorerInterfacec                       s   e Zd ZdZddddedfdedededed	ed
edef fddZ	de
jde
jde
jde
jdee
je
jf f
ddZ	d#de
jde
jde
jdee
j dee
jee
j f f
ddZdd Zde
jdee d e
jdee
jee f fd!d"Z  ZS )$BaseTransformerDecodera  Base class of Transfomer decoder module.

    Args:
        vocab_size: output dim
        encoder_output_size: dimension of attention
        attention_heads: the number of heads of multi head attention
        linear_units: the number of units of position-wise feed forward
        num_blocks: the number of decoder blocks
        dropout_rate: dropout rate
        self_attention_dropout_rate: dropout rate for attention
        input_layer: input layer type
        use_output_layer: whether to use output layer
        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
        normalize_before: whether to use layer_norm before the first block
        concat_after: whether to concat attention layer's input and output
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied.
            i.e. x -> x + att(x)
    皙?embedT
vocab_sizeencoder_output_sizedropout_ratepositional_dropout_rateinput_layeruse_output_layernormalize_beforec	           
   	      s   t  sJ t   |}	|dkr!tjtj||	||	|| _n*|dkrDtjtj||	tj	|	tj
|tj ||	|| _ntd| || _| jrVt	|	| _|ratj|	|| _nd | _d | _d S )Nr   linearz'only 'embed' or 'linear' is supported: )r   super__init__torchnn
Sequential	Embeddingr   Linearr   DropoutReLU
ValueErrorr   
after_normoutput_layerdecoders)
selfr   r   r   r   r   r   pos_enc_classr   attention_dim	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr/decoder/transformer_decoder.pyr!   2   s0   





zBaseTransformerDecoder.__init__hs_padhlens	ys_in_pad
ys_in_lensreturnc                 C   s  |}t |dddddf  |j}t|d|jdd}||@ }|}t ||dd dddddf |j}	|	jd |jd kr^|jd |	jd  }
tjj	
|	d|
fdd}	| |}| ||||	\}}}}	| jrw| |}| jdur| |}|d}||fS )	a@  Forward decoder.

        Args:
            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
            hlens: (batch)
            ys_in_pad:
                input token ids, int64 (batch, maxlen_out)
                if input_layer == "embed"
                input tensor (batch, maxlen_out, #mels) in the other cases
            ys_in_lens: (batch)
        Returns:
            (tuple): tuple containing:

            x: decoded token score before softmax (batch, maxlen_out, token)
                if use_output_layer is True,
            olens: (batch, )
        Ndevicer      )maxlenconstantF)r   tor;   r   size	unsqueezeshaper"   r#   
functionalpadr   r,   r   r*   r+   sum)r-   r4   r5   r6   r7   tgttgt_maskmmemorymemory_maskpadlenxolensr2   r2   r3   forward\   s.   $&




zBaseTransformerDecoder.forwardNrF   rG   rI   cachec                 C   s   |  |}|du rdgt| j }g }t|| jD ]\}}||||d|d\}}}}	|| q| jr?| |dddf }
n|dddf }
| jdurVtj	| |
dd}
|
|fS )a5  Forward one step.

        Args:
            tgt: input token ids, int64 (batch, maxlen_out)
            tgt_mask: input token mask,  (batch, maxlen_out)
                      dtype=torch.uint8 in PyTorch 1.2-
                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            cache: cached output list of (batch, max_time_out-1, size)
        Returns:
            y, cache: NN output value and cache per `self.decoders`.
            y.shape` is (batch, maxlen_out, token)
        NrO   r9   )dim)
r   lenr,   zipappendr   r*   r+   r"   log_softmax)r-   rF   rG   rI   rO   rL   	new_cachecdecoderrJ   yr2   r2   r3   forward_one_step   s   


z'BaseTransformerDecoder.forward_one_stepc                 C   sH   t t||jdd}| j|d||d|d\}}|d|fS )zScore.r:   r   rP   )r   rR   r;   rA   rZ   squeeze)r-   ysstaterL   ys_masklogpr2   r2   r3   score   s
   
zBaseTransformerDecoder.scorer\   statesxsc                    s   t | t | jd du rd}n fddtD }t|d|jdd}| j||||d\}fddt D }||fS )	a  Score new token batch.

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        r   Nc                    s*   g | ] t  fd dtD qS )c                       g | ]}|   qS r2   r2   ).0b)ira   r2   r3   
<listcomp>       ABaseTransformerDecoder.batch_score.<locals>.<listcomp>.<listcomp>)r"   stackrangerd   )n_batchra   )rf   r3   rg      s    z6BaseTransformerDecoder.batch_score.<locals>.<listcomp>r9   r:   rP   c                    s$   g | ]  fd dt D qS )c                    rc   r2   r2   )rd   rf   )re   ra   r2   r3   rg      rh   ri   )rk   rl   )n_layersra   )re   r3   rg      s   $ )rR   r,   rk   r   r@   r;   rA   rZ   )r-   r\   ra   rb   batch_stater^   r_   
state_listr2   )rm   rn   ra   r3   batch_score   s   
z"BaseTransformerDecoder.batch_scoreN)__name__
__module____qualname____doc__r   intfloatstrboolr!   r"   Tensorr   rN   r   rZ   r`   r   rq   __classcell__r2   r2   r0   r3   r      sp    	*
<
'r   c                       sl   e Zd Zdddddddddeddfd	ed
edededededededededededef fddZ  Z	S )TransformerDecoder         r           r   TFr   r   attention_headslinear_units
num_blocksr   r   self_attention_dropout_ratesrc_attention_dropout_rater   r   r   concat_afterc              
      sP   t  sJ t j||||
||d | t| fdd| _d S )Nr   r   r   r   r   r   r.   r   c                    s,   t  t t t S rr   )r
   r	   r   lnumr/   r   r   r   r   r   r   r   r2   r3   <lambda>	  s    
z-TransformerDecoder.__init__.<locals>.<lambda>)r   r    r!   r   r,   )r-   r   r   r   r   r   r   r   r   r   r   r   r.   r   r   r0   r   r3   r!      s    

zTransformerDecoder.__init__)
rs   rt   ru   r   rw   rx   ry   rz   r!   r|   r2   r2   r0   r3   r}      sP    	
r}   c                !          e Zd Zdddddddddedddd	dfd
ededededededededededededededee def  fddZ	  Z
S )(LightweightConvolutionTransformerDecoderr~   r   r   r   r   r   TF   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   conv_wshareconv_kernel_lengthconv_usebiasc                    z   t  sJ t|krtdt d| t j||||
||d | t| 	
fdd| _d S )NCconv_kernel_length must have equal number of values to num_blocks:  != r   c              
      8   t  t 	|  ddt 
t S NT)wsharen_featr   kernel_sizeuse_kernel_maskuse_bias)r
   r   r	   r   r   r/   r   r   r   r   r   r   r   r   r   r   r2   r3   r   B  $    
zCLightweightConvolutionTransformerDecoder.__init__.<locals>.<lambda>r   rR   r)   r    r!   r   r,   r-   r   r   r   r   r   r   r   r   r   r   r   r.   r   r   r   r   r   r0   r   r3   r!     0   

z1LightweightConvolutionTransformerDecoder.__init__rs   rt   ru   r   rw   rx   ry   rz   r   r!   r|   r2   r2   r0   r3   r     b    	
r   c                !       r   )*LightweightConvolution2DTransformerDecoderr~   r   r   r   r   r   TFr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   c                    r   )Nr   r   r   c              
      r   r   )r
   r   r	   r   r   r   r2   r3   r     r   zELightweightConvolution2DTransformerDecoder.__init__.<locals>.<lambda>r   r   r0   r   r3   r!   X  r   z3LightweightConvolution2DTransformerDecoder.__init__r   r2   r2   r0   r3   r   W  r   r   c                !       r   )$DynamicConvolutionTransformerDecoderr~   r   r   r   r   r   TFr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   c                    r   )Nr   r   r   c              
      r   r   )r
   r   r	   r   r   r   r2   r3   r     r   z?DynamicConvolutionTransformerDecoder.__init__.<locals>.<lambda>r   r   r0   r   r3   r!     0   


z-DynamicConvolutionTransformerDecoder.__init__r   r2   r2   r0   r3   r     r   r   c                !       r   )&DynamicConvolution2DTransformerDecoderr~   r   r   r   r   r   TFr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   c                    r   )Nr   r   r   c              
      r   r   )r
   r   r	   r   r   r   r2   r3   r     r   zADynamicConvolution2DTransformerDecoder.__init__.<locals>.<lambda>r   r   r0   r   r3   r!     r   z/DynamicConvolution2DTransformerDecoder.__init__r   r2   r2   r0   r3   r     r   r   )+rv   typingr   r   r   r   r"   	typeguardr   espnet2.asr.decoder.abs_decoderr   &espnet.nets.pytorch_backend.nets_utilsr   1espnet.nets.pytorch_backend.transformer.attentionr	   5espnet.nets.pytorch_backend.transformer.decoder_layerr
   4espnet.nets.pytorch_backend.transformer.dynamic_convr   6espnet.nets.pytorch_backend.transformer.dynamic_conv2dr   1espnet.nets.pytorch_backend.transformer.embeddingr   2espnet.nets.pytorch_backend.transformer.layer_normr   1espnet.nets.pytorch_backend.transformer.lightconvr   3espnet.nets.pytorch_backend.transformer.lightconv2dr   ,espnet.nets.pytorch_backend.transformer.maskr   Aespnet.nets.pytorch_backend.transformer.positionwise_feed_forwardr   .espnet.nets.pytorch_backend.transformer.repeatr   espnet.nets.scorer_interfacer   r   r}   r   r   r   r   r2   r2   r2   r3   <module>   s2    M1>>>