o
    :i                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlZd dlm	Z	m
Z
 ej r.dZnejj r7dZndZd dlmZmZmZmZmZmZmZmZmZmZmZmZmZ d+d	d
ZG dd de
jZ d,de!de!defddZ"G dd de
jZ#G dd de
jZ$G dd de
jZ%G dd de
jZ&G dd dej
jZ'G dd dej
jZ(G dd  d ej
jZ)G d!d" d"e
jZ*G d#d$ d$e
jZ+G d%d& d&e
jZ,G d'd( d(e
jZ-G d)d* d*e
jZ.dS )-    N)OptionalTupleUnion)Tensornncudampscpu)ActivationDropoutAndLinearBalancerBiasNormDropout2	FloatLikeIdentityScaledLinearScheduledFloatSwooshRWhitenlimit_param_valuepenalize_abs_values_gtsoftmax'  c              	   C   s   |d }t t| t jd|t j| jd | }|  dkr&| dd} | d 	 |d  }t j
t |t |gdd}|d rUt j
|t |d	ddf gdd}|S )
a!  Create sinusoidal timestep embeddings.

    :param timesteps: shape of (N) or (N, T)
    :param dim: the dimension of the output.
    :param max_period: controls the minimum frequency of the embeddings.
    :return: an Tensor of positional embeddings. shape of (N, dim) or (T, N, dim)
       r   )startenddtypedevice   ).NNdim.)torchexpmathlogarangefloat32r   r    	transposefloatcatcossin
zeros_like)	timestepsr    
max_periodhalffreqsargs	embedding r3   8/home/ubuntu/LuxTTS/zipvoice/models/modules/zipformer.pytimestep_embedding6   s   
$r5   c                )       s   e Zd ZdZ											
				
		
	d+dededeeee f deeee f deeee f dededededededededede	dede	d ed!e	d"df( fd#d$Z
			d,d%ed&ee d'ee d(ee d"eeef f
d)d*Z  ZS )-TTSZipformera  
    Args:

    Note: all "int or Tuple[int]" arguments below will be treated as lists of the same
    length as downsampling_factor if they are single ints or one-element tuples.
    The length of downsampling_factor defines the number of stacks.

        downsampling_factor (Tuple[int]): downsampling factor for each encoder stack.
           Note: this is in addition to the downsampling factor of 2 that is applied in
           the frontend (self.encoder_embed).
        encoder_dim (Tuple[int]): embedding dimension of each of the encoder stacks,
            one per encoder stack.
        num_encoder_layers (int or Tuple[int])): number of encoder layers for each stack
        query_head_dim (int or Tuple[int]): dimension of query and key per attention
           head: per stack, if a tuple..
        pos_head_dim (int or Tuple[int]): dimension of positional-encoding projection
            per attention head
        value_head_dim (int or Tuple[int]): dimension of value in each attention head
        num_heads: (int or Tuple[int]): number of heads in the self-attention mechanism.
              Must be at least 4.
        feedforward_dim (int or Tuple[int]): hidden dimension in feedforward modules
        cnn_module_kernel (int or Tuple[int])): Kernel size of convolution module

        pos_dim (int): the dimension of each positional-encoding vector prior to
            projection, e.g. 128.

        dropout (float): dropout rate
        warmup_batches (float): number of batches to warm up over; this controls
          dropout of encoder layers.
        use_time_embed: (bool): if True, take time embedding as an additional input.
        time_embed_dim: (int): the dimension of the time embedding.
        use_guidance_scale_embed (bool): if True, take guidance scale embedding as
            an additional input.
        guidance_scale_embed_dim: (int): the dimension of the guidance scale embedding.
    r      r8                       N     @@TFin_dimout_dimdownsampling_factornum_encoder_layerscnn_module_kernelencoder_dimquery_head_dimpos_head_dimvalue_head_dim	num_headsfeedforward_dimpos_dimdropoutwarmup_batchesuse_time_embedtime_embed_dimuse_guidance_scale_embedguidance_scale_embed_dimuse_convreturnc                    s  t t|   |d u rtdd}t tr f  fdd}dd }|   | _||}|| | _}|| _|| _	|| _
|	| _|
| _|| _|| _|| _| jrV|dksUJ nd}|| _t||| _t||| _g }t }t|D ]J}t|||
|||	|||| |d
}t||| |||||d	  |d	  ||d
  |d	  d | d  d} | d	krt|| | d}|| qst|| _| jrtt||d
 t t|d
 || _nd | _| jrt ||ddd| _!d S d | _!d S )N)        333333?)     @皙?c                    sR   t | tr| f} t| dkr| t  } | S t| t kr%t | d ts'J | S )zoConverts a single int or a 1-tuple of an int to a tuple with the same
            length as downsampling_factorr   r   )
isinstanceintlen)xrC   r3   r4   	_to_tuple   s   
"z(TTSZipformer.__init__.<locals>._to_tuplec                 S   s   | d dkr| d dksJ t dt| d d D ]}| | | |d  d ks)J qt t| d d t| D ]}| | d | |d  ksGJ q7dS )z.assert downsampling_factor follows u-net styler   r   r   r   N)ranger[   )factorsir3   r3   r4   _assert_downsampling_factor   s   z:TTSZipformer.__init__.<locals>._assert_downsampling_factorr   )
	embed_dimrL   rJ   rG   rH   rI   rK   rS   rE   rM   r   r   gQ?      ?)rc   rP   rL   warmup_begin
warmup_endfinal_layerdrop_rate)r    
downsampleFrX   biasinitial_scale)"superr6   __init__r   rY   rZ   rC   rE   rF   rD   rG   rI   rJ   rO   rQ   rP   rR   r   Linearin_projout_projr[   r_   Zipformer2EncoderLayerZipformer2EncoderDownsampledZipformer2Encoderappend
ModuleListencoders
Sequentialr   
time_embedr   guidance_scale_embed)selfrA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   r^   rb   rv   num_encodersra   encoder_layerencoder	__class__r]   r4   rm   t   s   



zTTSZipformer.__init__r\   tpadding_maskguidance_scalec           
      C   s   | ddd}| |}|durO| dks!| dks!J |jt|| j}|durI| dks<| dks<J |j| t|| j}|| }| |}nd}d}t	| j
D ]\}}	|	||||d}qX| |}| ddd}|S )a>  
        Args:
          x:
            The input tensor. Its shape is (batch_size, seq_len, feature_dim).
          t:
            A t tensor of shape (batch_size,) or (batch_size, seq_len)
          padding_mask:
            The mask for padding, of shape (batch_size, seq_len); True means
            masked position. May be None.
          guidance_scale:
            The guidance scale in classifier-free guidance of distillation model.
        Returns:
          Return the output embeddings. its shape is
            (batch_size, output_seq_len, encoder_dim)
        r   r   r   N)time_embsrc_key_padding_mask	attn_mask)permutero   r    shaper5   rP   ry   rR   rx   	enumeraterv   rp   )
rz   r\   r   r   r   r   guidance_scale_embr   ra   moduler3   r3   r4   forward   s4   
"

zTTSZipformer.forward)r7   r8   r9   r:   r;   r8   r<   r=   r>   r?   Nr@   Tr?   Fr?   TNNN)__name__
__module____qualname____doc__rZ   r   r   r   r(   boolrm   r   r   r   __classcell__r3   r3   r~   r4   r6   O   s    (	
 	
r6          @r\   ratiorT   c                 C   s   t d| fd||  f| dS )NrU   rW   default)r   )r\   r   r3   r3   r4   _whitening_schedule/  s   r   c                #       s  e Zd ZdZdddeddddd	eddddd	ed
ddd	edddedddedddd	f	dededededededededededededededed ed!d"f" fd#d$Zd%e	d&e
d!ee	 fd'd(Zd%e	d&e
d!e	fd)d*Z	"	"	"d2d+e	d,e	d-ee	 d.ee	 d/ee	 d!e	fd0d1Z  ZS )3rq   ac  
    Args:
        embed_dim: the number of expected features in the input (required).
        nhead: the number of heads in the multiheadattention models (required).
        feedforward_dim: the dimension of the feedforward network model (required).
        dropout: the dropout value (default=0.1).
        cnn_module_kernel (int): Kernel size of convolution module (default=31).

    Examples::
        >>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8)
        >>> src = torch.rand(10, 32, 512)
        >>> pos_emb = torch.rand(32, 19, 512)
        >>> out = encoder_layer(src, pos_emb)
    rX   r9   TrU   皙?)r@   皙?)i>  rU   r   r   rU         ?)r@   皙?rU   rX   )r@   {Gz?)g     j@rU   rU   rd   r@   g{Gz?rc   rL   rJ   rG   rH   rI   rK   rM   rE   rS   attention_skip_rateconv_skip_rateconst_attention_rateff2_skip_rateff3_skip_ratebypass_skip_raterT   Nc              
      s  t t|   || _t||dd| _t|dd| _t|| _	t|| _
t|| _t|| _t|| _t|||||dd| _t|||| _t|||| _t||d d || _t|||| _t||d d || _t|d| d d	| _|
| _| jrt||	| _t||	| _t|| _t|d
ddddd| _ t|d
ddt!dddd| _"t|d
ddt!ddddddd| _#t|d
ddt!ddddddd| _$t%dt&ddddd d!| _'t|d
ddd"dd| _(d S )#Nr   )	skip_ratestraight_through_rater   rU   )rL   rJ   rG   rH   rM      r8      )hidden_channelsr   g?g?r   g      @channel_dimmin_positivemax_positivemin_absmax_absrV   gffffff?)rU   gMbp?r   r   )r   r   r   r   prob)rU   rU   )r@   rX   r   r   r   r   r   r   r   r   )r@   r   r         @r   r   r   r   
num_groupswhitening_limitr   
grad_scalerX   ))rl   rq   rm   rc   BypassModulebypass
bypass_midcopydeepcopyr   r   r   r   r   $RelPositionMultiheadAttentionWeightsself_attn_weightsSelfAttention
self_attn1
self_attn2FeedforwardModulefeed_forward1feed_forward2feed_forward3NonlinAttentionnonlin_attentionrS   ConvolutionModuleconv_module1conv_module2r   normr   	balancer1r   balancer_nabalancer_ff2balancer_ff3r   r   whiten	balancer2)rz   rc   rL   rJ   rG   rH   rI   rK   rM   rE   rS   r   r   r   r   r   r   r~   r3   r4   rm   C  s   	




zZipformer2EncoderLayer.__init__r\   dropout_ratec                 C   sR   |dks| j rtj stj rd S |jd }tj|d|jd|k|j	}|S )NrU   r   r   )
trainingr!   jitis_scripting
is_tracingr   randr   tor   )rz   r\   r   
batch_sizemaskr3   r3   r4   get_sequence_dropout_mask  s   
z0Zipformer2EncoderLayer.get_sequence_dropout_maskc                 C   s    |  ||}|du r|S || S )zf
        Apply sequence-level dropout to x.
        x shape: (seq_len, batch_size, embed_dim)
        N)r   )rz   r\   r   dropout_maskr3   r3   r4   sequence_dropout  s   z'Zipformer2EncoderLayer.sequence_dropoutsrcpos_embr   r   r   c                 C   s  |}t j st j rd}n
| jrt| jnd}| j||||d}|dur*|| }|| | }| 	||}	|dd }
t j sGt j rHn%| jrmt

 t| jk rm|
dd }
|
dk|
j}
|
d|
jddd	  }
| | ||
}||	du r}|n||	  }| ||}||	du r|n||	  }| jrt j st j rd}n
| jrt| jnd}|dur|| }|| | j||d
| }t j st j rd}n
| jrt| jnd}|| | | || }| ||}| ||}||	du r|n||	  }| jr4t j st j rd}n| jrt| jnd}|dur'|| }|| | j||d
| }t j s@t j rCd}n| jrLt| jnd}|| | | || }| |}|  |}| !||}| "|}| #|}|S )a  
        Pass the input through the encoder layer.
        Args:
          src: the sequence to the encoder (required):
            shape (seq_len, batch_size, embedding_dim).
          pos_emb: (1, 2*seq_len-1, pos_emb_dim) or
            (batch_size, 2*seq_len-1, pos_emb_dim)
          time_emb: the embedding representing the current timestep
            shape (batch_size, embedding_dim) or (seq_len, batch_size, embedding_dim).
          attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len)
            or (seq_len, seq_len), interpreted as (batch_size, tgt_seq_len, src_seq_len)
            or (tgt_seq_len, src_seq_len). True means masked position. May be None.
          src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len);
            True means masked position.  May be None.

        Returns:
           A tensor which has the same shape as src
        rU   )r   r   key_padding_maskNr   r         ?r   T)r    keepdim)r   )$r!   r   r   r   r   r(   r   r   r   r   randomr   r   r   sumr   r   r   rS   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rz   r   r   r   r   r   src_origr   attn_weightsself_attn_dropout_maskselected_attn_weightsna	self_attnr   r   r   r3   r3   r4   r     s   




zZipformer2EncoderLayer.forwardr   )r   r   r   r   r   rZ   r   r   rm   r   r(   r   r   r   r   r   r3   r3   r~   r4   rq   3  s    	
 
rq   c                       s   e Zd ZdZ		ddejdedededed	ed
edededdf fddZ			dde	de
e	 de
e	 de
e	 de	f
ddZ  ZS )rr   a&  Zipformer2Encoder is a stack of N encoder layers

    Args:
        encoder_layer: an instance of the Zipformer2EncoderLayer() class (required).
        num_layers: the number of sub-encoder-layers in the encoder (required).
        pos_dim: the dimension for the relative positional encoding

    Examples::
        >>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8)
        >>> zipformer_encoder = Zipformer2Encoder(encoder_layer, num_layers=6)
        >>> src = torch.rand(10, 32, 512)
        >>> out = zipformer_encoder(src)
    rd   r   r|   
num_layersrc   rP   rL   re   rf   initial_layerdrop_raterg   rT   Nc
                    s   t    t|ddd| _|dkrtt t||| _nd | _t	 fddt
|D | _|| _d|  kr>|ksAJ  J d| ||  }
|}t
|D ]}||
 }t||f||	fdd	| j| j_|}qOd S )
Ng333333?r   )r   length_factorr   c                    s   g | ]}t  qS r3   )r   r   ).0ra   r|   r3   r4   
<listcomp>  s    z.Zipformer2Encoder.__init__.<locals>.<listcomp>r   rU   r   )rl   rm   CompactRelPositionalEncodingencoder_posr   rw   r   rn   r   ru   r_   layersr   r   r   r   )rz   r|   r   rc   rP   rL   re   rf   r   rg   delta	cur_beginra   cur_endr~   r   r4   rm     s4   

zZipformer2Encoder.__init__r   r   r   r   c           	      C   sf   |  |}| jdur|dusJ | |}n|du sJ |}t| jD ]\}}||||||d}q#|S )aU  Pass the input through the encoder layers in turn.

        Args:
            src: the sequence to the encoder (required):
                shape (seq_len, batch_size, embedding_dim).
            time_emb: the embedding representing the current timestep:
                shape  (batch_size, embedding_dim)
                or (seq_len, batch_size, embedding_dim) .
            attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len)
                or (seq_len, seq_len), interpreted as
                (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
                True means masked position. May be None.
            src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len);
                True means masked position.  May be None.

        Returns: a Tensor with the same shape as src.
        Nr   r   r   )r   r   r   r   )	rz   r   r   r   r   r   outputra   modr3   r3   r4   r     s   

zZipformer2Encoder.forward)rd   r   r   )r   r   r   r   r   ModulerZ   r(   rm   r   r   r   r   r3   r3   r~   r4   rr     sN    	
-rr   c                       sj   e Zd ZdZddedddddfded	ed
ededef
 fddZdefddZde	de	fddZ
  ZS )r   aA  
    An nn.Module that implements a learnable bypass scale, and also randomized
    per-sequence layer-skipping.  The bypass is limited during early stages of training
    to be close to "straight-through", i.e. to not do the bypass operation much
    initially, in order to force all the modules to learn something.
    rU   )rU   g?)rW   r   r   r   r   rc   r   r   	scale_min	scale_maxc                    sT   t    tt|fd| _t|| _	t|| _
t|| _t|| _d S )Nrd   )rl   rm   r   	Parameterr!   fullbypass_scaler   r   r   r   r  r  )rz   rc   r   r   r  r  r~   r3   r4   rm     s   
zBypassModule.__init__r   c                 C   s   t j st j s| js| jS t| jt| jt| j	d}t| j
}|dkr7t j|df|jd|k}|| }t| j}|dkrVt j|df|jd|k }t |||j}|S )N)minmaxrU   r   r   )r!   r   r   r   r   r  r   r(   r  r  r   r   r   r   maximumr   r   )rz   r   ansr   r   r   r3   r3   r4   _get_bypass_scale	  s&   

zBypassModule._get_bypass_scaler   r   c                 C   s    |  |jd }||| |  S )z
        Args: src_orig and src are both of shape (seq_len, batch_size, num_channels)
        Returns: something with the same shape as src and src_orig
        r   )r
  r   )rz   r   r   r  r3   r3   r4   r   %  s   zBypassModule.forward)r   r   r   r   r   rZ   r   rm   r
  r   r   r   r3   r3   r~   r4   r     s&    
r   c                       sb   e Zd ZdZdejdedef fddZ			dded	e	e d
e	e de	e def
ddZ
  ZS )rs   a
  
    DownsampledZipformer2Encoder is a zipformer encoder evaluated at a reduced frame
    rate, after convolutional downsampling, and then upsampled again at the output, and
    combined with the origin input, so that the output has the same shape as the input.
    r}   r    rh   c                    sH   t t|   || _t|| _|j| _|| _t|| _	t
|dd| _d S )Nr   r   )rl   rs   rm   downsample_factorSimpleDownsamplerh   r   r}   SimpleUpsampleupsampler   out_combiner)rz   r}   r    rh   r~   r3   r4   rm   5  s   

z%DownsampledZipformer2Encoder.__init__Nr   r   r   r   rT   c                 C   s   |}|  |}| j}|dur| dkr|dd| }|dur+|dd|dd|f }|dur8|ddd|f }| j||||d}| |}|d|jd  }| ||S )a  Downsample, go through encoder, upsample.

        Args:
            src: the sequence to the encoder (required):
                shape (seq_len, batch_size, embedding_dim).
            time_emb: the embedding representing the current timestep:
                shape  (batch_size, embedding_dim)
                or (seq_len, batch_size, embedding_dim) .
            feature_mask: something that broadcasts with src, that we'll multiply `src`
                by at every layer: if a Tensor, likely of shape
                (seq_len, batch_size, embedding_dim)
            attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len)
                or (seq_len, seq_len), interpreted as
                (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
                True means masked position. May be None.
            src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len);
                True means masked position.  May be None.

        Returns: a Tensor with the same shape as src.
        Nr   .r   r   )rh   r  r    r}   r  r   r  )rz   r   r   r   r   r   dsr3   r3   r4   r   >  s$   

z$DownsampledZipformer2Encoder.forwardr   )r   r   r   r   r   r   rZ   rm   r   r   r   r   r3   r3   r~   r4   rs   .  s"    rs   c                       8   e Zd ZdZdef fddZdedefddZ  ZS )	r  z<
    Does downsampling with attention, by weighted sum.
    rh   c                    s0   t t|   tt|| _d | _|| _	d S N)
rl   r  rm   r   r  r!   zerosrj   namerh   )rz   rh   r~   r3   r4   rm   u  s   
zSimpleDownsample.__init__r   rT   c                 C   s   |j \}}}| j}|| d | }|| | }||j d d d ||j d |j d }tj||fdd}|j d || ksAJ |||||}| jjdd}	|	dd}	||	 j	dd}
|
S )z
        x: (seq_len, batch_size, in_channels)
        Returns a tensor of shape
           ( (seq_len+downsample-1)//downsample, batch_size, channels)
        r   r   Nr   r   r   )
r   rh   expandr!   r)   reshaperj   r   	unsqueezer   )rz   r   seq_lenr   in_channelsr  	d_seq_lenpad	src_extraweightsr	  r3   r3   r4   r   ~  s   ,zSimpleDownsample.forward	r   r   r   r   rZ   rm   r   r   r   r3   r3   r~   r4   r  p  s    	r  c                       r  )	r  zG
    A very simple form of upsampling that just repeats the input.
    r  c                    s   t t|   || _d S r  )rl   r  rm   r  )rz   r  r~   r3   r4   rm     s   
zSimpleUpsample.__init__r   rT   c                 C   s>   | j }|j\}}}|d||||}||| ||}|S )z
        x: (seq_len, batch_size, num_channels)
        Returns a tensor of shape
           ( (seq_len*upsample), batch_size, num_channels)
        r   )r  r   r  r  r  )rz   r   r  r  r   num_channelsr3   r3   r4   r     s
   zSimpleUpsample.forwardr  r3   r3   r~   r4   r    s    r  c                       sl   e Zd ZdZ		ddededededd	f
 fd
dZddededd	fddZ	ddededefddZ
  ZS )r   a  
    Relative positional encoding module.  This version is "compact" meaning it is able
    to encode the important information about the relative position in a relatively
    small number of dimensions. The goal is to make it so that small differences between
    large relative offsets (e.g. 1000 vs. 1001) make very little difference to the
    embedding.   Such differences were potentially important when encoding absolute
    position, but not important when encoding relative position because there is now no
    need to compare two large offsets with each other.

    Our embedding works by projecting the interval [-infinity,infinity] to a finite
    interval using the atan() function, before doing the Fourier transform of that fixed
    interval.  The atan() function would compress the "long tails" too small, making it
    hard to distinguish between different magnitudes of large offsets, so we use a
    logarithmic function to compress large offsets to a smaller range before applying
    atan(). Scalings are chosen in such a way that the embedding can clearly distinguish
    individual offsets as long as they are quite close to the origin, e.g. abs(offset)
    <= about sqrt(embedding_dim)


    Args:
        embed_dim: Embedding dimension.
        dropout_rate: Dropout rate.
        max_len: Maximum input length: just a heuristic for initialization.
        length_factor: a heuristic scale (should be >= 1.0) which, if larger, gives
           less weight to small differences of offset near the origin.
      r   rc   r   max_lenr   rT   Nc                    sh   t t|   || _|d dksJ |t|| _d| _|dks$J ||| _| t	
d| dS )z0Construct a CompactRelPositionalEncoding object.r   r   Nr   rU   )rl   r   rm   rc   r   rM   per   	extend_per!   tensorr  )rz   rc   r   r!  r   r~   r3   r4   rm     s   
z%CompactRelPositionalEncoding.__init__r   r\   left_context_lenc                 C   s^  | d| }| jdur%| j d|d d kr%| jj|j|jd| _dS tj|d  ||jdtjd}dtj| j	d |jd }| j	d }||
  | |  t|  }| j| j	 dtj  }||  }||  }	||  }
tj|jd | j	|jd}|	|dddddf< |
|dddddf< d	|ddd
f< |j|jd| _dS )zReset the positional encodings.r   Nr   r   )r   r   r   rd   r   r   r   )r   )sizer"  r   r   r   r!   r%   r&   r  rc   signabsr$   r#   r   piatanr*   r+   r  r   )rz   r\   r%  Tr0   compression_lengthx_compressedlength_scalex_atancosinessinesr"  r3   r3   r4   r#    s.   
&

z&CompactRelPositionalEncoding.extend_pec                 C   sn   |  || |d| }| j| jdd | d | jdd |d ddf }|d}| |S )a  Create positional encoding.

        Args:
            x (Tensor): Input tensor (time, batch, `*`).
            left_context_len: (int): Length of cached left context.

        Returns:
            positional embedding, of shape (batch, left_context_len + 2*time-1, `*`).
        r   r   r   N)r#  r&  r"  r  rM   )rz   r\   r%  x_size_leftr   r3   r3   r4   r     s"   


z$CompactRelPositionalEncoding.forward)r   r   )r   )r   r   r   r   rZ   r   r(   rm   r   r#  r   r   r3   r3   r~   r4   r     s"     3r   c                       s   e Zd ZdZdeddfdedededed	ed
ededdf fddZ		dde	de	de
e	 de
e	 de	f
ddZde	fddZ  ZS )r   a$  Module that computes multi-head attention weights with relative position
    encoding. Various other modules consume the resulting attention weights:
    see, for example, the SimpleAttention module which allows you to compute
    conventional attention.

    This is a quite heavily modified from: "Transformer-XL: Attentive Language
        Models Beyond a Fixed-Length Context",
    we have to write up the differences.


    Args:
           embed_dim: number of channels at the input to this module, e.g. 256
             pos_dim: dimension of the positional encoding vectors, e.g. 128.
           num_heads:  number of heads to compute weights for, e.g. 8
     query_head_dim: dimension of the query (and key), per head.  e.g. 24.
       pos_head_dim: dimension of the projected positional encoding per head, e.g. 4.
            dropout: dropout probability for attn_output_weights. Default: 0.0.
       pos_emb_skip_rate: probability for skipping the pos_emb part of the scores on
                     any given call to forward(), in training time.
    rU   r   )r@   rU   rc   rL   rJ   rG   rH   rM   pos_emb_skip_raterT   Nc           
   	      s   t    || _|| _|| _|| _|| _t|| _	d | _
|}|| | | }	t||	d|d d| _t|tdddd| _t|| dd	d
dddd| _t||| ddd| _t | _t | _d S )NTg      пri   r   r   r   r   r   g?g333333?rU   g      Y@r   Fr   )rl   rm   rc   rJ   rG   rH   rM   r   r   r3  r  r   ro   r   r   whiten_keysr   balance_keys
linear_posr   copy_pos_query
copy_query)
rz   rc   rL   rJ   rG   rH   rM   r3  key_head_dimin_proj_dimr~   r3   r4   rm   @  sH   

z-RelPositionMultiheadAttentionWeights.__init__r\   r   r   r   c                 C   s  |  |}| j}| j}| j}|j\}}	}
|| }|dd|f }|d|d| f }|dd| df }|jd || ksGJ |jd ||f| |}| | |}| |}|	||	||}|	||	||}|	||	||}|
dddd}|
dddd}|
dddd}t||}d}tj stj rd	}n| jrt t| jkrd	}|r;| |}d| d }|	d|||
dddd}t||}tj r|j\}}	}}tj|d ddd
}t|}||	| d}|| }|	d|}tj|d|d}|	||	||}n(|j||	||f|d|d|d|d |df|d|d  d}|| }tj sGtj rHn| jr\t dk r\t|dd| jd}|j||	||fkshJ |dur||jtjksvJ ||d}|dur|j|	|fksJ |j||dd}t |dd}tj stj rnt dk r| js| !| t"j#j$|| j$| jd}|S )aE  
        Args:
            x: input of shape (seq_len, batch_size, embed_dim)
            pos_emb: Positional embedding tensor, of shape (1, 2*seq_len - 1, pos_dim)
            key_padding_mask: a bool tensor of shape (batch_size, seq_len).
                Positions that are True in this mask will be ignored as sources in the
                attention weighting.
            attn_mask: mask of shape (seq_len, seq_len) or
                (batch_size, seq_len, seq_len), interpreted as
                ([batch_size,] tgt_seq_len, src_seq_len)
               saying which positions are allowed to attend to which other positions.
        Returns:
           a tensor of attention weights, of
            shape (hum_heads, batch_size, seq_len, seq_len)
           interpreted as (hum_heads, batch_size, tgt_seq_len, src_seq_len).
        .r   r   Nr   r   r   FT)r   r   step)r    index)storage_offsetrX   g      9@g-C6?)limitpenaltyr  ir   gMbP?)pr   )%ro   rG   rH   rJ   r   r8  r4  r5  r7  r  r   r!   matmulr   r   r   r   r   r(   r3  r6  r%   repeatr  gather
as_stridedstrider   r  r   r   masked_fillr   _print_attn_entropyr   
functionalrM   )rz   r\   r   r   r   rG   rH   rJ   r  r   _	query_dimqkr@  attn_scoresuse_pos_scoresseq_len2
pos_scorestime1nrowscolsindexesr   r3   r3   r4   r     s   









	

z,RelPositionMultiheadAttentionWeights.forwardr   c              	   C   s   |j \}}}}t L tjjtdd+ |tj}|d  | j	ddj
dd }td| j d|  W d    n1 sBw   Y  W d    d S W d    d S 1 sZw   Y  d S )	NF)enabledg#B;r   r   )r   r   zname=z, attn_weights_entropy = )r   r!   no_gradampautocastDEVICE_TYPEr   r&   r$   r   meanloggingdebugr  )rz   r   rJ   r   r  attn_weights_entropyr3   r3   r4   rG  #  s"   
"z8RelPositionMultiheadAttentionWeights._print_attn_entropy)NN)r   r   r   r   r   rZ   r(   r   rm   r   r   r   rG  r   r3   r3   r~   r4   r   *  sH    	H
  r   c                       sH   e Zd ZdZdedededdf fddZd	ed
edefddZ  ZS )r   a[  
    The simplest possible attention module.  This one works with already-computed
    attention weights, e.g. as computed by RelPositionMultiheadAttentionWeights.

    Args:
          embed_dim: the input and output embedding dimension
          num_heads: the number of attention heads
          value_head_dim: the value dimension per head
    rc   rJ   rI   rT   Nc                    sT   t    tj||| dd| _t|| |ddd| _tdtdddd	d
d| _	d S )NTrj   r   ri   r         @r   r   r   r   r   )
rl   rm   r   rn   ro   r   rp   r   r   r   )rz   rc   rJ   rI   r~   r3   r4   rm   ?  s   

zSelfAttention.__init__r\   r   c                 C   s   |j \}}}|j d }|j ||||fksJ | |}||||ddddd}|j d }t||}|dddd |||| }| |}| 	|}|S )ah  
        Args:
          x: input tensor, of shape (seq_len, batch_size, embed_dim)
         attn_weights: a tensor of shape (num_heads, batch_size, seq_len, seq_len),
          with seq_len being interpreted as (tgt_seq_len, src_seq_len).  Expect
          attn_weights.sum(dim=-1) == 1.
        Returns:
           a tensor with the same shape as x.
        r   r   r   r   r   )
r   ro   r  r   r!   rA  
contiguousviewrp   r   )rz   r\   r   r  r   rc   rJ   rI   r3   r3   r4   r   V  s   




zSelfAttention.forwardr  r3   r3   r~   r4   r   4  s$    
r   c                       s<   e Zd ZdZdededef fddZdefdd	Z  Z	S )
r   z)Feedforward module in TTSZipformer model.rc   rK   rM   c              	      sd   t t|   t||| _t|dddddd| _t||d|dd	d
d| _	t
dtdddd| _d S )Nr   rV   r         ?      @r   SwooshLr   TrX   )
activation	dropout_pdropout_shared_dimrj   rk   r   r`  r   r   r   )rl   r   rm   r   rn   ro   r   hidden_balancerr
   rp   r   r   
out_whiten)rz   rc   rK   rM   r~   r3   r4   rm     s2   

zFeedforwardModule.__init__r\   c                 C   s,   |  |}| |}| |}| |}|S r  )ro   ri  rp   rj  )rz   r\   r3   r3   r4   r     s
   



zFeedforwardModule.forward)
r   r   r   r   rZ   r   rm   r   r   r   r3   r3   r~   r4   r   ~  s    r   c                       sD   e Zd ZdZdededdf fddZded	edefd
dZ  ZS )r   a`  This is like the ConvolutionModule, but refactored so that we use multiplication
       by attention weights (borrowed from the attention module) in place of actual
       convolution.  We also took out the second nonlinearity, the one after the
       attention mechanism.

    Args:
        channels (int): The number of channels of conv layers.
    channelsr   rT   Nc                    s   t    || _tj||d dd| _t|dtddtddd	d
d| _t	 | _
t | _t | _t | _t||ddd| _tdtd
ddd| _tdtd
ddddd| _d S )Nr   Tr_  r   r   )rW   r   )rU   rc  )rW   gffffff?rd   rd  r   r   ri   r   r   r   r   r   r   )rl   rm   r   r   rn   ro   r   r   balancerTanhtanhr   	identity1	identity2	identity3r   rp   r   r   whiten1whiten2)rz   rk  r   r~   r3   r4   rm     s<   


zNonlinAttention.__init__r\   r   c                 C   s  |  |}|j\}}}| j}|jddd\}}}| |}| |}|d|||}| |}|| }| 	|}|j\}}}	|jd }
|j|
|||fksPJ ||||
d
dddd}t||}|
dddd||d}| |}|| }| |}| |}| |}|S )z.
        Args:
            x: a Tensor of shape (seq_len, batch_size, num_channels)
            attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
        Returns:
            a Tensor with the same shape as x
        r   r   r   r   r   r   )ro   r   r   chunkrl  rn  r  r  rr  ro  r   r!   rA  rp  rq  rp   rs  )rz   r\   r   r  r   rI  r   syrc   rJ   r3   r3   r4   r     s,   









zNonlinAttention.forwardr  r3   r3   r~   r4   r     s     	/r   c                       sL   e Zd ZdZdededdf fddZ	dded	ee defd
dZ  Z	S )r   zConvolutionModule in Zipformer2 model.

    Args:
        channels (int): The number of channels of conv layers.
        kernel_size (int): Kernerl size of conv layers.
        bias (bool): Whether to use bias in conv layers (default=True).

    rk  kernel_sizerT   Nc                    s   t t|   |d d dksJ |}t|d| | _t|dtddddtd	d
ddd| _t	 | _
t | _t	 | _|d dksDJ tj|||||d d| _t|dtdddtdddd| _tdtdddd| _t||dddd| _dS )z%Construct a ConvolutionModule object.r   r   r   r   )rU   r   )     @@r   r   g      ?)rU   rd  )rx        $@r   r   )r  out_channelsgroupsrw  paddingr   )rx  r   r   )rW   rd   ry  r`  r   r   r   r   rU   r   )rf  rg  rk   N)rl   r   rm   r   rn   ro   r   r   r   r   activation1Sigmoidsigmoidactivation2Conv1ddepthwise_convr   r   r   r   r
   rp   )rz   rk  rw  bottleneck_dimr~   r3   r4   rm     s^   	
	zConvolutionModule.__init__r\   r   c                 C   s   |  |}|jddd\}}| |}| |}| |}|| }| |}|ddd}|dur=||d	|d}| 
|}| |}|ddd}| |}| |}|S )aK  Compute convolution module.

        Args:
            x: Input tensor (#time, batch, channels).
           src_key_padding_mask: the mask for the src keys per batch (optional):
               (batch, #time), contains True in masked positions.

        Returns:
            Tensor: Output tensor (#time, batch, channels).

        r   r   r   r   NrU   )ro   rt  r   r  r}  r  r   rF  r  	expand_asr  r   r   rp   )rz   r\   r   ru  r3   r3   r4   r   m  s    








zConvolutionModule.forwardr  )
r   r   r   r   rZ   rm   r   r   r   r   r3   r3   r~   r4   r     s"    	Tr   )r   )r   )/r   r\  r#   r   typingr   r   r   r!   r   r   r   is_availablerZ  backendsr   zipvoice.models.modules.scalingr
   r   r   r   r   r   r   r   r   r   r   r   r   r5   r   r6   r(   r   rq   rr   r   rs   r  r  r   r   r   r   r   r   r3   r3   r3   r4   <module>   sB   
<
 a  [f<B+y  J+i