o
    @Tis                  /   @   s  d dl mZ d dlmZmZmZ d dlZd dlZd dlmZm	Z	 d dl
mZmZ ddlmZ ddlmZmZmZ d	d
 ZG dd dejZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd  d eZ	!dKd"ed#eee e e f  d$e!d%e!d&ef
d'd(Z"	!	!	!	!dLd)e d*e d+e#d,e d-e d.e d/ee! d0ee! d1ee  d2e d3e#d4ee  d5e#d6e#d7e!d8e#d9e!d:e!d;e!d<e!d&ef*d=d>Z$	!	!	!	!dLd)e d*e d+e#d,e d-e d.e d/ee! d0ee! d?ee  d@eee   dAe dBe d3e#d4ee  d5e#d6e#d7e!d8e#d9e!d:e!d;e!d<e!d&ef.dCdDZ%dEe	dFe	d&e	fdGdHZ&G dIdJ dJej'j(Z)dS )M    )defaultdict)ListOptionalTupleN)nnTensor)Module	Parameter   )HardConcrete)prune_linear_layerprune_conv1d_layerprune_layer_normc                 C   sv   dd }t | tjr|| jj | jdur| jj  t | tjr7|| jj | jdur9| jj| j   dS dS dS )a  
    Initialize the weights of Transformer module in Wav2Vec2/HuBERT.

    If the module is ``nn.Linear``, normalize the weight with mean 0 and standard deviation 0.02.
    If ``bias`` is set to ``True`` in the module, set ``bias`` to 0.

    If the module is ``nn.Embedding``, normalize the weight with mean 0 and standard deviation 0.02.
    If ``padding_idx`` is not None, set the weight of padding to 0.

    Note:
        Ths method corresponds to
        `init_bert_params
        <https://github.com/facebookresearch/fairseq/blob/main/fairseq/modules/transformer_sentence_encoder.py#L21>`__
        in the original ``fairseq`` implementation.
    c                 S   s$   |  |  jddd| j d S )N        g{Gz?)meanstd)copy_cpunormal_todevice)data r   O/home/ubuntu/.local/lib/python3.10/site-packages/linacodec/module/components.pyr   "   s   $z)_init_transformer_params.<locals>.normal_N)	
isinstancer   Linearweightr   biaszero_	Embeddingpadding_idx)moduler   r   r   r   _init_transformer_params   s   

r"   c                   @   s"   e Zd ZdZdedefddZdS )	LayerNormzLayer norm with transposeinputreturnc                 C   s8   | dd}tj|| j| j| j| j}| dd}|S )N)	transposer   
functional
layer_normnormalized_shaper   r   eps)selfr$   xr   r   r   forward2   s   zLayerNorm.forwardN)__name__
__module____qualname____doc__r   r/   r   r   r   r   r#   /   s    r#   c                       st   e Zd ZdZ	ddedededededee d	ef fd
dZde	dee	 de
e	ee	 f fddZdd Z  ZS )ConvLayerBlockz$Convolution unit of FeatureExtractorFin_channelsout_channelskernel_sizestrider   r*   prune_conv_channelsc                    sR   t    || _|| _|| _tj|||||d| _|r$t|dd| _	d S d | _	d S )N)r5   r6   r7   r8   r   {Gz?n_in	init_mean)
super__init__r7   r8   r*   r   Conv1dconvr   hard_concrete)r-   r5   r6   r7   r8   r   r*   r9   	__class__r   r   r?   <   s   


zConvLayerBlock.__init__r.   lengthr%   c                 C   s   |  |}| jdur| |}tj|}| jdur%|  }||d }|dur@tj|| j	 | j
ddd }tt||}||fS )a#  
        Args:
            x (Tensor): Shape: ``[batch, in_channels, in_frame]``.
            length (Tensor or None, optional): Shape ``[batch, ]``.
        Returns:
            Tensor: Shape ``[batch, out_channels, out_frames]``.
            Optional[Tensor]: Shape ``[batch, ]``.
        Nr'   floor)rounding_moder
   )rA   r*   r   r)   gelurB   	unsqueezetorchdivr7   r8   max
zeros_like)r-   r.   rE   channel_maskr   r   r   r/   W   s   



zConvLayerBlock.forwardc                 C   s^   | j d ur| j  }n| jj}|| | j }| jjd ur ||7 }| jd ur+||d 7 }||fS N   )rB   l0_normrA   r6   r7   r   r*   )r-   r5   r6   
num_paramsr   r   r   get_num_params_and_out_channelss   s   

z.ConvLayerBlock.get_num_params_and_out_channelsF)r0   r1   r2   r3   intboolr   r   r?   r   r   r/   rS   __classcell__r   r   rC   r   r4   9   s4    

r4   c                	       s^   e Zd ZdZdejf fddZdedee de	eee f fdd	Z
d
d Zdd Z  ZS )FeatureExtractorzoExtract features from audio

    Args:
        conv_layers (nn.ModuleList):
            convolution layers
    conv_layersc                    s8   t    || _tjtj|d jjtj	ddd| _
d S )Nr'   dtypeFrequires_grad)r>   r?   rY   r   r	   rJ   onesrA   r6   float32dummy_weight)r-   rY   rC   r   r   r?      s   
zFeatureExtractor.__init__r.   rE   r%   c                 C   sT   |j dkr	td|d}| jD ]	}|||\}}q|dd}|| j }||fS )a  
        Args:
            x (Tensor):
                Input Tensor representing a batch of audio,
                shape: ``[batch, time]``.
            length (Tensor or None, optional):
                Valid length of each input sample. shape: ``[batch, ]``.

        Returns:
            Tensor:
                The resulting feature, shape: ``[batch, frame, feature]``
            Optional[Tensor]:
                Valid length of each output sample. shape: ``[batch, ]``.
        rP   zNExpected the input Tensor to be 2D (batch, time), but received {list(x.shape)}r
   )ndim
ValueErrorrI   rY   r(   r`   )r-   r.   rE   layerr   r   r   r/      s   



zFeatureExtractor.forwardc                 C   s:   d}d}| j D ]}||\}}||7 }q||7 }||fS )Nr
   r   )rY   rS   )r-   r5   rR   rc   layer_paramsr   r   r   %get_num_params_and_final_out_channels   s   

z6FeatureExtractor.get_num_params_and_final_out_channelsc                 C   sX  g }t | jD ]\}}|jdur|jjrJ | }| d}t|dks.J d| |t||j|j	f t
|j|d |jdurLt|j| |t| jd kro| j j|9  _tj| jd|  dd| _n| j|d  jj j|d9  _t
| j|d  j|d	d
 d|_q||jj|j|j	f tj|jjtjd}q||fS )zu"Prune conv layers and dummy weight based on hardconcrete parameters.
        This is an in-place operation.
        Nr'   r   z&Conv channels pruned to zero at index outputr
   Fr\   r$   dimrZ   )	enumeraterY   rB   trainingnonzerosqueezelenappendr7   r8   r   rA   r*   r   r`   r   r   r	   index_selectclonedetachr   rI   r6   rJ   arangelong)r-   
new_configidxrc   maskindexr   r   r   prune   s4   


"zFeatureExtractor.prune)r0   r1   r2   r3   r   
ModuleListr?   r   r   r   r/   re   rx   rW   r   r   rC   r   rX      s    
rX   c                       s>   e Zd ZdZdededef fddZdd Zd	d
 Z  Z	S )FeatureProjectionzLayer that connects FeatureExtractor and Encoder

    Projects features to encoder dimension.

    Args:
        in_features (int): Input feature dim.
        out_features (int): Output feature dim.
        dropout (float): Dropout probability.
    in_featuresout_featuresdropoutc                    s4   t    t|| _t||| _t|| _d S N)	r>   r?   r   r#   r*   r   
projectionDropoutr}   )r-   r{   r|   r}   rC   r   r   r?      s   
zFeatureProjection.__init__c                 C   s"   |  |}| |}| |}|S )z
        Args:
            x (Tensor):
                Feature Tensor. shape: ``[batch, frame, in_feature]``
        Returns:
            Tensor: Projected features. ``[batch, frame, out_feature]``.
        )r*   r   r}   r-   r.   r   r   r   r/      s   


zFeatureProjection.forwardc                 C   s   |d |d | j j  S )NrP   r
   )r   r|   )r-   r{   r   r   r   get_num_params  s   z FeatureProjection.get_num_params)
r0   r1   r2   r3   rU   floatr?   r/   r   rW   r   r   rC   r   rz      s    
rz   c                       s>   e Zd ZdZdededef fddZdd Zd	d
 Z  ZS ) ConvolutionalPositionalEmbeddinga  Positional embedding which is placed at the beginning of Transformer.

    Args:
        embed_dim (int): Feature dimension of the input Tensor.
        kernel_size (int): The number of frames to be use.
        groups (int): The number of groups in feature dimensions.
    	embed_dimr7   groupsc                    sf   t    || _|| _tj||||d |d| _tjj| jddd| _|d dkr.d| _	d S d| _	d S )NrP   )r5   r6   r7   paddingr   r   )namerh   r   r
   )
r>   r?   r   r7   r   r@   rA   utilsweight_norm
num_remove)r-   r   r7   r   rC   r   r   r?     s   
 z)ConvolutionalPositionalEmbedding.__init__c                 C   s<   | j j D ]}|jdkr|jjdkrtjj	| j  q| S )Nztorch.nn.utils.weight_norm
WeightNorm)
rA   _forward_pre_hooksvaluesr1   rD   r0   rJ   r   r   remove_weight_norm)r-   hookr   r   r   __prepare_scriptable__.  s
   z7ConvolutionalPositionalEmbedding.__prepare_scriptable__c                 C   sR   | dd}| |}| jdkr|dd| j f }tjj|}| dd}|S )z
        Args:
            x (Tensor): shape ``[batch, frame, feature]``.

        Returns:
            Tensor: The resulting feature. Shape ``[batch, frame, feature]``.
        r&   r'   r   .N)r(   rA   r   rJ   r   r)   rH   r   r   r   r   r/   8  s   

z(ConvolutionalPositionalEmbedding.forward)	r0   r1   r2   r3   rU   r?   r   r/   rW   r   r   rC   r   r     s    
r   c                       s   e Zd ZdZ			ddededededed	ef fd
dZ			ddede	e de	e de	e de
ee	e f f
ddZdd Zdd Z  ZS )SelfAttentiona   Multihead Self Attention module

    Args:
        embed_dim (int): Total dimension of the model.
        num_heads (int): The number of heads.
        dropout (float, optional):
            Dropout probability on attn_output_weights. Default: ``0.0``
    r   Fr   	num_headshead_dimr}   prune_headsprune_layerc                    s   t    || _|| _|| _tj|| _| jd | _	tj
||| dd| _tj
||| dd| _tj
||| dd| _tj
|| |dd| _|rQt|dd| _nd | _|r_tddd| _d S d | _d S )Ng      Tr   r:   r;   r
   )r>   r?   r   r   r   rJ   r   r   r}   scalingr   k_projv_projq_projout_projr   hard_concrete_for_headshard_concrete_for_layer)r-   r   r   r   r}   r   r   rC   r   r   r?   S  s    
	
zSelfAttention.__init__Nr.   attention_maskposition_biaskey_padding_maskr%   c                 C   sf  |j dks|jd | jkrtd| j d|j d| \}}}||| j| jf}| |j| 	dd}	| 
|j| dddd}
| |j| 	dd}| j|	 |
 }|dur^||7 }||jd	d
dd  }tjjj|d	d}| |}|| }| jdur|  }||d	d	 }|	dd||| j| j }| |}| jdur|  }|| }|dfS )a  
        Args:
            x (Tensor): shape: ``[batch_size, sequence_length, embed_dim]``.
            attention_mask (Tensor or ``None``, optional):
                shape: ``[batch_size, 1, sequence_length, sequence_length]``
            position_bias: Not used. Only for the compatibility with :py:class:`WavLMSelfAttention`.
            key_padding_mask (Tensor or ``None``): Not used. Only for the compatibility with
                :py:class:`WavLMSelfAttention`.
        Returns:
            (Tensor, ``None``): The resulting attention output and ``None`` (necessary for compatibility
                with :py:class:`WavLMSelAttention`).
                Attention output shape: ``[batch, sequence_length, embed_dim]``.
           rP   z9The expected input shape is (batch, sequence, embed_dim==z	). Found .r
   r   Nr'   T)rh   keepdimrg   )ra   shaper   rb   sizer   r   r   viewr(   r   permuter   r   rL   rJ   r   r)   softmaxr}   r   rI   reshaper   r   )r-   r.   r   r   r   
batch_sizerE   r   r   qkvweightsrf   	head_mask
layer_maskr   r   r   r/   t  s2   



zSelfAttention.forwardc                 C   sd   | j d ur| j  }n| j}| jd | | j d || j d | j  }| jd ur0|| j 9 }|S )Nr
   r   )r   rQ   r   r   r   r   )r-   r   rR   r   r   r   r     s   

zSelfAttention.get_num_paramsc                 C   s  d| j d}| jd ur2| jjrJ |  }| jj j|9  _| jj j|9  _|dkr/d|d< d | _| jd ur| jjr=J |  }t|	 |d< |d dkrTd|d< n2|
| j}|	 d}t| j|d t| j|d t| j|d | jj j|9  _t| j|d	 d | _|S )
NT)use_attentionr   r   Fr   r   r'   rf   r$   )r   r   rj   r   r   r   r   r   rm   rk   repeat_interleaver   rl   r   r   r   r   r-   rt   r   r   	full_mask
full_indexr   r   r   rx     s4   


zSelfAttention.prune)r   FFNNN)r0   r1   r2   r3   rU   r   rV   r?   r   r   r   r/   r   rx   rW   r   r   rC   r   r   I  sD    $
;r   c                       s   e Zd ZdZ									d&ded	ed
eee  dedededededededef fddZ	dedede
fddZd'de
defddZ			d(de
dee
 d ee
 d!ee
 dee
ee
 f f
 fd"d#Zd$d% Z  ZS ))WavLMSelfAttentiona  Multi-headed self-attention for WavLM model :cite:`chen2022wavlm`.

    Args:
        embed_dim (int): Total dimension of the model.
        num_heads (int): The number of heads.
        dropout (float, optional): Dropout probability on attn_output_weights. (Default: to ``0.0``)
        bias (bool, optional): If ``True``, add bias to input / output projection layers. (Default: ``True``)
        has_relative_attention_bias (bool, optional): If ``True``, apply relative position embedding.
            Necessary in the first encoder layer, but not in the subsequent ones. (Default: ``False``)
        num_buckets (int, optional): Number of buckets for relative position embedding. (Default: ``32``)
        max_distance (int, optional): Naximum distance for relative position embedding. (Default: ``128``)
        gru_rel_pos (bool, optional): If ``True``, apply gated relative position embedding. (Default: ``False``)
    Nr   TF       r   total_num_headsremaining_headsr}   r   has_relative_attention_biasnum_bucketsmax_distancegru_rel_posr   r   c                    s.  || _ |d u rtt|| _n|| _|| | _t |t| j| j||
| || _|| _	|| _
|r9t||| _nd | _tj|t| j| j |d| _tj|t| j| j |d| _tj|t| j| j |d| _tjt| j| j ||d| _|	| _| jrt| jd| _ttd|dd| _d| _d S )Nr      r
   T)r   listranger   r   r>   r?   rm   r   r   r   r   r   rel_attn_embedr   r   r   r   r   r   gru_rel_pos_linearr	   rJ   r^   gru_rel_pos_consthas_position_bias)r-   r   r   r   r}   r   r   r   r   r   r   r   rC   r   r   r?     s*   

zWavLMSelfAttention.__init__query_length
key_lengthr%   c                 C   sz   t j|t jddddf }t j|t jddddf }|| }| j|dd}|| jjj}| |}|g d}|S )a  Compute relative position embeddings for WavLM model.
        Args:
            query_length (int): Query position can take values between 0 and ``query_length - 1``.
            key_length (int): Key position can take values between 0 and ``key_length - 1``.
        Returns:
            Tensor of shape `(num_heads, query_length, key_length)`, relative positions embeddings
        rZ   NT)bidirectional)rP   r   r
   )	rJ   rr   rs   _relative_positions_bucketr   r   r   r   r   )r-   r   r   context_positionmemory_positionrelative_positionrelative_position_bucketr   r   r   r   compute_bias  s   
zWavLMSelfAttention.compute_biasrelative_positionsr   c           	      C   s   | j }| j}tj|tjd}|r&|d }||dktj| 7 }t|}n
t|t| }|d }||k }|t|	 | t
||  ||  tj }t|t||d }|t|||7 }|S )a  Compute relative position buckets for WavLM model. Computation similar to formula (5) in WavLM
           paper :cite:`chen2022wavlm`.
        Args:
            relative_positions (Tensor): Relative offsets between query and key positions,
                of shape ``(query_length, key_length)``.
            bidirectional (bool): If ``True``, values will be filled both above and below the diagonal in the resulting
                matrix. If ``False``, the elements above the diagonal (i.e. with negative relative offsets) will be set
                to zero. (Default ``True``)
        Returns:
            Tensor of shape ``(query_length, key_length)`` filled bucketed values of with relative positions.
        rZ   rP   r   r
   )r   r   rJ   rM   rs   r   absminlogr   math	full_likewhere)	r-   r   r   r   r   relative_buckets	max_exactis_smallrelative_postion_if_larger   r   r   r   ,  s.   z-WavLMSelfAttention._relative_positions_bucketqueryr   r   r   c              	      s  |  \}}}|| jksJ |du sJ | jdur6|du r6| ||}|d|ddd|| j ||}d}|dur|}| jr|||| jd}	|		dddd}	t
| |	|| j|ddjddd	jddd
\}
}|
|| j d  d }||| j dd| }|d||f}||| j||dd| jddddf }|}|dur|| }|dur|||dd|td}t j||d\}}||fS )a  
        Args:
            query (Tensor): Input of shape ``(batch_size, src_len, embed_dim)``.
            key_padding_mask (Tensor or None, optional): Mask to exclude keys that are pads, of shape
                `(batch, src_len)`, where padding elements are indicated by 1s. (Default: ``None``)
            attn_mask: Needs to be ``None``. The argument exists for compatibility with
                ``EncoderLayer``. (Default: ``None``)
            position_bias (Tensor or None, optional): Position bias of shape
                ``(batch_size * num_heads, src_len, src_len)``. When used inside WavLM model encoder, will be
                generated in the first layer and then passed from each encoder layer to the next one.
                (Default: ``None``)
        Returns:
            attn_output (Tensor): Attention output of shape ``(batch_size, src_len, embed_dim)``.
            position_bias (Tensor or None): Position bias of shape ``(batch_size * num_heads, src_len, src_len)``.
        Nr   r
   r'   rP   r      F)r   rg   g      ?g       @z-infr   )r   r   r   r   rI   repeatr   r   r   r   rJ   sigmoidr   sumchunkr   r   r   masked_fillr   r>   r/   )r-   r   r   r   r   bszseq_lenr   attn_mask_rel_posquery_layergate_agate_bgate_a_1	attn_maskattn_output_rC   r   r   r/   S  s>   &"
,zWavLMSelfAttention.forwardc                 C   s   d| j d}| jd ur2| jjrJ |  }| jj j|9  _| jj j|9  _|dkr/d|d< d | _| jd ur| jjr=J |  }| 	d
 |d< t|d dkrYd|d< n2|| j}| 	d}t| j|d t| j|d t| j|d | jj j|9  _t| j|d	 d | _|S )
NT)r   r   r   Fr   r'   r   rf   r$   )r   r   rj   r   r   r   r   r   rk   rl   tolistrm   r   r   r   r   r   r   r   r   r   r   rx     s4   


zWavLMSelfAttention.prune)	Nr   TFr   r   TFF)Tr   )r0   r1   r2   r3   rU   r   r   r   rV   r?   r   r   r   r   r/   rx   rW   r   r   rC   r   r     sf    
	
-*;r   c                       sX   e Zd ZdZ		ddedededededef fd	d
Zdd Zdd Z	dd Z
  ZS )FeedForwardz4Layer that follows attention layer in encoder layer.Fio_featuresintermediate_featuresintermediate_dropoutoutput_dropoutprune_intermediater   c                    sx   t    t||| _t|| _t||| _t|| _|r)t	|dd| _
nd | _
|r7t	ddd| _d S d | _d S )Ng      ?r;   r
   r:   )r>   r?   r   r   intermediate_denser   r   output_denser   r   hard_concrete_for_intermediater   )r-   r   r   r   r   r   r   rC   r   r   r?     s   
	

zFeedForward.__init__c                 C   sn   |  |}tjj|}| |}| jdur|  }|| }| |}| |}| j	dur5| 	 }|| }|S )z
        Args:
            x (Tensor): shape: `(batch, sequence_length, io_features)`
        Returns:
            x (Tensor): shape: `(batch, sequence_length, io_features)`
        N)
r   rJ   r   r)   rH   r   r   r   r   r   )r-   r.   intermediate_maskr   r   r   r   r/     s   





zFeedForward.forwardc                 C   sZ   | j j}| jd ur| j }n| j j}|d | |d |  }| jd ur+|| j 9 }|S )Nr
   )r   r{   r   rQ   r|   r   )r-   r   r   rR   r   r   r   r     s   

zFeedForward.get_num_paramsc                 C   s   d| j jd}| jd ur3| jjrJ |  }| jj j|9  _| jj j|9  _|dkr0d|d< d | _| jd urt| jjr>J |  }|	 
d}t||d< |d dkrZd|d< nt| j |d | jj j|9  _t| j|d	 d | _|S )
NT)use_feed_forwardff_interm_featuresr   Fr   r'   r   rf   r$   )r   r|   r   rj   r   r   r   r   r   rk   rl   rm   r   )r-   rt   r   interm_maskinterm_indexr   r   r   rx     s.   


zFeedForward.prune)FF)r0   r1   r2   r3   rU   r   rV   r?   r/   r   rx   rW   r   r   rC   r   r     s(    r   c                       s   e Zd ZdZdee dededee def
 fddZ							dd
e
dee
 dee
 dee
 dee
ee
 f f
ddZdd Z  ZS )EncoderLayerzLA layer unit in encoder. Combines multihead self attention and feed forward.	attentionr}   layer_norm_firstfeed_forwardr   c                    sJ   t    || _t|| _t|| _|| _|| _	t|| _
|| _d S r~   )r>   r?   r   r   r   r}   r#   r*   r   r   final_layer_normr   )r-   r   r}   r   r   r   rC   r   r   r?     s   

zEncoderLayer.__init__Nr.   r   r   r   r%   c                 C   s   | j dur#|}| jr| |}| j ||||d\}}| |}|| }| jr9| jdur5|| | | }||fS | |}| jdurJ|| | }| |}||fS )af  
        Args:
            x (Tensor): Input of shape ``(batch, sequence_length, embed_dim)``.
            attention_mask (Tensor or ``None``, optional): attention mask
                of shape ``(batch, 1, sequence_length, sequence_length)``. (Default: ``None``)
            position_bias (Tensor or ``None``, optional): position bias of shape
                ``(batch_size * num_heads, src_len, src_len)``.
                Only necessary for WavLM model, ``None`` otherwise. (Default: ``None``)
            key_padding_mask (Tensor or ``None``, optional): key padding mask of shape ``(batch_size, src_len)``.
                Only used for WavLM model, ignored otherwise. (Default: ``None``)
        Returns:
            (x, position_bias): Shapes are the same as in the input. Position bias is only relevant for WaLM model,
                ``None`` otherwise.
        N)r   r   r   )r   r   r*   r}   r   r  )r-   r.   r   r   r   residualr   r   r   r/   '  s$   







zEncoderLayer.forwardc                 C   sB   | j d d }| jd ur|| j 7 }| jd ur|| j 7 }|S rO   )r   r   r   r   )r-   rR   r   r   r   r   T  s   

zEncoderLayer.get_num_paramsr   )r0   r1   r2   r3   r   r   r   rV   rU   r?   r   r   r/   r   rW   r   r   rC   r   r     s8    
-r   c                       s   e Zd Zdededededef
 fddZdefd	d
Z		ddede	e de	e defddZ
			ddede	e de	e de	e dee f
ddZdd Zdd Z  ZS )Transformerpos_conv_embedr}   layersr   
layer_dropc                    s@   t    || _t|j| _|| _|| _t	|| _
|| _d S r~   )r>   r?   r  r   r#   r   r*   r   r  r   r}   r  )r-   r  r}   r  r   r  rC   r   r   r?   ^  s   

zTransformer.__init__r.   c                 C   s,   ||  | }| jr| |}| |}|S r~   )r  r   r*   r}   r   r   r   r   _preprocessn  s
   

zTransformer._preprocessNr   r   r%   c                 C   sV   |  |}| jD ]}| jrtd | jks ||||d\}}q| js)| |}|S )Nr
   r   )	r  r  rj   rJ   randitemr  r   r*   )r-   r.   r   r   rc   r   r   r   r/   w  s   


zTransformer.forward
num_layersc                 C   s   |d urd|  k rt | jksn tdt | j dg }| |}| jD ]}||||d\}}|| |d urEt ||krE|  S q'|S )Nr   z!`num_layers` must be between [1, ]r  )rm   r  rb   r  rn   )r-   r.   r   r  r   retrc   r   r   r   get_intermediate_outputs  s   


z$Transformer.get_intermediate_outputsc                 C   s@   t dd | j D | jjd  }| jD ]}|| 7 }q|S )Nc                 s   s    | ]}|  V  qd S r~   )numel).0pr   r   r   	<genexpr>  s    z-Transformer.get_num_params.<locals>.<genexpr>rP   )r   r  
parametersr   r  r   )r-   rR   rc   r   r   r   r     s   $
zTransformer.get_num_paramsc                 C   s   t t}| jD ]L}|j }|d |d  d|v r%|d |d  n	|d |d  |d s5d |_|j }|d |d  |d |d  |d sSd |_q|S )Nr   r   r   r   r   )r   r   r  r   rx   rn   r   )r-   rt   rc   attention_config	ff_configr   r   r   rx     s    


zTransformer.pruneNNr   )r0   r1   r2   r   r   rV   r?   r   r  r   r/   rU   r   r  r   rx   rW   r   r   rC   r   r  ]  sP    

r  c                
       s   e Zd Zdedef fddZ	ddedee deeee f fd	d
Z	ddedee defddZ			ddedee dee
 dee fddZdd Zdd Z  ZS )Encoderfeature_projectiontransformerc                    s   t    || _|| _d S r~   )r>   r?   r  r  )r-   r  r  rC   r   r   r?     s   

zEncoder.__init__Nfeatureslengthsr%   c                 C   s   |  |}d }|d urD|j\}}}tj||jd|||d d d f k}d||< d|d d d d d d f j|jd }||d||}||fS )Nr   r   g     rZ   r
   )r  r   rJ   rr   r   expandr   r[   )r-   r  r  r.   rv   r   max_lenr   r   r   r   r    s   
(&zEncoder._preprocessc                 C   s"   |  ||\}}| j||d}|S )Nr   )r  r  )r-   r  r  r.   rv   r   r   r   r/     s   zEncoder.forwardr  c                 C   s,   |  ||\}}| jj|||d}|g| S )N)r   r  )r  r  r  )r-   r  r  r  r.   masksintermr   r   r   extract_features  s   
zEncoder.extract_featuresc                 C   s   | j |}| j }|| S )z!Calculate the current model size.)r  r   r  )r-   r{   feature_projection_sizetransformer_sizer   r   r   r     s   
zEncoder.get_num_paramsc                 C   s,   t | jj| t| jj|d | j }|S )zIn-place pruning of submodules.r$   )r   r  r*   r   r   r  rx   )r-   conv_out_indextransformer_configr   r   r   rx     s   
zEncoder.pruner~   r  )r0   r1   r2   r   r?   r   r   r   r  r/   rU   r   r!  r   rx   rW   r   r   rC   r   r    sF    



r  F	norm_modeshapesr   r9   r%   c                 C   s   | dvrt dg }d}t|D ]4\}\}}}	d}
| dkr*|dkr*tj||dd}
n
| d	kr4t|dd
}
|t||||	||
|d |}qtt|S )a  
    Args:
        norm_mode (str):
            Either "group_norm" or "layer_norm".
            If "group_norm", then a single normalization is applied
            in the first convolution block. Otherwise, all the convolution
            blocks will have layer normalization.
            This option corresponds to "extractor_mode" from fairseq.
            Expected values are "group_norm" for Base arch, and
            "layer_norm" for Large arch.
        shapes (list of tuple of int):
            Configuration of convolution layers. List of convolution configuration,
            i.e. ``[(output_channel, kernel_size, stride), ...]``
            This option corresponds to "conv_feature_layers" from fairseq.
            Expected values are
            ``[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2``
            for all the architectures.
        bias (bool):
            Whether to include bias term to each convolution operation.
            This option corresponds to "conv_bias" from fairseq.
            Expected values are False for Base arch, and True for Large arch.

    See Also:
        * Original implementation
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L666-L733
        * "extractor_mode"
          - Def and base:
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L38-L45
          - Large:
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L52
        * "conv_feature_layers"
          - Def, base and large:
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L94-L100
        * "conv_bias"
          - Def and base:
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L101-L103
          - Large:
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L61
    )
group_normr*   zInvalid norm moder
   Nr(  r   T)
num_groupsnum_channelsaffiner*   )r+   elementwise_affine)r5   r6   r7   r8   r   r*   r9   )	rb   ri   r   	GroupNormr#   rn   r4   rX   ry   )r&  r'  r   r9   blocksr5   ir6   r7   r8   normalizationr   r   r   _get_feature_extractor  s<   -r1  r{   r   dropout_inputpos_conv_kernelpos_conv_groupsr  r   r   r   r   attention_dropoutr   ff_interm_dropoutr}   r   r  prune_attention_headsprune_attention_layerprune_feed_forward_intermediateprune_feed_forward_layerc              
   C   s   t | ||}t|||}t }t|D ]4}|| r't||| |	|
||d}nd}|| r:t||| ||||d}nd}|t|||||d qt	|||| |d}t
||S )a  
    Args:
        in_features (int): The number of input features.
        embed_dim (int):
            The dimension of embedding.
            This option corresponds to "encoder_embed_dim" from fairseq.
            Expected values are 768 for Base arch, and 1024 for Large arch.
        dropout_input (float):
            The dropout probability applied after the input feature is projected
            to ``embed_dim``.
            This option corresponds to "dropout_input" from fairseq.
            Expected values are 0.1 for both Base and Large arch.
        pos_conv_kernel (int):
            The kernel size of convolutional positional embeddings.
            This option corresponds to "conv_pos" from fairseq.
            Expected values are 128 for both Base and Large arch.
        pos_conv_groups (int):
            The number of groups of convolutional positional embeddings.
            This option corresponds to "conv_pos_groups" from fairseq.
            Expected values are 16 for both Base and Large arch.
        num_layers (int):
            The number of self attention layers in transformer block.
            This option corresponds to "encoder_layers" from fairseq.
            Expected values are 12 for Base and 24 for Large arch.
        num_heads (int):
            The number of heads in self attention layers.
            This option corresponds to "encoder_attention_heads" from fairseq.
            Expected values are 12 for Base and 16 for Large arch.
        attention_dropout (float):
            The dropout probability applied after softmax in self-attention layer.
            This option corresponds to "attention_dropout" from fairseq.
            Expected values are 0.1 for Base and 0.0 for Large arch.
        ff_interm_features (int):
            The dimension of hidden features in feed forward layer.
            This option corresponds to "encoder_ffn_embed_dim" from fairseq.
            Expected values are 3072 for Base and 4096 for Large arch.
        ff_interm_dropout (float):
            The dropout probability applied in feedforward layer.
            This option correspinds to "activation_dropout" from fairseq.
            Expected values are 0.1 for both Base and Large arch.
        dropout (float):
            The dropout probability applied at the end of feed forward layer.
            This option corresponds to "dropout" from fairseq.
            Expected values are 0.1 for Base and 0.0 for Large arch.
        layer_norm_first (bool):
            Control the order of layer norm in transformer layer and each encoder layer.
            If True, in transformer layer, layer norm is applied before features are fed
            to encoder layers. In encoder layer, two layer norms are applied before and after
            self attention.
            If False, in transformer layer, layer norm is applied after features are fed
            to encoder layers. In encoder layer, two layer norms are applied after self
            attention, before and after feed forward.
            This option corresponds to "layer_norm_first" from fairseq.
            Expected values are False for Base and True for Large arch.
        layer_drop (float):
            Probability to drop each encoder layer during training.
            This option corresponds to "layerdrop" from fairseq.
            Expected values are 0.1 for both Base and Large arch.

    See Also:
        * "encoder_embed_dim"
          - Def and base
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L49-L51
          - Large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L64
        * "dropout_input"
          - Def, base and large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L75-L78
        * "conv_pos"
          - Def, base and large
            NOTE: The description is wrong.
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L204-L207
          - Usage
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L756
        * "conv_pos_groups"
          - Def, base and large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L208-L211
        * "encoder_layers"
          - Def and base
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L46-L48
          - Large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L63
        * "encoder_attention_heads"
          - Def and base
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L55-L57
          - Large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L66
        * "attention_dropout"
          - Def and base
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L66-L68
          - Large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L60
        * "encoder_ffn_embed_dim"
          - Def and base
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L52-L54
          - Large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L65
        * "activation_dropout"
          - Def
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L69-L71
          - Base
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/base_960h.yaml#L55
          - Large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/vox_960h.yaml#L55
        * "dropout"
          - Def and base
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L63-L65
          - Large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L59
        * "layer_norm_first"
          - Def and base
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L91-L93
          - Large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L53
        * "layerdrop"
          - Def
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L72-L74
          - Base
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/base_960h.yaml#L54
          - Large
            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/vox_960h.yaml#L54
    )r   r   r   r}   r   r   Nr   r   r   r   r   r   r   r}   r   r   r   r  r}   r  r   r  )rz   r   r   ry   r   r   r   rn   r   r  r  )r{   r   r2  r3  r4  r  r   r   r   r   r5  r   r6  r}   r   r  r7  r8  r9  r:  r  pos_convencoder_layersru   r   r   r  r   r   r   _get_encoderB  sT    			
r@  r   r   r   r   c                 C   s   t | ||}t|||}t }t|D ];}|| r.t||| |	| ||dk|
|||d	}nd}|| rAt||| ||||d}nd}|t|||||d qt	|||| |d}t
||S )a  
    Construct encoder for WavLM model :cite:`chen2022wavlm`. The structure of the encoder and most of the argments are
    the same as in :py:func:`_get_encoder` so refer there for documentation. The only difference from Wav2Vec2 encoder
    is usage of `WavLMSelfAttention` instead of `SelfAttention` and two additional parameters: `num_buckets` and
    `max_distance`.
    Args:
        in_features (int): See :py:func:`_get_encoder`.
        embed_dim (int): See :py:func:`_get_encoder`.
        dropout_input (float): See :py:func:`_get_encoder`.
        pos_conv_kernel (int): See :py:func:`_get_encoder`.
        pos_conv_groups (int): See :py:func:`_get_encoder`.
        num_layers (int): See :py:func:`_get_encoder`.
        num_heads (int): See :py:func:`_get_encoder`.
        num_buckets (int): Number of buckets for relative position embedding.
        max_distance (int): Maximum distance for relative position embedding.
        attention_dropout (float): See :py:func:`_get_encoder`.
        ff_interm_features (int): See :py:func:`_get_encoder`.
        ff_interm_dropout (float): See :py:func:`_get_encoder`.
        dropout (float): See :py:func:`_get_encoder`.
        layer_norm_first (bool): See :py:func:`_get_encoder`.
        layer_drop (float): See :py:func:`_get_encoder`.

    r   )	r   r   r   r}   r   r   r   r   r   Nr;  r<  r=  )rz   r   r   ry   r   r   r   rn   r   r  r  )r{   r   r2  r3  r4  r  r   r   r   r   r   r   r5  r   r6  r}   r   r  r7  r8  r9  r:  r  r>  r?  r/  r   r   r  r   r   r   _get_wavlm_encoder  sX   /		
rA  r$   r  c                 C   s8   | j \}}}tj||jd|||dddf k}|S )a&  Generate the padding mask given the padded input and the lengths Tensors.
    Args:
        input (Tensor): The padded Tensor of dimension `[batch, max_len, frequency]`.
        lengths (Tensor): The lengths Tensor of dimension `[batch,]`.

    Returns:
        (Tensor): The padding mask.
    r  N)r   rJ   rr   r   r  )r$   r  r   r  r   rv   r   r   r   _get_padding_maskd  s   	(rB  c                   @   s$   e Zd Zedd Zedd ZdS )GradMultiplyc                 C   s   || _ ||}|S r~   )scalenew)ctxr.   rD  resr   r   r   r/   s  s   
zGradMultiply.forwardc                 C   s   || j  d fS r~   )rD  )rF  gradr   r   r   backwardy  s   zGradMultiply.backwardN)r0   r1   r2   staticmethodr/   rI  r   r   r   r   rC  r  s
    
rC  rT   )FFFF)*collectionsr   typingr   r   r   r   rJ   r   r   torch.nnr   r	   hardconcreter   pruning_utilsr   r   r   r"   r#   r4   rX   rz   r   r   r   r   r   r  r  strrU   rV   r1  r   r@  rA  rB  autogradFunctionrC  r   r   r   r   <module>   s   
Ie*8  SbJZB
^	

 T	


b