o
    eiZ                     @   s  d dl Z d dlZd dlmZ d dlm  mZ ddlmZ	 ddl
mZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ d
dlmZmZmZmZmZmZmZmZm Z  ddl!m"Z" e#e$Z%G dd deZ&G dd deZ'G dd dej(Z)G dd deZ*G dd deZ+G dd deZ,G dd dej(Z-G dd dej(Z.G dd dej(Z/G d d! d!ee Z0eZ1G d"d# d#eZ2G d$d% d%eZ3G d&d' d'eZ4G d(d) d)eZ5G d*d+ d+eZ6g d,Z7dS )-    N   )initialization)is_deepspeed_zero3_enabled)is_fsdp_managed_module)GradientCheckpointingLayer)BaseModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)logging   )	Wav2Vec2FeatureProjectionWav2Vec2FeedForward#Wav2Vec2ForAudioFrameClassificationWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ForXVectorWav2Vec2ModelWav2Vec2PositionalConvEmbeddingWav2Vec2PreTrainedModel   )WavLMConfigc                   @      e Zd ZdS )WavLMPositionalConvEmbeddingN__name__
__module____qualname__ r   r   e/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/wavlm/modular_wavlm.pyr          r   c                   @   r   )WavLMFeatureProjectionNr   r   r   r   r   r    #   r   r    c                       s   e Zd ZdZ				d"dededed	ed
edef fddZ				d#dej	dej	dB dej	dB dede
ej	ej	dB e
ej	 dB f f
ddZdejdejejB dejdede
ejejf f
ddZdededejfddZdejdejfd d!Z  ZS )$WavLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        @     T	embed_dim	num_headsdropoutnum_bucketsmax_distancehas_relative_position_biasc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _t	||| _
t	||| _t	||| _t	||| _|| _|| _ttd| jdd| _t	| jd| _|rqt| j| j| _d S d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )super__init__r%   r&   r'   head_dim
ValueErrorscalingnnLineark_projv_projq_projout_projr(   r)   	Parametertorchonesgru_rel_pos_constgru_rel_pos_linear	Embeddingrel_attn_embed)selfr%   r&   r'   r(   r)   r*   	__class__r   r   r-   *   s.   
	

zWavLMAttention.__init__NFr   hidden_statesattention_maskposition_biasoutput_attentionsreturnc                 C   s  |  \}}}|du r$| ||}|d|ddd|| j ||}||jdd | jdf }	|	dddd}	| |	}
|
|	jdd d 	d}
t
|
jddd\}}||| j d	  d
 }||| j dd| }|d||f}| ||||\}}|||fS )z'Attention layer with relative attentionNr   r   r   r   )r      dim      ?g       @)sizecompute_bias	unsqueezerepeatviewr&   shapepermuter;   sumr8   sigmoidchunkr:   torch_multi_head_self_attention)r>   rA   rB   rC   rD   indexbsztgt_len_gated_hidden_statesrelative_position_projgate_agate_bgate_outputgated_position_biasattn_outputattn_weightsr   r   r   forwardN   s"   	$

zWavLMAttention.forwardr_   c                 C   s   | dd } }}|dur|dnd}d }	}
d}tj|||| j| jtdgt| j	j
| jj
| jj
f|	|
|| j| jj| jj
| j|||d| j	j| jj| jjd\}}| dd}|durz|dddf |jdd | jf |jdd  }||fS )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)use_separate_proj_weightq_proj_weightk_proj_weightv_proj_weight)	transposeneFmulti_head_attention_forwardr%   r&   r8   emptycatr5   biasr3   r4   r'   r6   weighttrainingbroadcast_torP   )r>   rA   rB   r_   rD   querykeyvaluekey_padding_maskbias_kbias_vadd_zero_attnr`   ra   r   r   r   rU   w   sB   	

"z.WavLMAttention.torch_multi_head_self_attentionquery_length
key_lengthc                 C   sv   t j|t jdd d d f }t j|t jdd d d f }|| }| |}|| jjj}| |}|g d}|S )N)dtype)r   r   r   )	r8   arangelong_relative_positions_buckettor=   rn   devicerQ   )r>   rx   ry   context_positionmemory_positionrelative_positionrelative_position_bucketvaluesr   r   r   rL      s   

zWavLMAttention.compute_biasrelative_positionsc                 C   s   | j d }|dktj| }t|}|d }||k }t| | }|t| j|  }|||  }|| tj}t	|t
||d }|t|||7 }|S )Nr   r   r   )r(   r~   r8   r|   abslogfloatmathr)   min	full_likewhere)r>   r   r(   relative_buckets	max_exactis_smallrelative_positions_if_largerelative_position_if_larger   r   r   r}      s   

z)WavLMAttention._relative_positions_bucket)r"   r#   r$   TNNFr   )r   r   r   __doc__intr   boolr-   r8   Tensortuplerb   FloatTensor
LongTensor
BoolTensorrU   rL   r}   __classcell__r   r   r?   r   r!   '   s^    '
)

7
r!   c                   @   r   )WavLMFeedForwardNr   r   r   r   r   r      r   r   c                       s2   e Zd Zddedef fddZdd	d
Z  ZS )WavLMEncoderLayerTconfigr*   c                    n   t    t|j|j|j|j|j|d| _t	
|j| _t	j|j|jd| _t|| _t	j|j|jd| _d S N)r%   r&   r'   r(   r)   r*   epsr,   r-   r!   hidden_sizenum_attention_headsattention_dropoutr(   max_bucket_distance	attentionr1   Dropouthidden_dropoutr'   	LayerNormlayer_norm_eps
layer_normr   feed_forwardfinal_layer_normr>   r   r*   r?   r   r   r-         

zWavLMEncoderLayer.__init__NFr   c           	      C   sl   |}| j |||||d\}}}| |}|| }| |}|| | }| |}||f}|r4||f7 }|S )NrB   rC   rD   rV   )r   r'   r   r   r   )	r>   rA   rB   rC   rD   rV   attn_residualra   outputsr   r   r   rb      s"   



zWavLMEncoderLayer.forwardTr   r   r   r   r   r   r-   rb   r   r   r   r?   r   r          r   c                       s2   e Zd Zd
dedef fddZddd	Z  ZS ) WavLMEncoderLayerStableLayerNormTr   r*   c                    r   r   r   r   r?   r   r   r-      r   z)WavLMEncoderLayerStableLayerNorm.__init__NFc                 C   sf   |}|  |}| j||||d\}}}| |}|| }|| | | }||f}|r1||f7 }|S )N)rB   rC   rD   )r   r   r'   r   r   )r>   rA   rB   rC   rD   r   ra   r   r   r   r   rb   
  s   


z(WavLMEncoderLayerStableLayerNorm.forwardr   )NNFr   r   r   r?   r   r      r   r   c                       .   e Zd Z fddZ				dddZ  ZS )	WavLMEncoderc                    f   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nr   c                       g | ]
}t  |d kdqS r   )r*   )r   .0ir   r   r   
<listcomp>'  s    z)WavLMEncoder.__init__.<locals>.<listcomp>Fr,   r-   r   r   pos_conv_embedr1   r   r   r   r   r   r   r'   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr>   r   r?   r   r   r-      s   


zWavLMEncoder.__init__NFTc                 C   s>  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | |}	||	 }| |}| |}t p;t| }
d }t| j	D ]?\}}|rN||f }t
g }| jo_|dko_|| jjk }|rd|
ru||||||d}|d d \}}|ryd}|r||d f }qC|r||f }|stdd	 |||fD S t|||d
S )Nr   rF   r   r   r   r   NNNc                 s       | ]	}|d ur|V  qd S Nr   r   vr   r   r   	<genexpr>a      z'WavLMEncoder.forward.<locals>.<genexpr>last_hidden_staterA   
attentions)rM   rN   rP   r   r   r'   r   r   	enumerater   r8   randro   r   	layerdropr   r   r>   rA   rB   rD   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusrC   r   layerdropout_probabilityskip_the_layerlayer_outputsr   r   r   rb   +  sN   






zWavLMEncoder.forwardNFFTr   r   r   r-   rb   r   r   r   r?   r   r     s    r   c                       r   )	WavLMEncoderStableLayerNormc                    r   )Nr   c                    r   r   )r   r   r   r   r   r   q  s    z8WavLMEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   r   r?   r   r   r-   j  s   



z$WavLMEncoderStableLayerNorm.__init__NFTc                 C   s<  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | |}	||	 }| |}t p6t| }
d }t| jD ]>\}}|rI||f }t	
g }| joZ|dkoZ|| jjk }|r_|
ro|||||d}|d d \}}|rsd}|r|||d f }q>| |}|r||f }|stdd	 |||fD S t|||d
S )Nr   rF   r   r   r   )rB   rD   rC   r   c                 s   r   r   r   r   r   r   r   r     r   z6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )rM   rN   rP   r   r'   r   r   r   r   r8   r   ro   r   r   r   r   r   r   r   r   r   rb   x  sH   






z#WavLMEncoderStableLayerNorm.forwardr   r   r   r   r?   r   r   i  s    r   c                       s4   e Zd ZdZ fddZedd Zdd Z  ZS )WavLMGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                    s   t    |j| _|j| _|j| j dkr"td|j d| j dt	t
d| j| j |j| j | _t|jd | j| j | _d| _d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   rF   r   )r,   r-   num_codevector_groups
num_groupsnum_codevectors_per_groupnum_varscodevector_dimr/   r1   r7   r8   r   codevectorsr2   conv_dimweight_projtemperaturer   r?   r   r   r-     s   


z#WavLMGumbelVectorQuantizer.__init__c                 C   s2   | j dd}ttjt||dd  }|S )Nr   rH   rF   )meanr8   exprR   xlogy)probsmarginal_probs
perplexityr   r   r   _compute_perplexity  s   "z.WavLMGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}| |}||| | j d}| jrAtjj| | j	dd}|
|}tj||| | jd dd}| |}n$|jdd}|j|j  d|ddd}||| | jd}| |}||| d}|d| j }	|	|| | j| jd}
|
d||d}
|
|fS )NrF   T)tauhardrH   r   rJ   )rP   r   rO   r   ro   r1   
functionalgumbel_softmaxr   r   type_asr8   softmaxr   argmax	new_zerosscatter_rM   r   r   rR   )r>   rA   
batch_sizesequence_lengthr   codevector_probscodevector_soft_distr   codevector_idxcodevectors_per_groupr   r   r   r   rb     s*   


z"WavLMGumbelVectorQuantizer.forward)	r   r   r   r   r-   staticmethodr   rb   r   r   r   r?   r   r     s    
r   c                   @   sZ   e Zd ZU eed< dZdZdZdZdZ	dZ
dZe dd Zd	d
 Zdd Zdd ZdS )WavLMPreTrainedModelr   wavlminput_valuesaudioTFc              	   C   s  t |trtj|jjddd t|jj t|j	 dS t |t
rFtj|jjddtd|jjd |jj   d t|jjd dS t |trltd|jj }tj|jj| |d tj|jj| |d dS t |tjrtj|jd| jjd |jdurt|j dS dS t |tjtjfrt|j t|j dS t |tjrt|j |jdurt|j|j|jd   }tj|j| |d dS dS dS )zInitialize the weightsr"   r   )r   stdr   r   )abN)
isinstancer   initnormal_r   rn   zeros_rm   uniform_r   r   convr   sqrtkernel_sizein_channels	constant_r    
projectionin_featuresr1   r2   r   initializer_ranger   	GroupNormones_Conv1dkaiming_normal_groups)r>   modulekr   r   r   _init_weights  s<   

 


z"WavLMPreTrainedModel._init_weightsc                 C      t dNzNot needed for WavLMAttributeErrorr>   r   r   r   _get_adapters&     z"WavLMPreTrainedModel._get_adaptersc                 C   r%  r&  r'  r)  r   r   r   init_adapter_layers)  r+  z(WavLMPreTrainedModel.init_adapter_layersc                 C   r%  r&  r'  r)  r   r   r   load_adapter,  r+  z!WavLMPreTrainedModel.load_adapterN)r   r   r   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr8   no_gradr$  r*  r,  r-  r   r   r   r   r	    s   
 
!r	  c                   @   r   )
WavLMModelNr   r   r   r   r   r7  3  r   r7  c                   @   r   )WavLMForCTCNr   r   r   r   r   r8  7  r   r8  c                   @   r   )WavLMForSequenceClassificationNr   r   r   r   r   r9  ;  r   r9  c                   @   r   ) WavLMForAudioFrameClassificationNr   r   r   r   r   r:  ?  r   r:  c                   @   r   )WavLMForXVectorNr   r   r   r   r   r;  C  r   r;  )r:  r8  r9  r;  r7  r	  )8r   r8   torch.nnr1   torch.nn.functionalr   ri    r   r  integrations.deepspeedr   integrations.fsdpr   modeling_layersr   modeling_outputsr   r   modeling_utilsr	   utilsr
   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_wavlmr   
get_loggerr   loggerr   r    Moduler!   r   r   r   r   r   r   r	  WavLMBaseModelOutputr7  r8  r9  r:  r;  __all__r   r   r   r   <module>   s@    ,
 ')%JKF6