o
    iZ                     @   s  d dl Z d dlmZmZ d dlZd dlmZ d dlm  mZ	 ddl
mZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ d
dlmZmZmZmZmZmZmZmZm Z  ddl!m"Z" e#e$Z%G dd deZ&G dd deZ'G dd dej(Z)G dd deZ*G dd deZ+G dd deZ,G dd dej(Z-G dd dej(Z.G dd dej(Z/G d d! d!ee Z0eZ1G d"d# d#eZ2G d$d% d%eZ3G d&d' d'eZ4G d(d) d)eZ5G d*d+ d+eZ6g d,Z7dS )-    N)OptionalUnion   )is_deepspeed_zero3_enabled)is_fsdp_managed_module)GradientCheckpointingLayer)BaseModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)logging   )	Wav2Vec2FeatureProjectionWav2Vec2FeedForward#Wav2Vec2ForAudioFrameClassificationWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ForXVectorWav2Vec2ModelWav2Vec2PositionalConvEmbeddingWav2Vec2PreTrainedModel   )WavLMConfigc                   @      e Zd ZdS )WavLMPositionalConvEmbeddingN__name__
__module____qualname__ r   r   d/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/wavlm/modular_wavlm.pyr          r   c                   @   r   )WavLMFeatureProjectionNr   r   r   r   r   r!   #   r    r!   c                       s   e Zd ZdZ				d"dededed	ed
edef fddZ				d#dej	de
ej	 de
ej	 dedeej	e
ej	 e
eej	  f f
ddZdejdeejejf dejdedeejejf f
ddZdededejfddZdejdejfd d!Z  ZS )$WavLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        @     T	embed_dim	num_headsdropoutnum_bucketsmax_distancehas_relative_position_biasc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _t	||| _
t	||| _t	||| _t	||| _|| _|| _ttd| jdd| _t	| jd| _|rqt| j| j| _d S d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )super__init__r&   r'   r(   head_dim
ValueErrorscalingnnLineark_projv_projq_projout_projr)   r*   	Parametertorchonesgru_rel_pos_constgru_rel_pos_linear	Embeddingrel_attn_embed)selfr&   r'   r(   r)   r*   r+   	__class__r   r   r.   *   s.   
	

zWavLMAttention.__init__NFr   hidden_statesattention_maskposition_biasoutput_attentionsreturnc                 C   s  |  \}}}|du r$| ||}|d|ddd|| j ||}||jdd | jdf }	|	dddd}	| |	}
|
|	jdd d 	d}
t
|
jddd\}}||| j d	  d
 }||| j dd| }|d||f}| ||||\}}|||fS )z'Attention layer with relative attentionNr   r   r   r   )r      dim      ?g       @)sizecompute_bias	unsqueezerepeatviewr'   shapepermuter<   sumr9   sigmoidchunkr;   torch_multi_head_self_attention)r?   rB   rC   rD   rE   indexbsztgt_len_gated_hidden_statesrelative_position_projgate_agate_bgate_outputgated_position_biasattn_outputattn_weightsr   r   r   forwardN   s"   	$

zWavLMAttention.forwardr`   c                 C   s   | dd } }}|dur|dnd}d }	}
d}tj|||| j| jtdgt| j	j
| jj
| jj
f|	|
|| j| jj| jj
| j|||d| j	j| jj| jjd\}}| dd}|durz|dddf |jdd | jf |jdd  }||fS )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)use_separate_proj_weightq_proj_weightk_proj_weightv_proj_weight)	transposeneFmulti_head_attention_forwardr&   r'   r9   emptycatr6   biasr4   r5   r(   r7   weighttrainingbroadcast_torQ   )r?   rB   rC   r`   rE   querykeyvaluekey_padding_maskbias_kbias_vadd_zero_attnra   rb   r   r   r   rV   w   sB   	

"z.WavLMAttention.torch_multi_head_self_attentionquery_length
key_lengthc                 C   sv   t j|t jdd d d f }t j|t jdd d d f }|| }| |}|| jjj}| |}|g d}|S )N)dtype)r   r   r   )	r9   arangelong_relative_positions_buckettor>   ro   devicerR   )r?   ry   rz   context_positionmemory_positionrelative_positionrelative_position_bucketvaluesr   r   r   rM      s   

zWavLMAttention.compute_biasrelative_positionsc                 C   s   | j d }|dktj| }t|}|d }||k }t| | }|t| j|  }|||  }|| tj}t	|t
||d }|t|||7 }|S )Nr   r   r   )r)   r   r9   r}   abslogfloatmathr*   min	full_likewhere)r?   r   r)   relative_buckets	max_exactis_smallrelative_positions_if_largerelative_position_if_larger   r   r   r~      s   

z)WavLMAttention._relative_positions_bucket)r#   r$   r%   TNNFr   )r   r   r   __doc__intr   boolr.   r9   Tensorr   tuplerc   FloatTensorr   
LongTensor
BoolTensorrV   rM   r~   __classcell__r   r   r@   r   r"   '   s^    '
)
7
r"   c                   @   r   )WavLMFeedForwardNr   r   r   r   r   r      r    r   c                       s2   e Zd Zddedef fddZdd	d
Z  ZS )WavLMEncoderLayerTconfigr+   c                    n   t    t|j|j|j|j|j|d| _t	
|j| _t	j|j|jd| _t|| _t	j|j|jd| _d S N)r&   r'   r(   r)   r*   r+   epsr-   r.   r"   hidden_sizenum_attention_headsattention_dropoutr)   max_bucket_distance	attentionr2   Dropouthidden_dropoutr(   	LayerNormlayer_norm_eps
layer_normr   feed_forwardfinal_layer_normr?   r   r+   r@   r   r   r.         

zWavLMEncoderLayer.__init__NFr   c           	      C   sl   |}| j |||||d\}}}| |}|| }| |}|| | }| |}||f}|r4||f7 }|S )NrC   rD   rE   rW   )r   r(   r   r   r   )	r?   rB   rC   rD   rE   rW   attn_residualrb   outputsr   r   r   rc      s"   



zWavLMEncoderLayer.forwardTr   r   r   r   r   r   r.   rc   r   r   r   r@   r   r          r   c                       s2   e Zd Zd
dedef fddZddd	Z  ZS ) WavLMEncoderLayerStableLayerNormTr   r+   c                    r   r   r   r   r@   r   r   r.      r   z)WavLMEncoderLayerStableLayerNorm.__init__NFc                 C   sf   |}|  |}| j||||d\}}}| |}|| }|| | | }||f}|r1||f7 }|S )N)rC   rD   rE   )r   r   r(   r   r   )r?   rB   rC   rD   rE   r   rb   r   r   r   r   rc   
  s   


z(WavLMEncoderLayerStableLayerNorm.forwardr   )NNFr   r   r   r@   r   r      r   r   c                       .   e Zd Z fddZ				dddZ  ZS )	WavLMEncoderc                    f   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nr   c                       g | ]
}t  |d kdqS r   )r+   )r   .0ir   r   r   
<listcomp>'  s    z)WavLMEncoder.__init__.<locals>.<listcomp>Fr-   r.   r   r   pos_conv_embedr2   r   r   r   r   r   r   r(   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr?   r   r@   r   r   r.      s   


zWavLMEncoder.__init__NFTc                 C   s>  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | |}	||	 }| |}| |}t p;t| }
d }t| j	D ]?\}}|rN||f }t
g }| jo_|dko_|| jjk }|rd|
ru||||||d}|d d \}}|ryd}|r||d f }qC|r||f }|stdd	 |||fD S t|||d
S )Nr   rG   r   r   r   r   NNNc                 s       | ]	}|d ur|V  qd S Nr   r   vr   r   r   	<genexpr>a      z'WavLMEncoder.forward.<locals>.<genexpr>last_hidden_staterB   
attentions)rN   rO   rQ   r   r   r(   r   r   	enumerater   r9   randrp   r   	layerdropr   r   r?   rB   rC   rE   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusrD   r   layerdropout_probabilityskip_the_layerlayer_outputsr   r   r   rc   +  sN   






zWavLMEncoder.forwardNFFTr   r   r   r.   rc   r   r   r   r@   r   r     s    r   c                       r   )	WavLMEncoderStableLayerNormc                    r   )Nr   c                    r   r   )r   r   r   r   r   r   q  s    z8WavLMEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   r   r@   r   r   r.   j  s   



z$WavLMEncoderStableLayerNorm.__init__NFTc                 C   s<  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | |}	||	 }| |}t p6t| }
d }t| jD ]>\}}|rI||f }t	
g }| joZ|dkoZ|| jjk }|r_|
ro|||||d}|d d \}}|rsd}|r|||d f }q>| |}|r||f }|stdd	 |||fD S t|||d
S )Nr   rG   r   r   r   )rC   rE   rD   r   c                 s   r   r   r   r   r   r   r   r     r   z6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )rN   rO   rQ   r   r(   r   r   r   r   r9   r   rp   r   r   r   r   r   r   r   r   r   rc   x  sH   






z#WavLMEncoderStableLayerNorm.forwardr   r   r   r   r@   r   r   i  s    r   c                       s4   e Zd ZdZ fddZedd Zdd Z  ZS )WavLMGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                    s   t    |j| _|j| _|j| j dkr"td|j d| j dt	t
d| j| j |j| j | _t|jd | j| j | _d| _d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   rG   r   )r-   r.   num_codevector_groups
num_groupsnum_codevectors_per_groupnum_varscodevector_dimr0   r2   r8   r9   r   codevectorsr3   conv_dimweight_projtemperaturer   r@   r   r   r.     s   


z#WavLMGumbelVectorQuantizer.__init__c                 C   s8   | j dd}ttj|t|d  dd  }|S )Nr   rI   gHz>rG   )meanr9   exprS   r   )probsmarginal_probs
perplexityr   r   r   _compute_perplexity  s   (z.WavLMGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}| |}||| | j d}| jrAtjj| | j	dd}|
|}tj||| | jd dd}| |}n$|jdd}|j|j  d|ddd}||| | jd}| |}||| d}|d| j }	|	|| | j| jd}
|
d||d}
|
|fS )NrG   T)tauhardrI   r   rK   )rQ   r   rP   r   rp   r2   
functionalgumbel_softmaxr   r   type_asr9   softmaxr   argmax	new_zerosscatter_rN   r   r   rS   )r?   rB   
batch_sizesequence_lengthr   codevector_probscodevector_soft_distr   codevector_idxcodevectors_per_groupr   r   r   r   rc     s*   


z"WavLMGumbelVectorQuantizer.forward)	r   r   r   r   r.   staticmethodr   rc   r   r   r   r@   r   r     s    
r   c                   @   sN   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Zdd	 Zd
d Zdd ZdS )WavLMPreTrainedModelr   wavlminput_valuesTFc              	   C   s  t |tr|jjjjddd |jjj  tj	
|j dS t |trItj	j|jjddtd|jjd |jj   d tj	|jjd dS t |trqtd|jj }tj	j
|jj| |d tj	j
|jj| |d dS t |tjr|jjjd| jjd |jdur|jj  dS dS t |tjtjfr|jj  |jjd dS t |tjrtj	|j |jdurt|j|j|jd   }tj	j
|j| |d dS dS dS )	zInitialize the weightsr#   r   )r   stdr   r   )abNrK   )
isinstancer   r   ro   datanormal_rn   zero_r2   inituniform_r   r   convr   sqrtkernel_sizein_channels	constant_r!   
projectionin_featuresr3   r   initializer_ranger   	GroupNormfill_Conv1dkaiming_normal_groups)r?   modulekr   r   r   _init_weights  s<   

 


z"WavLMPreTrainedModel._init_weightsc                 C      t dNzNot needed for WavLMAttributeErrorr?   r   r   r   _get_adapters$     z"WavLMPreTrainedModel._get_adaptersc                 C   r%  r&  r'  r)  r   r   r   init_adapter_layers'  r+  z(WavLMPreTrainedModel.init_adapter_layersc                 C   r%  r&  r'  r)  r   r   r   load_adapter*  r+  z!WavLMPreTrainedModel.load_adapterN)r   r   r   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr$  r*  r,  r-  r   r   r   r   r	    s   
 !r	  c                   @   r   )
WavLMModelNr   r   r   r   r   r5  1  r    r5  c                   @   r   )WavLMForCTCNr   r   r   r   r   r6  5  r    r6  c                   @   r   )WavLMForSequenceClassificationNr   r   r   r   r   r7  9  r    r7  c                   @   r   ) WavLMForAudioFrameClassificationNr   r   r   r   r   r8  =  r    r8  c                   @   r   )WavLMForXVectorNr   r   r   r   r   r9  A  r    r9  )r8  r6  r7  r9  r5  r	  )8r   typingr   r   r9   torch.nnr2   torch.nn.functionalr   rj   integrations.deepspeedr   integrations.fsdpr   modeling_layersr   modeling_outputsr   r	   modeling_utilsr
   utilsr   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_wavlmr   
get_loggerr   loggerr   r!   Moduler"   r   r   r   r   r   r   r	  WavLMBaseModelOutputr5  r6  r7  r8  r9  __all__r   r   r   r   <module>   s@    ,
 ')%JKF4