o
    iq                    @   s  d dl Z d dlZd dlmZmZ d dlZd dlZd dlm	Z	 d dl
m	  mZ d dlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlm Z m!Z!m"Z" ddl#m$Z$ e"%e&Z'G dd de	j(Z)G dd de	j(Z*G dd de	j(Z+G dd de	j(Z,G dd de	j(Z-G dd deZ.G dd deZ/G dd de	j(Z0G dd de	j(Z1G d d! d!e	j(Z2e G d"d# d#eZ3G d$d% d%eZ4G d&d' d'eZ5G d(d) d)eZ6G d*d+ d+e	j(Z7G d,d- d-e	j(Z8G d.d/ d/e	j(Z9		 dLd0e:e;e;f d1e<d2e;d3eej= d4e;d5ej>fd6d7Z?eZ@e G d8d9 d9e3ZAd:ZBe d;d<G d=d> d>e3ZCe d?d<G d@dA dAe3ZDe G dBdC dCe3ZEG dDdE dEe	j(ZFG dFdG dGe	j(ZGe dHd<G dIdJ dJe3ZHg dKZIdS )M    N)OptionalUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)PreTrainedModel)auto_docstringis_peft_availablelogging   )WavLMConfigc                       $   e Zd Z fddZdd Z  ZS )WavLMSamePadLayerc                    s*   t    |d dkrd| _d S d| _d S N   r   r   )super__init__num_pad_remove)selfnum_conv_pos_embeddings	__class__ e/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/wavlm/modeling_wavlm.pyr   &   s   
 zWavLMSamePadLayer.__init__c                 C   s,   | j dkr|d d d d d | j  f }|S Nr   )r   r   hidden_statesr!   r!   r"   forward*   s   
zWavLMSamePadLayer.forward__name__
__module____qualname__r   r&   __classcell__r!   r!   r   r"   r   %   s    r   c                       r   )WavLMPositionalConvEmbeddingc                    s$  t    tj|j|j|j|jd |jd| _tjj	}t
tjjdr'tjjj	}t r{dd l}|jj| jjdd || jddd| _W d    n1 sLw   Y  t
| jdrd| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	Nr   )kernel_sizepaddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r   r   nnConv1dhidden_sizer   num_conv_pos_embedding_groupsconvutilsr0   hasattrr5   r   	deepspeedzeroGatheredParametersr2   	original0	original1weight_gweight_vregister_external_parameterr   r.   r   feat_extract_activation
activation)r   configr0   r=   rB   rC   r   r!   r"   r   1   s4   

z%WavLMPositionalConvEmbedding.__init__c                 C   s:   | dd}| |}| |}| |}| dd}|S Nr   r   )	transposer:   r.   rF   r$   r!   r!   r"   r&   R   s   


z$WavLMPositionalConvEmbedding.forwardr'   r!   r!   r   r"   r,   0   s    !r,   c                       r   )WavLMFeatureProjectionc                    sJ   t    tj|jd |jd| _t|jd |j| _	t
|j| _d S )Neps)r   r   r6   	LayerNormconv_dimlayer_norm_eps
layer_normLinearr8   
projectionDropoutfeat_proj_dropoutdropoutr   rG   r   r!   r"   r   ^   s   
zWavLMFeatureProjection.__init__c                 C   s&   |  |}| |}| |}||fS N)rQ   rS   rV   )r   r%   norm_hidden_statesr!   r!   r"   r&   d   s   


zWavLMFeatureProjection.forwardr'   r!   r!   r   r"   rJ   ]   s    rJ   c                       s   e Zd ZdZ				d"dededed	ed
edef fddZ				d#dej	de
ej	 de
ej	 dedeej	e
ej	 e
eej	  f f
ddZdejdeejejf dejdedeejejf f
ddZdededejfddZdejdejfd d!Z  ZS )$WavLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        @     T	embed_dim	num_headsrV   num_bucketsmax_distancehas_relative_position_biasc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _t	||| _
t	||| _t	||| _t	||| _|| _|| _ttd| jdd| _t	| jd| _|rqt| j| j| _d S d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )r   r   r^   r_   rV   head_dim
ValueErrorscalingr6   rR   k_projv_projq_projout_projr`   ra   	Parametertorchonesgru_rel_pos_constgru_rel_pos_linear	Embeddingrel_attn_embed)r   r^   r_   rV   r`   ra   rb   r   r!   r"   r   o   s.   
	

zWavLMAttention.__init__NFr   r%   attention_maskposition_biasoutput_attentionsreturnc                 C   s  |  \}}}|du r$| ||}|d|ddd|| j ||}||jdd | jdf }	|	dddd}	| |	}
|
|	jdd d 	d}
t
|
jddd\}}||| j d	  d
 }||| j dd| }|d||f}| ||||\}}|||fS )z'Attention layer with relative attentionNr   r   rK   r   r   )r      r4         ?g       @)sizecompute_bias	unsqueezerepeatviewr_   shapepermutero   sumrl   sigmoidchunkrn   torch_multi_head_self_attention)r   r%   rr   rs   rt   indexbsztgt_len_gated_hidden_statesrelative_position_projgate_agate_bgate_outputgated_position_biasattn_outputattn_weightsr!   r!   r"   r&      s"   	$

zWavLMAttention.forwardr   c                 C   s   | dd } }}|dur|dnd}d }	}
d}tj|||| j| jtdgt| j	j
| jj
| jj
f|	|
|| j| jj| jj
| j|||d| j	j| jj| jjd\}}| dd}|durz|dddf |jdd | jf |jdd  }||fS )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)use_separate_proj_weightq_proj_weightk_proj_weightv_proj_weight)rI   neFmulti_head_attention_forwardr^   r_   rl   emptycatri   biasrg   rh   rV   rj   r2   trainingbroadcast_tor~   )r   r%   rr   r   rt   querykeyvaluekey_padding_maskbias_kbias_vadd_zero_attnr   r   r!   r!   r"   r      sB   	

"z.WavLMAttention.torch_multi_head_self_attentionquery_length
key_lengthc                 C   sv   t j|t jdd d d f }t j|t jdd d d f }|| }| |}|| jjj}| |}|g d}|S )Ndtype)r   r   r   )	rl   arangelong_relative_positions_buckettorq   r2   devicer   )r   r   r   context_positionmemory_positionrelative_positionrelative_position_bucketvaluesr!   r!   r"   rz      s   

zWavLMAttention.compute_biasrelative_positionsc                 C   s   | j d }|dktj| }t|}|d }||k }t| | }|t| j|  }|||  }|| tj}t	|t
||d }|t|||7 }|S r   )r`   r   rl   r   abslogfloatmathra   min	full_likewhere)r   r   r`   relative_buckets	max_exactis_smallrelative_positions_if_largerelative_position_if_larger!   r!   r"   r      s   

z)WavLMAttention._relative_positions_bucket)r[   r\   r]   TNNFr   )r(   r)   r*   __doc__intr   boolr   rl   Tensorr   tupler&   FloatTensorr   
LongTensor
BoolTensorr   rz   r   r+   r!   r!   r   r"   rZ   l   s^    '
)
7
rZ   c                       r   )WavLMFeedForwardc                    sp   t    t|j| _t|j|j| _	t
|jtr"t|j | _n|j| _t|j|j| _t|j| _d S rX   )r   r   r6   rT   activation_dropoutintermediate_dropoutrR   r8   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutrW   r   r!   r"   r     s   
zWavLMFeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S rX   )r   r   r   r   r   r$   r!   r!   r"   r&      s   




zWavLMFeedForward.forwardr'   r!   r!   r   r"   r         r   c                       s2   e Zd Zddedef fddZdd	d
Z  ZS )WavLMEncoderLayerTrG   rb   c                    n   t    t|j|j|j|j|j|d| _t	
|j| _t	j|j|jd| _t|| _t	j|j|jd| _d S N)r^   r_   rV   r`   ra   rb   rL   r   r   rZ   r8   num_attention_headsattention_dropoutr`   max_bucket_distance	attentionr6   rT   r   rV   rN   rP   rQ   r   feed_forwardfinal_layer_normr   rG   rb   r   r!   r"   r   +     

zWavLMEncoderLayer.__init__NFr   c           	      C   sl   |}| j |||||d\}}}| |}|| }| |}|| | }| |}||f}|r4||f7 }|S )Nrr   rs   rt   r   )r   rV   rQ   r   r   )	r   r%   rr   rs   rt   r   attn_residualr   outputsr!   r!   r"   r&   :  s"   



zWavLMEncoderLayer.forwardTr   r(   r)   r*   r   r   r   r&   r+   r!   r!   r   r"   r   *      r   c                       s2   e Zd Zd
dedef fddZddd	Z  ZS ) WavLMEncoderLayerStableLayerNormTrG   rb   c                    r   r   r   r   r   r!   r"   r   T  r   z)WavLMEncoderLayerStableLayerNorm.__init__NFc                 C   sf   |}|  |}| j||||d\}}}| |}|| }|| | | }||f}|r1||f7 }|S )N)rr   rs   rt   )rQ   r   rV   r   r   )r   r%   rr   rs   rt   r   r   r   r!   r!   r"   r&   c  s   


z(WavLMEncoderLayerStableLayerNorm.forwardr   )NNFr   r!   r!   r   r"   r   S  r   r   c                       .   e Zd Z fddZ				dddZ  ZS )	WavLMEncoderc                    f   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )NrL   c                       g | ]
}t  |d kdqS r   )rb   )r   .0irG   r!   r"   
<listcomp>  s    z)WavLMEncoder.__init__.<locals>.<listcomp>Fr   r   rG   r,   pos_conv_embedr6   rN   r8   rP   rQ   rT   r   rV   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingrW   r   r   r"   r   y  s   


zWavLMEncoder.__init__NFTc                 C   s>  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | |}	||	 }| |}| |}t p;t| }
d }t| j	D ]?\}}|rN||f }t
g }| jo_|dko_|| jjk }|rd|
ru||||||d}|d d \}}|ryd}|r||d f }qC|r||f }|stdd	 |||fD S t|||d
S )Nr!   rK   r   r   r   r   NNNc                 s       | ]	}|d ur|V  qd S rX   r!   r   vr!   r!   r"   	<genexpr>      z'WavLMEncoder.forward.<locals>.<genexpr>last_hidden_stater%   
attentions)r{   r|   r~   r   rQ   rV   r   r   	enumerater   rl   randr   rG   	layerdropr   r
   r   r%   rr   rt   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusrs   r   layerdropout_probabilityskip_the_layerlayer_outputsr!   r!   r"   r&     sN   






zWavLMEncoder.forwardNFFTr'   r!   r!   r   r"   r   x  s    r   c                       r   )	WavLMEncoderStableLayerNormc                    r   )NrL   c                    r   r   )r   r   r   r!   r"   r     s    z8WavLMEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   rW   r   r   r"   r     s   



z$WavLMEncoderStableLayerNorm.__init__NFTc                 C   s<  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | |}	||	 }| |}t p6t| }
d }t| jD ]>\}}|rI||f }t	
g }| joZ|dkoZ|| jjk }|r_|
ro|||||d}|d d \}}|rsd}|r|||d f }q>| |}|r||f }|stdd	 |||fD S t|||d
S )Nr!   rK   r   r   r   )rr   rt   rs   r   c                 s   r   rX   r!   r   r!   r!   r"   r     r   z6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )r{   r|   r~   r   rV   r   r   r  r   rl   r  r   rG   r  rQ   r   r
   r  r!   r!   r"   r&     sH   






z#WavLMEncoderStableLayerNorm.forwardr  r'   r!   r!   r   r"   r    s    r  c                       s4   e Zd ZdZ fddZedd Zdd Z  ZS )WavLMGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                    s   t    |j| _|j| _|j| j dkr"td|j d| j dt	t
d| j| j |j| j | _t|jd | j| j | _d| _d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   rK   r   )r   r   num_codevector_groups
num_groupsnum_codevectors_per_groupnum_varscodevector_dimre   r6   rk   rl   r   codevectorsrR   rO   weight_projtemperaturerW   r   r!   r"   r     s   


z#WavLMGumbelVectorQuantizer.__init__c                 C   s8   | j dd}ttj|t|d  dd  }|S )Nr   rw   gHz>rK   )meanrl   expr   r   )probsmarginal_probs
perplexityr!   r!   r"   _compute_perplexity(  s   (z.WavLMGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}| |}||| | j d}| jrAtjj| | j	dd}|
|}tj||| | jd dd}| |}n$|jdd}|j|j  d|ddd}||| | jd}| |}||| d}|d| j }	|	|| | j| jd}
|
d||d}
|
|fS )NrK   T)tauhardrw   r   rx   )r~   r  r}   r  r   r6   
functionalgumbel_softmaxr   r  type_asrl   softmaxr   argmax	new_zerosscatter_r{   r  r  r   )r   r%   
batch_sizesequence_lengthr8   codevector_probscodevector_soft_distr  codevector_idxcodevectors_per_groupr  r!   r!   r"   r&   .  s*   


z"WavLMGumbelVectorQuantizer.forward)	r(   r)   r*   r   r   staticmethodr   r&   r+   r!   r!   r   r"   r    s    
r  c                   @   sr   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Z	dd	eejef d
ee fddZ	ddedejfddZdS )WavLMPreTrainedModelrG   wavlminput_valuesTFc              	   C   s  t |tr|jjjjddd |jjj  tj	
|j dS t |trItj	j|jjddtd|jjd |jj   d tj	|jjd dS t |trqtd|jj }tj	j
|jj| |d tj	j
|jj| |d dS t |tjr|jjjd| jjd |jdur|jj  dS dS t |tjtjfr|jj  |jjd dS t |tjrtj	|j |jdurt|j|j|jd   }tj	j
|j| |d dS dS dS )	zInitialize the weightsr[   r   )r  stdr   r   )abNrx   )r   r  r  r2   datanormal_r   zero_r6   inituniform_r  r,   r:   r   sqrtr-   in_channels	constant_rJ   rS   in_featuresrR   rG   initializer_rangerN   	GroupNormfill_r7   kaiming_normal_r/   )r   modulekr!   r!   r"   _init_weights]  s<   

 


z"WavLMPreTrainedModel._init_weightsNinput_lengthsadd_adapterc                 C   sn   |du r| j jn|}dd }t| j j| j jD ]
\}}||||}q|r5t| j jD ]
}||d| j j}q*|S )zH
        Computes the output length of the convolutional layers
        Nc                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )rl   divinput_lengthr-   strider!   r!   r"   _conv_out_length  s   zOWavLMPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_lengthr   )rG   rI  zipconv_kernelconv_strider   num_adapter_layersadapter_stride)r   rH  rI  rP  r-   rO  r   r!   r!   r"    _get_feat_extract_output_lengths~  s   z5WavLMPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthrr   c                 C   s   |j ddd d df }| j||d}|tj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dg d
dg }|S )NrK   rw   rI  r   )r   r   r   )r   )cumsumrV  r   rl   r   r~   zerosr   r   r   flipr   )r   rW  rr   rI  non_padded_lengthsoutput_lengthsr+  r!   r!   r"   "_get_feature_vector_attention_mask  s   
"z7WavLMPreTrainedModel._get_feature_vector_attention_maskrX   )r(   r)   r*   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnrG  r   rl   r   r   r   r   rV  r^  r!   r!   r!   r"   r2  S  s*   
 "
r2  c                       &   e Zd Zd fdd	Zdd Z  ZS )WavLMNoLayerNormConvLayerr   c                    sj   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _d S )Nr   r   r-   rO  r   )r   r   rO   in_conv_dimout_conv_dimr6   r7   rR  rS  	conv_biasr:   r   rE   rF   r   rG   layer_idr   r!   r"   r     s   
z"WavLMNoLayerNormConvLayer.__init__c                 C   s   |  |}| |}|S rX   )r:   rF   r$   r!   r!   r"   r&     s   

z!WavLMNoLayerNormConvLayer.forwardr   r'   r!   r!   r   r"   rg    s    rg  c                       rf  )WavLMLayerNormConvLayerr   c                    s|   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   rh  T)elementwise_affine)r   r   rO   ri  rj  r6   r7   rR  rS  rk  r:   rN   rQ   r   rE   rF   rl  r   r!   r"   r     s   
z WavLMLayerNormConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )Nr#  rK   )r:   rI   rQ   rF   r$   r!   r!   r"   r&     s   


zWavLMLayerNormConvLayer.forwardrn  r'   r!   r!   r   r"   ro    s    ro  c                       rf  )WavLMGroupNormConvLayerr   c                    s   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _tj| j| jdd| _d S )Nr   r   rh  T)r  num_channelsaffine)r   r   rO   ri  rj  r6   r7   rR  rS  rk  r:   r   rE   rF   rB  rQ   rl  r   r!   r"   r     s   
z WavLMGroupNormConvLayer.__init__c                 C   s"   |  |}| |}| |}|S rX   )r:   rQ   rF   r$   r!   r!   r"   r&     s   


zWavLMGroupNormConvLayer.forwardrn  r'   r!   r!   r   r"   rq    s    rq  c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )WavLMFeatureEncoderz.Construct the features from raw audio waveformc                    s   t     jdkr t ddg fddt jd D  }n jdkr2 fddt jD }n	td	 j d
t|| _	d| _
d| _d S )Ngroupr   rm  c                    s   g | ]
}t  |d  dqS )r   rv  )rg  r   r   r!   r"   r     s    z0WavLMFeatureEncoder.__init__.<locals>.<listcomp>r   r  c                    s   g | ]}t  |d qS )rv  )ro  r   r   r!   r"   r     s    z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r   r   feat_extract_normrq  r   num_feat_extract_layersre   r6   r   conv_layersr   _requires_grad)r   rG   ry  r   r   r"   r     s   




zWavLMFeatureEncoder.__init__c                 C   s   |   D ]}d|_qd| _d S )NF)
parametersrequires_gradrz  r   paramr!   r!   r"   _freeze_parameters  s   
z&WavLMFeatureEncoder._freeze_parametersc                 C   s:   |d d d f }| j r| jrd|_| jD ]}||}q|S )NT)rz  r   r|  ry  )r   r4  r%   
conv_layerr!   r!   r"   r&     s   

zWavLMFeatureEncoder.forward)r(   r)   r*   r   r   r  r&   r+   r!   r!   r   r"   rt    s
    rt  c                       r   )WavLMAdapterLayerc                    s0   t    tj|jd|j |j|jdd| _d S )Nr   r   )rO  r.   )r   r   r6   r7   output_hidden_sizeadapter_kernel_sizerU  r:   rW   r   r!   r"   r     s   
zWavLMAdapterLayer.__init__c                 C   s   |  |}tjj|dd}|S )Nr   rw   )r:   r6   r$  glur$   r!   r!   r"   r&   #  s   
zWavLMAdapterLayer.forwardr'   r!   r!   r   r"   r    s    
r  c                       r   )WavLMAdapterc                    sp   t     j jkrt j j| _t j| _nd  | _| _t	 fddt
 jD | _ j| _d S )Nc                 3   s    | ]}t  V  qd S rX   )r  r   r   r   r!   r"   r   5  s    z(WavLMAdapter.__init__.<locals>.<genexpr>)r   r   r  r8   r6   rR   projrN   proj_layer_normr   r   rT  r   r  rW   r   r   r"   r   +  s   
 zWavLMAdapter.__init__c                 C   sr   | j d ur| jd ur|  |}| |}|dd}| jD ]}tj }| jr,|| jkr0||}q|dd}|S rH   )r  r  rI   r   nprandomr   r  )r   r%   r  layerdrop_probr!   r!   r"   r&   8  s   



zWavLMAdapter.forwardr'   r!   r!   r   r"   r  *  r   r  r~   	mask_probmask_lengthrr   	min_masksru   c                    s  | \}dk rt dkrt d d dtjd   fdd}|dur:| d	 n
fd
dt|D }tj	|ft
d}g }	|}
|
dkrZ|S |D ];}||}tjjt|d  |dd}t|dkr}d }n|d }t|tj|
| tjd| g}|	| q\t|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d krd |	|	d k< t||	dd	 |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr }| d  |k r*t| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)rN  num_masked_spanepsilonr  r  r  r,  r!   r"   compute_num_masked_spano  s   
z6_compute_mask_indices.<locals>.compute_num_masked_spanNrK   c                    s   g | ]} qS r!   r!   r  )r,  r!   r"   r     s    z)_compute_mask_indices.<locals>.<listcomp>r   r   F)replace)re   r  r  r  itemdetachr   tolistr   rZ  r   choicer   lenconcatenaterm   int32appendarrayr   reshaper  put_along_axis)r~   r  r  rr   r  r+  r  rH  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrN  r  spec_aug_mask_idxdummy_mask_idxoffsetsr!   r  r"   _compute_mask_indicesI  s\   

r  c                       s   e Zd Zdef fddZdd Zdd Z		dd	ejd
e	ej de	ej
 fddZe					dde	ej de	ej d
e	ej de	e de	e de	e deeef fddZ  ZS )
WavLMModelrG   c                    s   t  | || _t|| _t|| _|jdks|jdkr)t	
t|j | _|jr2t|| _nt|| _|jr>t|nd | _|   d S )Nr[   )r   r   rG   rt  feature_extractorrJ   feature_projectionmask_time_probmask_feature_probr6   rk   rl   r   r8   r<  masked_spec_embeddo_stable_layer_normr  encoderr   rI  r  adapter	post_initrW   r   r!   r"   r     s   


zWavLMModel.__init__c                 C      t dt |   dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr   r!   r!   r"   freeze_feature_extractor  
   z#WavLMModel.freeze_feature_extractorc                 C   s   | j   dS 
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r  r  r!   r!   r"   r    s   z!WavLMModel.freeze_feature_encoderNr%   mask_time_indicesrr   c                 C   s  t | jdds	|S | \}}}|dur| j|j||< n-| jjdkrK| jrKt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r  r  rr   r  )r   r   )r  r  r  rK   )getattrrG   ry   r  r   r   r  r   r  mask_time_lengthmask_time_min_masksrl   tensorr   r   r  mask_feature_lengthmask_feature_min_masksexpand)r   r%   r  rr   r+  r,  r8   mask_feature_indicesr!   r!   r"   _mask_hidden_states  s4   zWavLMModel._mask_hidden_statesr4  rt   r  r  ru   c           
      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur8| j|jd |dd}| |\}}| j	|||d}| j
|||||d}	|	d }| jdur_| |}|sk||f|	dd  S t|||	j|	jd	S )
a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   FrX  )r  rr   rr   rt   r  r  r   )r   extract_featuresr%   r   )rG   rt   r  use_return_dictr  rI   r^  r~   r  r  r  r  WavLMBaseModelOutputr%   r   )
r   r4  rr   r  rt   r  r  r  r%   encoder_outputsr!   r!   r"   r&     s@   


zWavLMModel.forward)NNNNNNN)r(   r)   r*   r   r   r  r  rl   r   r   r   r  r   r   r   r   r   r  r&   r+   r!   r!   r   r"   r    sD    

.
r  r   zm
    WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                       s   e Zd Zddee f fddZdd Zdd Zd	d
 Zdd Z	e
					ddeej deej dee dee dee deej deeef fddZ  ZS )WavLMForCTCNtarget_langc                    s~   t  | t|| _t|j| _|| _|j	du r#t
d| j dt|dr.|jr.|jn|j}t||j	| _|   dS )a/  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`WavLMForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.rI  )r   r   r  r3  r6   rT   final_dropoutrV   r  
vocab_sizere   r    r<   rI  r  r8   rR   lm_headr  )r   rG   r  r  r   r!   r"   r   ^  s   

zWavLMForCTC.__init__c                 C   sv   | j }|durt| jdddu rtd| d|du r,t| jdddur,td dS |dur9| j|dd dS dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nadapter_attn_dimzCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  r  rG   re   loggerinfoload_adapter)r   r  r!   r!   r"   tie_weights{  s   zWavLMForCTC.tie_weightsc                 C   r  r  r  Nr  r  r!   r!   r"   r    r  z$WavLMForCTC.freeze_feature_extractorc                 C      | j j  dS r  r3  r  r  r  r!   r!   r"   r       z"WavLMForCTC.freeze_feature_encoderc                 C      | j  D ]}d|_qdS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr3  r{  r|  r}  r!   r!   r"   freeze_base_model     zWavLMForCTC.freeze_base_modelr4  rr   rt   r  r  labelsru   c              
   C   s|  |dur|n| j j}|dur| | j jkrtd| j j | j|||||d}|d }| |}| |}	d}
|dur|durC|ntj	|tj
d}| |dtj
}|dk}|d}||}tjj|	dtjddd}tjjjd	d
 tjj||||| j j| j j| j jd}
W d   n1 sw   Y  |s|	f|td  }|
dur|
f| S |S t|
|	|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r   rK   )r4   r   r   F)enabled)blank	reductionzero_infinitylosslogitsr%   r   )rG   r  r  r  re   r3  rV   r  rl   	ones_liker   rV  r   r   masked_selectr6   r$  log_softmaxfloat32rI   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r%   r   )r   r4  rr   rt   r  r  r  r   r%   r  r  rH  labels_masktarget_lengthsflattened_targets	log_probsoutputr!   r!   r"   r&     sN   



zWavLMForCTC.forwardrX   r  )r(   r)   r*   r   r   r   r  r  r  r  r   rl   r   r   r   r   r   r&   r+   r!   r!   r   r"   r  X  s6    
r  z
    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                       s   e Zd Z fddZdd Zdd Zdd Ze										dd
ee	j
 dee	j
 dee dee dee dee	j
 deeef fddZ  ZS )WavLMForSequenceClassificationc                    s   t  | t|dr|jrtdt|| _|jd }|jr*t	
t|| | _t	|j|j| _t	|j|j| _|   d S )NrI  z\Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)r   )r   r   r<   rI  re   r  r3  r   use_weighted_layer_sumr6   rk   rl   rm   layer_weightsrR   r8   classifier_proj_size	projector
num_labels
classifierr  r   rG   
num_layersr   r!   r"   r     s   

z'WavLMForSequenceClassification.__init__c                 C   r  r  r  r  r!   r!   r"   r    r  z7WavLMForSequenceClassification.freeze_feature_extractorc                 C   r  r  r  r  r!   r!   r"   r    r  z5WavLMForSequenceClassification.freeze_feature_encoderc                 C   r  r  r  r}  r!   r!   r"   r    r  z0WavLMForSequenceClassification.freeze_base_modelNr4  rr   rt   r  r  r  ru   c                 C   sz  |dur|n| j j}| j jrdn|}| j|||||d}| j jrB|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}|du rV|jdd}
n+| |jd |}|ddd|jd }d	|| < |jdd|jdddd }
| |
}d}|durt }||d| j j|d}|s|f|td  }|dur|f| S |S t|||j|jd
S )	  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   rw   rK   r   r   r[   r  )rG   r  r  r3  r  rl   stackr6   r$  r'  r   r}   r   r  r  r^  r~   r{   r|   r  r   r  r   r%   r   )r   r4  rr   rt   r  r  r  r   r%   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  r!   r!   r"   r&   &  sH   

 
z&WavLMForSequenceClassification.forwardr  )r(   r)   r*   r   r  r  r  r   r   rl   r   r   r   r   r   r&   r+   r!   r!   r   r"   r    s4    
r  c                       s   e Zd Z fddZdd Zdd Zdd Ze										dd
ee	j
 dee	j
 dee	j
 dee dee dee deeef fddZ  ZS ) WavLMForAudioFrameClassificationc                    sz   t  | t|dr|jrtdt|| _|jd }|jr*t	
t|| | _t	|j|j| _|j| _|   d S )NrI  z_Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)r   )r   r   r<   rI  re   r  r3  r   r  r6   rk   rl   rm   r   rR   r8   r  r  init_weightsr  r   r!   r"   r   n  s   

z)WavLMForAudioFrameClassification.__init__c                 C   r  r  r  r  r!   r!   r"   r  ~  r  z9WavLMForAudioFrameClassification.freeze_feature_extractorc                 C   r  r  r  r  r!   r!   r"   r    r  z7WavLMForAudioFrameClassification.freeze_feature_encoderc                 C   r  r  r  r}  r!   r!   r"   r    r  z2WavLMForAudioFrameClassification.freeze_base_modelNr4  rr   r  rt   r  r  ru   c                 C   s   |dur|n| j j}| j jrdn|}| j|||||d}| j jrB|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}
d}|durht }||
d| jtj|d| jdd}|su|
f|td  }|S t||
|j|jd	S )
r  NTr  r   rw   rK   r   )axisr  )rG   r  r  r3  r  rl   r  r6   r$  r'  r   r}   r   r  r   r  r(  r   r%   r   )r   r4  rr   r  rt   r  r  r   r%   r	  r  r  r  r  r!   r!   r"   r&     s:   
(z(WavLMForAudioFrameClassification.forwardr  )r(   r)   r*   r   r  r  r  r   r   rl   r   r   r   r   r   r&   r+   r!   r!   r   r"   r  l  s4    
r  c                       s&   e Zd Zd fdd	Zdd Z  ZS )AMSoftmaxLoss      >@皙?c                    sB   t    || _|| _|| _tjt||dd| _	t
 | _d S )NT)r|  )r   r   scalemarginr  r6   rk   rl   randnr2   r   r  )r   	input_dimr  r  r  r   r!   r"   r     s   
zAMSoftmaxLoss.__init__c           	      C   sx   |  }tjj| jdd}tjj|dd}t||}|| j }tj|| j	}| j
t| || }| ||}|S )Nr   rw   r   )flattenr6   r$  	normalizer2   rl   mmr  one_hotr  r  r   r   r  )	r   r%   r  r2   	cos_thetapsionehotr  r  r!   r!   r"   r&     s   
zAMSoftmaxLoss.forward)r  r  r'   r!   r!   r   r"   r    s    r  c                       s4   e Zd Zd fdd	ZdejdejfddZ  ZS )		TDNNLayerr   c                    sv   t    |dkr|j|d  n|j| | _|j| | _|j| | _|j| | _t	
| j| j | j| _t	 | _d S )Nr   r   )r   r   tdnn_dimri  rj  tdnn_kernelr-   tdnn_dilationdilationr6   rR   kernelReLUrF   rl  r   r!   r"   r     s   
"zTDNNLayer.__init__r%   ru   c                 C   s   t  r	ddlm} t  rt| j|rtd |dd}| jj	| j
| j| jdd}tjj||| jj| jd}|dd}| |}|S )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r   r   )r#  )r   peft.tuners.lorar&  r   r$  r  r  rI   r2   r}   rj  r-   ri  r6   r$  conv1dr   r#  rF   )r   r%   r&  r2   r!   r!   r"   r&     s    
zTDNNLayer.forwardrn  )r(   r)   r*   r   rl   r   r&   r+   r!   r!   r   r"   r    s    
r  zi
    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                       s   e Zd Z fddZdd Zdd Zdd Zd	eej	e
f fd
dZe					ddeej deej dee dee dee deej deeef fddZ  ZS )WavLMForXVectorc                    s   t    t | _ jd } jrtt	|| | _
t j jd | _ fddtt jD }t|| _t jd d  j| _t j j| _t j j| _|   d S )Nr   r   c                    s   g | ]}t  |qS r!   )r  r   r   r!   r"   r     s    z,WavLMForXVector.__init__.<locals>.<listcomp>rK   r   )r   r   r  r3  r   r  r6   rk   rl   rm   r   rR   r8   r   r  r   r  r   tdnnxvector_output_dimr  r  r  r  	objectiver  )r   rG   r  tdnn_layersr   r   r"   r     s   

zWavLMForXVector.__init__c                 C   r  r  r  r  r!   r!   r"   r  &  r  z(WavLMForXVector.freeze_feature_extractorc                 C   r  r  r  r  r!   r!   r"   r  2  r  z&WavLMForXVector.freeze_feature_encoderc                 C   r  r  r  r}  r!   r!   r"   r  9  r  z!WavLMForXVector.freeze_base_modelrH  c                 C   s&   dd }| j jD ]}|||d}q|S )z?
        Computes the output length of the TDNN layers
        c                 S   s   | | | d S )Nr   r!   rM  r!   r!   r"   rP  F  s   zBWavLMForXVector._get_tdnn_output_lengths.<locals>._conv_out_lengthr   )rG   r!  )r   rH  rP  r-   r!   r!   r"   _get_tdnn_output_lengthsA  s   z(WavLMForXVector._get_tdnn_output_lengthsNr4  rr   rt   r  r  r  ru   c                 C   s  |dur|n| j j}| j jrdn|}| j|||||d}| j jrB|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}| jD ]}
|
|}qN|du rf|jdd}|jdd}nC| |jdd}| |}g }g }t|D ]"\}}|||d|f jdd |||d|f jdd q|t|}t|}tj||gdd}| |}| |}d}|dur| ||}|s||f|td  }|dur|f| S |S t||||j|jdS )	r  NTr  r   rw   rK   r   )r  r  
embeddingsr%   r   )rG   r  r  r3  r  rl   r  r6   r$  r'  r   r}   r   r  r*  r  r5  rV  r.  r  r  r   r  r  r,  r   r%   r   )r   r4  rr   rt   r  r  r  r   r%   r	  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsr   lengthstatistic_poolingoutput_embeddingsr  r  r  r!   r!   r"   r&   P  s\   



 



zWavLMForXVector.forwardr  )r(   r)   r*   r   r  r  r  r   rl   r   r   r.  r   r   r   r   r   r   r&   r+   r!   r!   r   r"   r)    s6    
r)  )r  r  r  r)  r  r2  r#   )Jr   r  typingr   r   numpyr  rl   torch.nnr6   torch.nn.functionalr$  r   r   activationsr   integrations.deepspeedr   integrations.fsdpr   modeling_layersr	   modeling_outputsr
   r   r   r   r   r   modeling_utilsr   r;   r   r   r   configuration_wavlmr   
get_loggerr(   r  Moduler   r,   rJ   rZ   r   r   r   r   r  r  r2  rg  ro  rq  rt  r  r  r   r   r   r   ndarrayr  r  r  r  r  r  r  r  r  r)  __all__r!   r!   r!   r"   <module>   s    
- ')%JKFV&#

w  si  