o
    i                     @   s  d dl Z d dlZd dlmZmZmZ d dlZd dlZd dlm	Z	 d dl
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* e( rddl+m,Z, G dd deZ-G dd de	j.Z/G dd de	j.Z0G dd de	j.Z1G dd de	j.Z2G dd de	j.Z3			dSd e	j.d!ej4d"ej4d#ej4d$eej4 d%ee5 d&e5d'eej4 fd(d)Z6G d*d+ d+e	j.Z7G d,d- d-e	j.Z8G d.d/ d/eZ9G d0d1 d1e	j.Z:G d2d3 d3e	j.Z;G d4d5 d5e	j.Z<e&G d6d7 d7e"Z=		 dTd8e>e?e?f d9e5d:e?d$eej@ d;e?d<ejAfd=d>ZBeZCe&G d?d@ d@e=ZDdAZEe&dBdCG dDdE dEe=ZFe&dFdCG dGdH dHe=ZGe&G dIdJ dJe=ZHG dKdL dLe	j.ZIG dMdN dNe	j.ZJe&dOdCG dPdQ dQe=ZKg dRZLdS )U    N)CallableOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_peft_availableis_torch_flex_attn_available   )Data2VecAudioConfig)make_flex_block_causal_maskc                       s&   e Zd Zd fdd	Zdd Z  ZS )Data2VecAudioConvLayerr   c                    s|   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   )kernel_sizestridebiasTelementwise_affine)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconv	LayerNorm
layer_normr   feat_extract_activation
activationselfconfiglayer_id	__class__ h/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/data2vec/modeling_data2vec_audio.pyr%   8   s   
zData2VecAudioConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )N)r-   	transposer/   r1   r3   hidden_statesr8   r8   r9   forwardG   s   


zData2VecAudioConvLayer.forwardr   __name__
__module____qualname__r%   r?   __classcell__r8   r8   r6   r9   r   7   s    r   c                       $   e Zd Z fddZdd Z  ZS )Data2VecAudioPadLayerc                    s*   t    |d dkrd| _d S d| _d S )N   r   r   )r$   r%   num_pad_remove)r3   num_conv_pos_embeddingsr6   r8   r9   r%   S   s   
 zData2VecAudioPadLayer.__init__c                 C   s,   | j dkr|d d d d d | j  f }|S Nr   )rI   r=   r8   r8   r9   r?   W   s   
zData2VecAudioPadLayer.forwardrA   r8   r8   r6   r9   rG   R   s    rG   c                       rF   ) Data2VecAudioPositionalConvLayerc                    s\   t    tj|j|j|j|jd |jd| _t|j| _	t
|j | _tj|jdd| _d S )NrH   )r   paddinggroupsFr"   )r$   r%   r   r)   hidden_sizeconv_pos_kernel_sizenum_conv_pos_embedding_groupsr-   rG   rM   r   r0   r1   r.   r/   r3   r4   r6   r8   r9   r%   ^   s   
z)Data2VecAudioPositionalConvLayer.__init__c                 C   sD   |  |}| |}|dd}| |}|dd}| |}|S Nr   rH   )r-   rM   r<   r/   r1   r=   r8   r8   r9   r?   m   s   



z(Data2VecAudioPositionalConvLayer.forwardrA   r8   r8   r6   r9   rL   ]   s    rL   c                       rF   )$Data2VecAudioPositionalConvEmbeddingc                    s.   t    t fddt jD | _d S )Nc                       g | ]}t  qS r8   )rL   .0_r4   r8   r9   
<listcomp>|       zAData2VecAudioPositionalConvEmbedding.__init__.<locals>.<listcomp>)r$   r%   r   
ModuleListrangerJ   layersrR   r6   rY   r9   r%   y   s   

z-Data2VecAudioPositionalConvEmbedding.__init__c                 C   s0   | dd}| jD ]}||}q	| dd}|S rS   )r<   r^   )r3   r>   layerr8   r8   r9   r?      s
   

z,Data2VecAudioPositionalConvEmbedding.forwardrA   r8   r8   r6   r9   rT   x       rT   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )Data2VecAudioFeatureEncoderz.Construct the features from raw audio waveformc                    s:   t    t fddt jD | _d| _d| _d S )Nc                    s   g | ]}t  |d qS ))r5   )r   rW   irY   r8   r9   rZ      s    z8Data2VecAudioFeatureEncoder.__init__.<locals>.<listcomp>FT)	r$   r%   r   r\   r]   num_feat_extract_layersconv_layersgradient_checkpointing_requires_gradrR   r6   rY   r9   r%      s   

z$Data2VecAudioFeatureEncoder.__init__c                 C   s   |   D ]}d|_qd| _d S NF)
parametersrequires_gradrg   r3   paramr8   r8   r9   _freeze_parameters   s   
z.Data2VecAudioFeatureEncoder._freeze_parametersc                 C   s:   |d d d f }| j r| jrd|_| jD ]}||}q|S )NT)rg   trainingrj   re   )r3   input_valuesr>   
conv_layerr8   r8   r9   r?      s   

z#Data2VecAudioFeatureEncoder.forward)rB   rC   rD   __doc__r%   rm   r?   rE   r8   r8   r6   r9   ra      s
    ra   c                       rF   )Data2VecAudioFeatureProjectionc                    sJ   t    tj|jd |jd| _t|jd |j| _	t
|j| _d S )Nr;   eps)r$   r%   r   r.   r&   layer_norm_epsr/   LinearrO   
projectionDropoutfeat_proj_dropoutdropoutrR   r6   r8   r9   r%      s   
z'Data2VecAudioFeatureProjection.__init__c                 C   s&   |  |}| |}| |}||fS N)r/   rw   rz   )r3   r>   norm_hidden_statesr8   r8   r9   r?      s   


z&Data2VecAudioFeatureProjection.forwardrA   r8   r8   r6   r9   rr      r`   rr           modulequerykeyvalueattention_maskscalingrz   	head_maskc                 K   s   |d u r| dd }t||dd| }	|d ur|	| }	tjj|	dd}	|d ur5|	|dddd }	tjj|	|| j	d}	t|	|}
|
dd
 }
|
|	fS )Nr;         rH   r   dimr   )prn   )sizetorchmatmulr<   r   
functionalsoftmaxviewrz   rn   
contiguous)r~   r   r   r   r   r   rz   r   kwargsattn_weightsattn_outputr8   r8   r9   eager_attention_forward   s   r   c                       s   e Zd ZdZ					ddededed	ed
ededee f fddZ					dde
jdee
j dee
j dee
j dee dee dee
jee
j eee
j  f fddZ  ZS )Data2VecAudioAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr}   FTN	embed_dim	num_headsrz   
is_decoderr!   	is_causalr4   c                    s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )r!   )r$   r%   r   r   rz   head_dimr4   
ValueErrorr   r   r   r   rv   k_projv_projq_projout_proj)r3   r   r   rz   r   r!   r   r4   r6   r8   r9   r%      s&   



zData2VecAudioAttention.__init__r>   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                 K   s  |du}|j dd \}}	|r|j d n|	}
||	d| jf}||
d| jf}| |j| dd}|r4|n|}| |j| dd}| |j| dd}t}| jj	dkr\t
| jj	 }|| ||||f| jshdn| j| j||d|\}}|||	d }| |}||dfS )z#Input shape: Batch x Time x ChannelNr;   r   rH   eagerr}   )rz   r   r   r   )shaper   r   r   r<   r   r   r   r4   _attn_implementationr   rn   rz   r   reshaper   r   )r3   r>   r   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   r8   r8   r9   r?      s:   



zData2VecAudioAttention.forward)r}   FTFN)NNNF)rB   rC   rD   rq   intfloatboolr   r   r%   r   Tensorr   r   tupler?   rE   r8   r8   r6   r9   r      sR    "	
r   c                       rF   )Data2VecAudioFeedForwardc                    sp   t    t|j| _t|j|j| _	t
|jtr"t|j | _n|j| _t|j|j| _t|j| _d S r{   )r$   r%   r   rx   activation_dropoutintermediate_dropoutrv   rO   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutrR   r6   r8   r9   r%   *  s   
z!Data2VecAudioFeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r{   )r   r   r   r   r   r=   r8   r8   r9   r?   7  s   




z Data2VecAudioFeedForward.forwardrA   r8   r8   r6   r9   r   )      r   c                       s&   e Zd Z fddZdddZ  ZS )Data2VecAudioEncoderLayerc                    sh   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _d S )NF)r   r   rz   r   r4   rs   )r$   r%   r   rO   num_attention_headsattention_dropout	attentionr   rx   r   rz   r.   ru   r/   r   feed_forwardfinal_layer_normrR   r6   r8   r9   r%   B  s   

z"Data2VecAudioEncoderLayer.__init__NFc                 C   sf   |}| j |||d\}}}| |}|| }| |}|| | }| |}|f}|r1||f7 }|S )Nr   r   )r   rz   r/   r   r   )r3   r>   r   r   attn_residualr   rX   outputsr8   r8   r9   r?   Q  s   



z!Data2VecAudioEncoderLayer.forwardrh   rA   r8   r8   r6   r9   r   A  s    r   c                       sj   e Zd Z fddZ				ddejdeej ded	ed
ef
ddZ	de
ejdf dejfddZ  ZS )Data2VecAudioEncoderc                    sf   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nrs   c                    rU   r8   )r   rV   rY   r8   r9   rZ   l  r[   z1Data2VecAudioEncoder.__init__.<locals>.<listcomp>F)r$   r%   r4   rT   pos_conv_embedr   r.   rO   ru   r/   rx   r   rz   r\   r]   num_hidden_layersr^   rf   rR   r6   rY   r9   r%   f  s   

 
zData2VecAudioEncoder.__init__NFTr>   r   r   output_hidden_statesreturn_dictc                 C   s*  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | ||}| |}	||	 }| |}| |}t pAt| }
| j	D ]3}|rN||f }t
g }| jo[|| jjk }|r`|
rk||||d}|d }|rod}|rx||d f }qE|r||f }|stdd	 |||fD S t|||d
S )Nr8   r;   r   rH   r   r   NNc                 s   s    | ]	}|d ur|V  qd S r{   r8   )rW   vr8   r8   r9   	<genexpr>  s    z/Data2VecAudioEncoder.forward.<locals>.<genexpr>)last_hidden_stater>   
attentions)	unsqueezerepeatr   _update_full_maskr   r/   rz   r	   r
   r^   r   randrn   r4   	layerdropr   r   )r3   r>   r   r   r   r   all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr_   dropout_probabilityskip_the_layerlayer_outputsr8   r8   r9   r?   o  sL   







zData2VecAudioEncoder.forwardinputs_embedsc                 C   s   |d ur>| j jdkrd|v r|}|S d }|S | j jdkr$t||j}|S | j jdkr8t|tjr6t|dd}|S t||j}|S )Nflash_attention_2r   sdpaflex_attentionF)r   )	r4   r   r   dtyper   r   r   r   r   )r3   r   r   r8   r8   r9   r     s   z&Data2VecAudioEncoder._update_full_mask)NFFT)rB   rC   rD   r%   r   tensorr   r   r   r?   r   r   rE   r8   r8   r6   r9   r   e  s,    
<r   c                       rF   )Data2VecAudioAdapterLayerc                    s0   t    tj|jd|j |j|jdd| _d S )NrH   r   )r    rM   )r$   r%   r   r)   output_hidden_sizeadapter_kernel_sizeadapter_strider-   rR   r6   r8   r9   r%     s   
z"Data2VecAudioAdapterLayer.__init__c                 C   s   |  |}tjj|dd}|S )Nr   r   )r-   r   r   glur=   r8   r8   r9   r?     s   
z!Data2VecAudioAdapterLayer.forwardrA   r8   r8   r6   r9   r     s    
r   c                       rF   )Data2VecAudioAdapterc                    sp   t     j jkrt j j| _t j| _nd  | _| _t	 fddt
 jD | _ j| _d S )Nc                 3   s    | ]}t  V  qd S r{   )r   rV   rY   r8   r9   r     s    z0Data2VecAudioAdapter.__init__.<locals>.<genexpr>)r$   r%   r   rO   r   rv   projr.   proj_layer_normr\   r]   num_adapter_layersr^   r   rR   r6   rY   r9   r%     s   
 zData2VecAudioAdapter.__init__c                 C   sr   | j d ur| jd ur|  |}| |}|dd}| jD ]}tj }| jr,|| jkr0||}q|dd}|S rS   )r   r   r<   r^   nprandomrn   r   )r3   r>   r_   layerdrop_probr8   r8   r9   r?     s   



zData2VecAudioAdapter.forwardrA   r8   r8   r6   r9   r     r   r   c                   @   sr   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Z	ddeejef d	ee fd
dZ	ddedejfddZdS )Data2VecAudioPreTrainedModelr4   data2vec_audioro   Tc                 C   sZ  t |tr(td|jj }tjj|jj	| |d tjj|jj
| |d dS t |tr8tj|jj
d dS t |tjrX|j	jjd| jjd |j
durV|j
j  dS dS t |tjtjfr||j
durl|j
j  |j	durz|j	jd dS dS t |tjrtj|j	 |j
durt|j|j|jd   }tjj|j
| |d dS dS dS )zInitialize the weightsr   )abr   r}   )meanstdNg      ?)r   rr   mathsqrtrw   in_featuresr   inituniform_weightr!   rL   	constant_r-   rv   datanormal_r4   initializer_rangezero_r.   	GroupNormfill_r)   kaiming_normal_rN   in_channelsr   )r3   r~   kr8   r8   r9   _init_weights  s0   





z*Data2VecAudioPreTrainedModel._init_weightsNinput_lengthsadd_adapterc                 C   sn   |du r| j jn|}dd }t| j j| j jD ]
\}}||||}q|r5t| j jD ]
}||d| j j}q*|S )zH
        Computes the output length of the convolutional layers
        Nc                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )r   divinput_lengthr   r    r8   r8   r9   _conv_out_length  s   zWData2VecAudioPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_lengthr   )r4   r  zipr*   r+   r]   r   r   )r3   r  r  r  r   r    rX   r8   r8   r9    _get_feat_extract_output_lengths  s   z=Data2VecAudioPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthr   c                 C   s   |j ddd d df }| j||d}|tj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dg d
dg }|S )Nr;   r   r  r   )r   devicer   )r  )cumsumr  tor   longr   zerosr   r  arangeflipr   )r3   r  r   r  non_padded_lengthsoutput_lengths
batch_sizer8   r8   r9   "_get_feature_vector_attention_mask-  s   
"z?Data2VecAudioPreTrainedModel._get_feature_vector_attention_maskr{   )rB   rC   rD   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr  r   r   
LongTensorr   r   r   r  r#  r8   r8   r8   r9   r     s*   
 
r   r   	mask_probmask_length	min_masksr   c                    s  | \}dk rt dkrt d d dtjd   fdd}|dur:| d	 n
fd
dt|D }tj	|ft
d}g }	|}
|
dkrZ|S |D ];}||}tjjt|d  |dd}t|dkr}d }n|d }t|tj|
| tjd| g}|	| q\t|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d krd |	|	d k< t||	dd	 |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr }| d  |k r*t| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r  num_masked_spanepsilonr-  r,  r.  sequence_lengthr8   r9   compute_num_masked_spanh  s   
z6_compute_mask_indices.<locals>.compute_num_masked_spanNr;   c                    s   g | ]} qS r8   r8   rV   )r4  r8   r9   rZ   {  s    z)_compute_mask_indices.<locals>.<listcomp>r   r   F)replace)r   r   r   r   itemdetachsumtolistr]   r  r   choicer  lenconcatenateonesint32appendarraybroadcast_tor   r0  put_along_axis)r   r,  r-  r   r.  r"  r5  r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr  r1  spec_aug_mask_idxdummy_mask_idxoffsetsr8   r2  r9   _compute_mask_indicesB  s\   

rK  c                       s   e Zd Zdef fddZdd Z		ddejdeej d	eej	 fd
dZ
e					ddeej d	eej deej dee dee dee deeef fddZ  ZS )Data2VecAudioModelr4   c                    s|   t  | || _t|| _t|| _|jdks|jdkr)t	
t|j | _t|| _|jr5t|nd | _|   d S )Nr}   )r$   r%   r4   ra   feature_extractorrr   feature_projectionmask_time_probmask_feature_probr   	Parameterr   r   rO   r   masked_spec_embedr   encoderr  r   adapter	post_initrR   r6   r8   r9   r%     s   


zData2VecAudioModel.__init__c                 C   s   | j   dS 
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rM  rm   r3   r8   r8   r9   freeze_feature_encoder  s   z)Data2VecAudioModel.freeze_feature_encoderNr>   mask_time_indicesr   c                 C   s  t | jdds	|S | \}}}|dur| j|j||< n-| jjdkrK| jrKt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r,  r-  r   r.  )r  r   )r,  r-  r.  r;   )getattrr4   r   rR  r  r   rO  rn   rK  mask_time_lengthmask_time_min_masksr   r   r  r   rP  mask_feature_lengthmask_feature_min_masksexpand)r3   r>   rZ  r   r"  r4  rO   mask_feature_indicesr8   r8   r9   _mask_hidden_states  s4   z&Data2VecAudioModel._mask_hidden_statesro   r   r   r   r   c           
      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur8| j|jd |dd}| |\}}| j	|||d}| j
|||||d}	|	d }| jdur_| |}|sk||f|	dd  S t|||	j|	jd	S )
a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   rH   Fr  )rZ  r   r   r   r   r   r   )r   extract_featuresr>   r   )r4   r   r   use_return_dictrM  r<   r#  r   rN  rc  rS  rT  Data2VecAudioBaseModelOutputr>   r   )
r3   ro   r   rZ  r   r   r   re  r>   encoder_outputsr8   r8   r9   r?     s@   


zData2VecAudioModel.forwardr   NNNNN)rB   rC   rD   r   r%   rY  r   FloatTensorr   r+  rc  r   r   r   r   r   rg  r?   rE   r8   r8   r6   r9   rL    sB    

.
rL  rH   zu
    Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                       s   e Zd Z fddZdd Zdd Ze					ddeej	 d	eej	 d
ee
 dee
 dee
 deej	 deeef fddZ  ZS )Data2VecAudioForCTCc                    sx   t  | t|| _t|j| _|jdu r t	d| j
 dt|dr+|jr+|jn|j}t||j| _|   dS )a7  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`Data2VecAudioForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r  )r$   r%   rL  r   r   rx   final_dropoutrz   
vocab_sizer   r7   hasattrr  r   rO   rv   lm_headrU  )r3   r4   r   r6   r8   r9   r%   H  s   

zData2VecAudioForCTC.__init__c                 C      t dt |   dS rW  The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningrY  rX  r8   r8   r9   freeze_feature_extractorc  
   z,Data2VecAudioForCTC.freeze_feature_extractorc                 C      | j j  dS rV  r   rM  rm   rX  r8   r8   r9   rY  o     z*Data2VecAudioForCTC.freeze_feature_encoderNro   r   r   r   r   labelsr   c              
   C   s|  |dur|n| j j}|dur| | j jkrtd| j j | j|||||d}|d }| |}| |}	d}
|dur|durC|ntj	|tj
d}| |dtj
}|dk}|d}||}tjj|	dtjddd}tjjjd	d
 tjj||||| j j| j j| j jd}
W d   n1 sw   Y  |s|	f|td  }|
dur|
f| S |S t|
|	|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: rd  r   r6  r;   )r   r   r   F)enabled)blank	reductionzero_infinitylosslogitsr>   r   )r4   rf  r0  rn  r   r   rz   rp  r   	ones_liker  r  r:  r  masked_selectr   r   log_softmaxfloat32r<   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r>   r   )r3   ro   r   r   r   r   r}  r   r>   r  r  r  labels_masktarget_lengthsflattened_targets	log_probsoutputr8   r8   r9   r?   v  sN   



zData2VecAudioForCTC.forwardri  )rB   rC   rD   r%   rx  rY  r   r   r   r   r   r   r   r   r?   rE   r8   r8   r6   r9   rl  B  s2    
rl  z
    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                       s   e Zd Z fddZdd Zdd Zdd Ze										dd
ee	j
 dee	j
 dee dee dee dee	j
 deeef fddZ  ZS )&Data2VecAudioForSequenceClassificationc                    s   t  | t|dr|jrtdt|| _|jd }|jr*t	
t|| | _t	|j|j| _t	|j|j| _|   d S )Nr  zdSequence classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)r   )r$   r%   ro  r  r   rL  r   r   use_weighted_layer_sumr   rQ  r   r?  layer_weightsrv   rO   classifier_proj_size	projector
num_labels
classifierrU  r3   r4   
num_layersr6   r8   r9   r%     s   

z/Data2VecAudioForSequenceClassification.__init__c                 C   rq  )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        rs  Nrt  rX  r8   r8   r9   rx    ry  z?Data2VecAudioForSequenceClassification.freeze_feature_extractorc                 C   rz  rV  r{  rX  r8   r8   r9   rY    r|  z=Data2VecAudioForSequenceClassification.freeze_feature_encoderc                 C      | j  D ]}d|_qdS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr   ri   rj   rk   r8   r8   r9   freeze_base_model     z8Data2VecAudioForSequenceClassification.freeze_base_modelNro   r   r   r   r   r}  r   c                 C   sz  |dur|n| j j}| j jrdn|}| j|||||d}| j jrB|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}|du rV|jdd}
n+| |jd |}|ddd|jd }d	|| < |jdd|jdddd }
| |
}d}|durt }||d| j j|d}|s|f|td  }|dur|f| S |S t|||j|jd
S )  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTrd  r   r   r;   r   rH   r}   r  )r4   rf  r  r   r  r   stackr   r   r   r  r   r:  r  r   r#  r   r   r   r  r   r  r   r>   r   )r3   ro   r   r   r   r   r}  r   r>   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  r8   r8   r9   r?     sH   

 
z.Data2VecAudioForSequenceClassification.forwardri  )rB   rC   rD   r%   rx  rY  r  r   r   r   r   r   r   r   r   r?   rE   r8   r8   r6   r9   r    s4    
r  c                       s   e Zd Z fddZdd Zdd Zdd Ze										dd
ee	j
 dee	j
 dee	j
 dee dee dee deeef fddZ  ZS )(Data2VecAudioForAudioFrameClassificationc                    sz   t  | t|dr|jrtdt|| _|jd }|jr*t	
t|| | _t	|j|j| _|j| _|   d S )Nr  zgAudio frame classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)r   )r$   r%   ro  r  r   rL  r   r   r  r   rQ  r   r?  r  rv   rO   r  r  init_weightsr  r6   r8   r9   r%   9  s   

z1Data2VecAudioForAudioFrameClassification.__init__c                 C   rq  rr  rt  rX  r8   r8   r9   rx  I  ry  zAData2VecAudioForAudioFrameClassification.freeze_feature_extractorc                 C   rz  rV  r{  rX  r8   r8   r9   rY  U  r|  z?Data2VecAudioForAudioFrameClassification.freeze_feature_encoderc                 C   r  r  r  rk   r8   r8   r9   r  \  r  z:Data2VecAudioForAudioFrameClassification.freeze_base_modelNro   r   r}  r   r   r   r   c                 C   s   |dur|n| j j}| j jrdn|}| j|||||d}| j jrB|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}
d}|durht }||
d| jtj|d| jdd}|su|
f|td  }|S t||
|j|jd	S )
r  NTrd  r   r   r;   r   )axisr  )r4   rf  r  r   r  r   r  r   r   r   r  r   r:  r  r   r  argmaxr   r>   r   )r3   ro   r   r}  r   r   r   r   r>   r  r  r  r  r  r8   r8   r9   r?   d  s:   
(z0Data2VecAudioForAudioFrameClassification.forwardri  )rB   rC   rD   r%   rx  rY  r  r   r   r   r   r   r   r   r   r?   rE   r8   r8   r6   r9   r  7  s4    
r  c                       s&   e Zd Zd fdd	Zdd Z  ZS )AMSoftmaxLoss      >@皙?c                    sB   t    || _|| _|| _tjt||dd| _	t
 | _d S )NT)rj   )r$   r%   scalemarginr  r   rQ  r   randnr  r   r  )r3   	input_dimr  r  r  r6   r8   r9   r%     s   
zAMSoftmaxLoss.__init__c           	      C   sx   |  }tjj| jdd}tjj|dd}t||}|| j }tj|| j	}| j
t| || }| ||}|S )Nr   r   r   )flattenr   r   	normalizer  r   mmr  one_hotr  r  wherer   r  )	r3   r>   r}  r  	cos_thetapsionehotr  r  r8   r8   r9   r?     s   
zAMSoftmaxLoss.forward)r  r  rA   r8   r8   r6   r9   r    s    r  c                       s4   e Zd Zd fdd	ZdejdejfddZ  ZS )		TDNNLayerr   c                    sv   t    |dkr|j|d  n|j| | _|j| | _|j| | _|j| | _t	
| j| j | j| _t	 | _d S )Nr   r   )r$   r%   tdnn_dimr'   r(   tdnn_kernelr   tdnn_dilationdilationr   rv   kernelReLUr1   r2   r6   r8   r9   r%     s   
"zTDNNLayer.__init__r>   r   c                 C   s   t  r	ddlm} t  rt| j|rtd |dd}| jj	| j
| j| jdd}tjj||| jj| jd}|dd}| |}|S )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r   rH   )r  )r   peft.tuners.lorar  r   r  ru  rv  r<   r  r   r(   r   r'   r   r   conv1dr!   r  r1   )r3   r>   r  r  r8   r8   r9   r?     s    
zTDNNLayer.forwardr@   )rB   rC   rD   r%   r   r   r?   rE   r8   r8   r6   r9   r    s    
r  zq
    Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                       s   e Zd Z fddZdd Zdd Zdd Zd	eej	e
f fd
dZe					ddeej deej dee dee dee deej deeef fddZ  ZS )Data2VecAudioForXVectorc                    s   t    t | _ jd } jrtt	|| | _
t j jd | _ fddtt jD }t|| _t jd d  j| _t j j| _t j j| _|   d S )Nr   r   c                    s   g | ]}t  |qS r8   )r  rb   rY   r8   r9   rZ     s    z4Data2VecAudioForXVector.__init__.<locals>.<listcomp>r;   rH   )r$   r%   rL  r   r   r  r   rQ  r   r?  r  rv   rO   r  r  r]   r=  r\   tdnnxvector_output_dimrM  r  r  r  	objectiver  )r3   r4   r  tdnn_layersr6   rY   r9   r%     s   

z Data2VecAudioForXVector.__init__c                 C   rq  rr  rt  rX  r8   r8   r9   rx    ry  z0Data2VecAudioForXVector.freeze_feature_extractorc                 C   rz  rV  r{  rX  r8   r8   r9   rY    r|  z.Data2VecAudioForXVector.freeze_feature_encoderc                 C   r  r  r  rk   r8   r8   r9   r    r  z)Data2VecAudioForXVector.freeze_base_modelr  c                 C   s&   dd }| j jD ]}|||d}q|S )z?
        Computes the output length of the TDNN layers
        c                 S   s   | | | d S )Nr   r8   r  r8   r8   r9   r    s   zJData2VecAudioForXVector._get_tdnn_output_lengths.<locals>._conv_out_lengthr   )r4   r  )r3   r  r  r   r8   r8   r9   _get_tdnn_output_lengths  s   z0Data2VecAudioForXVector._get_tdnn_output_lengthsNro   r   r   r   r   r}  r   c                 C   s  |dur|n| j j}| j jrdn|}| j|||||d}| j jrB|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}| jD ]}
|
|}qN|du rf|jdd}|jdd}nC| |jdd}| |}g }g }t|D ]"\}}|||d|f jdd |||d|f jdd q|t|}t|}tj||gdd}| |}| |}d}|dur| ||}|s||f|td  }|dur|f| S |S t||||j|jdS )	r  NTrd  r   r   r;   r   )r  r  
embeddingsr>   r   )r4   rf  r  r   r  r   r  r   r   r   r  r   r:  r  r  r   r   r  r  	enumeraterA  catrM  r  r  r   r>   r   )r3   ro   r   r   r   r   r}  r   r>   r  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsrc   lengthstatistic_poolingoutput_embeddingsr  r  r  r8   r8   r9   r?     s\   



 



zData2VecAudioForXVector.forwardri  )rB   rC   rD   r%   rx  rY  r  r   r   r+  r   r  r   r   r   r   r   r   r?   rE   r8   r8   r6   r9   r    s6    
r  )r  rl  r  r  rL  r   )Nr}   NrK   )Mr   ru  typingr   r   r   numpyr   r   r   torch.nnr   activationsr   integrations.deepspeedr	   integrations.fsdpr
   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_data2vec_audior   integrations.flex_attentionr   r   ModulerG   rL   rT   ra   rr   r   r   r   r   r   r   r   r   r   r   r   r   r+  ndarrayrK  rg  rL  r  rl  r  r  r  r  r  __all__r8   r8   r8   r9   <module>   s    
X$]R

w wsi  