o
    i$                     @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ d	d
lmZmZmZmZmZmZmZmZmZmZmZ ddlmZ G dd deZG dd deZG dd dejZG dd dejZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deeZ#e	Z$G dd  d e#eZ%G d!d" d"e#eZ&G d#d$ d$eZ'G d%d& d&eZ(G d'd( d(eZ)g d)Z*dS )*zPyTorch Data2VecText model.    N)nn   )ACT2FN)GradientCheckpointingLayer)Wav2Vec2BaseModelOutput)PreTrainedModel   )Wav2Vec2AdapterWav2Vec2EncoderWav2Vec2FeatureEncoderWav2Vec2FeatureProjection#Wav2Vec2ForAudioFrameClassificationWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ForXVectorWav2Vec2ModelWav2Vec2PreTrainedModelWav2Vec2SamePadLayer   )Data2VecAudioConfigc                       s&   e Zd Zd fdd	Zdd Z  ZS )Data2VecAudioConvLayerr   c                    s|   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   )kernel_sizestridebiasTelementwise_affine)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconv	LayerNorm
layer_normr   feat_extract_activation
activation)selfconfiglayer_id	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/data2vec/modular_data2vec_audio.pyr   +   s   
zData2VecAudioConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )N)r%   	transposer'   r)   r*   hidden_statesr/   r/   r0   forward:   s   


zData2VecAudioConvLayer.forward)r   __name__
__module____qualname__r   r6   __classcell__r/   r/   r-   r0   r   *   s    r   c                   @      e Zd ZdS )Data2VecAudioPadLayerNr8   r9   r:   r/   r/   r/   r0   r=   E       r=   c                       $   e Zd Z fddZdd Z  ZS ) Data2VecAudioPositionalConvLayerc                    s\   t    tj|j|j|j|jd |jd| _t|j| _	t
|j | _tj|jdd| _d S )Nr   )r   paddinggroupsFr   )r   r   r   r!   hidden_sizeconv_pos_kernel_sizenum_conv_pos_embedding_groupsr%   r=   rB   r   r(   r)   r&   r'   r*   r+   r-   r/   r0   r   J   s   
z)Data2VecAudioPositionalConvLayer.__init__c                 C   sD   |  |}| |}|dd}| |}|dd}| |}|S Nr   r   )r%   rB   r3   r'   r)   r4   r/   r/   r0   r6   Y   s   



z(Data2VecAudioPositionalConvLayer.forwardr7   r/   r/   r-   r0   rA   I   s    rA   c                       r@   )$Data2VecAudioPositionalConvEmbeddingc                    s.   t    t fddt jD | _d S )Nc                    s   g | ]}t  qS r/   )rA   ).0_r+   r/   r0   
<listcomp>h   s    zAData2VecAudioPositionalConvEmbedding.__init__.<locals>.<listcomp>)r   r   r   
ModuleListrangenum_conv_pos_embeddingslayersrG   r-   rL   r0   r   e   s   

z-Data2VecAudioPositionalConvEmbedding.__init__c                 C   s0   | dd}| jD ]}||}q	| dd}|S rH   )r3   rQ   )r*   r5   layerr/   r/   r0   r6   k   s
   

z,Data2VecAudioPositionalConvEmbedding.forwardr7   r/   r/   r-   r0   rI   d   s    rI   c                   @   s   e Zd Zdd ZdS )Data2VecAudioFeatureEncoderc                    s<   t j|  t  fddt jD | _d| _d| _d S )Nc                    s   g | ]}t  |d qS ))r,   )r   )rJ   irL   r/   r0   rM   w   s    z8Data2VecAudioFeatureEncoder.__init__.<locals>.<listcomp>FT)	r   Moduler   rN   rO   num_feat_extract_layersconv_layersgradient_checkpointing_requires_gradrG   r/   rL   r0   r   t   s   
z$Data2VecAudioFeatureEncoder.__init__N)r8   r9   r:   r   r/   r/   r/   r0   rS   s   s    rS   c                   @   r<   )Data2VecAudioFeatureProjectionNr>   r/   r/   r/   r0   rZ   }   r?   rZ   c                   @   r<   )Data2VecAudioEncoderNr>   r/   r/   r/   r0   r[      r?   r[   c                   @   r<   )Data2VecAudioAdapterNr>   r/   r/   r/   r0   r\      r?   r\   c                   @   sN   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Zdd Zd	d
 Zdd ZdS )Data2VecAudioPreTrainedModelr+   data2vec_audioinput_valuesTc                 C   sZ  t |tr(td|jj }tjj|jj	| |d tjj|jj
| |d dS t |tr8tj|jj
d dS t |tjrX|j	jjd| jjd |j
durV|j
j  dS dS t |tjtjfr||j
durl|j
j  |j	durz|j	jd dS dS t |tjrtj|j	 |j
durt|j|j|jd   }tjj|j
| |d dS dS dS )zInitialize the weightsr   )abr           )meanstdNg      ?)
isinstancerZ   mathsqrt
projectionin_featuresr   inituniform_weightr   rA   	constant_r%   Lineardatanormal_r+   initializer_rangezero_r&   	GroupNormfill_r!   kaiming_normal_rC   in_channelsr   )r*   modulekr/   r/   r0   _init_weights   s0   





z*Data2VecAudioPreTrainedModel._init_weightsc                 C      t dNzNot needed for Data2VecAudioAttributeErrorr*   r/   r/   r0   _get_adapters      z*Data2VecAudioPreTrainedModel._get_adaptersc                 C   rz   r{   r|   r~   r/   r/   r0   init_adapter_layers   r   z0Data2VecAudioPreTrainedModel.init_adapter_layersc                 C   rz   r{   r|   r~   r/   r/   r0   load_adapter   r   z)Data2VecAudioPreTrainedModel.load_adapterN)r8   r9   r:   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnry   r   r   r   r/   r/   r/   r0   r]      s   
 r]   c                       s:   e Zd ZdefddZdd Zdd Z fdd	Z  ZS )
Data2VecAudioModelr+   c                 C   s|   t | | || _t|| _t|| _|jdks|jdkr)t	
t|j | _t|| _|jr5t|nd | _|   d S )Nrb   )r]   r   r+   rS   feature_extractorrZ   feature_projectionmask_time_probmask_feature_probr   	ParametertorchTensorrD   rk   masked_spec_embedr[   encoderadd_adapterr\   adapter	post_initrG   r/   r/   r0   r      s   


zData2VecAudioModel.__init__c                 C   rz   r{   r|   r~   r/   r/   r0   freeze_feature_extractor   r   z+Data2VecAudioModel.freeze_feature_extractorc                 C   s   | j   dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r   _freeze_parametersr~   r/   r/   r0   freeze_feature_encoder   s   z)Data2VecAudioModel.freeze_feature_encoderc                       t  jdi |S Nr/   r   r6   r*   super_kwargsr-   r/   r0   r6         zData2VecAudioModel.forward)	r8   r9   r:   r   r   r   r   r6   r;   r/   r/   r-   r0   r      s
    r   c                       s4   e Zd Zdd Zdd Zdd Z fddZ  ZS )	Data2VecAudioForCTCc                 C   sx   t | | t|| _t|j| _|jd u r t	d| j
 dt|dr+|jr+|jn|j}t||j| _|   d S )NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r   )r]   r   r   r^   r   Dropoutfinal_dropoutdropout
vocab_size
ValueErrorr.   hasattrr   output_hidden_sizerD   rn   lm_headr   )r*   r+   r   r/   r/   r0   r      s   

zData2VecAudioForCTC.__init__c                 C   rz   r{   r|   r~   r/   r/   r0   freeze_base_model   r   z%Data2VecAudioForCTC.freeze_base_modelc                 C   rz   r{   r|   r~   r/   r/   r0   tie_weights   r   zData2VecAudioForCTC.tie_weightsc                    r   r   r   r   r-   r/   r0   r6      r   zData2VecAudioForCTC.forward)r8   r9   r:   r   r   r   r6   r;   r/   r/   r-   r0   r      s
    r   c                   @   r<   )&Data2VecAudioForSequenceClassificationNr>   r/   r/   r/   r0   r      r?   r   c                   @   r<   )(Data2VecAudioForAudioFrameClassificationNr>   r/   r/   r/   r0   r      r?   r   c                   @   r<   )Data2VecAudioForXVectorNr>   r/   r/   r/   r0   r      r?   r   )r   r   r   r   r   r]   )+__doc__rf   r   r   activationsr   modeling_layersr   modeling_outputsr   modeling_utilsr   wav2vec2.modeling_wav2vec2r	   r
   r   r   r   r   r   r   r   r   r   configuration_data2vec_audior   r   r=   rU   rA   rI   rS   rZ   r[   r\   r]   Data2VecAudioBaseModelOutputr   r   r   r   r   __all__r/   r/   r/   r0   <module>   s4   4
,  