o
    i_                     @   s  d dl Z d dlmZmZmZ d dlZd dlZd dlm	Z	 d dlm
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& e# ryddl'm(Z( e$)e*Z+G dd de	j,Z-G dd de	j,Z.G dd deZ/G dd deZ0G dd deZ1G dd de	j,Z2G dd de	j,Z3		 	dLd!e	j,d"ej4d#ej4d$ej4d%eej4 d&ee5 d'e5d(eej4 fd)d*Z6G d+d, d,e	j,Z7G d-d. d.e	j,Z8G d/d0 d0eZ9G d1d2 d2e	j,Z:G d3d4 d4e	j,Z;G d5d6 d6eZ<G d7d8 d8e	j,Z=e"G d9d: d:eZ>		 dMd;e?e@e@f d<e5d=e@d%eejA d>e@d?ejBfd@dAZCe"G dBdC dCe>ZDdZEe"dDdEG dFdG dGe>ZFe"dHdEG dIdJ dJe>ZGg dKZHdS )N    N)CallableOptionalUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torch_flex_attn_availablelogging   )HubertConfig)make_flex_block_causal_maskc                       $   e Zd Z fddZdd Z  ZS )HubertPositionalConvEmbeddingc                    s@  t    tj|j|j|j|jd |jd| _d | _|j	r%t
|j| _nmtjj}ttjjdr5tjjj}t rdd l}|jj| jjdd || jddd| _W d    n1 sZw   Y  t| jdrr| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	N   )kernel_sizepaddinggroupsweight_normr   modifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsconv
batch_normconv_pos_batch_normBatchNorm1dutilsr    hasattrr&   r   	deepspeedzeroGatheredParametersr#   	original0	original1weight_gweight_vregister_external_parameterHubertSamePadLayerr   r   feat_extract_activation
activation)selfconfigr    r4   r9   r:   	__class__ g/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.pyr(   3   s:   

z&HubertPositionalConvEmbedding.__init__c                 C   sN   | dd}| jd ur| |}| |}| |}| |}| dd}|S )Nr   r   )	transposer/   r.   r   r>   r?   hidden_statesrC   rC   rD   forwardX   s   




z%HubertPositionalConvEmbedding.forward__name__
__module____qualname__r(   rH   __classcell__rC   rC   rA   rD   r   2   s    %r   c                       r   )r<   c                    s*   t    |d dkrd| _d S d| _d S )Nr   r   r   )r'   r(   num_pad_remove)r?   r,   rA   rC   rD   r(   e   s   
 zHubertSamePadLayer.__init__c                 C   s,   | j dkr|d d d d d | j  f }|S Nr   )rN   rF   rC   rC   rD   rH   i   s   
zHubertSamePadLayer.forwardrI   rC   rC   rA   rD   r<   d   s    r<   c                       &   e Zd Zd fdd	Zdd Z  ZS )HubertNoLayerNormConvLayerr   c                    sj   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _d S )Nr   r   r   stridebias)r'   r(   conv_dimin_conv_dimout_conv_dimr)   r*   conv_kernelconv_stride	conv_biasr.   r   r=   r>   r?   r@   layer_idrA   rC   rD   r(   p   s   
z#HubertNoLayerNormConvLayer.__init__c                 C   s   |  |}| |}|S N)r.   r>   rF   rC   rC   rD   rH   ~   s   

z"HubertNoLayerNormConvLayer.forwardr   rI   rC   rC   rA   rD   rQ   o   s    rQ   c                       rP   )HubertLayerNormConvLayerr   c                    s|   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   rR   T)elementwise_affine)r'   r(   rU   rV   rW   r)   r*   rX   rY   rZ   r.   	LayerNorm
layer_normr   r=   r>   r[   rA   rC   rD   r(      s   
z!HubertLayerNormConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )N)r.   rE   rb   r>   rF   rC   rC   rD   rH      s   


z HubertLayerNormConvLayer.forwardr^   rI   rC   rC   rA   rD   r_      s    r_   c                       rP   )HubertGroupNormConvLayerr   c                    s   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _tj| j| jdd| _d S )Nr   r   rR   T)
num_groupsnum_channelsaffine)r'   r(   rU   rV   rW   r)   r*   rX   rY   rZ   r.   r   r=   r>   	GroupNormrb   r[   rA   rC   rD   r(      s   
z!HubertGroupNormConvLayer.__init__c                 C   s"   |  |}| |}| |}|S r]   )r.   rb   r>   rF   rC   rC   rD   rH      s   


z HubertGroupNormConvLayer.forwardr^   rI   rC   rC   rA   rD   re      s    re   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )HubertFeatureEncoderz.Construct the features from raw audio waveformc                    s   t     jdkr t ddg fddt jd D  }n jdkr2 fddt jD }n	td	 j d
t|| _	d| _
d| _d S )Ngroupr   r\   c                    s   g | ]
}t  |d  dqS )r   rl   )rQ   .0ir@   rC   rD   
<listcomp>   s    z1HubertFeatureEncoder.__init__.<locals>.<listcomp>r   layerc                    s   g | ]}t  |d qS )rl   )r_   rm   rp   rC   rD   rq      s    z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r'   r(   feat_extract_normre   rangenum_feat_extract_layers
ValueErrorr)   
ModuleListconv_layersgradient_checkpointing_requires_grad)r?   r@   rx   rA   rp   rD   r(      s   




zHubertFeatureEncoder.__init__c                 C   s   |   D ]}d|_qd| _d S NF)
parametersrequires_gradrz   r?   paramrC   rC   rD   _freeze_parameters   s   
z'HubertFeatureEncoder._freeze_parametersc                 C   s:   |d d d f }| j r| jrd|_| jD ]}||}q|S )NT)rz   trainingr}   rx   )r?   input_valuesrG   
conv_layerrC   rC   rD   rH      s   

zHubertFeatureEncoder.forward)rJ   rK   rL   __doc__r(   r   rH   rM   rC   rC   rA   rD   rj      s
    rj   c                       r   )HubertFeatureProjectionc                    sX   t    |j| _| jrtj|jd |jd| _t|jd |j	| _
t|j| _d S )Nrd   eps)r'   r(   feat_proj_layer_normr)   ra   rU   layer_norm_epsrb   Linearr+   
projectionDropoutfeat_proj_dropoutdropoutr?   r@   rA   rC   rD   r(      s   
z HubertFeatureProjection.__init__c                 C   s(   | j r| |}| |}| |}|S r]   )r   rb   r   r   rF   rC   rC   rD   rH      s
   


zHubertFeatureProjection.forwardrI   rC   rC   rA   rD   r      s    r           modulequerykeyvalueattention_maskscalingr   	head_maskc                 K   s   |d u r| dd }t||dd| }	|d ur|	| }	tjj|	dd}	|d ur5|	|dddd }	tjj|	|| j	d}	t|	|}
|
dd
 }
|
|	fS )Nrd         r   r   r%   r   )pr   )sizetorchmatmulrE   r)   
functionalsoftmaxviewr   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputrC   rC   rD   eager_attention_forward   s   r   c                       s   e Zd ZdZ					ddededed	ed
ededee f fddZ					dde
jdee
j dee
j dee
j dee dee dee
jee
j eee
j  f fddZ  ZS )HubertAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr   
is_decoderrT   	is_causalr@   c                    s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )rT   )r'   r(   r   r   r   head_dimr@   rv   r   r   r   r)   r   k_projv_projq_projout_proj)r?   r   r   r   r   rT   r   r@   rA   rC   rD   r(     s&   



zHubertAttention.__init__rG   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                 K   s  |du}|j dd \}}	|r|j d n|	}
||	d| jf}||
d| jf}| |j| dd}|r4|n|}| |j| dd}| |j| dd}t}| jj	dkr\t
| jj	 }|| ||||f| jshdn| j| j||d|\}}|||	d }| |}||dfS )z#Input shape: Batch x Time x ChannelNrd   r   r   eagerr   )r   r   r   r   )shaper   r   r   rE   r   r   r   r@   _attn_implementationr   r   r   r   reshaper   r   )r?   rG   r   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   rC   rC   rD   rH   /  s:   



zHubertAttention.forward)r   FTFN)NNNF)rJ   rK   rL   r   intfloatboolr   r   r(   r   Tensorr   r   tuplerH   rM   rC   rC   rA   rD   r     sR    "	
r   c                       r   )HubertFeedForwardc                    sp   t    t|j| _t|j|j| _	t
|jtr"t|j | _n|j| _t|j|j| _t|j| _d S r]   )r'   r(   r)   r   activation_dropoutintermediate_dropoutr   r+   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   rA   rC   rD   r(   f  s   
zHubertFeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r]   )r   r   r   r   r   rF   rC   rC   rD   rH   s  s   




zHubertFeedForward.forwardrI   rC   rC   rA   rD   r   e  s    r   c                       s&   e Zd Z fddZdddZ  ZS )HubertEncoderLayerc                    sh   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _d S )NFr   r   r   r   r@   r   )r'   r(   r   r+   num_attention_headsattention_dropout	attentionr)   r   r   r   ra   r   rb   r   feed_forwardfinal_layer_normr   rA   rC   rD   r(   ~  s   

zHubertEncoderLayer.__init__NFc                 C   sf   |}| j |||d\}}}| |}|| }| |}|| | }| |}|f}|r1||f7 }|S Nr   r   )r   r   rb   r   r   r?   rG   r   r   attn_residualr   _outputsrC   rC   rD   rH     s   



zHubertEncoderLayer.forwardr{   rI   rC   rC   rA   rD   r   }  s    r   c                       sj   e Zd Z fddZ				ddejdeej ded	ed
ef
ddZ	de
ejdf dejfddZ  ZS )HubertEncoderc                    f   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nr   c                       g | ]}t  qS rC   )r   rn   r   rp   rC   rD   rq         z*HubertEncoder.__init__.<locals>.<listcomp>Fr'   r(   r@   r   pos_conv_embedr)   ra   r+   r   rb   r   r   r   rw   rt   num_hidden_layerslayersry   r   rA   rp   rD   r(     s   

 
zHubertEncoder.__init__NFTrG   r   r   output_hidden_statesreturn_dictc                 C   s*  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | ||}| |}	||	 }| |}| |}t pAt| }
| j	D ]3}|rN||f }t
g }| jo[|| jjk }|r`|
rk||||d}|d }|rod}|rx||d f }qE|r||f }|stdd	 |||fD S t|||d
S )NrC   rd   r   r   r   r   NNc                 s       | ]	}|d ur|V  qd S r]   rC   rn   vrC   rC   rD   	<genexpr>      z(HubertEncoder.forward.<locals>.<genexpr>last_hidden_staterG   
attentions)	unsqueezerepeatr   _update_full_maskr   rb   r   r   r	   r   r   randr   r@   	layerdropr   r   r?   rG   r   r   r   r   all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusrr   dropout_probabilityskip_the_layerlayer_outputsrC   rC   rD   rH     sL   







zHubertEncoder.forwardinputs_embedsc                 C      |d ur>| j jdkrd|v r|}|S d }|S | j jdkr$t||j}|S | j jdkr8t|tjr6t|dd}|S t||j}|S Nflash_attention_2r   sdpaflex_attentionF)r   	r@   r   r   dtyper   r   r   r   r
   r?   r   r  rC   rC   rD   r        zHubertEncoder._update_full_maskNFFT)rJ   rK   rL   r(   r   tensorr   r   r   rH   r   r   rM   rC   rC   rA   rD   r     s,    
<r   c                       s,   e Zd Z fddZdejfddZ  ZS )HubertAttnAdapterLayerc                    sZ   t    |j| _|j| _t| j| _t	| j| j| _
t | _t	| j| j| _dS )z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r'   r(   adapter_attn_dim	input_dimr+   
hidden_dimr)   ra   normr   linear_1ReLUact_fnlinear_2r   rA   rC   rD   r(     s   

zHubertAttnAdapterLayer.__init__rG   c                 C   s,   |  |}| |}| |}| |}|S r]   )r  r  r  r  rF   rC   rC   rD   rH     s
   



zHubertAttnAdapterLayer.forward)rJ   rK   rL   r(   r   FloatTensorrH   rM   rC   rC   rA   rD   r    s    r  c                       s@   e Zd Z fddZ		d
dejdeej defdd	Z  Z	S )!HubertEncoderLayerStableLayerNormc                    s   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _t|dd d urAt|| _d S d | _d S )NFr   r   r  )r'   r(   r   r+   r   r   r   r)   r   r   r   ra   r   rb   r   r   r   getattrr  adapter_layerr   rA   rC   rD   r(     s   


z*HubertEncoderLayerStableLayerNorm.__init__NFrG   r   r   c                 C   sz   |}|  |}| j|||d\}}}| |}|| }|| | | }| jd ur1|| | }|f}|r;||f7 }|S r   )rb   r   r   r   r   r  r   rC   rC   rD   rH   +  s   



z)HubertEncoderLayerStableLayerNorm.forwardr{   )
rJ   rK   rL   r(   r   r   r   r   rH   rM   rC   rC   rA   rD   r    s    r  c                       sL   e Zd Z fddZ				dddZdeejdf d	ejfd
dZ  Z	S )HubertEncoderStableLayerNormc                    r   )Nr   c                    r   rC   )r  r   rp   rC   rD   rq   M  r   z9HubertEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   r   rA   rp   rD   r(   F  s   


z%HubertEncoderStableLayerNorm.__init__NFTc                 C   s*  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < | ||}| |}	||	 }| |}t p<t| }
| jD ]3}|rI||f }t	
g }| joV|| jjk }|r[|
rf||||d}|d }|rjd}|rs||d f }q@| |}|r||f }|stdd	 |||fD S t|||d
S )NrC   rd   r   r   r   r   r   c                 s   r   r]   rC   r   rC   rC   rD   r     r   z7HubertEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )r   r   r   r   r   r   r   r	   r   r   r   r   r@   r   rb   r   r   r   rC   rC   rD   rH   Q  sL   







z$HubertEncoderStableLayerNorm.forwardr   r  c                 C   r  r  r	  r  rC   rC   rD   r     r  z.HubertEncoderStableLayerNorm._update_full_maskr  )
rJ   rK   rL   r(   rH   r   r   r   r   rM   rC   rC   rA   rD   r  E  s    
>r  c                   @   sb   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Zdeejef fdd	Zd
edejfddZdS )HubertPreTrainedModelr@   hubertr   Tc                 C   s  t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
tjtjfr:|jj	  |jjd dS t |tjrt rddl}t|drvt|drv|jj|j|jgdd tj|jj W d   n1 spw   Y  n*|jj|jdd tj|jj W d   n1 sw   Y  ntj|jj |jdur|jj	  dS dS t |trt|d	r|jj  dS dS t |trt|d
r|jjd| jjd   dS dS dS )zInitialize the weightsr   )meanstdNg      ?r   r:   r9   r!   masked_spec_embedlayer_weightsr   )r   r)   r   r#   datanormal_r@   initializer_rangerT   zero_ra   ri   r1   fill_r*   r   r4   r3   r5   r6   r:   r9   initkaiming_normal_HubertModelr!  uniform_HubertForSequenceClassificationr"  r   )r?   r   r4   rC   rC   rD   _init_weights  sB   





z#HubertPreTrainedModel._init_weightsinput_lengthsc                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )r   div)input_lengthr   rS   rC   rC   rD   _conv_out_length  s   zPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)zipr@   rX   rY   )r?   r.  r3  r   rS   rC   rC   rD    _get_feat_extract_output_lengths  s   z6HubertPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthr   c                 C   s~   |  |dtj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dgd
dg }|S )Nrd   r   )r
  devicer   )r7  )r5  sumtor   longr   zerosr
  r7  arangeflipcumsumr   )r?   r6  r   output_lengths
batch_sizerC   rC   rD   "_get_feature_vector_attention_mask  s   
"z8HubertPreTrainedModel._get_feature_vector_attention_maskN)rJ   rK   rL   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr-  r   r   
LongTensorr   r5  rA  rC   rC   rC   rD   r    s   
 !r  r   	mask_probmask_length	min_masksr   c                    s  | \}dk rt dkrt d d dtjd   fdd}|dur:| d	 n
fd
dt|D }tj	|ft
d}g }	|}
|
dkrZ|S |D ];}||}tjjt|d  |dd}t|dkr}d }n|d }t|tj|
| tjd| g}|	| q\t|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d krd |	|	d k< t||	dd	 |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr }| d  |k r*t| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r2  num_masked_spanepsilonrK  rJ  rL  sequence_lengthrC   rD   compute_num_masked_span  s   
z6_compute_mask_indices.<locals>.compute_num_masked_spanNrd   c                    s   g | ]} qS rC   rC   r   )rR  rC   rD   rq   &  s    z)_compute_mask_indices.<locals>.<listcomp>r
  r   F)replace)rv   nprandomr   itemdetachr8  tolistrt   r;  r   choicer<  lenconcatenateonesint32appendarraybroadcast_tor   rN  put_along_axis)r   rJ  rK  r   rL  r@  rS  r.  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr2  rO  spec_aug_mask_idxdummy_mask_idxoffsetsrC   rP  rD   _compute_mask_indices  s\   

rj  c                       s   e Zd Zdef fddZ		ddejdeej deej fdd	Z	e
					dd
eej deej deej dee dee dee deeef fddZ  ZS )r*  r@   c                    sz   t  | || _t|| _t|| _|jdks|jdkr)t	
t|j | _|jr2t|| _nt|| _|   d S )Nr   )r'   r(   r@   rj   feature_extractorr   feature_projectionmask_time_probmask_feature_probr)   	Parameterr   r   r+   r+  r!  do_stable_layer_normr  encoderr   	post_initr   rA   rC   rD   r(   f  s   


zHubertModel.__init__NrG   mask_time_indicesr   c                 C   s  t | jdds	|S | \}}}|dur| j|j||< n-| jjdkrK| jrKt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )rJ  rK  r   rL  )r7  r
  )rJ  rK  rL  rd   )r  r@   r   r!  r9  r
  rm  r   rj  mask_time_lengthmask_time_min_masksr   r  r7  r   rn  mask_feature_lengthmask_feature_min_masksexpand)r?   rG   rs  r   r@  rR  r+   mask_feature_indicesrC   rC   rD   _mask_hidden_statesx  s4   zHubertModel._mask_hidden_statesr   r   r   r   r   c           
      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur6| |jd |}| |}| j	||d}| j
|||||d}	|	d }|s[|f|	dd  S t||	j|	jdS )a1  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(example):
        ...     example["speech"] = example["audio"]["array"]
        ...     return example


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r   )rs  r   r   r   r   r   r   )r@   r   r   use_return_dictrk  rE   rA  r   rl  r{  rq  r   rG   r   )
r?   r   r   rs  r   r   r   extract_featuresrG   encoder_outputsrC   rC   rD   rH     s2   $

zHubertModel.forwardr   NNNNN)rJ   rK   rL   r   r(   r   r  r   rI  r{  r   r   r   r   r   r   rH   rM   rC   rC   rA   rD   r*  d  s@    
.
r*  zn
    Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                       s   e Zd Zddee f fddZdd Zdd Zd	d
 Zdd Z	e
					ddeej deej dee dee dee deej deeef fddZ  ZS )HubertForCTCNtarget_langc                    s~   t  | t|| _t|j| _|| _|j	du r#t
d| j dt|dr.|jr.|jn|j}t||j	| _|   dS )a0  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`HubertForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r'   r(   r*  r  r)   r   final_dropoutr   r  
vocab_sizerv   rB   r3   r  output_hidden_sizer+   r   lm_headrr  )r?   r@   r  r  rA   rC   rD   r(     s   

zHubertForCTC.__init__c                 C   sv   | j }|durt| jdddu rtd| d|du r,t| jdddur,td dS |dur9| j|dd dS dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nr  zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  r  r@   rv   loggerinfoload_adapter)r?   r  rC   rC   rD   tie_weights  s   zHubertForCTC.tie_weightsc                 C      t dt |   dS )
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr?   rC   rC   rD   freeze_feature_extractor)  
   z%HubertForCTC.freeze_feature_extractorc                 C      | j j  dS r  Nr  rk  r   r  rC   rC   rD   r  5     z#HubertForCTC.freeze_feature_encoderc                 C      | j  D ]}d|_qdS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr  r|   r}   r~   rC   rC   rD   freeze_base_model<     zHubertForCTC.freeze_base_modelr   r   r   r   r   labelsr   c              
   C   s|  |dur|n| j j}|dur| | j jkrtd| j j | j|||||d}|d }| |}| |}	d}
|dur|durC|ntj	|tj
d}| |dtj
}|dk}|d}||}tjj|	dtjddd}tjjjd	d
 tjj||||| j j| j j| j jd}
W d   n1 sw   Y  |s|	f|td  }|
dur|
f| S |S t|
|	|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r|  r   rT  rd   )r%   r
  r   F)enabled)blank	reductionzero_infinitylosslogitsrG   r   )r@   r}  rN  r  rv   r  r   r  r   	ones_liker:  r5  r8  r9  masked_selectr)   r   log_softmaxfloat32rE   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rG   r   )r?   r   r   r   r   r   r  r   rG   r  r  r.  labels_masktarget_lengthsflattened_targets	log_probsoutputrC   rC   rD   rH   D  sN   



zHubertForCTC.forwardr]   r  )rJ   rK   rL   r   r   r(   r  r  r  r  r   r   r   r   r   r   r   rH   rM   rC   rC   rA   rD   r    s6    
r  z
    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                       s   e Zd Z fddZdd Zdd Zdd Ze										dd
ee	j
 dee	j
 dee dee dee dee	j
 deeef fddZ  ZS )r,  c                    s   t  | t|dr|jrtdt|| _|jd }|jr*t	
t|| | _t	|j|j| _t	|j|j| _|   d S )Nr  z]Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)r   )r'   r(   r3   r  rv   r*  r  r   use_weighted_layer_sumr)   ro  r   r^  r"  r   r+   classifier_proj_size	projector
num_labels
classifierrr  )r?   r@   
num_layersrA   rC   rD   r(     s   

z(HubertForSequenceClassification.__init__c                 C   r  )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        r  Nr  r  rC   rC   rD   r    r  z8HubertForSequenceClassification.freeze_feature_extractorc                 C   r  r  r  r  rC   rC   rD   r    r  z6HubertForSequenceClassification.freeze_feature_encoderc                 C   r  r  r  r~   rC   rC   rD   r    r  z1HubertForSequenceClassification.freeze_base_modelNr   r   r   r   r   r  r   c                 C   sz  |dur|n| j j}| j jrdn|}| j|||||d}| j jrB|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}|du rV|jdd}
n+| |jd |}|ddd|jd }d	|| < |jdd|jdddd }
| |
}d}|durt }||d| j j|d}|s|f|td  }|dur|f| S |S t|||j|jd
S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr|  r   r   rd   r   r   r   r  )r@   r}  r  r  r  r   stackr)   r   r   r"  r   r8  r  r  rA  r   r   r   r  r   r  r   rG   r   )r?   r   r   r   r   r   r  r   rG   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  rC   rC   rD   rH     sH   

 
z'HubertForSequenceClassification.forwardr  )rJ   rK   rL   r(   r  r  r  r   r   r   r   r   r   r   r   rH   rM   rC   rC   rA   rD   r,    s4    
r,  )r  r,  r*  r  )Nr   NrO   )Ir  typingr   r   r   numpyrV  r   torch.nnr)   r   activationsr   integrations.deepspeedr   integrations.fsdpr	   modeling_attn_mask_utilsr
   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   r2   r   r   r   configuration_hubertr   integrations.flex_attentionr   
get_loggerrJ   r  Moduler   r<   rQ   r_   re   rj   r   r   r   r   r   r   r   r   r  r  r  r  r   r   rI  ndarrayrj  r*  r  r  r,  __all__rC   rC   rC   rD   <module>   s   
2&
X$].aJ

w 
 s