o
    ei                     @   s  d dl mZ d dlZd dlZd dlmZ d dlmZ ddlm	Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& e$'e(Z)G dd dej*Z+G dd dej*Z,G dd deZ-G dd deZ.G dd deZ/G dd dej*Z0G dd dej*Z1		 dLd!ej*d"ej2d#ej2d$ej2d%ej2dB d&e3dB d'e3d(e e" fd)d*Z4G d+d, d,ej*Z5G d-d. d.ej*Z6G d/d0 d0eZ7G d1d2 d2ej*Z8G d3d4 d4ej*Z9G d5d6 d6eZ:G d7d8 d8ej*Z;e#G d9d: d:eZ<		 dMd;e=e>e>f d<e3d=e>d%ej?dB d>e>d?ej@fd@dAZAe#G dBdC dCe<ZBdZCe#dDdEG dFdG dGe<ZDe#dHdEG dIdJ dJe<ZEg dKZFdS )N    )CallableN)CrossEntropyLoss   )initialization)ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_bidirectional_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel*get_torch_context_manager_or_global_device)Unpack)TransformersKwargsauto_docstringlogging   )HubertConfigc                       $   e Zd Z fddZdd Z  ZS )HubertPositionalConvEmbeddingc                    s@  t    tj|j|j|j|jd |jd| _d | _|j	r%t
|j| _nmtjj}ttjjdr5tjjj}t rdd l}|jj| jjdd || jddd| _W d    n1 sZw   Y  t| jdrr| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	N   )kernel_sizepaddinggroupsweight_normr   modifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsconv
batch_normconv_pos_batch_normBatchNorm1dutilsr   hasattrr$   r   	deepspeedzeroGatheredParametersr!   	original0	original1weight_gweight_vregister_external_parameterHubertSamePadLayerr   r   feat_extract_activation
activation)selfconfigr   r2   r7   r8   	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/hubert/modeling_hubert.pyr&   .   s:   

z&HubertPositionalConvEmbedding.__init__c                 C   sN   | dd}| jd ur| |}| |}| |}| |}| dd}|S )Nr   r   )	transposer-   r,   r   r<   r=   hidden_statesrA   rA   rB   forwardS   s   




z%HubertPositionalConvEmbedding.forward__name__
__module____qualname__r&   rF   __classcell__rA   rA   r?   rB   r   -   s    %r   c                       r   )r:   c                    s*   t    |d dkrd| _d S d| _d S )Nr   r   r   )r%   r&   num_pad_remove)r=   r*   r?   rA   rB   r&   `   s   
 zHubertSamePadLayer.__init__c                 C   s,   | j dkr|d d d d d | j  f }|S Nr   )rL   rD   rA   rA   rB   rF   d   s   
zHubertSamePadLayer.forwardrG   rA   rA   r?   rB   r:   _   s    r:   c                       &   e Zd Zd fdd	Zdd Z  ZS )HubertNoLayerNormConvLayerr   c                    sj   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _d S )Nr   r   r   stridebias)r%   r&   conv_dimin_conv_dimout_conv_dimr'   r(   conv_kernelconv_stride	conv_biasr,   r   r;   r<   r=   r>   layer_idr?   rA   rB   r&   k   s   
z#HubertNoLayerNormConvLayer.__init__c                 C   s   |  |}| |}|S N)r,   r<   rD   rA   rA   rB   rF   y   s   

z"HubertNoLayerNormConvLayer.forwardr   rG   rA   rA   r?   rB   rO   j   s    rO   c                       rN   )HubertLayerNormConvLayerr   c                    s|   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   rP   T)elementwise_affine)r%   r&   rS   rT   rU   r'   r(   rV   rW   rX   r,   	LayerNorm
layer_normr   r;   r<   rY   r?   rA   rB   r&      s   
z!HubertLayerNormConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )N)r,   rC   r`   r<   rD   rA   rA   rB   rF      s   


z HubertLayerNormConvLayer.forwardr\   rG   rA   rA   r?   rB   r]      s    r]   c                       rN   )HubertGroupNormConvLayerr   c                    s   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _tj| j| jdd| _d S )Nr   r   rP   T)
num_groupsnum_channelsaffine)r%   r&   rS   rT   rU   r'   r(   rV   rW   rX   r,   r   r;   r<   	GroupNormr`   rY   r?   rA   rB   r&      s   
z!HubertGroupNormConvLayer.__init__c                 C   s"   |  |}| |}| |}|S r[   )r,   r`   r<   rD   rA   rA   rB   rF      s   


z HubertGroupNormConvLayer.forwardr\   rG   rA   rA   r?   rB   rc      s    rc   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )HubertFeatureEncoderz.Construct the features from raw audio waveformc                    s   t     jdkr t ddg fddt jd D  }n jdkr2 fddt jD }n	td	 j d
t|| _	d| _
d| _d S )Ngroupr   rZ   c                    s   g | ]
}t  |d  dqS )r   rj   )rO   .0ir>   rA   rB   
<listcomp>   s    z1HubertFeatureEncoder.__init__.<locals>.<listcomp>r   layerc                    s   g | ]}t  |d qS )rj   )r]   rk   rn   rA   rB   ro      s    z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r%   r&   feat_extract_normrc   rangenum_feat_extract_layers
ValueErrorr'   
ModuleListconv_layersgradient_checkpointing_requires_grad)r=   r>   rv   r?   rn   rB   r&      s   




zHubertFeatureEncoder.__init__c                 C   s   |   D ]}d|_qd| _d S NF)
parametersrequires_gradrx   r=   paramrA   rA   rB   _freeze_parameters   s   
z'HubertFeatureEncoder._freeze_parametersc                 C   s:   |d d d f }| j r| jrd|_| jD ]}||}q|S )NT)rx   trainingr{   rv   )r=   input_valuesrE   
conv_layerrA   rA   rB   rF      s   

zHubertFeatureEncoder.forward)rH   rI   rJ   __doc__r&   r~   rF   rK   rA   rA   r?   rB   rh      s
    rh   c                       r   )HubertFeatureProjectionc                    sX   t    |j| _| jrtj|jd |jd| _t|jd |j	| _
t|j| _d S )Nrb   eps)r%   r&   feat_proj_layer_normr'   r_   rS   layer_norm_epsr`   Linearr)   
projectionDropoutfeat_proj_dropoutdropoutr=   r>   r?   rA   rB   r&      s   
z HubertFeatureProjection.__init__c                 C   s(   | j r| |}| |}| |}|S r[   )r   r`   r   r   rD   rA   rA   rB   rF      s
   


zHubertFeatureProjection.forwardrG   rA   rA   r?   rB   r      s    r           modulequerykeyvalueattention_maskscalingr   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )Nrb         r   r   r#   )pr   r   )
sizetorchmatmulrC   r'   
functionalsoftmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputrA   rA   rB   eager_attention_forward   s   
r   c                       s   e Zd ZdZ					ddededed	ed
edededB f fddZ			dde	j
de	j
dB de	j
dB dedB dee dee	j
e	j
dB ee	j
 dB f fddZ  ZS )HubertAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr   
is_decoderrR   	is_causalr>   c                    s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )rR   )r%   r&   r   r   r   head_dimr>   rt   r   r   r   r'   r   k_projv_projq_projout_proj)r=   r   r   r   r   rR   r   r>   r?   rA   rB   r&   	  s&   



zHubertAttention.__init__rE   key_value_statesr   output_attentionsr   returnc                 K   s  |du}|j dd \}}|r|j d n|}	||d| jf}
||	d| jf}| |j|
 dd}|r4|n|}| |j| dd}| |j| dd}t| j	j
t}|| ||||f| jsbdn| j| j|d|\}}|||d }| |}||dfS )z#Input shape: Batch x Time x ChannelNrb   r   r   r   )r   r   r   )shaper   r   viewrC   r   r   r   get_interfacer>   _attn_implementationr   r   r   r   reshaper   r   )r=   rE   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   rA   rA   rB   rF   (  s8   	


zHubertAttention.forward)r   FTFN)NNF)rH   rI   rJ   r   intfloatboolr   r&   r   Tensorr   r
   tuplerF   rK   rA   rA   r?   rB   r     sL    "	r   c                       r   )HubertFeedForwardc                    sp   t    t|j| _t|j|j| _	t
|jtr"t|j | _n|j| _t|j|j| _t|j| _d S r[   )r%   r&   r'   r   activation_dropoutintermediate_dropoutr   r)   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   r?   rA   rB   r&   ]  s   
zHubertFeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r[   )r   r   r   r   r   rD   rA   rA   rB   rF   j  s   




zHubertFeedForward.forwardrG   rA   rA   r?   rB   r   \  s    r   c                       s&   e Zd Z fddZdddZ  ZS )HubertEncoderLayerc                    sh   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _d S )NFr   r   r   r   r>   r   )r%   r&   r   r)   num_attention_headsattention_dropout	attentionr'   r   r   r   r_   r   r`   r   feed_forwardfinal_layer_normr   r?   rA   rB   r&   u  s   

zHubertEncoderLayer.__init__NFc                 C   sf   |}| j |||d\}}}| |}|| }| |}|| | }| |}|f}|r1||f7 }|S Nr   r   )r   r   r`   r   r   r=   rE   r   r   attn_residualr   _outputsrA   rA   rB   rF     s   



zHubertEncoderLayer.forwardry   rG   rA   rA   r?   rB   r   t  s    r   c                       sL   e Zd Z fddZ				ddejdejdB ded	ed
ef
ddZ  Z	S )HubertEncoderc                    f   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nr   c                       g | ]}t  qS rA   )r   rl   r   rn   rA   rB   ro         z*HubertEncoder.__init__.<locals>.<listcomp>Fr%   r&   r>   r   pos_conv_embedr'   r_   r)   r   r`   r   r   r   ru   rr   num_hidden_layerslayersrw   r   r?   rn   rB   r&     s   

 
zHubertEncoder.__init__NFTrE   r   r   output_hidden_statesreturn_dictc                 C   s.  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < t| j||d}| |}	||	 }| |}| |}t pCt	| }
| j
D ]3}|rP||f }tg }| jo]|| jjk }|rb|
rm||||d}|d }|rqd}|rz||d f }qG|r||f }|std	d
 |||fD S t|||dS )NrA   rb   r   r   r   r>   inputs_embedsr   r   NNc                 s       | ]	}|d ur|V  qd S r[   rA   rl   vrA   rA   rB   	<genexpr>      z(HubertEncoder.forward.<locals>.<genexpr>last_hidden_staterE   
attentions)	unsqueezerepeatr   r	   r>   r   r`   r   r   r   r   r   randr   	layerdropr   r   r=   rE   r   r   r   r   all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusrp   dropout_probabilityskip_the_layerlayer_outputsrA   rA   rB   rF     sN   







zHubertEncoder.forwardNFFT)
rH   rI   rJ   r&   r   tensorr   r   rF   rK   rA   rA   r?   rB   r     s"    r   c                       s,   e Zd Z fddZdejfddZ  ZS )HubertAttnAdapterLayerc                    sZ   t    |j| _|j| _t| j| _t	| j| j| _
t | _t	| j| j| _dS )z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r%   r&   adapter_attn_dim	input_dimr)   
hidden_dimr'   r_   normr   linear_1ReLUact_fnlinear_2r   r?   rA   rB   r&     s   

zHubertAttnAdapterLayer.__init__rE   c                 C   s,   |  |}| |}| |}| |}|S r[   )r  r  r	  r
  rD   rA   rA   rB   rF     s
   



zHubertAttnAdapterLayer.forward)rH   rI   rJ   r&   r   FloatTensorrF   rK   rA   rA   r?   rB   r    s    r  c                       s@   e Zd Z fddZ		d
dejdejdB defdd	Z  ZS )!HubertEncoderLayerStableLayerNormc                    s   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _t|dd d urAt|| _d S d | _d S )NFr   r   r  )r%   r&   r   r)   r   r   r   r'   r   r   r   r_   r   r`   r   r   r   getattrr  adapter_layerr   r?   rA   rB   r&     s   


z*HubertEncoderLayerStableLayerNorm.__init__NFrE   r   r   c                 C   sz   |}|  |}| j|||d\}}}| |}|| }|| | | }| jd ur1|| | }|f}|r;||f7 }|S r   )r`   r   r   r   r   r  r   rA   rA   rB   rF     s   



z)HubertEncoderLayerStableLayerNorm.forwardry   )	rH   rI   rJ   r&   r   r   r   rF   rK   rA   rA   r?   rB   r    s    r  c                       s.   e Zd Z fddZ				dddZ  ZS )	HubertEncoderStableLayerNormc                    r   )Nr   c                    r   rA   )r  r   rn   rA   rB   ro   /  r   z9HubertEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   r   r?   rn   rB   r&   (  s   


z%HubertEncoderStableLayerNorm.__init__NFTc                 C   s.  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < t| j||d}| |}	||	 }| |}t p>t| }
| j	D ]3}|rK||f }t
g }| joX|| jjk }|r]|
rh||||d}|d }|rld}|ru||d f }qB| |}|r||f }|std	d
 |||fD S t|||dS )NrA   rb   r   r   r   r   r   r   c                 s   r   r[   rA   r   rA   rA   rB   r   k  r   z7HubertEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )r   r   r   r	   r>   r   r   r   r   r   r   r   r   r   r`   r   r   r   rA   rA   rB   rF   3  sN   







z$HubertEncoderStableLayerNorm.forwardr   rG   rA   rA   r?   rB   r  '  s    r  c                   @   sj   e Zd ZU eed< dZdZdZdZdZ	dZ
dZe dd ZdejeB fd	d
ZdedejfddZdS )HubertPreTrainedModelr>   hubertr   audioTc                 C   s  t |tjr tj|jd| jjd |jdurt	|j dS dS t |tj
tjtjfrUt	|j t|j t|dddurSt	|j t|j t	|j dS dS t |tjrt rddl}t|drt|dr|jj|j|jgdd t|j W d   n1 sw   Y  n&|jj|jdd t|j W d   n1 sw   Y  nt|j |jdurt	|j dS dS t |trt|d	rt|j dS dS t |trt|d
rt|j d| jj!d   dS dS dS )zInitialize the weightsr   )meanstdNrunning_meanr   r8   r7   r   masked_spec_embedlayer_weightsg      ?r   )"r   r'   r   initnormal_r!   r>   initializer_rangerR   zeros_r_   rg   r/   ones_r  r  running_varnum_batches_trackedr(   r   r2   r1   r3   r4   r8   r7   kaiming_normal_HubertModeluniform_r  HubertForSequenceClassification	constant_r  r   )r=   r   r2   rA   rA   rB   _init_weights~  sL   





z#HubertPreTrainedModel._init_weightsinput_lengthsc                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )r   div)input_lengthr   rQ   rA   rA   rB   _conv_out_length  s   zPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)zipr>   rV   rW   )r=   r%  r*  r   rQ   rA   rA   rB    _get_feat_extract_output_lengths  s   z6HubertPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthr   c                 C   s~   |  |dtj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dgd
dg }|S )Nrb   r   )dtypedevicer   )r/  )r,  sumtor   longr   zerosr.  r/  arangeflipcumsumr   )r=   r-  r   output_lengths
batch_sizerA   rA   rB   "_get_feature_vector_attention_mask  s   
"z8HubertPreTrainedModel._get_feature_vector_attention_maskN)rH   rI   rJ   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   no_gradr$  
LongTensorr   r,  r9  rA   rA   rA   rB   r  s  s   
 
#r  r   	mask_probmask_length	min_masksr   c                    s  | \}dk rt dkrt d d dtjd   fdd}|dur:| d	 n
fd
dt|D }tj	|ft
d}g }	|}
|
dkrZ|S |D ];}||}tjjt|d  |dd}t|dkr}d }n|d }t|tj|
| tjd| g}|	| q\t|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d krd |	|	d k< t||	dd	 |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr }| d  |k r*t| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r)  num_masked_spanepsilonrE  rD  rF  sequence_lengthrA   rB   compute_num_masked_span  s   
z6_compute_mask_indices.<locals>.compute_num_masked_spanNrb   c                    s   g | ]} qS rA   rA   r   )rL  rA   rB   ro     s    z)_compute_mask_indices.<locals>.<listcomp>r.  r   F)replace)rt   nprandomr   itemdetachr0  tolistrr   r3  r   choicer4  lenconcatenateonesint32appendarraybroadcast_tor   rH  put_along_axis)r   rD  rE  r   rF  r8  rM  r%  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr)  rI  spec_aug_mask_idxdummy_mask_idxoffsetsrA   rJ  rB   _compute_mask_indices  s\   

rd  c                       s   e Zd Zdef fddZ		ddejdejdB dejdB fdd	Ze						dd
ej
dB dej
dB dejdB dedB dedB dedB deeB fddZ  ZS )r   r>   c                    sz   t  | || _t|| _t|| _|jdks|jdkr)t	
t|j | _|jr2t|| _nt|| _|   d S Nr   )r%   r&   r>   rh   feature_extractorr   feature_projectionmask_time_probmask_feature_probr'   	Parameterr   r   r)   r!  r  do_stable_layer_normr  encoderr   	post_initr   r?   rA   rB   r&   7  s   


zHubertModel.__init__NrE   mask_time_indicesr   c                 C   s  t | jdds	|S | \}}}|dur| j|j||< n-| jjdkrK| jrKt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )rD  rE  r   rF  )r/  r.  )rD  rE  rF  rb   )r  r>   r   r  r1  r.  rh  r   rd  mask_time_lengthmask_time_min_masksr   r  r/  r   ri  mask_feature_lengthmask_feature_min_masksexpand)r=   rE   rn  r   r8  rL  r)   mask_feature_indicesrA   rA   rB   _mask_hidden_statesI  s4   zHubertModel._mask_hidden_statesr   r   r   r   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur6| |jd |}| |}	| j	|	|d}	| j
|	||||d}
|
d }	|s[|	f|
dd  S t|	|
j|
jdS )a1  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(example):
        ...     example["speech"] = example["audio"]["array"]
        ...     return example


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r   )rn  r   r   r   r   r   r   )r>   r   r   use_return_dictrf  rC   r9  r   rg  rv  rl  r   rE   r   )r=   r   r   rn  r   r   r   r   extract_featuresrE   encoder_outputsrA   rA   rB   rF   w  s2   %

zHubertModel.forwardr   NNNNN)rH   rI   rJ   r   r&   r   r  rC  rv  r   r   r   r   r   rF   rK   rA   rA   r?   rB   r   5  s@    
.	r   zn
    Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                       s   e Zd ZddedB f fddZdd Zdd Zd	d
 Ze					dde	j
dB de	j
dB dedB dedB dedB de	j
dB deeB fddZ  ZS )HubertForCTCNtarget_langc                    s~   t  | t|| _t|j| _|| _|j	du r#t
d| j dt|dr.|jr.|jn|j}t||j	| _|   dS )a0  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`HubertForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r%   r&   r   r  r'   r   final_dropoutr   r~  
vocab_sizert   r@   r1   r  output_hidden_sizer)   r   lm_headrm  )r=   r>   r~  r  r?   rA   rB   r&     s   

zHubertForCTC.__init__c                 K   s   t  tdkr
dS | j}|dur"t| jdddu r"td| d|du r6t| jdddur6td dS |durC| j	|dd dS dS )	a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        metaNr  zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)
r   r   r/  r~  r  r>   rt   loggerinfoload_adapter)r=   r   r~  rA   rA   rB   tie_weights  s   zHubertForCTC.tie_weightsc                 C      | j j  dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        Nr  rf  r~   r=   rA   rA   rB   freeze_feature_encoder     z#HubertForCTC.freeze_feature_encoderc                 C      | j  D ]}d|_qdS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr  rz   r{   r|   rA   rA   rB   freeze_base_model     zHubertForCTC.freeze_base_modelr   r   r   r   r   labelsr   c              
   K   s|  |dur|n| j j}|dur| | j jkrtd| j j | j|||||d}|d }	| |	}	| |	}
d}|dur|durC|ntj	|tj
d}| |dtj
}|dk}|d}||}tjj|
dtjddd}tjjjd	d
 tjj||||| j j| j j| j jd}W d   n1 sw   Y  |s|
f|td  }|dur|f| S |S t||
|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: rw  r   rN  rb   )r#   r.  r   F)enabled)blank	reductionzero_infinitylosslogitsrE   r   )r>   rx  rH  r  rt   r  r   r  r   	ones_liker2  r,  r0  r1  masked_selectr'   r   log_softmaxfloat32rC   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rE   r   )r=   r   r   r   r   r   r  r   r   rE   r  r  r%  labels_masktarget_lengthsflattened_targets	log_probsoutputrA   rA   rB   rF     sN   



zHubertForCTC.forwardr[   r{  )rH   rI   rJ   r   r&   r  r  r  r   r   r   r   r   r   rF   rK   rA   rA   r?   rB   r}    s4    	r}  z
    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                       s   e Zd Z fddZdd Zdd Ze					ddejdB d	ejdB d
e	dB de	dB de	dB dejdB de
eB fddZ  ZS )r"  c                    s   t  | t|dr|jrtdt|| _|jd }|jr*t	
t|| | _t	|j|j| _t	|j|j| _|   d S )Nr  z]Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)r   )r%   r&   r1   r  rt   r   r  r   use_weighted_layer_sumr'   rj  r   rX  r  r   r)   classifier_proj_size	projector
num_labels
classifierrm  )r=   r>   
num_layersr?   rA   rB   r&   ]  s   

z(HubertForSequenceClassification.__init__c                 C   r  r  r  r  rA   rA   rB   r  n  r  z6HubertForSequenceClassification.freeze_feature_encoderc                 C   r  r  r  r|   rA   rA   rB   r  u  r  z1HubertForSequenceClassification.freeze_base_modelNr   r   r   r   r   r  r   c                 K   sz  |dur|n| j j}| j jrdn|}| j|||||d}| j jrB|t }	tj|	dd}	tjj	| j
dd}
|	|
ddd jdd}	n|d }	| |	}	|du rV|	jdd}n+| |	jd |}|ddd|	jd }d	|	| < |	jdd|jdddd }| |}d}|durt }||d| j j|d}|s|f|td  }|dur|f| S |S t|||j|jd
S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTrw  r   r   rb   r   r   r   r  )r>   rx  r  r  r  r   stackr'   r   r   r  r   r0  r  r  r9  r   r   r   r  r   r  r   rE   r   )r=   r   r   r   r   r   r  r   r   rE   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  rA   rA   rB   rF   }  sH   

 
z'HubertForSequenceClassification.forwardr{  )rH   rI   rJ   r&   r  r  r   r   r   r   r   r   rF   rK   rA   rA   r?   rB   r"  V  s2    	r"  )r}  r"  r   r  re  rM   )Gcollections.abcr   numpyrP  r   torch.nnr'   r    r   r  activationsr   integrations.deepspeedr   integrations.fsdpr   masking_utilsr	   modeling_flash_attention_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   r   processing_utilsr   r0   r   r   r   configuration_hubertr   
get_loggerrH   r  Moduler   r:   rO   r]   rc   rh   r   r   r   r   r   r   r   r   r  r  r  r  r   r   rC  ndarrayrd  r   r  r}  r"  __all__rA   rA   rA   rB   <module>   s   
2&
V$H.LN

w  h