o
    	۷i.                     @   sN  d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZmZmZmZmZmZmZ ddlmZ dZG dd dejZG dd deZG dd deZG dd dejZ G dd deZ!G dd deZ"eG dd deZ#G dd dee#Z$G dd deZ%G d d! d!eZ&g d"Z'dS )#zPyTorch Hubert model.    )OptionalUnionN   )ACT2FN)is_deepspeed_zero3_enabled)BaseModelOutput)PreTrainedModel)auto_docstring   )Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ModelWav2Vec2SamePadLayer   )HubertConfigc                       $   e Zd Z fddZdd Z  ZS )HubertPositionalConvEmbeddingc                    s@  t    tj|j|j|j|jd |jd| _d | _|j	r%t
|j| _nmtjj}ttjjdr5tjjj}t rdd l}|jj| jjdd || jddd| _W d    n1 sZw   Y  t| jdrr| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	Nr
   )kernel_sizepaddinggroupsweight_normr   modifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsconv
batch_normconv_pos_batch_normBatchNorm1dutilsr   hasattrr   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterHubertSamePadLayerr   r   feat_extract_activation
activation)selfconfigr   r-   r2   r3   	__class__ _/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/hubert/modular_hubert.pyr!   +   s:   

z&HubertPositionalConvEmbedding.__init__c                 C   sN   | dd}| jd ur| |}| |}| |}| |}| dd}|S )Nr   r
   )	transposer(   r'   r   r7   r8   hidden_statesr<   r<   r=   forwardP   s   




z%HubertPositionalConvEmbedding.forward__name__
__module____qualname__r!   rA   __classcell__r<   r<   r:   r=   r   *   s    %r   c                   @      e Zd ZdS )r5   NrC   rD   rE   r<   r<   r<   r=   r5   \       r5   c                   @   rG   )HubertFeatureEncoderNrH   r<   r<   r<   r=   rJ   `   rI   rJ   c                       r   )HubertFeatureProjectionc                    sX   t    |j| _| jrtj|jd |jd| _t|jd |j	| _
t|j| _d S )N)eps)r    r!   feat_proj_layer_normr"   	LayerNormconv_dimlayer_norm_eps
layer_normLinearr$   
projectionDropoutfeat_proj_dropoutdropoutr8   r9   r:   r<   r=   r!   e   s   
z HubertFeatureProjection.__init__c                 C   s(   | j r| |}| |}| |}|S )N)rN   rR   rT   rW   r?   r<   r<   r=   rA   m   s
   


zHubertFeatureProjection.forwardrB   r<   r<   r:   r=   rK   d   s    rK   c                   @   rG   )HubertEncoderNrH   r<   r<   r<   r=   rY   v   rI   rY   c                   @   rG   )HubertEncoderStableLayerNormNrH   r<   r<   r<   r=   rZ   z   rI   rZ   c                   @   sb   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Zdeejef fdd	Zd
edejfddZdS )HubertPreTrainedModelr9   hubertinput_valuesTc                 C   s  t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
tjtjfr:|jj	  |jjd dS t |tjrt rddl}t|drvt|drv|jj|j|jgdd tj|jj W d   n1 spw   Y  n*|jj|jdd tj|jj W d   n1 sw   Y  ntj|jj |jdur|jj	  dS dS t |trt|d	r|jj  dS dS t |trt|d
r|jjd| jjd   dS dS dS )zInitialize the weights        )meanstdNg      ?r   r3   r2   r   masked_spec_embedlayer_weightsr   )
isinstancer"   rS   r   datanormal_r9   initializer_rangebiaszero_rO   	GroupNormr*   fill_r#   r   r-   r,   r.   r/   r3   r2   initkaiming_normal_HubertModelra   uniform_HubertForSequenceClassificationrb   num_hidden_layers)r8   moduler-   r<   r<   r=   _init_weights   sB   





z#HubertPreTrainedModel._init_weightsinput_lengthsc                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )torchdiv)input_lengthr   strider<   r<   r=   _conv_out_length   s   zPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)zipr9   conv_kernelconv_stride)r8   rs   rz   r   ry   r<   r<   r=    _get_feat_extract_output_lengths   s   z6HubertPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthattention_maskc                 C   s~   |  |dtj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dgd
dg }|S )NrL   r   )dtypedevicer   )r   )r~   sumtorv   longshapezerosr   r   arangeflipcumsumbool)r8   r   r   output_lengths
batch_sizer<   r<   r=   "_get_feature_vector_attention_mask   s   
"z8HubertPreTrainedModel._get_feature_vector_attention_maskN)rC   rD   rE   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnrr   r   rv   
LongTensorintr~   r   r<   r<   r<   r=   r[   ~   s   
 !r[   c                       s   e Zd Zdef fddZdd Zdd Z					dd	eej	 d
eej	 deej
 dee dee dee deeef fddZ  ZS )rm   r9   c                    s~   t  | || _t|| _t|| _|jdks|jdkr)t	
t|j | _|jr2t|| _nt|| _|   | `d S )Nr^   )r    r!   r9   rJ   feature_extractorrK   feature_projectionmask_time_probmask_feature_probr"   	Parameterrv   Tensorr$   rn   ra   do_stable_layer_normrZ   encoderrY   	post_initadapterrX   r:   r<   r=   r!      s   


zHubertModel.__init__c                 C      t dNzNot needed for HubertAttributeErrorr8   r<   r<   r=   freeze_feature_extractor      z$HubertModel.freeze_feature_extractorc                 C   r   r   r   r   r<   r<   r=   freeze_feature_encoder   r   z"HubertModel.freeze_feature_encoderNr]   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc           
      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur6| |jd |}| |}| j	||d}| j
|||||d}	|	d }|s[|f|	dd  S t||	j|	jdS )a1  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(example):
        ...     example["speech"] = example["audio"]["array"]
        ...     return example


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r
   )r   )r   r   r   r   r   )last_hidden_stater@   
attentions)r9   r   r   use_return_dictr   r>   r   r   r   _mask_hidden_statesr   r   r@   r   )
r8   r]   r   r   r   r   r   extract_featuresr@   encoder_outputsr<   r<   r=   rA      s2   #

zHubertModel.forward)NNNNN)rC   rD   rE   r   r!   r   r   r   rv   r   FloatTensorr   r   tupler   rA   rF   r<   r<   r:   r=   rm      s0    
rm   c                   @   rG   )HubertForCTCNrH   r<   r<   r<   r=   r   &  rI   r   c                   @   rG   )ro   NrH   r<   r<   r<   r=   ro   *  rI   ro   )r   ro   rm   r[   )(__doc__typingr   r   rv   torch.nnr"   activationsr   integrations.deepspeedr   modeling_outputsr   modeling_utilsr   r+   r	   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   configuration_hubertr   _HIDDEN_STATES_START_POSITIONModuler   r5   rJ   rK   rY   rZ   r[   rm   r   ro   __all__r<   r<   r<   r=   <module>   s0   $	2Fa