o
    ei.                     @   sJ  d Z ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZmZmZmZmZmZmZ ddlmZ dZG dd dejZG dd deZG dd deZG dd dejZ G dd deZ!G dd deZ"eG dd deZ#G dd dee#Z$G dd deZ%G d d! d!eZ&g d"Z'dS )#zPyTorch Hubert model.    N   )initialization)ACT2FN)is_deepspeed_zero3_enabled)BaseModelOutput)PreTrainedModel)auto_docstring   )Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ModelWav2Vec2SamePadLayer   )HubertConfigc                       $   e Zd Z fddZdd Z  ZS )HubertPositionalConvEmbeddingc                    s@  t    tj|j|j|j|jd |jd| _d | _|j	r%t
|j| _nmtjj}ttjjdr5tjjj}t rdd l}|jj| jjdd || jddd| _W d    n1 sZw   Y  t| jdrr| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	Nr	   )kernel_sizepaddinggroupsweight_normr   modifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsconv
batch_normconv_pos_batch_normBatchNorm1dutilsr   hasattrr   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterHubertSamePadLayerr   r   feat_extract_activation
activation)selfconfigr   r,   r1   r2   	__class__ g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/hubert/modular_hubert.pyr    )   s:   

z&HubertPositionalConvEmbedding.__init__c                 C   sN   | dd}| jd ur| |}| |}| |}| |}| dd}|S )Nr   r	   )	transposer'   r&   r   r6   r7   hidden_statesr;   r;   r<   forwardN   s   




z%HubertPositionalConvEmbedding.forward__name__
__module____qualname__r    r@   __classcell__r;   r;   r9   r<   r   (   s    %r   c                   @      e Zd ZdS )r4   NrB   rC   rD   r;   r;   r;   r<   r4   Z       r4   c                   @   rF   )HubertFeatureEncoderNrG   r;   r;   r;   r<   rI   ^   rH   rI   c                       r   )HubertFeatureProjectionc                    sX   t    |j| _| jrtj|jd |jd| _t|jd |j	| _
t|j| _d S )N)eps)r   r    feat_proj_layer_normr!   	LayerNormconv_dimlayer_norm_eps
layer_normLinearr#   
projectionDropoutfeat_proj_dropoutdropoutr7   r8   r9   r;   r<   r    c   s   
z HubertFeatureProjection.__init__c                 C   s(   | j r| |}| |}| |}|S )N)rM   rQ   rS   rV   r>   r;   r;   r<   r@   k   s
   


zHubertFeatureProjection.forwardrA   r;   r;   r9   r<   rJ   b   s    rJ   c                   @   rF   )HubertEncoderNrG   r;   r;   r;   r<   rX   t   rH   rX   c                   @   rF   )HubertEncoderStableLayerNormNrG   r;   r;   r;   r<   rY   x   rH   rY   c                   @   sj   e Zd ZU eed< dZdZdZdZdZ	dZ
dZe dd ZdejeB fd	d
ZdedejfddZdS )HubertPreTrainedModelr8   hubertinput_valuesaudioTc                 C   s  t |tjr tj|jd| jjd |jdurt	|j dS dS t |tj
tjtjfrUt	|j t|j t|dddurSt	|j t|j t	|j dS dS t |tjrt rddl}t|drt|dr|jj|j|jgdd t|j W d   n1 sw   Y  n&|jj|jdd t|j W d   n1 sw   Y  nt|j |jdurt	|j dS dS t |trt|d	rt|j dS dS t |trt|d
rt|j d| jj!d   dS dS dS )zInitialize the weights        )meanstdNrunning_meanr   r2   r1   r   masked_spec_embedlayer_weightsg      ?r   )"
isinstancer!   rR   initnormal_r   r8   initializer_rangebiaszeros_rN   	GroupNormr)   ones_getattrra   running_varnum_batches_trackedr"   r   r,   r+   r-   r.   r2   r1   kaiming_normal_HubertModeluniform_rb   HubertForSequenceClassification	constant_rc   num_hidden_layers)r7   moduler,   r;   r;   r<   _init_weights   sL   





z#HubertPreTrainedModel._init_weightsinput_lengthsc                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )torchdiv)input_lengthr   strider;   r;   r<   _conv_out_length   s   zPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)zipr8   conv_kernelconv_stride)r7   rw   r~   r   r}   r;   r;   r<    _get_feat_extract_output_lengths   s   z6HubertPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthattention_maskc                 C   s~   |  |dtj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dgd
dg }|S )NrK   r   )dtypedevicer   )r   )r   sumtorz   longshapezerosr   r   arangeflipcumsumbool)r7   r   r   output_lengths
batch_sizer;   r;   r<   "_get_feature_vector_attention_mask   s   
"z8HubertPreTrainedModel._get_feature_vector_attention_maskN)rB   rC   rD   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnrz   no_gradrv   
LongTensorintr   r   r;   r;   r;   r<   rZ   |   s   
 
#rZ   c                       s~   e Zd Zdef fddZdd Z					ddejdB dejdB d	ejdB d
e	dB de	dB de	dB de
eB fddZ  ZS )rp   r8   c                    s~   t  | || _t|| _t|| _|jdks|jdkr)t	
t|j | _|jr2t|| _nt|| _|   | `d S )Nr^   )r   r    r8   rI   feature_extractorrJ   feature_projectionmask_time_probmask_feature_probr!   	Parameterrz   Tensorr#   rq   rb   do_stable_layer_normrY   encoderrX   	post_initadapterrW   r9   r;   r<   r       s   


zHubertModel.__init__c                 C   s   t d)NzNot needed for Hubert)AttributeError)r7   r;   r;   r<   freeze_feature_encoder   s   z"HubertModel.freeze_feature_encoderNr\   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur6| |jd |}| |}	| j	|	|d}	| j
|	||||d}
|
d }	|s[|	f|
dd  S t|	|
j|
jdS )a1  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(example):
        ...     example["speech"] = example["audio"]["array"]
        ...     return example


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r	   )r   )r   r   r   r   r   )last_hidden_stater?   
attentions)r8   r   r   use_return_dictr   r=   r   r   r   _mask_hidden_statesr   r   r?   r   )r7   r\   r   r   r   r   r   kwargsextract_featuresr?   encoder_outputsr;   r;   r<   r@      s2   $

zHubertModel.forward)NNNNN)rB   rC   rD   r   r    r   rz   r   FloatTensorr   tupler   r@   rE   r;   r;   r9   r<   rp      s.    	rp   c                   @   rF   )HubertForCTCNrG   r;   r;   r;   r<   r   &  rH   r   c                   @   rF   )rr   NrG   r;   r;   r;   r<   rr   *  rH   rr   )r   rr   rp   rZ   )(__doc__rz   torch.nnr!    r   re   activationsr   integrations.deepspeedr   modeling_outputsr   modeling_utilsr   r*   r   wav2vec2.modeling_wav2vec2r
   r   r   r   r   r   r   configuration_hubertr   _HIDDEN_STATES_START_POSITIONModuler   r4   rI   rJ   rX   rY   rZ   rp   r   rr   __all__r;   r;   r;   r<   <module>   s0   $	2J_