o
    iG                     @   s  d Z ddlZddlZddlmZ ddlmZmZ ddlZddl	m
Z
 ddlmZmZ ddlmZ ddlmZmZ d	d
lmZmZmZmZmZmZmZmZmZ ddlmZ ee Z!eeddG dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G dd deZ'G dd deZ(eG dd deZ)eZ*G dd  d e)eZ+ed!dG d"d# d#e)Z,G d$d% d%eZ-G d&d' d'eZ.g d(Z/dS ))zPyTorch UniSpeech model.    N)	dataclass)OptionalUnion   )ModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)auto_docstringlogging   )	Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2FeatureProjectionWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GumbelVectorQuantizerWav2Vec2ModelWav2Vec2PositionalConvEmbedding   )UniSpeechConfigzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )custom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeeej  ed< dZeeej  ed< dS )	UniSpeechForPreTrainingOutputa  
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://huggingface.co/papers/2006.11477).
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   tupler    r'   r'   l/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/unispeech/modular_unispeech.pyr   -   s   
 r   c                   @      e Zd ZdS ) UniSpeechPositionalConvEmbeddingNr   r    r!   r'   r'   r'   r(   r*   J       r*   c                   @   r)   )UniSpeechFeatureEncoderNr+   r'   r'   r'   r(   r-   N   r,   r-   c                   @   r)   )UniSpeechFeatureProjectionNr+   r'   r'   r'   r(   r.   R   r,   r.   c                   @   r)   )UniSpeechEncoderNr+   r'   r'   r'   r(   r/   V   r,   r/   c                   @   r)   )UniSpeechEncoderStableLayerNormNr+   r'   r'   r'   r(   r0   Z   r,   r0   c                   @   s    e Zd Zedd Zdd ZdS )UniSpeechGumbelVectorQuantizerc                 C   s8   | j dd}ttj|t|d  dd  }|S )Nr   dimgHz>)meanr#   expsumlog)probsmarginal_probs
perplexityr'   r'   r(   _compute_perplexity_   s   (z2UniSpeechGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}| |}||| | j d}| jr?tjj| | j	dd
|}tj||| | jd dd}| |}n$|jdd}|j|j  d|ddd}||| | jd}| |}||| d}|d| j }	|	|| | j| jd}
|
d||d}
|
|fS )Nr4   T)tauhardr2   r         ?)shapeweight_projview
num_groupstrainingnn
functionalgumbel_softmaxfloattemperaturetype_asr#   softmaxr<   argmax	new_zerosscatter_	unsqueezecodevectorsnum_varsr7   )selfr   
batch_sizesequence_lengthhidden_sizecodevector_probscodevector_soft_distr;   codevector_idxcodevectors_per_grouprQ   r'   r'   r(   forwarde   s0   

z&UniSpeechGumbelVectorQuantizer.forwardN)r   r    r!   staticmethodr<   r[   r'   r'   r'   r(   r1   ^   s    
r1   c                   @   sb   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Zdeejef fdd	Zd
edejfddZdS )UniSpeechPreTrainedModelconfig	unispeechinput_valuesTc              	   C   s  t |tr|jjjjddd |jjj  tj	
|j dS t |trItj	j|jjddtd|jjd |jj   d tj	|jjd dS t |trqtd|jj }tj	j
|jj| |d tj	j
|jj| |d dS t |tjr|jjjd| jjd |jdur|jj  dS dS t |tjtjfr|jj  |jjd dS t |tjrtj	|j |jdurt|j|j|jd   }tj	j
|j| |d dS dS dS )	zInitialize the weights        r   )r5   stdr   r   )abNr?   )
isinstancer1   rB   weightdatanormal_biaszero_rF   inituniform_rQ   r*   convmathsqrtkernel_sizein_channels	constant_r.   
projectionin_featuresLinearr^   initializer_range	LayerNorm	GroupNormfill_Conv1dkaiming_normal_groups)rS   modulekr'   r'   r(   _init_weights   s<   

 


z&UniSpeechPreTrainedModel._init_weightsinput_lengthsc                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )r#   div)input_lengthrp   strider'   r'   r(   _conv_out_length   s   zSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)zipr^   conv_kernelconv_stride)rS   r   r   rp   r   r'   r'   r(    _get_feat_extract_output_lengths   s   z9UniSpeechPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthattention_maskc                 C   s   |j ddd d df }| |tj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dg d
dg }|S )Nr4   r2   r   )dtypedevicer   )r   )cumsumr   tor#   longrA   zerosr   r   arangeflipbool)rS   r   r   non_padded_lengthsoutput_lengthsrT   r'   r'   r(   "_get_feature_vector_attention_mask   s   
"z;UniSpeechPreTrainedModel._get_feature_vector_attention_maskN)r   r    r!   r   r%   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   r   r#   
LongTensorintr   r   r'   r'   r'   r(   r]      s   
 !r]   c                   @   s   e Zd ZdefddZdd Zdd Z					dd	eej	 d
eej	 deej
 dee dee dee deeef fddZdS )UniSpeechModelr^   c                 C   sz   t | | || _t|| _t|| _|jdks|jdkr)t	
t|j | _|jr2t|| _nt|| _|   d S )Nra   )r]   __init__r^   r-   feature_extractorr.   feature_projectionmask_time_probmask_feature_probrF   	Parameterr#   TensorrV   rl   masked_spec_embeddo_stable_layer_normr0   encoderr/   	post_initrS   r^   r'   r'   r(   r      s   


zUniSpeechModel.__init__c                 C      t dNzNot needed for UniSpeechAttributeErrorrS   r'   r'   r(   freeze_feature_extractor      z'UniSpeechModel.freeze_feature_extractorc                 C   r   r   r   r   r'   r'   r(   freeze_feature_encoder   r   z%UniSpeechModel.freeze_feature_encoderNr`   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc           
      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur6| |jd |}| |\}}| j	|||d}| j
|||||d}	|	d }|s_||f|	dd  S t|||	j|	jdS )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )r   r   r   r   r   r   r   )last_hidden_stateextract_featuresr   r   )r^   r   r   use_return_dictr   	transposer   rA   r   _mask_hidden_statesr   UniSpeechBaseModelOutputr   r   )
rS   r`   r   r   r   r   r   r   r   encoder_outputsr'   r'   r(   r[      s8   
zUniSpeechModel.forward)NNNNN)r   r    r!   r   r   r   r   r   r#   r   r$   r   r   r&   r   r[   r'   r'   r'   r(   r      s0    
r   zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                       s   e Zd Zdef fddZdefddZdd Zd	d
 Ze		dde
jde
jde
jdefddZe				ddee
j dee
j dee dee dee deeef fddZ  ZS )UniSpeechForPreTrainingr^   c                    s~   t  | t|| _t|j| _t|| _	t
|j|j| _t
|j|j| _t
|j|j| _t|j| _|   d S )N)superr   r   r_   rF   Dropoutfeat_quantizer_dropoutdropout_featuresr1   	quantizerru   codevector_dimproj_codevector_dim	project_qrV   project_hidnum_ctc_classesctc_projfinal_dropoutdropoutr   r   	__class__r'   r(   r   +  s   

z UniSpeechForPreTraining.__init__rJ   c                 C   s   || j _dS )zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r   rJ   )rS   rJ   r'   r'   r(   set_gumbel_temperature:  s   z.UniSpeechForPreTraining.set_gumbel_temperaturec                 C   s   t dt |   dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        zThe method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.N)warningswarnFutureWarningr   r   r'   r'   r(   r   @  s
   z0UniSpeechForPreTraining.freeze_feature_extractorc                 C   s   | j j  dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r_   r   _freeze_parametersr   r'   r'   r(   r   L  s   z.UniSpeechForPreTraining.freeze_feature_encoderr   target_featuresnegative_featurespredicted_featuresc                 C   s@   t j| |gdd} t j| |  dd}|| }|| }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r2   r4   )r#   catcosine_similarityrI   rK   )r   r   r   rJ   logitsr'   r'   r(   compute_contrastive_logitsS  s
   
z2UniSpeechForPreTraining.compute_contrastive_logitsNr`   r   r   r   r   r   c                 C   sJ  |dur|n| j j}| j|||||d}|d }| |d }| |\}	}
| |	| jjj}	| 	|	}	t
|d|d| j j}|dd}t
| |j}|dd}|d}||d|	| d }| |}| |}d}|s|dur|||	|
f|dd  S ||	|
f|dd  S t|||	|
|j|jdS )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr   r   r   r4   ra   r   )r   r   r   r   r   r   )r^   r   r_   r   r   r   r   rf   r   r   r#   emptysizery   replace_probr   	bernoullir   r   rP   masked_fillr   r   r   r   r   )rS   r`   r   r   r   r   outputstransformer_featuresr   quantized_featuresr   prob_replace_matrixsampled_replace_matrixr   r   r'   r'   r(   r[   g  sL   




zUniSpeechForPreTraining.forward)r   )NNNN)r   r    r!   r   r   r   r   r   r   r\   r#   r$   r   r	   r   r   r   r   r&   r   r[   __classcell__r'   r'   r   r(   r   %  sD    
r   c                   @   r)   )UniSpeechForCTCNr+   r'   r'   r'   r(   r     r,   r   c                   @   r)   )"UniSpeechForSequenceClassificationNr+   r'   r'   r'   r(   r     r,   r   )r   r   r   r   r]   )0r"   rn   r   dataclassesr   typingr   r   r#   torch.nnrF   modeling_outputsr   r   modeling_utilsr   utilsr	   r
   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_unispeechr   
get_loggerr   loggerr   r*   r-   r.   r/   r0   r1   r]   r   r   r   r   r   __all__r'   r'   r'   r(   <module>   sH   ,
-IM 