o
    eiD                     @   s  d Z ddlZddlmZ ddlZddlmZ ddlmZ	 ddl
mZmZ ddlmZ ddlmZmZ d	d
lmZmZmZmZmZmZmZmZmZ ddlmZ eeZ eeddG dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G dd deZ'eG dd deZ(eZ)G dd  d e(eZ*ed!dG d"d# d#e(Z+G d$d% d%eZ,G d&d' d'eZ-g d(Z.dS ))zPyTorch UniSpeech model.    N)	dataclass   )initialization)ModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)auto_docstringlogging   )	Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2FeatureProjectionWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GumbelVectorQuantizerWav2Vec2ModelWav2Vec2PositionalConvEmbedding   )UniSpeechConfigzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZeej dB ed< dZeej dB ed< dS )	UniSpeechForPreTrainingOutputa  
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://huggingface.co/papers/2006.11477).
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r   tupler    r&   r&   m/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/unispeech/modular_unispeech.pyr   +   s   
 r   c                   @      e Zd ZdS ) UniSpeechPositionalConvEmbeddingNr   r   r    r&   r&   r&   r'   r)   H       r)   c                   @   r(   )UniSpeechFeatureEncoderNr*   r&   r&   r&   r'   r,   L   r+   r,   c                   @   r(   )UniSpeechFeatureProjectionNr*   r&   r&   r&   r'   r-   P   r+   r-   c                   @   r(   )UniSpeechEncoderNr*   r&   r&   r&   r'   r.   T   r+   r.   c                   @   r(   )UniSpeechEncoderStableLayerNormNr*   r&   r&   r&   r'   r/   X   r+   r/   c                   @   s    e Zd Zedd Zdd ZdS )UniSpeechGumbelVectorQuantizerc                 C   s2   | j dd}ttjt||dd  }|S )Nr   dim)meanr"   expsumxlogy)probsmarginal_probs
perplexityr&   r&   r'   _compute_perplexity]   s   "z2UniSpeechGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}| |}||| | j d}| jr?tjj| | j	dd
|}tj||| | jd dd}| |}n$|jdd}|j|j  d|ddd}||| | jd}| |}||| d}|d| j }	|	|| | j| jd}
|
d||d}
|
|fS )Nr3   T)tauhardr1   r   g      ?)shapeweight_projview
num_groupstrainingnn
functionalgumbel_softmaxfloattemperaturetype_asr"   softmaxr;   argmax	new_zerosscatter_	unsqueezecodevectorsnum_varsr6   )selfr   
batch_sizesequence_lengthhidden_sizecodevector_probscodevector_soft_distr:   codevector_idxcodevectors_per_grouprO   r&   r&   r'   forwardc   s0   

z&UniSpeechGumbelVectorQuantizer.forwardN)r   r   r    staticmethodr;   rY   r&   r&   r&   r'   r0   \   s    
r0   c                   @   sj   e Zd ZU eed< dZdZdZdZdZ	dZ
dZe dd ZdejeB fd	d
ZdedejfddZdS )UniSpeechPreTrainedModelconfig	unispeechinput_valuesaudioTc              	   C   s  t |trtj|jjddd t|jj t|j	 dS t |t
rFtj|jjddtd|jjd |jj   d t|jjd dS t |trltd|jj }tj|jj| |d tj|jj| |d dS t |tjrtj|jd| jjd |jdurt|j dS dS t |tjtjfrt|j t|j dS t |tjrt|j |jdurt|j|j|jd   }tj|j| |d dS dS dS )zInitialize the weights        r   )r4   stdr   r
   )abN)
isinstancer0   initnormal_r@   weightzeros_biasuniform_rO   r)   convmathsqrtkernel_sizein_channels	constant_r-   
projectionin_featuresrD   Linearr\   initializer_range	LayerNorm	GroupNormones_Conv1dkaiming_normal_groups)rQ   modulekr&   r&   r'   _init_weights   s<   

 


z&UniSpeechPreTrainedModel._init_weightsinput_lengthsc                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )r"   div)input_lengthrn   strider&   r&   r'   _conv_out_length   s   zSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)zipr\   conv_kernelconv_stride)rQ   r~   r   rn   r   r&   r&   r'    _get_feat_extract_output_lengths   s   z9UniSpeechPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthattention_maskc                 C   s   |j ddd d df }| |tj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dg d
dg }|S )Nr3   r1   r   )dtypedevicer   )r   )cumsumr   tor"   longr?   zerosr   r   arangeflipbool)rQ   r   r   non_padded_lengthsoutput_lengthsrR   r&   r&   r'   "_get_feature_vector_attention_mask   s   
"z;UniSpeechPreTrainedModel._get_feature_vector_attention_maskN)r   r   r    r   r$   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr"   no_gradr}   
LongTensorintr   r   r&   r&   r&   r'   r[      s   
 
!r[   c                   @   sv   e Zd ZdefddZdd Z					ddejdB dejdB d	ejdB d
e	dB de	dB de	dB de
eB fddZdS )UniSpeechModelr\   c                 C   sz   t | | || _t|| _t|| _|jdks|jdkr)t	
t|j | _|jr2t|| _nt|| _|   d S )Nr`   )r[   __init__r\   r,   feature_extractorr-   feature_projectionmask_time_probmask_feature_probrD   	Parameterr"   TensorrT   rj   masked_spec_embeddo_stable_layer_normr/   encoderr.   	post_initrQ   r\   r&   r&   r'   r      s   


zUniSpeechModel.__init__c                 C   s   t d)NzNot needed for UniSpeech)AttributeErrorrQ   r&   r&   r'   freeze_feature_encoder   s   z%UniSpeechModel.freeze_feature_encoderNr^   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur6| |jd |}| |\}	}| j	|	||d}	| j
|	||||d}
|
d }	|s_|	|f|
dd  S t|	||
j|
jdS )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r
   )r   r   r   r   r   r   r   )last_hidden_stateextract_featuresr   r   )r\   r   r   use_return_dictr   	transposer   r?   r   _mask_hidden_statesr   UniSpeechBaseModelOutputr   r   )rQ   r^   r   r   r   r   r   kwargsr   r   encoder_outputsr&   r&   r'   rY      s8   
zUniSpeechModel.forward)NNNNN)r   r   r    r   r   r   r"   r   r#   r   r%   r   rY   r&   r&   r&   r'   r      s.    	r   zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                       s   e Zd Zdef fddZdefddZdd Ze		dd
e	j
de	j
de	j
defddZe				dde	jdB de	jdB dedB dedB dedB deeB fddZ  ZS )UniSpeechForPreTrainingr\   c                    s~   t  | t|| _t|j| _t|| _	t
|j|j| _t
|j|j| _t
|j|j| _t|j| _|   d S )N)superr   r   r]   rD   Dropoutfeat_quantizer_dropoutdropout_featuresr0   	quantizerrs   codevector_dimproj_codevector_dim	project_qrT   project_hidnum_ctc_classesctc_projfinal_dropoutdropoutr   r   	__class__r&   r'   r   )  s   

z UniSpeechForPreTraining.__init__rH   c                 C   s   || j _dS )zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r   rH   )rQ   rH   r&   r&   r'   set_gumbel_temperature8  s   z.UniSpeechForPreTraining.set_gumbel_temperaturec                 C   s   | j j  dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r]   r   _freeze_parametersr   r&   r&   r'   r   >  s   z.UniSpeechForPreTraining.freeze_feature_encoderr   target_featuresnegative_featurespredicted_featuresc                 C   s@   t j| |gdd} t j| |  dd}|| }|| }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r1   r3   )r"   catcosine_similarityrG   rI   )r   r   r   rH   logitsr&   r&   r'   compute_contrastive_logitsE  s
   
z2UniSpeechForPreTraining.compute_contrastive_logitsNr^   r   r   r   r   r   c                 K   sJ  |dur|n| j j}| j|||||d}|d }| |d }	| |	\}
}| |
| jjj}
| 	|
}
t
|d|d| j j}|dd}t
| |j}|dd}|d}||d|
| d }| |}| |}d}|s|dur|||
|f|dd  S ||
|f|dd  S t|||
||j|jdS )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr   r   r   r3   r`   r
   )r   r   r   r   r   r   )r\   r   r]   r   r   r   r   rg   r   r   r"   emptysizefill_replace_probr   	bernoullir   r   rN   masked_fillr   r   r   r   r   )rQ   r^   r   r   r   r   r   outputstransformer_featuresr   quantized_featuresr   prob_replace_matrixsampled_replace_matrixr   r   r&   r&   r'   rY   Y  sL   




zUniSpeechForPreTraining.forward)r   )NNNN)r   r   r    r   r   r   r   r   rZ   r"   r#   r   r   r   r   r%   r   rY   __classcell__r&   r&   r   r'   r   #  sB    r   c                   @   r(   )UniSpeechForCTCNr*   r&   r&   r&   r'   r     r+   r   c                   @   r(   )"UniSpeechForSequenceClassificationNr*   r&   r&   r&   r'   r     r+   r   )r   r   r   r   r[   )/r!   rl   dataclassesr   r"   torch.nnrD    r   re   modeling_outputsr   r   modeling_utilsr   utilsr   r	   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_unispeechr   
get_loggerr   loggerr   r)   r,   r-   r.   r/   r0   r[   r   r   r   r   r   __all__r&   r&   r&   r'   <module>   sD   ,
-KKz