o
    ei                     @   s,  d dl Z d dlmZ d dlmZ d dlZd dlZd dlm	Z	 d dlm
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ e),e-Z.ee(ddG dd deZ/G dd de	j0Z1G dd de	j0Z2G dd deZ3G dd deZ4G dd  d eZ5G d!d" d"e	j0Z6G d#d$ d$e	j0Z7		%dVd&e	j0d'ej8d(ej8d)ej8d*ej8dB d+e9dB d,e9d-e%e' fd.d/Z:G d0d1 d1e	j0Z;G d2d3 d3e	j0Z<G d4d5 d5eZ=G d6d7 d7e	j0Z>G d8d9 d9e	j0Z?G d:d; d;eZ@G d<d= d=e	j0ZAG d>d? d?e	j0ZBe(G d@dA dAe"ZC		 dWdBeDeEeEf dCe9dDeEd*ejFdB dEeEdFejGfdGdHZHeZIe(G dIdJ dJeCZJe(dKdG dLdM dMeCZKdNZLe(dOdG dPdQ dQeCZMe(dRdG dSdT dTeCZNg dUZOdS )X    N)Callable)	dataclass)CrossEntropyLoss   )initialization)ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_bidirectional_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputModelOutputSequenceClassifierOutputWav2Vec2BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel*get_torch_context_manager_or_global_device)Unpack)TransformersKwargsauto_docstringlogging   )UniSpeechConfigzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZeej dB ed< dZeej dB ed< dS )	UniSpeechForPreTrainingOutputa  
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://huggingface.co/papers/2006.11477).
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r    r!   tupler"    r+   r+   n/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/unispeech/modeling_unispeech.pyr   5   s   
 r   c                       $   e Zd Z fddZdd Z  ZS )UniSpeechSamePadLayerc                    s*   t    |d dkrd| _d S d| _d S )N   r   r   )super__init__num_pad_remove)selfnum_conv_pos_embeddings	__class__r+   r,   r1   S   s   
 zUniSpeechSamePadLayer.__init__c                 C   s,   | j dkr|d d d d d | j  f }|S Nr   )r2   r3   r!   r+   r+   r,   forwardW   s   
zUniSpeechSamePadLayer.forwardr#   r$   r%   r1   r9   __classcell__r+   r+   r5   r,   r.   R   s    r.   c                       r-   ) UniSpeechPositionalConvEmbeddingc                    s$  t    tj|j|j|j|jd |jd| _tjj	}t
tjjdr'tjjj	}t r{dd l}|jj| jjdd || jddd| _W d    n1 sLw   Y  t
| jdrd| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	Nr/   )kernel_sizepaddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r0   r1   nnConv1dhidden_sizer4   num_conv_pos_embedding_groupsconvutilsr@   hasattrrE   r   	deepspeedzeroGatheredParametersrB   	original0	original1weight_gweight_vregister_external_parameterr.   r>   r   feat_extract_activation
activation)r3   configr@   rM   rR   rS   r5   r+   r,   r1   ^   s4   

z)UniSpeechPositionalConvEmbedding.__init__c                 C   s:   | dd}| |}| |}| |}| dd}|S )Nr   r/   )	transposerJ   r>   rV   r8   r+   r+   r,   r9      s   


z(UniSpeechPositionalConvEmbedding.forwardr:   r+   r+   r5   r,   r<   ]   s    !r<   c                       &   e Zd Zd fdd	Zdd Z  ZS )UniSpeechNoLayerNormConvLayerr   c                    sj   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _d S )Nr   r   r=   stridebias)r0   r1   conv_dimin_conv_dimout_conv_dimrF   rG   conv_kernelconv_stride	conv_biasrJ   r   rU   rV   r3   rW   layer_idr5   r+   r,   r1      s   
z&UniSpeechNoLayerNormConvLayer.__init__c                 C   s   |  |}| |}|S N)rJ   rV   r8   r+   r+   r,   r9      s   

z%UniSpeechNoLayerNormConvLayer.forwardr   r:   r+   r+   r5   r,   rZ      s    rZ   c                       rY   )UniSpeechLayerNormConvLayerr   c                    s|   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   r[   T)elementwise_affine)r0   r1   r^   r_   r`   rF   rG   ra   rb   rc   rJ   	LayerNorm
layer_normr   rU   rV   rd   r5   r+   r,   r1      s   
z$UniSpeechLayerNormConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )N)rJ   rX   rk   rV   r8   r+   r+   r,   r9      s   


z#UniSpeechLayerNormConvLayer.forwardrg   r:   r+   r+   r5   r,   rh      s    rh   c                       rY   )UniSpeechGroupNormConvLayerr   c                    s   t    |dkr|j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _tj| j| jdd| _d S )Nr   r   r[   T)
num_groupsnum_channelsaffine)r0   r1   r^   r_   r`   rF   rG   ra   rb   rc   rJ   r   rU   rV   	GroupNormrk   rd   r5   r+   r,   r1      s   
z$UniSpeechGroupNormConvLayer.__init__c                 C   s"   |  |}| |}| |}|S rf   )rJ   rk   rV   r8   r+   r+   r,   r9      s   


z#UniSpeechGroupNormConvLayer.forwardrg   r:   r+   r+   r5   r,   rn      s    rn   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )UniSpeechFeatureEncoderz.Construct the features from raw audio waveformc                    s   t     jdkr t ddg fddt jd D  }n jdkr2 fddt jD }n	td	 j d
t|| _	d| _
d| _d S )Ngroupr   re   c                    s   g | ]
}t  |d  dqS )r   ru   )rZ   .0irW   r+   r,   
<listcomp>   s    z4UniSpeechFeatureEncoder.__init__.<locals>.<listcomp>r   layerc                    s   g | ]}t  |d qS )ru   )rh   rv   ry   r+   r,   rz      s    z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r0   r1   feat_extract_normrn   rangenum_feat_extract_layers
ValueErrorrF   
ModuleListconv_layersgradient_checkpointing_requires_grad)r3   rW   r   r5   ry   r,   r1      s   





z UniSpeechFeatureEncoder.__init__c                 C   s   |   D ]}d|_qd| _d S NF)
parametersrequires_gradr   r3   paramr+   r+   r,   _freeze_parameters   s   
z*UniSpeechFeatureEncoder._freeze_parametersc                 C   s:   |d d d f }| j r| jrd|_| jD ]}||}q|S )NT)r   trainingr   r   )r3   input_valuesr!   
conv_layerr+   r+   r,   r9      s   

zUniSpeechFeatureEncoder.forward)r#   r$   r%   r&   r1   r   r9   r;   r+   r+   r5   r,   rs      s
    rs   c                       r-   )UniSpeechFeatureProjectionc                    sJ   t    tj|jd |jd| _t|jd |j| _	t
|j| _d S )Nrm   eps)r0   r1   rF   rj   r^   layer_norm_epsrk   LinearrH   
projectionDropoutfeat_proj_dropoutdropoutr3   rW   r5   r+   r,   r1      s   
z#UniSpeechFeatureProjection.__init__c                 C   s&   |  |}| |}| |}||fS rf   )rk   r   r   )r3   r!   norm_hidden_statesr+   r+   r,   r9     s   


z"UniSpeechFeatureProjection.forwardr:   r+   r+   r5   r,   r      s    r           modulequerykeyvalueattention_maskscalingr   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )Nrm         r/   r   rD   )pr   r   )
sizer'   matmulrX   rF   
functionalsoftmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputr+   r+   r,   eager_attention_forward
  s   
r   c                       s   e Zd ZdZ					ddededed	ed
edededB f fddZ			dde	j
de	j
dB de	j
dB dedB dee dee	j
e	j
dB ee	j
 dB f fddZ  ZS )UniSpeechAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr   
is_decoderr]   	is_causalrW   c                    s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )r]   )r0   r1   r   r   r   head_dimrW   r   r   r   r   rF   r   k_projv_projq_projout_proj)r3   r   r   r   r   r]   r   rW   r5   r+   r,   r1   )  s&   



zUniSpeechAttention.__init__r!   key_value_statesr   output_attentionsr   returnc                 K   s  |du}|j dd \}}|r|j d n|}	||d| jf}
||	d| jf}| |j|
 dd}|r4|n|}| |j| dd}| |j| dd}t| j	j
t}|| ||||f| jsbdn| j| j|d|\}}|||d }| |}||dfS )z#Input shape: Batch x Time x ChannelNrm   r   r/   r   )r   r   r   )shaper   r   viewrX   r   r   r   get_interfacerW   _attn_implementationr   r   r   r   reshaper   r   )r3   r!   r   r   r   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   r+   r+   r,   r9   H  s8   	


zUniSpeechAttention.forward)r   FTFN)NNF)r#   r$   r%   r&   intfloatboolr   r1   r'   Tensorr   r   r*   r9   r;   r+   r+   r5   r,   r   &  sL    "	r   c                       r-   )UniSpeechFeedForwardc                    sp   t    t|j| _t|j|j| _	t
|jtr"t|j | _n|j| _t|j|j| _t|j| _d S rf   )r0   r1   rF   r   activation_dropoutintermediate_dropoutr   rH   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   r5   r+   r,   r1   }  s   
zUniSpeechFeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S rf   )r   r   r   r   r   r8   r+   r+   r,   r9     s   




zUniSpeechFeedForward.forwardr:   r+   r+   r5   r,   r   |  s    r   c                       s&   e Zd Z fddZdddZ  ZS )UniSpeechEncoderLayerc                    sh   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _d S )NFr   r   r   r   rW   r   )r0   r1   r   rH   num_attention_headsattention_dropout	attentionrF   r   r   r   rj   r   rk   r   feed_forwardfinal_layer_normr   r5   r+   r,   r1     s   

zUniSpeechEncoderLayer.__init__NFc                 C   sf   |}| j |||d\}}}| |}|| }| |}|| | }| |}|f}|r1||f7 }|S Nr   r   )r   r   rk   r   r   r3   r!   r   r   attn_residualr   _outputsr+   r+   r,   r9     s   



zUniSpeechEncoderLayer.forwardr   r:   r+   r+   r5   r,   r     s    r   c                       sL   e Zd Z fddZ				ddejdejdB ded	ed
ef
ddZ  Z	S )UniSpeechEncoderc                    f   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nr   c                       g | ]}t  qS r+   )r   rw   r   ry   r+   r,   rz         z-UniSpeechEncoder.__init__.<locals>.<listcomp>Fr0   r1   rW   r<   pos_conv_embedrF   rj   rH   r   rk   r   r   r   r   r}   num_hidden_layerslayersr   r   r5   ry   r,   r1     s   

 
zUniSpeechEncoder.__init__NFTr!   r   r   output_hidden_statesreturn_dictc                 C   s.  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < t| j||d}| |}	||	 }| |}| |}t pCt	| }
| j
D ]3}|rP||f }tg }| jo]|| jjk }|rb|
rm||||d}|d }|rqd}|rz||d f }qG|r||f }|std	d
 |||fD S t|||dS )Nr+   rm   r   r/   r   rW   inputs_embedsr   r   NNc                 s       | ]	}|d ur|V  qd S rf   r+   rw   vr+   r+   r,   	<genexpr>      z+UniSpeechEncoder.forward.<locals>.<genexpr>last_hidden_stater!   r"   )	unsqueezerepeatr   r
   rW   r   rk   r   r   r	   r   r'   randr   	layerdropr*   r   r3   r!   r   r   r   r   all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr{   dropout_probabilityskip_the_layerlayer_outputsr+   r+   r,   r9     sN   







zUniSpeechEncoder.forwardNFFT)
r#   r$   r%   r1   r'   tensorr   r   r9   r;   r+   r+   r5   r,   r     s"    r   c                       s,   e Zd Z fddZdejfddZ  ZS )UniSpeechAttnAdapterLayerc                    sZ   t    |j| _|j| _t| j| _t	| j| j| _
t | _t	| j| j| _dS )z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r0   r1   adapter_attn_dim	input_dimrH   
hidden_dimrF   rj   normr   linear_1ReLUact_fnlinear_2r   r5   r+   r,   r1     s   

z"UniSpeechAttnAdapterLayer.__init__r!   c                 C   s,   |  |}| |}| |}| |}|S rf   )r  r  r  r  r8   r+   r+   r,   r9     s
   



z!UniSpeechAttnAdapterLayer.forward)r#   r$   r%   r1   r'   r(   r9   r;   r+   r+   r5   r,   r	     s    r	  c                       s@   e Zd Z fddZ		d
dejdejdB defdd	Z  ZS )$UniSpeechEncoderLayerStableLayerNormc                    s   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _t|dd d urAt|| _d S d | _d S )NFr   r   r
  )r0   r1   r   rH   r   r   r   rF   r   r   r   rj   r   rk   r   r   r   getattrr	  adapter_layerr   r5   r+   r,   r1     s   


z-UniSpeechEncoderLayerStableLayerNorm.__init__NFr!   r   r   c                 C   sz   |}|  |}| j|||d\}}}| |}|| }|| | | }| jd ur1|| | }|f}|r;||f7 }|S r   )rk   r   r   r   r   r  r   r+   r+   r,   r9   -  s   



z,UniSpeechEncoderLayerStableLayerNorm.forwardr   )	r#   r$   r%   r1   r'   r   r   r9   r;   r+   r+   r5   r,   r    s    r  c                       s.   e Zd Z fddZ				dddZ  ZS )	UniSpeechEncoderStableLayerNormc                    r   )Nr   c                    r   r+   )r  r   ry   r+   r,   rz   O  r   z<UniSpeechEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   r   r5   ry   r,   r1   H  s   


z(UniSpeechEncoderStableLayerNorm.__init__NFTc                 C   s.  |rdnd }|r
dnd }|d ur"| ddd|jd }d|| < t| j||d}| |}	||	 }| |}t p>t| }
| j	D ]3}|rK||f }t
g }| joX|| jjk }|r]|
rh||||d}|d }|rld}|ru||d f }qB| |}|r||f }|std	d
 |||fD S t|||dS )Nr+   rm   r   r/   r   r   r   r   c                 s   r   rf   r+   r   r+   r+   r,   r     r   z:UniSpeechEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )r   r   r   r
   rW   r   r   r   r	   r   r'   r   r   r   rk   r*   r   r   r+   r+   r,   r9   S  sN   







z'UniSpeechEncoderStableLayerNorm.forwardr  r:   r+   r+   r5   r,   r  G  s    r  c                       s4   e Zd ZdZ fddZedd Zdd Z  ZS )UniSpeechGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                    s   t    |j| _|j| _|j| j dkr"td|j d| j dt	t
d| j| j |j| j | _t|jd | j| j | _d| _d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenationr   rm   r/   )r0   r1   num_codevector_groupsro   num_codevectors_per_groupnum_varscodevector_dimr   rF   	Parameterr'   r(   codevectorsr   r^   weight_projtemperaturer   r5   r+   r,   r1     s   


z'UniSpeechGumbelVectorQuantizer.__init__c                 C   s2   | j dd}ttjt||dd  }|S )Nr   r   rm   )meanr'   expsumxlogy)probsmarginal_probs
perplexityr+   r+   r,   _compute_perplexity  s   "z2UniSpeechGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}| |}||| | j d}| jr?tjj| | j	dd
|}tj||| | jd dd}| |}n$|jdd}|j|j  d|ddd}||| | jd}| |}||| d}|d| j }	|	|| | j| jd}
|
d||d}
|
|fS )Nrm   T)tauhardr   r   g      ?rl   )r   r  r   ro   r   rF   r   gumbel_softmaxr   r  type_asr'   r   r&  argmax	new_zerosscatter_r   r  r  r!  )r3   r!   
batch_sizesequence_lengthrH   codevector_probscodevector_soft_distr%  codevector_idxcodevectors_per_groupr  r+   r+   r,   r9     s0   

z&UniSpeechGumbelVectorQuantizer.forward)	r#   r$   r%   r&   r1   staticmethodr&  r9   r;   r+   r+   r5   r,   r    s    
r  c                   @   sj   e Zd ZU eed< dZdZdZdZdZ	dZ
dZe dd ZdejeB fd	d
ZdedejfddZdS )UniSpeechPreTrainedModelrW   	unispeechr   audioTc              	   C   s  t |trtj|jjddd t|jj t|j	 dS t |t
rFtj|jjddtd|jjd |jj   d t|jjd dS t |trltd|jj }tj|jj| |d tj|jj| |d dS t |tjrtj|jd| jjd |jdurt|j dS dS t |tjtjfrt|j t|j dS t |tjrt|j |jdurt|j|j|jd   }tj|j| |d dS dS dS )zInitialize the weightsr   r   )r  stdr   r/   )abN)r   r  initnormal_r  rB   zeros_r]   uniform_r  r<   rJ   mathsqrtr=   in_channels	constant_r   r   in_featuresrF   r   rW   initializer_rangerj   rr   ones_rG   kaiming_normal_r?   )r3   r   kr+   r+   r,   _init_weights  s<   

 


z&UniSpeechPreTrainedModel._init_weightsinput_lengthsc                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )r'   div)input_lengthr=   r\   r+   r+   r,   _conv_out_length  s   zSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)ziprW   ra   rb   )r3   rI  rN  r=   r\   r+   r+   r,    _get_feat_extract_output_lengths  s   z9UniSpeechPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthr   c                 C   s   |j ddd d df }| |tj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dg d
dg }|S )Nrm   r   r   )dtypedevicer   )rS  )cumsumrP  tor'   longr   zerosrR  rS  arangeflipr   )r3   rQ  r   non_padded_lengthsoutput_lengthsr.  r+   r+   r,   "_get_feature_vector_attention_mask  s   
"z;UniSpeechPreTrainedModel._get_feature_vector_attention_maskN)r#   r$   r%   r   r)   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr'   no_gradrH  
LongTensorr   rP  r\  r+   r+   r+   r,   r5    s   
 
!r5  r   	mask_probmask_length	min_masksr   c                    s  | \}dk rt dkrt d d dtjd   fdd}|dur:| d	 n
fd
dt|D }tj	|ft
d}g }	|}
|
dkrZ|S |D ];}||}tjjt|d  |dd}t|dkr}d }n|d }t|tj|
| tjd| g}|	| q\t|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d krd |	|	d k< t||	dd	 |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr }| d  |k r*t| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)rM  num_masked_spanepsilonrg  rf  rh  r/  r+   r,   compute_num_masked_spanK  s   
z6_compute_mask_indices.<locals>.compute_num_masked_spanNrm   c                    s   g | ]} qS r+   r+   r   )r/  r+   r,   rz   ^  s    z)_compute_mask_indices.<locals>.<listcomp>rR  r   F)replace)r   nprandomr   itemdetachr!  tolistr}   rW  r   choicerX  lenconcatenateonesint32appendarraybroadcast_tor   rj  put_along_axis)r   rf  rg  r   rh  r.  rn  rI  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrM  rk  spec_aug_mask_idxdummy_mask_idxoffsetsr+   rl  r,   _compute_mask_indices%  s\   

r  c                       s   e Zd Zdef fddZ		ddejdejdB dejdB fdd	Ze						dd
ej
dB dej
dB dejdB dedB dedB dedB deeB fddZ  ZS )UniSpeechModelrW   c                    sz   t  | || _t|| _t|| _|jdks|jdkr)t	
t|j | _|jr2t|| _nt|| _|   d S Nr   )r0   r1   rW   rs   feature_extractorr   feature_projectionmask_time_probmask_feature_probrF   r  r'   r   rH   r>  masked_spec_embeddo_stable_layer_normr  encoderr   	post_initr   r5   r+   r,   r1     s   


zUniSpeechModel.__init__Nr!   mask_time_indicesr   c                 C   s  t | jdds	|S | \}}}|dur| j|j||< n-| jjdkrK| jrKt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )rf  rg  r   rh  )rS  rR  )rf  rg  rh  rm   )r  rW   r   r  rU  rR  r  r   r  mask_time_lengthmask_time_min_masksr'   r  rS  r   r  mask_feature_lengthmask_feature_min_masksexpand)r3   r!   r  r   r.  r/  rH   mask_feature_indicesr+   r+   r,   _mask_hidden_states  s4   z"UniSpeechModel._mask_hidden_statesr   r   r   r   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}|dur6| |jd |}| |\}	}| j	|	||d}	| j
|	||||d}
|
d }	|s_|	|f|
dd  S t|	||
j|
jdS )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r/   )r  r   r   r   r   r   r   )r   extract_featuresr!   r"   )rW   r   r   use_return_dictr  rX   r\  r   r  r  r  UniSpeechBaseModelOutputr!   r"   )r3   r   r   r  r   r   r   r   r  r!   encoder_outputsr+   r+   r,   r9     s8   
zUniSpeechModel.forwardr   NNNNN)r#   r$   r%   r   r1   r'   r(   re  r  r   r   r   r*   r  r9   r;   r+   r+   r5   r,   r    s@    
.	r  zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                       s   e Zd Zdef fddZdefddZdd Ze		dd
e	j
de	j
de	j
defddZe				dde	jdB de	jdB dedB dedB dedB deeB fddZ  ZS )UniSpeechForPreTrainingrW   c                    s~   t  | t|| _t|j| _t|| _	t
|j|j| _t
|j|j| _t
|j|j| _t|j| _|   d S rf   )r0   r1   r  r6  rF   r   feat_quantizer_dropoutdropout_featuresr  	quantizerr   r  proj_codevector_dim	project_qrH   project_hidnum_ctc_classesctc_projfinal_dropoutr   r  r   r5   r+   r,   r1     s   

z UniSpeechForPreTraining.__init__r  c                 C   s   || j _dS )zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r  r  )r3   r  r+   r+   r,   set_gumbel_temperature,  s   z.UniSpeechForPreTraining.set_gumbel_temperaturec                 C      | j j  dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        Nr6  r  r   r3   r+   r+   r,   freeze_feature_encoder2     z.UniSpeechForPreTraining.freeze_feature_encoderr   target_featuresnegative_featurespredicted_featuresc                 C   s@   t j| |gdd} t j| |  dd}|| }|| }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r   rm   )r'   catcosine_similarityr   r*  )r  r  r  r  logitsr+   r+   r,   compute_contrastive_logits9  s
   
z2UniSpeechForPreTraining.compute_contrastive_logitsNr   r   r   r   r   r   c                 K   sJ  |dur|n| j j}| j|||||d}|d }| |d }	| |	\}
}| |
| jjj}
| 	|
}
t
|d|d| j j}|dd}t
| |j}|dd}|d}||d|
| d }| |}| |}d}|s|dur|||
|f|dd  S ||
|f|dd  S t|||
||j|jdS )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr  r   r   rm   r   r/   )r   r   r   r    r!   r"   )rW   r  r6  r  r  r  rU  rB   rR  r  r'   emptyr   fill_replace_probrX   	bernoullir   rS  r   masked_fillr   r  r   r!   r"   )r3   r   r   r   r   r   r   r   transformer_featuresr  quantized_featuresr    prob_replace_matrixsampled_replace_matrixr  r   r+   r+   r,   r9   M  sL   




zUniSpeechForPreTraining.forward)r   )NNNN)r#   r$   r%   r   r1   r   r  r  r4  r'   r(   r  r   r   r   r*   r   r9   r;   r+   r+   r5   r,   r    sB    r  r/   zq
    UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    c                       s   e Zd ZddedB f fddZdd Zdd Zd	d
 Ze					dde	j
dB de	j
dB dedB dedB dedB de	j
dB deeB fddZ  ZS )UniSpeechForCTCNtarget_langc                    s~   t  | t|| _t|j| _|| _|j	du r#t
d| j dt|dr.|jr.|jn|j}t||j	| _|   dS )a3  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `UniSpeechForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r0   r1   r  r6  rF   r   r  r   r  
vocab_sizer   r6   rL   r  output_hidden_sizerH   r   lm_headr  )r3   rW   r  r  r5   r+   r,   r1     s   

zUniSpeechForCTC.__init__c                 K   s   t  tdkr
dS | j}|dur"t| jdddu r"td| d|du r6t| jdddur6td dS |durC| j	|dd dS dS )	a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        metaNr
  zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)
r   r'   rS  r  r  rW   r   loggerinfoload_adapter)r3   r   r  r+   r+   r,   tie_weights  s   zUniSpeechForCTC.tie_weightsc                 C   r  r  r  r  r+   r+   r,   r    r  z&UniSpeechForCTC.freeze_feature_encoderc                 C      | j  D ]}d|_qdS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr6  r   r   r   r+   r+   r,   freeze_base_model     z!UniSpeechForCTC.freeze_base_modelr   r   r   r   r   labelsr   c              
   K   s|  |dur|n| j j}|dur| | j jkrtd| j j | j|||||d}|d }	| |	}	| |	}
d}|dur|durC|ntj	|tj
d}| |dtj
}|dk}|d}||}tjj|
dtjddd}tjjjd	d
 tjj||||| j j| j j| j jd}W d   n1 sw   Y  |s|
f|td  }|dur|f| S |S t||
|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   ro  rm   )rD   rR  r   F)enabled)blank	reductionzero_infinityr   r  r!   r"   )rW   r  rj  r  r   r6  r   r  r'   	ones_likerV  rP  r!  rU  masked_selectrF   r   log_softmaxfloat32rX   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r!   r"   )r3   r   r   r   r   r   r  r   r   r!   r  r   rI  labels_masktarget_lengthsflattened_targets	log_probsoutputr+   r+   r,   r9     sN   



zUniSpeechForCTC.forwardrf   r  )r#   r$   r%   r   r1   r  r  r  r   r'   r   r   r*   r   r9   r;   r+   r+   r5   r,   r    s4    	r  z
    UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                       s   e Zd Z fddZdd Zdd Ze					ddejdB d	ejdB d
e	dB de	dB de	dB dejdB de
eB fddZ  ZS )"UniSpeechForSequenceClassificationc                    s   t  | t|dr|jrtdt|| _|jd }|jr*t	
t|| | _t	|j|j| _t	|j|j| _|   d S )Nr  z`Sequence classification does not support the use of UniSpeech adapters (config.add_adapter=True)r   )r0   r1   rL   r  r   r  r6  r   use_weighted_layer_sumrF   r  r'   ry  layer_weightsr   rH   classifier_proj_size	projector
num_labels
classifierr  )r3   rW   
num_layersr5   r+   r,   r1   3  s   

z+UniSpeechForSequenceClassification.__init__c                 C   r  r  r  r  r+   r+   r,   r  D  r  z9UniSpeechForSequenceClassification.freeze_feature_encoderc                 C   r  r  r  r   r+   r+   r,   r  K  r  z4UniSpeechForSequenceClassification.freeze_base_modelNr   r   r   r   r   r  r   c                 K   sz  |dur|n| j j}| j jrdn|}| j|||||d}| j jrB|t }	tj|	dd}	tjj	| j
dd}
|	|
ddd jdd}	n|d }	| |	}	|du rV|	jdd}n+| |	jd |}|ddd|	jd }d	|	| < |	jdd|jdddd }| |}d}|durt }||d| j j|d}|s|f|td  }|dur|f| S |S t|||j|jd
S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   rm   r   r/   r   r  )rW   r  r  r6  r  r'   stackrF   r   r   r  r   r!  r  r  r\  r   r   r   r  r   r  r   r!   r"   )r3   r   r   r   r   r   r  r   r   r!   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r   loss_fctr  r+   r+   r,   r9   S  sH   

 
z*UniSpeechForSequenceClassification.forwardr  )r#   r$   r%   r1   r  r  r   r'   r   r   r*   r   r9   r;   r+   r+   r5   r,   r  ,  s2    	r  )r  r  r  r  r5  r  r7   )Pr?  collections.abcr   dataclassesr   numpyrq  r'   torch.nnrF   r    r   r;  activationsr   integrations.deepspeedr   integrations.fsdpr	   masking_utilsr
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   processing_utilsr   rK   r   r   r   configuration_unispeechr   
get_loggerr#   r  r   Moduler.   r<   rZ   rh   rn   rs   r   r   r   r   r   r   r   r   r	  r  r  r  r5  r*   r   re  ndarrayr  r  r  r  r  r  r  __all__r+   r+   r+   r,   <module>   s   
-)
V$H.LFO

wwz h