o
    i                     @   s  d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ ddl m!Z! e"e#Z$dZ%dZ&dZ'g dZ(dZ)dZ*G dd dej+Z,G dd dej+Z-G dd dej+Z.G dd dej+Z/G dd  d ej+Z0G d!d" d"ej+Z1G d#d$ d$ej+Z2G d%d& d&ej+Z3G d'd( d(eZ4G d)d* d*eZ5d+Z6d,Z7G d-d. d.e5Z8ed/e6G d0d1 d1e5Z9ed2e6G d3d4 d4e5Z:g d5Z;dS )6zPyTorch M-CTC-T model.    N)OptionalUnion)nn   )ACT2FN)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputCausalLMOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)logging   )MCTCTConfigr   zspeechbrain/m-ctc-t-large)r      i   zY"Mr. Quilter is the apostle of the middle classes, and we're glad to welcome his gospel."gv@c                       s(   e Zd ZdZ fddZdd Z  ZS )MCTCTConv1dSubsamplerz
    Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
    via gated linear units (https://huggingface.co/papers/1911.08460)
    c                    s   t    | _|j _t|j _|j	 _
|j|j  _ j
dkr1|jd u r,td|j _nd  _|jd  _|j _|j _t fddt jD  _d S )Nr   zbNeed to specify `conv_channels` configuration in `MCTCTConfig` to use multiple convolution layers.   c                 3   s\    | ])\}}t j|d kr jn j| | jd k r j| n j| j| ddV  qdS )r   r   valid)kernel_sizestridepaddingN)r   Conv1din_channelsmid_channels
num_layersout_channelsr   ).0ikself g/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py	<genexpr>U   s    
z1MCTCTConv1dSubsampler.__init__.<locals>.<genexpr>)super__init__configconv_glu_dimglu_dimr   Dropoutconv_dropoutdropoutnum_conv_layersr!   input_feat_per_channelinput_channelsr   conv_channels
ValueErrorr    hidden_sizer"   conv_kernelr   conv_strider   
ModuleList	enumerateconv_layersr'   r-   	__class__r&   r)   r,   9   s&   



zMCTCTConv1dSubsampler.__init__c                 C   s   t dd | jD }tjj|dd||fdd}|dd }| jD ]}||}tjj	|| j
d}| |}q#|dd }|S )Nc                 s   s    | ]}|d  V  qdS )r   Nr(   )r#   sizer(   r(   r)   r*   c   s    z0MCTCTConv1dSubsampler.forward.<locals>.<genexpr>r   constantr   r   dim)sumr   torchr   
functionalpad	transpose
contiguousr=   glur/   r2   )r'   input_featuresr   hidden_statesconvr(   r(   r)   forward`   s   
zMCTCTConv1dSubsampler.forward__name__
__module____qualname____doc__r,   rO   __classcell__r(   r(   r?   r)   r   3   s    'r   c                       s,   e Zd ZdZ fddZ	dddZ  ZS )	MCTCTEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _t | _t|j| _| jdt|jddd | jdtj| j tj| jjddd d S )N)padding_idxposition_ids)r   F)
persistenttoken_type_idsdtypedevice)r+   r,   r   	Embedding
vocab_sizer8   pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsMCTCTLayerNorm	LayerNormr0   hidden_dropout_probr2   register_bufferrF   arangeexpandzerosrX   rA   longr^   r>   r?   r(   r)   r,   s   s   

zMCTCTEmbeddings.__init__Nr   c                 C   s   |d ur|  n|  d d }|d }|d u r%| jd d ||| f }|d u rOt| drD| jd d d |f }||d |}	|	}ntj|tj| jjd}|d u rX| 	|}| 
|}
||
 }| |}| |}|S )NrY   r   r[   r   r\   )rA   rX   hasattrr[   rl   rF   rm   rn   r^   rb   rf   rh   r2   )r'   rL   r[   rX   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrf   
embeddingsr(   r(   r)   rO      s"    




zMCTCTEmbeddings.forward)NNNNr   rP   r(   r(   r?   r)   rV   p   s
    rV   c                       sD   e Zd Z fddZdd Zdd Zdd Z					
dddZ  ZS )MCTCTSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|j| _|j| _| j| j | _t	j
|j| jdd| _t	j
|j| jdd| _t	j
|j| jdd| _t	|j| _|j| _t	d|j d	 | j| _|j| _d S )
Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()Fbiasr   r   )r+   r,   r8   num_attention_headsro   r7   attention_head_dimattention_head_sizeall_head_sizer   Linearquerykeyvaluer0   attention_probs_dropout_probr2   rc   r_   distance_embedding
is_decoderr>   r?   r(   r)   r,      s"   

zMCTCTSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )NrY   r   r   r      )rA   r|   r~   viewpermute)r'   xnew_x_shaper(   r(   r)   transpose_for_scores   s   
z'MCTCTSelfAttention.transpose_for_scoresc                 C   sF   t |jdkr|jttt |j }|jt| jttt | S )Nr   )lenshaper   reversedrangereshape)r'   r   r   r(   r(   r)   reshape_fortran   s    z"MCTCTSelfAttention.reshape_fortranc                 C   s   | dddd}|j\}}}}tj|tj||||f|jdfdd}| |||| | d|g}|d d d || d | f }| |||| d ||g}|d }|d d ||| f dd}| ddddS )Nr   r   r   r   r^   rC   )r   r   rF   catrm   r^   r   rI   )r'   scoresbatchhidden_stateseq_lenheads	halfpointr(   r(   r)   "relative_position_embedding_rotate   s   &  z5MCTCTSelfAttention.relative_position_embedding_rotateNFc                 C   s   |  |}|t| j }| | |}| | |}| |}t||	dd}	| j
j}
td|
|	dd}| |}|	| }	|d urL|	| }	tjj|	dd}| |}|d ura|| }t||}|ddddjdd	}|r{||f}|S |f}|S )
NrY   zlh, bche -> bcler   r   rC   r   r   )	start_dim)r   mathsqrtr~   r   r   r   rF   matmulrI   r   weighteinsumr   r   rG   softmaxr2   r   flatten)r'   rM   attention_mask	head_maskoutput_attentionsmixed_query_layer	key_layervalue_layerquery_layerattention_scorespositional_embeddingrelative_position_scoresattention_probscontext_layeroutputsr(   r(   r)   rO      s,   



zMCTCTSelfAttention.forwardNNF)	rQ   rR   rS   r,   r   r   r   rO   rU   r(   r(   r?   r)   rw      s    rw   c                       $   e Zd Z fddZdd Z  ZS )rg   c                    s2   t    ttd| _ttd| _d S Nr   )	r+   r,   r   	ParameterrF   onessingleton_weightrm   singleton_biasr&   r?   r(   r)   r,     s   
zMCTCTLayerNorm.__init__c                 C   s   || j  | j S N)r   r   r'   rM   r(   r(   r)   rO     s   zMCTCTLayerNorm.forwardrQ   rR   rS   r,   rO   rU   r(   r(   r?   r)   rg     s    rg   c                       r   )MCTCTSelfOutputc                    sL   t    || _tj|j|jdd| _tj|j|jd| _t	|j
| _d S NFrz   )eps)r+   r,   r-   r   r   r8   denserh   layer_norm_epsr0   ri   r2   r>   r?   r(   r)   r,   !  s
   
zMCTCTSelfOutput.__init__c                 C   &   |  |}| |}| || }|S r   r   r2   rh   r'   rM   input_tensorr(   r(   r)   rO   (     

zMCTCTSelfOutput.forwardr   r(   r(   r?   r)   r      s    r   c                       s4   e Zd Z fddZdd Z			d	ddZ  ZS )
MCTCTAttentionc                    s*   t    t|| _t|| _t | _d S r   )r+   r,   rw   r'   r   outputsetpruned_headsr>   r?   r(   r)   r,   0  s   


zMCTCTAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rC   )r   r   r'   r|   r~   r   r   r   r   r   r   r   r   union)r'   r   indexr(   r(   r)   prune_heads6  s   zMCTCTAttention.prune_headsNFc                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r'   r   )r'   rM   r   r   r   self_outputsattention_outputr   r(   r(   r)   rO   H  s   zMCTCTAttention.forwardr   )rQ   rR   rS   r,   r   rO   rU   r(   r(   r?   r)   r   /  s    r   c                       r   )MCTCTIntermediatec                    sH   t    tj|j|jdd| _t|jt	rt
|j | _d S |j| _d S )NFrz   )r+   r,   r   r   r8   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr>   r?   r(   r)   r,   \  s
   
zMCTCTIntermediate.__init__c                 C   s   |  |}| |}|S r   )r   r   r   r(   r(   r)   rO   d  s   

zMCTCTIntermediate.forwardr   r(   r(   r?   r)   r   [  s    r   c                       r   )MCTCTOutputc                    sF   t    tj|j|jdd| _tj|j|jd| _t	|j
| _d S r   )r+   r,   r   r   r   r8   r   rh   r   r0   ri   r2   r>   r?   r(   r)   r,   k  s   
zMCTCTOutput.__init__c                 C   r   r   r   r   r(   r(   r)   rO   q  r   zMCTCTOutput.forwardr   r(   r(   r?   r)   r   j  s    r   c                       s:   e Zd Zdef fddZ			d
ddZdd	 Z  ZS )
MCTCTLayerr-   c                    sB   t    d| _|j| _t|| _t|| _|j| _t	|| _
d S r   )r+   r,   seq_len_dimchunk_size_feed_forwardr   intermediater   	attentionr   r   r   r>   r?   r(   r)   r,   y  s   


zMCTCTLayer.__init__NFc           	      C   sH   | j ||||d}|d }|dd  }t| j| j| j|}|f| }|S )N)r   r   r   )r   r   feed_forward_chunkr   r   )	r'   rM   r   r   r   self_attention_outputsr   r   layer_outputr(   r(   r)   rO     s   
zMCTCTLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )r'   r   intermediate_outputr   r(   r(   r)   r     s   
zMCTCTLayer.feed_forward_chunkr   )rQ   rR   rS   r   r,   rO   r   rU   r(   r(   r?   r)   r   x  s    
r   c                   @   sF   e Zd ZU dZeed< dZdZdZdd Z	de
jfd	d
Zdd ZdS )MCTCTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r-   mctctrL   Tc                 C   s  | j j}t|tjr|jjjd|d |jdur|jj	  nDt|tj
r=|jjjd|d |jdur<|jj|j 	  n&t|tjrQ|jj	  |jjd nt|trc|jjd |jj	  t|tjtjfr|jjjd|d |jdur|jj	  dS dS dS )zInitialize the weightsg        )meanstdNg      ?)r-   initializer_ranger   r   r   r   datanormal_r{   zero_r_   rW   rh   fill_rg   r   r   r   )r'   moduler   r(   r(   r)   _init_weights  s.   



z"MCTCTPreTrainedModel._init_weightsinput_lengthsc                 C   sh   d}t t| jj| jj| jjD ]!\}}}|d }|d|  ||d   d }tj||ddd }q|S )zH
        Computes the output length of the convolutional layers
        r   r   trunc)rounding_mode)zipr   r-   r3   r9   r:   rF   div)r'   r   dilation_	kernel_szr   r   r(   r(   r)    _get_feat_extract_output_lengths  s   z5MCTCTPreTrainedModel._get_feat_extract_output_lengthsc                 C   s   t |jdkr|d d d d df }| |d}| d }tj||f|j|jd}d|tj	||jd|d f< |
dgd
dg }|S )Nr   rY   r   r\   r   r   )r   r   r   rE   rA   rF   rm   r]   r^   rk   flipcumsumrn   )r'   feature_vector_lengthr   subsampled_lengthsbszr(   r(   r)   "_get_feature_vector_attention_mask  s   z7MCTCTPreTrainedModel._get_feature_vector_attention_maskN)rQ   rR   rS   rT   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr   rF   
LongTensorr   r   r(   r(   r(   r)   r     s   
 r   aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MCTCTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_features (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`Wav2Vec2CTCTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
c                       s^   e Zd Zdef fddZ			ddejdejdejd	ed
ededee	e
f fddZ  ZS )MCTCTEncoderr-   c                    sP   t     j| _t | _t | _t fddt	 j
D | _d| _d S )Nc                    s   g | ]}t  qS r(   )r   )r#   r   r-   r(   r)   
<listcomp>  s    z)MCTCTEncoder.__init__.<locals>.<listcomp>F)r+   r,   ri   rg   
layer_normr   rN   r   r;   r   num_hidden_layerslayersgradient_checkpointingr>   r?   r   r)   r,     s   
 
zMCTCTEncoder.__init__FTrL   r   r   r   output_hidden_statesreturn_dictreturnc                 C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| |}| |}|d ur5| |jd |}tj	j
|| j| jd}|d urJt||j}|rNdnd }	|rTdnd }
|d urw| d t| jkrwtdt| j d| d  dt p}t| }t| jD ]5\}}|r|	|f }	tg }| jo|| j jk }|r|r||||d}|d }|rd	}|r|
|d f }
q|r|	|f }	|std
d ||	|
fD S t||	|
dS )Nr   )ptrainingr(   r   z&The head_mask should be specified for z layers, but it is for .)rM   r   r   )NNc                 s   s    | ]	}|d ur|V  qd S r   r(   )r#   vr(   r(   r)   r*   `  s    z'MCTCTEncoder.forward.<locals>.<genexpr>last_hidden_staterM   
attentions)r-   r   r  use_return_dictr  rN   r   r   r   rG   r2   ri   r	  r   r]   rA   r   r  r7   r
   r   r<   rF   rand	layerdroptupler   )r'   rL   r   r   r   r  r  rp   rM   encoder_statesall_attentionssynced_gpusidxencoder_layerdropout_probabilityskip_the_layerlayer_outputsr(   r(   r)   rO     sZ   	





zMCTCTEncoder.forward)FFT)rQ   rR   rS   r   r,   rF   Tensorboolr   r  r   rO   rU   r(   r(   r?   r)   r     s(    
r   zaThe bare M-CTC-T Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Z fddZeedeee	e
ded					ddejdeej d	eej d
ee dee dee deee	f fddZ  ZS )
MCTCTModelc                    s(   t  | || _t|| _|   d S r   )r+   r,   r-   r   encoder	post_initr>   r?   r(   r)   r,   k  s   
zMCTCTModel.__init__zbatch_size, sequence_lengthaudio)
checkpointoutput_typeconfig_classmodalityexpected_outputNrL   r   r   r   r  r  r  c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| j||||||d}|d }|s@|f|dd   S t||j|jdS )Nz#You have to specify input_features.r   r   r   r  r  r   r   r  )	r-   r   r  r  r7   r  r   rM   r  )	r'   rL   r   r   r   r  r  encoder_outputssequence_outputr(   r(   r)   rO   t  s,   zMCTCTModel.forward)NNNNN)rQ   rR   rS   r,   r	   MCTCT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPErF   r  r   r  r   r  rO   rU   r(   r(   r?   r)   r  f  s<    	

r  zcMCTCT Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).c                       s   e Zd Z fddZeeeeee	e
ed						ddejdeej deej dee d	ee d
ee deej deeef fddZ  ZS )MCTCTForCTCc                    sT   t  | t|| _|jd u rtd| j d|j}t	||j| _
|   d S )NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `MCTCTForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)r+   r,   r  r   r`   r7   r@   r8   r   r   ctc_headr  )r'   r-   output_hidden_sizer?   r(   r)   r,     s   

zMCTCTForCTC.__init__)r!  r"  r#  r%  expected_lossNrL   r   r   r   r  r  labelsr  c              
   C   s~  |dur|  | jjkrtd| jj |dur|n| jj}| j||||||d}|d }	| |	}
d}|dur|dur?|ntj|j	dd tj
d}| |dtj
}|dk}|d}||}tjj|
dtjddd}tjjjd	d
 tjj||||| jj| jj| jjd}W d   n1 sw   Y  |s|
f|td  }|dur|f| S |S t||
|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r&  r   rY   )r]   )rD   r]   r   F)enabled)blank	reductionzero_infinity)losslogitsrM   r  )maxr-   r`   r7   r  r   r/  rF   r   r   rn   r   rE   tomasked_selectr   rG   log_softmaxfloat32rI   backendscudnnflagsctc_lossra   ctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rM   r  )r'   rL   r   r   r   r  r  r2  r   rM   r8  r7  r   labels_masktarget_lengthsflattened_targets	log_probsr   r(   r(   r)   rO     sR   	


zMCTCTForCTC.forward)NNNNNN)rQ   rR   rS   r,   r	   r)  r   r+  r   r,  _CTC_EXPECTED_OUTPUT_CTC_EXPECTED_LOSSrF   r  r   r  r   r   r  rO   rU   r(   r(   r?   r)   r.    sB    

	r.  )r.  r  r   )<rT   r   typingr   r   rF   r   activationsr   
file_utilsr   r   r	   integrations.deepspeedr
   integrations.fsdpr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   configuration_mctctr   
get_loggerrQ   loggerrD  r,  r+  r-  rI  rJ  Moduler   rV   rw   rg   r   r   r   r   r   r   MCTCT_START_DOCSTRINGr)  r   r  r.  __all__r(   r(   r(   r)   <module>   s^   
=:l
,'E W8d