o
    eiG                     @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZm Z  ddl!m"Z" dZ#G dd deZ$G dd deZ%G dd deZ&G dd dej'Z(G dd deZ)G dd dej'Z*G dd deZ+G dd  d eZ,G d!d" d"eZ-G d#d$ d$eZ.G d%d& d&ej'Z/eG d'd( d(eZ0eG d)d* d*e0Z1G d+d, d,eZ2G d-d. d.eZ3g d/Z4dS )0zPyTorch SEW model.    N)nn   )initialization)ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)BaseModelOutput)PreTrainedModel)auto_docstring)is_flash_attention_requested   )Wav2Vec2AttentionWav2Vec2EncoderLayerWav2Vec2FeatureEncoderWav2Vec2FeedForwardWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GroupNormConvLayerWav2Vec2LayerNormConvLayerWav2Vec2NoLayerNormConvLayerWav2Vec2SamePadLayer_compute_mask_indices   )	SEWConfigc                   @      e Zd ZdS )SEWNoLayerNormConvLayerN__name__
__module____qualname__ r    r    a/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/sew/modular_sew.pyr   0       r   c                   @   r   )SEWLayerNormConvLayerNr   r    r    r    r!   r#   4   r"   r#   c                   @   r   )SEWGroupNormConvLayerNr   r    r    r    r!   r$   8   r"   r$   c                       $   e Zd Z fddZdd Z  ZS )SEWPositionalConvEmbeddingc                    s(  t    tj|j|j|j|jd |j|jd| _tj	j
}ttj	jdr)tj	jj
}t r}dd l}|jj| jjdd || jddd| _W d    n1 sNw   Y  t| jdrf| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	Nr   )kernel_sizepaddinggroupsstrideweight_normr   modifier_rankweight)namedimparametrizations)super__init__r   Conv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupssqueeze_factorconvutilsr+   hasattrr1   r   	deepspeedzeroGatheredParametersr.   	original0	original1weight_gweight_vregister_external_parameterSEWSamePadLayerr(   r   feat_extract_activation
activation)selfconfigr+   r<   rA   rB   	__class__r    r!   r3   =   s6   
	
z#SEWPositionalConvEmbedding.__init__c                 C   s"   |  |}| |}| |}|S N)r9   r(   rF   )rG   hidden_statesr    r    r!   forward_   s   


z"SEWPositionalConvEmbedding.forwardr   r   r   r3   rM   __classcell__r    r    rI   r!   r&   <   s    "r&   c                   @   r   )rD   Nr   r    r    r    r!   rD   g   r"   rD   c                       r%   )SEWUpsamplingc                    s:   t    t|j|j|j | _t|j | _	|j| _d S rK   )
r2   r3   r   Linearr5   r8   
projectionr   rE   rF   rG   rH   rI   r    r!   r3   l   s   
zSEWUpsampling.__init__c                 C   sd   |  |}| |}| jdkr0| \}}}|| j }|| j }|||| j|}||||}|S )Nr   )rR   rF   r8   sizereshape)rG   rL   bszsrc_lensrc_embed_dimtgt_lentgt_embed_dimr    r    r!   rM   r   s   




zSEWUpsampling.forwardrN   r    r    rI   r!   rP   k   s    rP   c                   @   r   )SEWFeatureEncoderNr   r    r    r    r!   r[      r"   r[   c                   @   r   )SEWAttentionNr   r    r    r    r!   r\      r"   r\   c                   @   r   )SEWFeedForwardNr   r    r    r    r!   r]      r"   r]   c                   @   r   )SEWEncoderLayerNr   r    r    r    r!   r^      r"   r^   c                       s.   e Zd Z fddZ				dddZ  ZS )	
SEWEncoderc                    s   t     | _t | _t j j| _tj	 j
 jd| _t j| _t fddt jD | _t | _d| _d S )Nepsc                    s   g | ]}t  qS r    )r^   ).0_rH   r    r!   
<listcomp>   s    z'SEWEncoder.__init__.<locals>.<listcomp>F)r2   r3   rH   r&   pos_conv_embedr   	AvgPool1dr8   pool	LayerNormr5   layer_norm_eps
layer_normDropouthidden_dropoutdropout
ModuleListrangenum_hidden_layerslayersrP   upsamplegradient_checkpointingrS   rI   rd   r!   r3      s   

 

zSEWEncoder.__init__NFTc              	   C   s  |rdnd }|r
dnd }|d ur| ddd|jd }t| jr4d|| < |d ur1d|v r1|nd }nfd|| < | d}	|	| jj }
|jd | jj }tj	d||
j
ddd|
jd d}||
ddk  }d|d d d d d d f j|jd	 }|t|jj }||jd d|jd |jd }|jd }|dd}| |}| |}t|d|d}|d
d |f |d
d |f  }|dd}| |}| |}t pt| }| jD ]7}|r||f }tg }| jo|| jjk }|r|r||||d}|d }|rd}|r||d f }q|r$||f }| |}|jd |k rAtj |ddd||jd  f}|sPt!dd |||fD S t"|||dS )Nr    r   r           r   deviceg      ?)dtype.)attention_maskoutput_attentionsNNc                 s   s    | ]	}|d ur|V  qd S rK   r    )rb   vr    r    r!   	<genexpr>   s    z%SEWEncoder.forward.<locals>.<genexpr>last_hidden_staterL   
attentions)#	unsqueezerepeatshaper   rH   longsumr8   torcharangerx   viewexpandtory   finfomin	transposerf   rh   rT   rk   rn   r   r   rr   randtraining	layerdroprs   r   
functionalpadtupler   )rG   rL   rz   r{   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskinput_lengthsoutput_lengthsmax_encoder_lengthattention_idsn_input_timestepsposition_embeddingspooled_hidden_states
min_lengthsynced_gpuslayerdropout_probabilityskip_the_layerlayer_outputsr    r    r!   rM      st   


&


 






 zSEWEncoder.forward)NFFTrN   r    r    rI   r!   r_      s    r_   c                   @   sj   e Zd ZU eed< dZdZdZdZdZ	dZ
dZe dd Zd	ejeB fd
dZdedejfddZdS )SEWPreTrainedModelrH   sewinput_valuesaudioTFc              	   C   s  t |tr'tj|jjddtd|jjd |jj	   d t
|jjd nt |tjr9tj|jd| jjd nvt |tjtjfrOt|j t|j n`t |tjrt rddl}t|drt|dr|jj|j|jgdd	 t|j W d   n1 sw   Y  n&|jj|jdd	 t|j W d   n1 sw   Y  nt|j t |tjtjfr|jdurt|j dS dS dS )
zInitialize the weightsr   r   r   )meanstdrv   NrB   rA   r,   )
isinstancer&   initnormal_r9   r.   mathsqrtr'   in_channels	constant_biasr   rQ   rH   initializer_rangeri   	GroupNormzeros_ones_r4   r   r<   r;   r=   r>   rB   rA   kaiming_normal_)rG   moduler<   r    r    r!   _init_weights  s8   
 z SEWPreTrainedModel._init_weightsr   c                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )r   div)input_lengthr'   r*   r    r    r!   _conv_out_length&  s   zMSEWPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)ziprH   conv_kernelconv_stride)rG   r   r   r'   r*   r    r    r!    _get_feat_extract_output_lengths!  s   z3SEWPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthrz   c                 C   s~   |  |dtj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dgd
dg }|S )Nru   r   )ry   rx   r   rw   )r   r   r   r   r   r   zerosry   rx   r   flipcumsumbool)rG   r   rz   r   
batch_sizer    r    r!   "_get_feature_vector_attention_mask0  s   
"z5SEWPreTrainedModel._get_feature_vector_attention_maskN)r   r   r   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   no_gradr   
LongTensorintr   r   r    r    r    r!   r      s   
 
r   c                       s   e Zd Zdef fddZ		ddejdejdB dejdB fdd	Ze						dd
ej
dB dej
dB dejdB dedB dedB dedB deeB fddZ  ZS )SEWModelrH   c                    s   t  | || _t|| _tj|jd |jd| _	|jd |j
k| _| jr1t|jd |j
| _t|j| _|jdksB|jdkrNtt|j
 | _t|| _|   d S )Nru   r`   rv   )r2   r3   rH   r[   feature_extractorr   ri   conv_dimrj   rk   r5   project_featuresrQ   feature_projectionrl   feat_proj_dropoutfeature_dropoutmask_time_probmask_feature_prob	Parameterr   Tensoruniform_masked_spec_embedr_   encoder	post_initrS   rI   r    r!   r3   ?  s   

zSEWModel.__init__NrL   mask_time_indicesrz   c                 C   s  t | jdds	|S | \}}}|dur| j|j||< n-| jjdkrK| jrKt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )	mask_probmask_lengthrz   	min_masks)rx   ry   )r   r   r   ru   )getattrrH   rT   r   r   ry   r   r   r   mask_time_lengthmask_time_min_masksr   tensorrx   r   r   mask_feature_lengthmask_feature_min_masksr   )rG   rL   r   rz   r   sequence_lengthr5   mask_feature_indicesr    r    r!   _mask_hidden_statesS  s4   zSEWModel._mask_hidden_statesr   r{   r   r   returnc                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}| |}| jr6| |}| 	|}	|durH| 
|	jd |}| j|	|d}	| j|	||||d}
|
d }	|sh|	f|
dd  S t|	|
j|
jdS )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )r   )rz   r{   r   r   r   r   )rH   r{   r   use_return_dictr   r   rk   r   r   r   r   r   r   r   r   rL   r   )rG   r   rz   r   r{   r   r   kwargsextract_featuresrL   encoder_outputsr    r    r!   rM     s8   



zSEWModel.forwardr|   )NNNNN)r   r   r   r   r3   r   FloatTensorr   r   r
   r   r   r   r   rM   rO   r    r    rI   r!   r   =  s@    
.	r   c                   @   r   )	SEWForCTCNr   r    r    r    r!   r     r"   r   c                   @   r   )SEWForSequenceClassificationNr   r    r    r    r!   r     r"   r   )r   r   r   r   )5__doc__r   r   r    r   r   activationsr   integrations.deepspeedr   integrations.fsdpr   modeling_outputsr   modeling_utilsr	   r:   r
   utils.genericr   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   r   r   configuration_sewr   _HIDDEN_STATES_START_POSITIONr   r#   r$   Moduler&   rD   rP   r[   r\   r]   r^   r_   r   r   r   r   __all__r    r    r    r!   <module>   sB   4+fE{