o
    	۷iI                     @   s  d Z ddlZddlZddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddl m!Z! dZ"G dd deZ#G dd deZ$G dd deZ%G dd dej&Z'G dd deZ(G dd dej&Z)G dd deZ*G dd de*Z+G d d! d!eZ,G d"d# d#eZ-G d$d% d%eZ.G d&d' d'ej&Z/eG d(d) d)eZ0eG d*d+ d+e0Z1G d,d- d-eZ2G d.d/ d/eZ3g d0Z4dS )1zPyTorch SEW model.    N)OptionalUnion)nn   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)BaseModelOutput)PreTrainedModel)auto_docstring   )Wav2Vec2AttentionWav2Vec2EncoderLayerWav2Vec2FeatureEncoderWav2Vec2FeedForwardWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GroupNormConvLayerWav2Vec2LayerNormConvLayerWav2Vec2NoLayerNormConvLayerWav2Vec2SamePadLayer_compute_mask_indices   )	SEWConfigc                   @      e Zd ZdS )SEWNoLayerNormConvLayerN__name__
__module____qualname__ r    r    Y/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/sew/modular_sew.pyr   1       r   c                   @   r   )SEWLayerNormConvLayerNr   r    r    r    r!   r#   5   r"   r#   c                   @   r   )SEWGroupNormConvLayerNr   r    r    r    r!   r$   9   r"   r$   c                       $   e Zd Z fddZdd Z  ZS )SEWPositionalConvEmbeddingc                    s(  t    tj|j|j|j|jd |j|jd| _tj	j
}ttj	jdr)tj	jj
}t r}dd l}|jj| jjdd || jddd| _W d    n1 sNw   Y  t| jdrf| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n	|| jddd| _t|j| _t|j | _d S )	Nr   )kernel_sizepaddinggroupsstrideweight_normr   modifier_rankweight)namedimparametrizations)super__init__r   Conv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupssqueeze_factorconvutilsr+   hasattrr1   r   	deepspeedzeroGatheredParametersr.   	original0	original1weight_gweight_vregister_external_parameterSEWSamePadLayerr(   r   feat_extract_activation
activation)selfconfigr+   r<   rA   rB   	__class__r    r!   r3   >   s6   
	
z#SEWPositionalConvEmbedding.__init__c                 C   s"   |  |}| |}| |}|S N)r9   r(   rF   )rG   hidden_statesr    r    r!   forward`   s   


z"SEWPositionalConvEmbedding.forwardr   r   r   r3   rM   __classcell__r    r    rI   r!   r&   =   s    "r&   c                   @   r   )rD   Nr   r    r    r    r!   rD   h   r"   rD   c                       r%   )SEWUpsamplingc                    s:   t    t|j|j|j | _t|j | _	|j| _d S rK   )
r2   r3   r   Linearr5   r8   
projectionr   rE   rF   rG   rH   rI   r    r!   r3   m   s   
zSEWUpsampling.__init__c                 C   sd   |  |}| |}| jdkr0| \}}}|| j }|| j }|||| j|}||||}|S )Nr   )rR   rF   r8   sizereshape)rG   rL   bszsrc_lensrc_embed_dimtgt_lentgt_embed_dimr    r    r!   rM   s   s   




zSEWUpsampling.forwardrN   r    r    rI   r!   rP   l   s    rP   c                   @   r   )SEWFeatureEncoderNr   r    r    r    r!   r[      r"   r[   c                       s   e Zd Z fddZ  ZS )SEWFeatureExtractorc                    s8   t  | td| jj d| jjd j dt d S )NzThe class `zD` has been depreciated and will be removed in Transformers v5. Use `r   z
` instead.)r2   r3   warningswarnrJ   r   	__bases__FutureWarningrS   rI   r    r!   r3      s   zSEWFeatureExtractor.__init__)r   r   r   r3   rO   r    r    rI   r!   r\      s    r\   c                   @   r   )SEWAttentionNr   r    r    r    r!   ra      r"   ra   c                   @   r   )SEWFeedForwardNr   r    r    r    r!   rb      r"   rb   c                   @   r   )SEWEncoderLayerNr   r    r    r    r!   rc      r"   rc   c                       s.   e Zd Z fddZ				dddZ  ZS )	
SEWEncoderc                    s   t     | _t | _t j j| _tj	 j
 jd| _t j| _t fddt jD | _t | _d| _d S )Nepsc                    s   g | ]}t  qS r    )rc   ).0_rH   r    r!   
<listcomp>   s    z'SEWEncoder.__init__.<locals>.<listcomp>F)r2   r3   rH   r&   pos_conv_embedr   	AvgPool1dr8   pool	LayerNormr5   layer_norm_eps
layer_normDropouthidden_dropoutdropout
ModuleListrangenum_hidden_layerslayersrP   upsamplegradient_checkpointingrS   rI   ri   r!   r3      s   

 

zSEWEncoder.__init__NFTc              	   C   s  |rdnd }|r
dnd }|d ur| ddd|jd }| jjdkr5d|| < |d ur2d|v r2|nd }nfd|| < | d}	|	| jj }
|jd | jj }tj	d||
j
ddd|
jd d}||
ddk  }d	|d d d d d d f j|jd
 }|t|jj }||jd d|jd |jd }|jd }|dd}| |}| |}t|d|d}|dd |f |dd |f  }|dd}| |}| |}t pt| }| jD ]7}|r||f }tg }| jo|| jjk }|r|r||||d}|d }|rd}|r||d f }q|r%||f }| |}|jd |k rBtj |ddd||jd  f}|sQt!dd |||fD S t"|||dS )Nr    r   r   flash_attention_2        r   device      ?)dtype.)attention_maskoutput_attentionsNNc                 s   s    | ]	}|d ur|V  qd S rK   r    )rg   vr    r    r!   	<genexpr>   s    z%SEWEncoder.forward.<locals>.<genexpr>last_hidden_staterL   
attentions)#	unsqueezerepeatshaperH   _attn_implementationlongsumr8   torcharanger~   viewexpandtor   finfomin	transposerk   rm   rT   rp   rs   r   r   rw   randtraining	layerdroprx   r   
functionalpadtupler	   )rG   rL   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskinput_lengthsoutput_lengthsmax_encoder_lengthattention_idsn_input_timestepsposition_embeddingspooled_hidden_states
min_lengthsynced_gpuslayerdropout_probabilityskip_the_layerlayer_outputsr    r    r!   rM      st   

&


 






 zSEWEncoder.forward)NFFTrN   r    r    rI   r!   rd      s    rd   c                   @   sb   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Zdeejef fd	d
ZdedejfddZdS )SEWPreTrainedModelrH   sewinput_valuesTFc              	   C   s  t |tr)tjj|jjddtd|jj	d |jj
   d tj|jjd nt |tjr;|jjjd| jjd n}t |tjtjfrR|jj  |jjd nft |tjrt rddl}t|drt|d	r|jj|j|jgdd
 tj|jj W d   n1 sw   Y  n*|jj|jdd
 tj|jj W d   n1 sw   Y  ntj|jj t |tjtjfr|jdur|jj  dS dS dS )zInitialize the weightsr   r   r   )meanstdr|   r   NrB   rA   r,   )
isinstancer&   r   initnormal_r9   r.   mathsqrtr'   in_channels	constant_biasrQ   datarH   initializer_rangern   	GroupNormzero_fill_r4   r   r<   r;   r=   r>   rB   rA   kaiming_normal_)rG   moduler<   r    r    r!   _init_weights  s8   
 z SEWPreTrainedModel._init_weightsr   c                 C   s4   dd }t | jj| jjD ]
\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)rounding_moder   )r   div)input_lengthr'   r*   r    r    r!   _conv_out_length2  s   zMSEWPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)ziprH   conv_kernelconv_stride)rG   r   r   r'   r*   r    r    r!    _get_feat_extract_output_lengths-  s   z3SEWPreTrainedModel._get_feat_extract_output_lengthsfeature_vector_lengthr   c                 C   s~   |  |dtj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dgd
dg }|S )Nrz   r   )r   r~   r   r}   )r   r   r   r   r   r   zerosr   r~   r   flipcumsumbool)rG   r   r   r   
batch_sizer    r    r!   "_get_feature_vector_attention_mask<  s   
"z5SEWPreTrainedModel._get_feature_vector_attention_maskN)r   r   r   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   r   r   
LongTensorintr   r   r    r    r    r!   r     s   
  r   c                       s   e Zd Zdef fddZ		ddejdeej deej fdd	Z	e
					dd
eej deej deej dee dee dee deeef fddZ  ZS )SEWModelrH   c                    s   t  | || _t|| _tj|jd |jd| _	|jd |j
k| _| jr1t|jd |j
| _t|j| _|jdksB|jdkrNtt|j
 | _t|| _|   d S )Nrz   re   r|   )r2   r3   rH   r[   feature_extractorr   rn   conv_dimro   rp   r5   project_featuresrQ   feature_projectionrq   feat_proj_dropoutfeature_dropoutmask_time_probmask_feature_prob	Parameterr   Tensoruniform_masked_spec_embedrd   encoder	post_initrS   rI   r    r!   r3   K  s   

zSEWModel.__init__NrL   mask_time_indicesr   c                 C   s  t | jdds	|S | \}}}|dur| j|j||< n-| jjdkrK| jrKt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )	mask_probmask_lengthr   	min_masks)r~   r   )r   r   r   rz   )getattrrH   rT   r   r   r   r   r   r   mask_time_lengthmask_time_min_masksr   tensorr~   r   r   mask_feature_lengthmask_feature_min_masksr   )rG   rL   r   r   r   sequence_lengthr5   mask_feature_indicesr    r    r!   _mask_hidden_states_  s4   zSEWModel._mask_hidden_statesr   r   r   r   returnc           
      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}|dd}| |}| jr6| |}| 	|}|durH| 
|jd |}| j||d}| j|||||d}	|	d }|sh|f|	dd  S t||	j|	jdS )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )r   )r   r   r   r   r   r   )rH   r   r   use_return_dictr   r   rp   r   r   r   r   r   r   r   r	   rL   r   )
rG   r   r   r   r   r   r   extract_featuresrL   encoder_outputsr    r    r!   rM     s8   



zSEWModel.forwardr   )NNNNN)r   r   r   r   r3   r   FloatTensorr   r   r   r   r   r   r   r   r	   rM   rO   r    r    rI   r!   r   I  s@    
.
r   c                   @   r   )	SEWForCTCNr   r    r    r    r!   r     r"   r   c                   @   r   )SEWForSequenceClassificationNr   r    r    r    r!   r     r"   r   )r   r   r   r   )5__doc__r   r]   typingr   r   r   r   activationsr   integrations.deepspeedr   integrations.fsdpr   modeling_outputsr	   modeling_utilsr
   r:   r   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   r   r   configuration_sewr   _HIDDEN_STATES_START_POSITIONr   r#   r$   Moduler&   rD   rP   r[   r\   ra   rb   rc   rd   r   r   r   r   __all__r    r    r    r!   <module>   sD   4+fEz