o
    	۷in`                  	   @   s  d Z ddlZddlZddlmZ ddlmZmZ ddlZddl	m
  mZ ddlm
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ eeZd/dejde de!dejfddZ"G dd de
j#Z$G dd de
j#Z%G dd de
j#Z&G dd de
j#Z'G dd de
j#Z(G d d! d!e
j#Z)G d"d# d#e
j#Z*G d$d% d%e
j#Z+eG d&d' d'eZ,eG d(d) d)e,Z-ed*d+G d,d- d-e,Z.g d.Z/dS )0zPyTorch PVT model.    N)Iterable)OptionalUnion)nn   )ACT2FN)BaseModelOutputImageClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )	PvtConfig        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r!   Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/pvt/modeling_pvt.py	drop_path(   s   
r#   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )PvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r!   r"   r'   @   s   

zPvtDropPath.__init__hidden_statesc                 C   s   t || j| jS r%   )r#   r   r   r(   r+   r!   r!   r"   forwardD   s   zPvtDropPath.forwardc                 C   s   d| j  S )Nzp=)r   )r(   r!   r!   r"   
extra_reprG   s   zPvtDropPath.extra_reprr%   )__name__
__module____qualname____doc__r   floatr'   r   Tensorr-   strr.   __classcell__r!   r!   r)   r"   r$   =   s
    r$   c                       s   e Zd ZdZ	ddedeeee f deeee f dededed	ef fd
dZ	de
jdedede
jfddZde
jdee
jeef fddZ  ZS )PvtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Fconfig
image_size
patch_sizestridenum_channelshidden_size	cls_tokenc           	         s   t    || _t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }|| _|| _|| _	|| _
ttd|rI|d n||| _|r[ttdd|nd | _tj||||d| _tj||jd| _tj|jd| _d S )Nr   r   kernel_sizer;   eps)p)r&   r'   r8   
isinstancecollectionsabcr   r9   r:   r<   num_patchesr   	Parameterr   randnposition_embeddingszerosr>   Conv2d
projection	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropout)	r(   r8   r9   r:   r;   r<   r=   r>   rG   r)   r!   r"   r'   R   s    

 zPvtPatchEmbeddings.__init__
embeddingsheightwidthr   c                 C   s|   || }t j s|| jj| jj kr| jS |d||ddddd}tj	|||fdd}|dd|| ddd}|S )Nr   r   r      bilinear)sizemode)
r   jit
is_tracingr8   r9   rJ   reshapepermuteFinterpolate)r(   rT   rU   rV   rG   interpolated_embeddingsr!   r!   r"   interpolate_pos_encodingn   s   z+PvtPatchEmbeddings.interpolate_pos_encodingpixel_valuesc                 C   s   |j \}}}}|| jkrtd| |}|j ^ }}}|ddd}| |}| jd urc| j|dd}	t	j
|	|fdd}| | jd d dd f ||}
t	j
| jd d d df |
fdd}
n| | j||}
| ||
 }|||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rX   r   rW   dim)r   r<   
ValueErrorrM   flatten	transposerP   r>   expandr   catrc   rJ   rS   )r(   rd   
batch_sizer<   rU   rV   patch_embed_rT   r>   rJ   r!   r!   r"   r-   y   s"   



 &
zPvtPatchEmbeddings.forwardF)r/   r0   r1   r2   r   r   intr   boolr'   r   r4   rc   tupler-   r6   r!   r!   r)   r"   r7   K   s(    (r7   c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	PvtSelfOutputr8   r=   c                    s*   t    t||| _t|j| _d S r%   )r&   r'   r   LineardenserQ   rR   rS   )r(   r8   r=   r)   r!   r"   r'      s   
zPvtSelfOutput.__init__r+   r   c                 C   s   |  |}| |}|S r%   )ru   rS   r,   r!   r!   r"   r-      s   

zPvtSelfOutput.forward)
r/   r0   r1   r   rp   r'   r   r4   r-   r6   r!   r!   r)   r"   rs      s    rs   c                       sp   e Zd ZdZdedededef fddZded	ej	fd
dZ
	ddej	dededed	eej	 f
ddZ  ZS )PvtEfficientSelfAttentionzxEfficient self-attention mechanism with reduction of the sequence [PvT paper](https://huggingface.co/papers/2102.12122).r8   r=   num_attention_headssequences_reduction_ratioc                    s   t    || _|| _| j| j dkr td| j d| j dt| j| j | _| j| j | _tj	| j| j|j
d| _tj	| j| j|j
d| _tj	| j| j|j
d| _t|j| _|| _|dkrwtj||||d| _tj||jd| _d S d S )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ())biasr   r?   rA   )r&   r'   r=   rw   rg   rp   attention_head_sizeall_head_sizer   rt   qkv_biasquerykeyvaluerQ   attention_probs_dropout_probrS   rx   rL   sequence_reductionrN   rO   rP   r(   r8   r=   rw   rx   r)   r!   r"   r'      s,   

z"PvtEfficientSelfAttention.__init__r+   r   c                 C   s6   |  d d | j| jf }||}|ddddS )NrW   r   rX   r   r   )rZ   rw   r{   viewr_   )r(   r+   	new_shaper!   r!   r"   transpose_for_scores   s   
z.PvtEfficientSelfAttention.transpose_for_scoresFrU   rV   output_attentionsc                 C   s&  |  | |}| jdkr6|j\}}}|ddd||||}| |}|||dddd}| |}|  | |}	|  | 	|}
t
||	dd}|t| j }tjj|dd}| |}t
||
}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr   r   rX   rW   re   r   )r   r~   rx   r   r_   r^   r   rP   r   r   r   matmulri   mathsqrtr{   r   
functionalsoftmaxrS   
contiguousrZ   r|   r   )r(   r+   rU   rV   r   query_layerrl   seq_lenr<   	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr!   r!   r"   r-      s*   




z!PvtEfficientSelfAttention.forwardro   )r/   r0   r1   r2   r   rp   r3   r'   r   r4   r   rq   rr   r-   r6   r!   r!   r)   r"   rv      s0    
rv   c                       s`   e Zd Zdedededef fddZdd Z		dd
ej	dedede
deej	 f
ddZ  ZS )PvtAttentionr8   r=   rw   rx   c                    s6   t    t||||d| _t||d| _t | _d S )N)r=   rw   rx   )r=   )r&   r'   rv   r(   rs   r    setpruned_headsr   r)   r!   r"   r'      s   
zPvtAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   re   )lenr   r(   rw   r{   r   r   r~   r   r   r    ru   r|   union)r(   headsindexr!   r!   r"   prune_heads   s   zPvtAttention.prune_headsFr+   rU   rV   r   r   c                 C   s4   |  ||||}| |d }|f|dd   }|S )Nr   r   )r(   r    )r(   r+   rU   rV   r   self_outputsattention_outputr   r!   r!   r"   r-     s   zPvtAttention.forwardro   )r/   r0   r1   r   rp   r3   r'   r   r   r4   rq   rr   r-   r6   r!   r!   r)   r"   r      s.    r   c                
       sR   e Zd Z		ddededee dee f fddZdejd	ejfd
dZ	  Z
S )PvtFFNNr8   in_featureshidden_featuresout_featuresc                    sj   t    |d ur|n|}t||| _t|jtr!t|j | _	n|j| _	t||| _
t|j| _d S r%   )r&   r'   r   rt   dense1rD   
hidden_actr5   r   intermediate_act_fndense2rQ   rR   rS   )r(   r8   r   r   r   r)   r!   r"   r'     s   
zPvtFFN.__init__r+   r   c                 C   s6   |  |}| |}| |}| |}| |}|S r%   )r   r   rS   r   r,   r!   r!   r"   r-   )  s   




zPvtFFN.forward)NN)r/   r0   r1   r   rp   r   r'   r   r4   r-   r6   r!   r!   r)   r"   r     s    r   c                       sT   e Zd Zdedededededef fddZdd
ejdedede	fddZ
  ZS )PvtLayerr8   r=   rw   r#   rx   	mlp_ratioc                    sz   t    tj||jd| _t||||d| _|dkrt|nt	 | _
tj||jd| _t|| }t|||d| _d S )NrA   )r8   r=   rw   rx   r   )r8   r   r   )r&   r'   r   rN   rO   layer_norm_1r   	attentionr$   Identityr#   layer_norm_2rp   r   mlp)r(   r8   r=   rw   r#   rx   r   mlp_hidden_sizer)   r!   r"   r'   3  s   
	zPvtLayer.__init__Fr+   rU   rV   r   c           
      C   sn   | j | ||||d}|d }|dd  }| |}|| }| | |}| |}|| }	|	f| }|S )N)r+   rU   rV   r   r   r   )r   r   r#   r   r   )
r(   r+   rU   rV   r   self_attention_outputsr   r   
mlp_outputlayer_outputr!   r!   r"   r-   I  s   


zPvtLayer.forwardro   )r/   r0   r1   r   rp   r3   r'   r   r4   rq   r-   r6   r!   r!   r)   r"   r   2  s    &r   c                       s^   e Zd Zdef fddZ			ddejdee dee d	ee d
e	e
ef f
ddZ  ZS )
PvtEncoderr8   c           	         sx  t    || _tjd|jt|jdd }g }t	|j
D ]9}|t||dkr+|jn	| jjd|d   |j| |j| |dkrC|jn|j|d  |j| ||j
d kd qt|| _g }d}t	|j
D ]@}g }|dkrx||j|d  7 }t	|j| D ]}|t||j| |j| |||  |j| |j| d q|t| qgt|| _tj|jd |jd	| _d S )
Nr   cpu)r   rX   r   )r8   r9   r:   r;   r<   r=   r>   )r8   r=   rw   r#   rx   r   rW   rA   )r&   r'   r8   r   linspacedrop_path_ratesumdepthstolistrangenum_encoder_blocksappendr7   r9   patch_sizesstridesr<   hidden_sizesr   
ModuleListpatch_embeddingsr   rw   sequence_reduction_ratios
mlp_ratiosblockrN   rO   rP   )	r(   r8   drop_path_decaysrT   iblockscurlayersjr)   r!   r"   r'   a  sJ   
 

zPvtEncoder.__init__FTrd   r   output_hidden_statesreturn_dictr   c                 C   s  |rdnd }|r
dnd }|j d }t| j}|}	tt| j| jD ]C\}
\}}||	\}	}}|D ]}||	|||}|d }	|rF||d f }|rM||	f }q0|
|d krd|	|||ddddd }	q!| 	|	}	|rq||	f }|st
dd |	||fD S t|	||d	S )
Nr!   r   r   rW   r   rX   c                 s   s    | ]	}|d ur|V  qd S r%   r!   ).0vr!   r!   r"   	<genexpr>  s    z%PvtEncoder.forward.<locals>.<genexpr>last_hidden_stater+   
attentions)r   r   r   	enumeratezipr   r^   r_   r   rP   rr   r   )r(   rd   r   r   r   all_hidden_statesall_self_attentionsrl   
num_blocksr+   idxembedding_layerblock_layerrU   rV   r   layer_outputsr!   r!   r"   r-     s8   


 

zPvtEncoder.forward)FFT)r/   r0   r1   r   r'   r   FloatTensorr   rq   r   rr   r   r-   r6   r!   r!   r)   r"   r   `  s     5
r   c                   @   s6   e Zd ZU eed< dZdZg Zdej	ddfddZ
dS )	PvtPreTrainedModelr8   pvtrd   moduler   Nc                 C   s   | j j}t|tjtjfr'tjj|jj	d|d |j
dur%|j
j	  dS dS t|tjr<|j
j	  |jj	d dS t|trbtjj|jj	d|d|j_	|jdurdtjj|jj	d|d|j_	dS dS dS )zInitialize the weightsr   )meanstdNg      ?)r8   initializer_rangerD   r   rt   rL   inittrunc_normal_weightdatarz   zero_rN   fill_r7   rJ   r>   )r(   r   r   r!   r!   r"   _init_weights  s.   



z PvtPreTrainedModel._init_weights)r/   r0   r1   r   __annotations__base_model_prefixmain_input_name_no_split_modulesr   Moduler   r!   r!   r!   r"   r     s   
 r   c                       sj   e Zd Zdef fddZdd Ze			ddejde	e
 d	e	e
 d
e	e
 deeef f
ddZ  ZS )PvtModelr8   c                    s(   t  | || _t|| _|   d S r%   )r&   r'   r8   r   encoder	post_initr(   r8   r)   r!   r"   r'     s   
zPvtModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   layerr   r   )r(   heads_to_pruner   r   r!   r!   r"   _prune_heads  s   zPvtModel._prune_headsNrd   r   r   r   r   c                 C   s~   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||||d}|d }|s6|f|dd   S t||j|jdS )Nrd   r   r   r   r   r   r   )r8   r   r   use_return_dictr   r   r+   r   )r(   rd   r   r   r   encoder_outputssequence_outputr!   r!   r"   r-     s$   zPvtModel.forward)NNN)r/   r0   r1   r   r'   r   r   r   r   r   rq   r   rr   r   r-   r6   r!   r!   r)   r"   r     s$    

r   z
    Pvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    )custom_introc                       sv   e Zd Zdeddf fddZe				ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )PvtForImageClassificationr8   r   Nc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   rW   )r&   r'   
num_labelsr   r   r   rt   r   r   
classifierr   r   r)   r!   r"   r'     s   
$z"PvtForImageClassification.__init__rd   labelsr   r   r   c                 C   s   |dur|n| j j}| j||||d}|d }| |dddddf }d}	|dur3| ||| j }	|sI|f|dd  }
|	durG|	f|
 S |
S t|	||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   )losslogitsr+   r   )r8   r   r   r   loss_functionr	   r+   r   )r(   rd   r  r   r   r   r   r   r  r  r    r!   r!   r"   r-   #  s*   z!PvtForImageClassification.forward)NNNN)r/   r0   r1   r   r'   r   r   r   r4   rq   r   rr   r	   r-   r6   r!   r!   r)   r"   r     s(    
r   )r   r   r   )r   F)0r2   rE   r   collections.abcr   typingr   r   r   torch.nn.functionalr   r   r`   activationsr   modeling_outputsr   r	   modeling_utilsr
   pytorch_utilsr   r   utilsr   r   configuration_pvtr   
get_loggerr/   loggerr4   r3   rq   r#   r   r$   r7   rs   rv   r   r   r   r   r   r   r   __all__r!   r!   r!   r"   <module>   sB   
 DR*.Y 3;