o
    eiW                  	   @   s  d Z ddlZddlZddlmZ ddlZddlm  mZ	 ddlmZ ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ eeZd.dejdededejfddZG dd dej Z!G dd dej Z"G dd dej Z#G dd dej Z$G dd dej Z%G dd  d ej Z&G d!d" d"ej Z'G d#d$ d$ej Z(eG d%d& d&eZ)eG d'd( d(e)Z*ed)d*G d+d, d,e)Z+g d-Z,dS )/zPyTorch PVT model.    N)Iterable)nn   )initialization)ACT2FN)BaseModelOutputImageClassifierOutput)PreTrainedModel)auto_docstringlogging   )	PvtConfig        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r   b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/pvt/modeling_pvt.py	drop_path&   s   r    c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )PvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r   r   r$   9   s   

zPvtDropPath.__init__hidden_statesc                 C   s   t || j| jS r"   )r    r   r   r%   r(   r   r   r   forward=   s   zPvtDropPath.forwardc                 C   s   d| j  S )Nzp=)r   )r%   r   r   r   
extra_repr@   s   zPvtDropPath.extra_reprr"   )__name__
__module____qualname____doc__floatr$   r   Tensorr*   strr+   __classcell__r   r   r&   r   r!   6   s
    r!   c                       s   e Zd ZdZ	ddedeee B deee B dededed	ef fd
dZde	j
dedede	j
fddZde	j
dee	j
eef fddZ  ZS )PvtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Fconfig
image_size
patch_sizestridenum_channelshidden_size	cls_tokenc           	         s   t    || _t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }|| _|| _|| _	|| _
ttd|rI|d n||| _|r[ttdd|nd | _tj||||d| _tj||jd| _tj|jd| _d S )Nr   r   kernel_sizer8   eps)p)r#   r$   r5   
isinstancecollectionsabcr   r6   r7   r9   num_patchesr   	Parameterr   randnposition_embeddingszerosr;   Conv2d
projection	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropout)	r%   r5   r6   r7   r8   r9   r:   r;   rD   r&   r   r   r$   K   s    

 zPvtPatchEmbeddings.__init__
embeddingsheightwidthr   c                 C   s|   || }t j s|| jj| jj kr| jS |d||ddddd}tj	|||fdd}|dd|| ddd}|S )Nr   r   r      bilinear)sizemode)
r   jit
is_tracingr5   r6   rG   reshapepermuteFinterpolate)r%   rQ   rR   rS   rD   interpolated_embeddingsr   r   r   interpolate_pos_encodingg   s   z+PvtPatchEmbeddings.interpolate_pos_encodingpixel_valuesc                 C   s   |j \}}}}|| jkrtd| |}|j ^ }}}|ddd}| |}| jd urc| j|dd}	t	j
|	|fdd}| | jd d dd f ||}
t	j
| jd d d df |
fdd}
n| | j||}
| ||
 }|||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rU   r   rT   dim)r   r9   
ValueErrorrJ   flatten	transposerM   r;   expandr   catr`   rG   rP   )r%   ra   
batch_sizer9   rR   rS   patch_embed_rQ   r;   rG   r   r   r   r*   r   s"   



 &
zPvtPatchEmbeddings.forwardF)r,   r-   r.   r/   r   intr   boolr$   r   r1   r`   tupler*   r3   r   r   r&   r   r4   D   s(    

(r4   c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	PvtSelfOutputr5   r:   c                    s*   t    t||| _t|j| _d S r"   )r#   r$   r   LineardenserN   rO   rP   )r%   r5   r:   r&   r   r   r$      s   
zPvtSelfOutput.__init__r(   r   c                 C   s   |  |}| |}|S r"   )rr   rP   r)   r   r   r   r*      s   

zPvtSelfOutput.forward
r,   r-   r.   r   rm   r$   r   r1   r*   r3   r   r   r&   r   rp      s    rp   c                       sp   e Zd ZdZdedededef fddZded	ej	fd
dZ
	ddej	dededed	eej	 f
ddZ  ZS )PvtEfficientSelfAttentionzxEfficient self-attention mechanism with reduction of the sequence [PvT paper](https://huggingface.co/papers/2102.12122).r5   r:   num_attention_headssequences_reduction_ratioc                    s   t    || _|| _| j| j dkr td| j d| j dt| j| j | _| j| j | _tj	| j| j|j
d| _tj	| j| j|j
d| _tj	| j| j|j
d| _t|j| _|| _|dkrwtj||||d| _tj||jd| _d S d S )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ())biasr   r<   r>   )r#   r$   r:   ru   rd   rm   attention_head_sizeall_head_sizer   rq   qkv_biasquerykeyvaluerN   attention_probs_dropout_probrP   rv   rI   sequence_reductionrK   rL   rM   r%   r5   r:   ru   rv   r&   r   r   r$      s,   

z"PvtEfficientSelfAttention.__init__r(   r   c                 C   s6   |  d d | j| jf }||}|ddddS )NrT   r   rU   r   r   )rW   ru   ry   viewr\   )r%   r(   	new_shaper   r   r   transpose_for_scores   s   
z.PvtEfficientSelfAttention.transpose_for_scoresFrR   rS   output_attentionsc                 C   s&  |  | |}| jdkr6|j\}}}|ddd||||}| |}|||dddd}| |}|  | |}	|  | 	|}
t
||	dd}|t| j }tjj|dd}| |}t
||
}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr   r   rU   rT   rb   r   )r   r|   rv   r   r\   r[   r   rM   r}   r~   r   matmulrf   mathsqrtry   r   
functionalsoftmaxrP   
contiguousrW   rz   r   )r%   r(   rR   rS   r   query_layerri   seq_lenr9   	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr   r   r   r*      s*   




z!PvtEfficientSelfAttention.forwardrl   )r,   r-   r.   r/   r   rm   r0   r$   r   r1   r   rn   ro   r*   r3   r   r   r&   r   rt      s0    
rt   c                       sX   e Zd Zdedededef fddZ	ddejd	ed
ede	de
ej f
ddZ  ZS )PvtAttentionr5   r:   ru   rv   c                    s.   t    t||||d| _t||d| _d S )N)r:   ru   rv   )r:   )r#   r$   rt   r%   rp   r   r   r&   r   r   r$      s   
zPvtAttention.__init__Fr(   rR   rS   r   r   c                 C   s4   |  ||||}| |d }|f|dd   }|S )Nr   r   )r%   r   )r%   r(   rR   rS   r   self_outputsattention_outputr   r   r   r   r*      s   zPvtAttention.forwardrl   )r,   r-   r.   r   rm   r0   r$   r   r1   rn   ro   r*   r3   r   r   r&   r   r      s,    r   c                
       sR   e Zd Z		ddedededB dedB f fddZdejd	ejfd
dZ  Z	S )PvtFFNNr5   in_featureshidden_featuresout_featuresc                    sj   t    |d ur|n|}t||| _t|jtr!t|j | _	n|j| _	t||| _
t|j| _d S r"   )r#   r$   r   rq   dense1rA   
hidden_actr2   r   intermediate_act_fndense2rN   rO   rP   )r%   r5   r   r   r   r&   r   r   r$      s   
zPvtFFN.__init__r(   r   c                 C   s6   |  |}| |}| |}| |}| |}|S r"   )r   r   rP   r   r)   r   r   r   r*     s   




zPvtFFN.forward)NNrs   r   r   r&   r   r      s    r   c                       sT   e Zd Zdedededededef fddZdd
ejdedede	fddZ
  ZS )PvtLayerr5   r:   ru   r    rv   	mlp_ratioc                    sz   t    tj||jd| _t||||d| _|dkrt|nt	 | _
tj||jd| _t|| }t|||d| _d S )Nr>   )r5   r:   ru   rv   r   )r5   r   r   )r#   r$   r   rK   rL   layer_norm_1r   	attentionr!   Identityr    layer_norm_2rm   r   mlp)r%   r5   r:   ru   r    rv   r   mlp_hidden_sizer&   r   r   r$     s   
	zPvtLayer.__init__Fr(   rR   rS   r   c           
      C   sn   | j | ||||d}|d }|dd  }| |}|| }| | |}| |}|| }	|	f| }|S )N)r(   rR   rS   r   r   r   )r   r   r    r   r   )
r%   r(   rR   rS   r   self_attention_outputsr   r   
mlp_outputlayer_outputr   r   r   r*   /  s   


zPvtLayer.forwardrl   )r,   r-   r.   r   rm   r0   r$   r   r1   rn   r*   r3   r   r   r&   r   r     s    &r   c                       sZ   e Zd Zdef fddZ			ddejdedB d	edB d
edB dee	B f
ddZ
  ZS )
PvtEncoderr5   c           	         sx  t    || _tjd|jt|jdd }g }t	|j
D ]9}|t||dkr+|jn	| jjd|d   |j| |j| |dkrC|jn|j|d  |j| ||j
d kd qt|| _g }d}t	|j
D ]@}g }|dkrx||j|d  7 }t	|j| D ]}|t||j| |j| |||  |j| |j| d q|t| qgt|| _tj|jd |jd	| _d S )
Nr   cpu)r   rU   r   )r5   r6   r7   r8   r9   r:   r;   )r5   r:   ru   r    rv   r   rT   r>   )r#   r$   r5   r   linspacedrop_path_ratesumdepthstolistrangenum_encoder_blocksappendr4   r6   patch_sizesstridesr9   hidden_sizesr   
ModuleListpatch_embeddingsr   ru   sequence_reduction_ratios
mlp_ratiosblockrK   rL   rM   )	r%   r5   drop_path_decaysrQ   iblockscurlayersjr&   r   r   r$   G  sJ   
 

zPvtEncoder.__init__FTra   r   Noutput_hidden_statesreturn_dictr   c                 C   s  |rdnd }|r
dnd }|j d }t| j}|}	tt| j| jD ]C\}
\}}||	\}	}}|D ]}||	|||}|d }	|rF||d f }|rM||	f }q0|
|d krd|	|||ddddd }	q!| 	|	}	|rq||	f }|st
dd |	||fD S t|	||d	S )
Nr   r   r   rT   r   rU   c                 s   s    | ]	}|d ur|V  qd S r"   r   ).0vr   r   r   	<genexpr>  s    z%PvtEncoder.forward.<locals>.<genexpr>last_hidden_stater(   
attentions)r   lenr   	enumeratezipr   r[   r\   r   rM   ro   r   )r%   ra   r   r   r   all_hidden_statesall_self_attentionsri   
num_blocksr(   idxembedding_layerblock_layerrR   rS   r   layer_outputsr   r   r   r*   y  s8   


 

zPvtEncoder.forward)FFT)r,   r-   r.   r   r$   r   FloatTensorrn   ro   r   r*   r3   r   r   r&   r   r   F  s     5r   c                   @   sB   e Zd ZU eed< dZdZdZg Ze	
 dejddfdd	ZdS )
PvtPreTrainedModelr5   pvtra   )imagemoduler   Nc                 C   s   | j j}t|tjtjfr%tj|jd|d |j	dur#t
|j	 dS dS t|tjr9t
|j	 t|j dS t|trWtj|jd|d |jdurYtj|jd|d dS dS dS )zInitialize the weightsr   )meanstdN)r5   initializer_rangerA   r   rq   rI   inittrunc_normal_weightrx   zeros_rK   ones_r4   rG   r;   )r%   r   r   r   r   r   _init_weights  s   


z PvtPreTrainedModel._init_weights)r,   r-   r.   r   __annotations__base_model_prefixmain_input_nameinput_modalities_no_split_modulesr   no_gradr   Moduler   r   r   r   r   r     s   
 r   c                       s^   e Zd Zdef fddZe			ddejdedB dedB dedB d	e	e
B f
d
dZ  ZS )PvtModelr5   c                    s(   t  | || _t|| _|   d S r"   )r#   r$   r5   r   encoder	post_initr%   r5   r&   r   r   r$     s   
zPvtModel.__init__Nra   r   r   r   r   c                 K   s~   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||||d}|d }|s6|f|dd   S t||j|jdS )Nra   r   r   r   r   r   r   )r5   r   r   use_return_dictr   r   r(   r   )r%   ra   r   r   r   kwargsencoder_outputssequence_outputr   r   r   r*     s$   	zPvtModel.forward)NNN)r,   r-   r.   r   r$   r
   r   r   rn   ro   r   r*   r3   r   r   r&   r   r     s"    
r   z
    Pvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    )custom_introc                       sr   e Zd Zdeddf fddZe				ddejdB dejdB dedB d	edB d
edB de	e
B fddZ  ZS )PvtForImageClassificationr5   r   Nc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   rT   )r#   r$   
num_labelsr   r   r   rq   r   r   
classifierr   r   r&   r   r   r$     s   
$z"PvtForImageClassification.__init__ra   labelsr   r   r   c                 K   s   |dur|n| j j}| j||||d}|d }| |dddddf }	d}
|dur3| ||	| j }
|sI|	f|dd  }|
durG|
f| S |S t|
|	|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   )losslogitsr(   r   )r5   r   r   r   loss_functionr   r(   r   )r%   ra   r   r   r   r   r   r   r   r   r   r   r   r   r   r*     s*   z!PvtForImageClassification.forward)NNNN)r,   r-   r.   r   r$   r
   r   r1   rn   ro   r   r*   r3   r   r   r&   r   r     s(    r   )r   r   r   )r   F)-r/   rB   r   collections.abcr   r   torch.nn.functionalr   r   r]    r   r   activationsr   modeling_outputsr   r   modeling_utilsr	   utilsr
   r   configuration_pvtr   
get_loggerr,   loggerr1   r0   rn   r    r   r!   r4   rp   rt   r   r   r   r   r   r   r   __all__r   r   r   r   <module>   s@   
 DR.Y,<