o
    wie                  	   @   s  d Z ddlZddlZddlmZ ddlmZmZ ddlZddl	m
  mZ ddlZddlm
Z
 ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZ ddlm Z  e!e"Z#d0dej$de%de&dej$fddZ'G dd de
j(Z)G dd de
j(Z*G dd de
j(Z+G dd de
j(Z,G dd  d e
j(Z-G d!d" d"e
j(Z.G d#d$ d$e
j(Z/G d%d& d&e
j(Z0eG d'd( d(eZ1eG d)d* d*e1Z2ed+d,G d-d. d.e1Z3g d/Z4dS )1zPyTorch PVT model.    N)Iterable)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputImageClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )	PvtConfig        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r$   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/pvt/modeling_pvt.py	drop_path*   s   
r&   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )PvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r$   r%   r*   B   s   

zPvtDropPath.__init__hidden_statesc                 C   s   t || j| jS r(   )r&   r   r   r+   r.   r$   r$   r%   forwardF   s   zPvtDropPath.forwardc                 C   s   d| j  S )Nzp=)r   )r+   r$   r$   r%   
extra_reprI   s   zPvtDropPath.extra_reprr(   )__name__
__module____qualname____doc__r   floatr*   r   Tensorr0   strr1   __classcell__r$   r$   r,   r%   r'   ?   s
    r'   c                       s   e Zd ZdZ	ddedeeee f deeee f dededed	ef fd
dZ	de
jdedede
jfddZde
jdee
jeef fddZ  ZS )PvtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Fconfig
image_size
patch_sizestridenum_channelshidden_size	cls_tokenc           	         s   t    || _t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }|| _|| _|| _	|| _
ttd|rI|d n||| _|r[ttdd|nd | _tj||||d| _tj||jd| _tj|jd| _d S )Nr   r   kernel_sizer>   eps)p)r)   r*   r;   
isinstancecollectionsabcr   r<   r=   r?   num_patchesr   	Parameterr   randnposition_embeddingszerosrA   Conv2d
projection	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropout)	r+   r;   r<   r=   r>   r?   r@   rA   rJ   r,   r$   r%   r*   T   s    

 zPvtPatchEmbeddings.__init__
embeddingsheightwidthr   c                 C   s|   || }t j s|| jj| jj kr| jS |d||ddddd}tj	|||fdd}|dd|| ddd}|S )Nr   r   r	      bilinear)sizemode)
r   jit
is_tracingr;   r<   rM   reshapepermuteFinterpolate)r+   rW   rX   rY   rJ   interpolated_embeddingsr$   r$   r%   interpolate_pos_encodingp   s   z+PvtPatchEmbeddings.interpolate_pos_encodingpixel_valuesc                 C   s   |j \}}}}|| jkrtd| |}|j ^ }}}|ddd}| |}| jd urc| j|dd}	t	j
|	|fdd}| | jd d dd f ||}
t	j
| jd d d df |
fdd}
n| | j||}
| ||
 }|||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r[   r   rZ   dim)r   r?   
ValueErrorrP   flatten	transposerS   rA   expandr   catrf   rM   rV   )r+   rg   
batch_sizer?   rX   rY   patch_embed_rW   rA   rM   r$   r$   r%   r0   {   s"   



 &
zPvtPatchEmbeddings.forwardF)r2   r3   r4   r5   r   r   intr   boolr*   r   r7   rf   tupler0   r9   r$   r$   r,   r%   r:   M   s(    (r:   c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	PvtSelfOutputr;   r@   c                    s*   t    t||| _t|j| _d S r(   )r)   r*   r   LineardenserT   rU   rV   )r+   r;   r@   r,   r$   r%   r*      s   
zPvtSelfOutput.__init__r.   r   c                 C   s   |  |}| |}|S r(   )rx   rV   r/   r$   r$   r%   r0      s   

zPvtSelfOutput.forward)
r2   r3   r4   r   rs   r*   r   r7   r0   r9   r$   r$   r,   r%   rv      s    rv   c                       sp   e Zd ZdZdedededef fddZded	ej	fd
dZ
	ddej	dededed	eej	 f
ddZ  ZS )PvtEfficientSelfAttentionzxEfficient self-attention mechanism with reduction of the sequence [PvT paper](https://huggingface.co/papers/2102.12122).r;   r@   num_attention_headssequences_reduction_ratioc                    s   t    || _|| _| j| j dkr td| j d| j dt| j| j | _| j| j | _tj	| j| j|j
d| _tj	| j| j|j
d| _tj	| j| j|j
d| _t|j| _|| _|dkrwtj||||d| _tj||jd| _d S d S )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ())biasr   rB   rD   )r)   r*   r@   rz   rj   rs   attention_head_sizeall_head_sizer   rw   qkv_biasquerykeyvaluerT   attention_probs_dropout_probrV   r{   rO   sequence_reductionrQ   rR   rS   r+   r;   r@   rz   r{   r,   r$   r%   r*      s,   

z"PvtEfficientSelfAttention.__init__r.   r   c                 C   s6   |  d d | j| jf }||}|ddddS )NrZ   r   r[   r   r	   )r]   rz   r~   viewrb   )r+   r.   	new_shaper$   r$   r%   transpose_for_scores   s   
z.PvtEfficientSelfAttention.transpose_for_scoresFrX   rY   output_attentionsc                 C   s&  |  | |}| jdkr6|j\}}}|ddd||||}| |}|||dddd}| |}|  | |}	|  | 	|}
t
||	dd}|t| j }tjj|dd}| |}t
||
}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr   r   r[   rZ   rh   r	   )r   r   r{   r   rb   ra   r   rS   r   r   r   matmulrl   mathsqrtr~   r   
functionalsoftmaxrV   
contiguousr]   r   r   )r+   r.   rX   rY   r   query_layerro   seq_lenr?   	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr$   r$   r%   r0      s*   




z!PvtEfficientSelfAttention.forwardrr   )r2   r3   r4   r5   r   rs   r6   r*   r   r7   r   rt   ru   r0   r9   r$   r$   r,   r%   ry      s0    
ry   c                       s`   e Zd Zdedededef fddZdd Z		dd
ej	dedede
deej	 f
ddZ  ZS )PvtAttentionr;   r@   rz   r{   c                    s6   t    t||||d| _t||d| _t | _d S )N)r@   rz   r{   )r@   )r)   r*   ry   r+   rv   r#   setpruned_headsr   r,   r$   r%   r*      s   
zPvtAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rh   )lenr   r+   rz   r~   r   r   r   r   r   r#   rx   r   union)r+   headsindexr$   r$   r%   prune_heads   s   zPvtAttention.prune_headsFr.   rX   rY   r   r   c                 C   s4   |  ||||}| |d }|f|dd   }|S )Nr   r   )r+   r#   )r+   r.   rX   rY   r   self_outputsattention_outputr   r$   r$   r%   r0     s   zPvtAttention.forwardrr   )r2   r3   r4   r   rs   r6   r*   r   r   r7   rt   ru   r0   r9   r$   r$   r,   r%   r      s.    r   c                
       sR   e Zd Z		ddededee dee f fddZdejd	ejfd
dZ	  Z
S )PvtFFNNr;   in_featureshidden_featuresout_featuresc                    sj   t    |d ur|n|}t||| _t|jtr!t|j | _	n|j| _	t||| _
t|j| _d S r(   )r)   r*   r   rw   dense1rG   
hidden_actr8   r
   intermediate_act_fndense2rT   rU   rV   )r+   r;   r   r   r   r,   r$   r%   r*     s   
zPvtFFN.__init__r.   r   c                 C   s6   |  |}| |}| |}| |}| |}|S r(   )r   r   rV   r   r/   r$   r$   r%   r0   +  s   




zPvtFFN.forward)NN)r2   r3   r4   r   rs   r   r*   r   r7   r0   r9   r$   r$   r,   r%   r     s    r   c                       sT   e Zd Zdedededededef fddZdd
ejdedede	fddZ
  ZS )PvtLayerr;   r@   rz   r&   r{   	mlp_ratioc                    sz   t    tj||jd| _t||||d| _|dkrt|nt	 | _
tj||jd| _t|| }t|||d| _d S )NrD   )r;   r@   rz   r{   r   )r;   r   r   )r)   r*   r   rQ   rR   layer_norm_1r   	attentionr'   Identityr&   layer_norm_2rs   r   mlp)r+   r;   r@   rz   r&   r{   r   mlp_hidden_sizer,   r$   r%   r*   5  s   
	zPvtLayer.__init__Fr.   rX   rY   r   c           
      C   sn   | j | ||||d}|d }|dd  }| |}|| }| | |}| |}|| }	|	f| }|S )N)r.   rX   rY   r   r   r   )r   r   r&   r   r   )
r+   r.   rX   rY   r   self_attention_outputsr   r   
mlp_outputlayer_outputr$   r$   r%   r0   K  s   


zPvtLayer.forwardrr   )r2   r3   r4   r   rs   r6   r*   r   r7   rt   r0   r9   r$   r$   r,   r%   r   4  s    &r   c                       s^   e Zd Zdef fddZ			ddejdee dee d	ee d
e	e
ef f
ddZ  ZS )
PvtEncoderr;   c           	         sx  t    || _tjd|jt|jdd }g }t	|j
D ]9}|t||dkr+|jn	| jjd|d   |j| |j| |dkrC|jn|j|d  |j| ||j
d kd qt|| _g }d}t	|j
D ]@}g }|dkrx||j|d  7 }t	|j| D ]}|t||j| |j| |||  |j| |j| d q|t| qgt|| _tj|jd |jd	| _d S )
Nr   cpu)r   r[   r   )r;   r<   r=   r>   r?   r@   rA   )r;   r@   rz   r&   r{   r   rZ   rD   )r)   r*   r;   r   linspacedrop_path_ratesumdepthstolistrangenum_encoder_blocksappendr:   r<   patch_sizesstridesr?   hidden_sizesr   
ModuleListpatch_embeddingsr   rz   sequence_reduction_ratios
mlp_ratiosblockrQ   rR   rS   )	r+   r;   drop_path_decaysrW   iblockscurlayersjr,   r$   r%   r*   c  sJ   
 

zPvtEncoder.__init__FTrg   r   output_hidden_statesreturn_dictr   c                 C   s  |rdnd }|r
dnd }|j d }t| j}|}	tt| j| jD ]C\}
\}}||	\}	}}|D ]}||	|||}|d }	|rF||d f }|rM||	f }q0|
|d krd|	|||ddddd }	q!| 	|	}	|rq||	f }|st
dd |	||fD S t|	||d	S )
Nr$   r   r   rZ   r	   r[   c                 s   s    | ]	}|d ur|V  qd S r(   r$   ).0vr$   r$   r%   	<genexpr>  s    z%PvtEncoder.forward.<locals>.<genexpr>last_hidden_stater.   
attentions)r   r   r   	enumeratezipr   ra   rb   r   rS   ru   r   )r+   rg   r   r   r   all_hidden_statesall_self_attentionsro   
num_blocksr.   idxembedding_layerblock_layerrX   rY   r   layer_outputsr$   r$   r%   r0     s8   


 

zPvtEncoder.forward)FFT)r2   r3   r4   r   r*   r   FloatTensorr   rt   r   ru   r   r0   r9   r$   r$   r,   r%   r   b  s     5
r   c                   @   s>   e Zd ZeZdZdZg Zdee	j
e	je	jf ddfddZdS )PvtPreTrainedModelpvtrg   moduler   Nc                 C   s   t |tjr$tjj|jjd| jjd|j_|j	dur"|j	j
  dS dS t |tjr9|j	j
  |jjd dS t |trctjj|jjd| jjd|j_|jduretjj|jjd| jjd|j_dS dS dS )zInitialize the weightsr   )meanstdNg      ?)rG   r   rw   inittrunc_normal_weightdatar;   initializer_ranger}   zero_rQ   fill_r:   rM   rA   )r+   r   r$   r$   r%   _init_weights  s,   



z PvtPreTrainedModel._init_weights)r2   r3   r4   r   config_classbase_model_prefixmain_input_name_no_split_modulesr   r   rw   rO   rQ   r   r$   r$   r$   r%   r     s    &r   c                       sj   e Zd Zdef fddZdd Ze			ddejde	e
 d	e	e
 d
e	e
 deeef f
ddZ  ZS )PvtModelr;   c                    s(   t  | || _t|| _|   d S r(   )r)   r*   r;   r   encoder	post_initr+   r;   r,   r$   r%   r*     s   
zPvtModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   layerr   r   )r+   heads_to_pruner   r   r$   r$   r%   _prune_heads  s   zPvtModel._prune_headsNrg   r   r   r   r   c                 C   s~   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||||d}|d }|s6|f|dd   S t||j|jdS )Nrg   r   r   r   r   r   r   )r;   r   r   use_return_dictr   r   r.   r   )r+   rg   r   r   r   encoder_outputssequence_outputr$   r$   r%   r0     s$   zPvtModel.forward)NNN)r2   r3   r4   r   r*   r   r   r   r   r   rt   r   ru   r   r0   r9   r$   r$   r,   r%   r     s$    

r   z
    Pvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    )custom_introc                       sv   e Zd Zdeddf fddZe				ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )PvtForImageClassificationr;   r   Nc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   rZ   )r)   r*   
num_labelsr   r   r   rw   r   r   
classifierr   r   r,   r$   r%   r*     s   
$z"PvtForImageClassification.__init__rg   labelsr   r   r   c                 C   sp  |dur|n| j j}| j||||d}|d }| |dddddf }d}	|dur| j jdu rU| jdkr;d| j _n| jdkrQ|jtjksL|jtj	krQd| j _nd| j _| j jdkrst
 }
| jdkrm|
| | }	n+|
||}	n%| j jdkrt }
|
|d| j|d}	n| j jdkrt }
|
||}	|s|f|dd  }|	dur|	f| S |S t|	||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   
regressionsingle_label_classificationmulti_label_classificationrZ   )losslogitsr.   r   )r;   r   r   r  problem_typer  r   r   longrs   r   squeezer   r   r   r   r.   r   )r+   rg   r  r   r   r   r   r   r  r  loss_fctr#   r$   r$   r%   r0   $  sJ   

"


z!PvtForImageClassification.forward)NNNN)r2   r3   r4   r   r*   r   r   r   r7   rt   r   ru   r   r0   r9   r$   r$   r,   r%   r     s(    
r   )r   r   r   )r   F)5r5   rH   r   collections.abcr   typingr   r   r   torch.nn.functionalr   r   rc   torch.utils.checkpointtorch.nnr   r   r   activationsr
   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   configuration_pvtr   
get_loggerr2   loggerr7   r6   rt   r&   Moduler'   r:   rv   ry   r   r   r   r   r   r   r   __all__r$   r$   r$   r%   <module>   sF   
 DR*.Y3N