o
    ie                  	   @   s*  d Z ddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ eeZeeddG dd deZd>dejde de!dejfddZ"G dd de	j#Z$G dd de	j#Z%G dd de	j#Z&G d d! d!e	j#Z'G d"d# d#e	j#Z(G d$d% d%e	j#Z)G d&d' d'e	j#Z*G d(d) d)e	j#Z+G d*d+ d+e	j#Z,G d,d- d-e	j#Z-G d.d/ d/e	j#Z.G d0d1 d1e	j#Z/G d2d3 d3e	j#Z0G d4d5 d5e	j#Z1eG d6d7 d7eZ2eG d8d9 d9e2Z3ed:dG d;d< d<e2Z4g d=Z5dS )?zPyTorch CvT model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )$ImageClassifierOutputWithNoAttentionModelOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )	CvtConfigzV
    Base class for model's outputs, with potential hidden states and attentions.
    )custom_introc                   @   sP   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dS )BaseModelOutputWithCLSTokenz
    cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
        Classification token at the output of the last layer of the model.
    Nlast_hidden_statecls_token_value.hidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tuple r    r    a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/cvt/modeling_cvt.pyr   #   s
   
 r           Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r"   r   r   )r   )dtypedevice)shapendimr   randr'   r(   floor_div)r#   r$   r%   	keep_probr)   random_tensoroutputr    r    r!   	drop_path5   s   
r1   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )CvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr$   r&   c                    s   t    || _d S N)super__init__r$   )selfr$   	__class__r    r!   r5   M   s   

zCvtDropPath.__init__r   c                 C   s   t || j| jS r3   )r1   r$   r%   )r6   r   r    r    r!   forwardQ   s   zCvtDropPath.forwardc                 C   s   d| j  S )Nzp=r$   )r6   r    r    r!   
extra_reprT   s   zCvtDropPath.extra_reprr3   )r   r   r   r   r   floatr5   r   Tensorr9   strr;   __classcell__r    r    r7   r!   r2   J   s
    r2   c                       (   e Zd ZdZ fddZdd Z  ZS )CvtEmbeddingsz'
    Construct the CvT embeddings.
    c                    s.   t    t|||||d| _t|| _d S )N)
patch_sizenum_channels	embed_dimstridepadding)r4   r5   CvtConvEmbeddingsconvolution_embeddingsr   Dropoutdropout)r6   rB   rC   rD   rE   rF   dropout_rater7   r    r!   r5   ]   s
   

zCvtEmbeddings.__init__c                 C   s   |  |}| |}|S r3   )rH   rJ   )r6   pixel_valueshidden_stater    r    r!   r9   d      

zCvtEmbeddings.forwardr   r   r   r   r5   r9   r?   r    r    r7   r!   rA   X       rA   c                       r@   )rG   z"
    Image to Conv Embedding.
    c                    sP   t    t|tjjr|n||f}|| _tj|||||d| _	t
|| _d S )N)kernel_sizerE   rF   )r4   r5   
isinstancecollectionsabcIterablerB   r   Conv2d
projection	LayerNormnormalization)r6   rB   rC   rD   rE   rF   r7   r    r!   r5   o   s
   
zCvtConvEmbeddings.__init__c                 C   sf   |  |}|j\}}}}|| }||||ddd}| jr$| |}|ddd||||}|S Nr      r   )rW   r)   viewpermuterY   )r6   rL   
batch_sizerC   heightwidthhidden_sizer    r    r!   r9   v   s   

zCvtConvEmbeddings.forwardrO   r    r    r7   r!   rG   j   rP   rG   c                       $   e Zd Z fddZdd Z  ZS )CvtSelfAttentionConvProjectionc              	      s4   t    tj|||||d|d| _t|| _d S )NF)rQ   rF   rE   biasgroups)r4   r5   r   rV   convolutionBatchNorm2drY   )r6   rD   rQ   rF   rE   r7   r    r!   r5      s   
	z'CvtSelfAttentionConvProjection.__init__c                 C      |  |}| |}|S r3   )rf   rY   r6   rM   r    r    r!   r9      rN   z&CvtSelfAttentionConvProjection.forwardr   r   r   r5   r9   r?   r    r    r7   r!   rc      s    rc   c                   @   s   e Zd Zdd ZdS ) CvtSelfAttentionLinearProjectionc                 C   s2   |j \}}}}|| }||||ddd}|S rZ   )r)   r\   r]   )r6   rM   r^   rC   r_   r`   ra   r    r    r!   r9      s   z(CvtSelfAttentionLinearProjection.forwardN)r   r   r   r9   r    r    r    r!   rk      s    rk   c                       s&   e Zd Zd fdd	Zdd Z  ZS )CvtSelfAttentionProjectiondw_bnc                    s.   t    |dkrt||||| _t | _d S )Nrm   )r4   r5   rc   convolution_projectionrk   linear_projection)r6   rD   rQ   rF   rE   projection_methodr7   r    r!   r5      s   
z#CvtSelfAttentionProjection.__init__c                 C   rh   r3   )rn   ro   ri   r    r    r!   r9      rN   z"CvtSelfAttentionProjection.forward)rm   rj   r    r    r7   r!   rl      s    rl   c                       0   e Zd Z	d fdd	Zdd Zdd Z  ZS )	CvtSelfAttentionTc                    s   t    |d | _|| _|| _|| _t|||||dkrdn|d| _t|||||d| _t|||||d| _	t
j|||	d| _t
j|||	d| _t
j|||	d| _t
|
| _d S )Ng      avglinear)rp   )rd   )r4   r5   scalewith_cls_tokenrD   	num_headsrl   convolution_projection_queryconvolution_projection_keyconvolution_projection_valuer   Linearprojection_queryprojection_keyprojection_valuerI   rJ   )r6   rw   rD   rQ   	padding_q
padding_kvstride_q	stride_kvqkv_projection_methodqkv_biasattention_drop_raterv   kwargsr7   r    r!   r5      s,   



zCvtSelfAttention.__init__c                 C   s6   |j \}}}| j| j }|||| j|ddddS )Nr   r[   r   r	   )r)   rD   rw   r\   r]   )r6   rM   r^   ra   _head_dimr    r    r!   "rearrange_for_multi_head_attention   s   z3CvtSelfAttention.rearrange_for_multi_head_attentionc                 C   sT  | j rt|d|| gd\}}|j\}}}|ddd||||}| |}| |}	| |}
| j rPtj	||	fdd}	tj	||fdd}tj	||
fdd}
| j
| j }| | |	}	| | |}| | |
}
td|	|g| j }tjjj|dd}| |}td||
g}|j\}}}}|dddd ||| j| }|S )	Nr   r   r[   dimzbhlk,bhtk->bhltzbhlt,bhtv->bhlvr	   )rv   r   splitr)   r]   r\   ry   rx   rz   catrD   rw   r   r|   r}   r~   einsumru   r   
functionalsoftmaxrJ   
contiguous)r6   rM   r_   r`   	cls_tokenr^   ra   rC   keyqueryvaluer   attention_scoreattention_probscontextr   r    r    r!   r9      s,   



$zCvtSelfAttention.forwardT)r   r   r   r5   r   r9   r?   r    r    r7   r!   rr      s
    )rr   c                       r@   )CvtSelfOutputz
    The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    c                    s(   t    t||| _t|| _d S r3   )r4   r5   r   r{   denserI   rJ   )r6   rD   	drop_rater7   r    r!   r5     s   
zCvtSelfOutput.__init__c                 C   rh   r3   r   rJ   r6   rM   input_tensorr    r    r!   r9   	  rN   zCvtSelfOutput.forwardrO   r    r    r7   r!   r      s    r   c                       rq   )	CvtAttentionTc                    s@   t    t|||||||||	|
|| _t||| _t | _d S r3   )r4   r5   rr   	attentionr   r0   setpruned_heads)r6   rw   rD   rQ   r   r   r   r   r   r   r   r   rv   r7   r    r!   r5     s    
zCvtAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   r   num_attention_headsattention_head_sizer   r   r   r   r   r0   r   all_head_sizeunion)r6   headsindexr    r    r!   prune_heads0  s   zCvtAttention.prune_headsc                 C   s   |  |||}| ||}|S r3   )r   r0   )r6   rM   r_   r`   self_outputattention_outputr    r    r!   r9   B  s   zCvtAttention.forwardr   )r   r   r   r5   r   r9   r?   r    r    r7   r!   r     s
     r   c                       rb   )CvtIntermediatec                    s.   t    t|t|| | _t | _d S r3   )r4   r5   r   r{   intr   GELU
activation)r6   rD   	mlp_ratior7   r    r!   r5   I  s   
zCvtIntermediate.__init__c                 C   rh   r3   )r   r   ri   r    r    r!   r9   N  rN   zCvtIntermediate.forwardrj   r    r    r7   r!   r   H      r   c                       rb   )	CvtOutputc                    s0   t    tt|| || _t|| _d S r3   )r4   r5   r   r{   r   r   rI   rJ   )r6   rD   r   r   r7   r    r!   r5   U  s   
zCvtOutput.__init__c                 C   s    |  |}| |}|| }|S r3   r   r   r    r    r!   r9   Z  s   

zCvtOutput.forwardrj   r    r    r7   r!   r   T  r   r   c                       s,   e Zd ZdZ	d fdd	Zdd Z  ZS )CvtLayerzb
    CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
    Tc                    s|   t    t|||||||||	|
||| _t||| _t|||| _|dkr+t|dnt	
 | _t	|| _t	|| _d S )Nr"   r:   )r4   r5   r   r   r   intermediater   r0   r2   r   Identityr1   rX   layernorm_beforelayernorm_after)r6   rw   rD   rQ   r   r   r   r   r   r   r   r   r   drop_path_raterv   r7   r    r!   r5   f  s(   
zCvtLayer.__init__c                 C   sX   |  | |||}|}| |}|| }| |}| |}| ||}| |}|S r3   )r   r   r1   r   r   r0   )r6   rM   r_   r`   self_attention_outputr   layer_outputr    r    r!   r9     s   



zCvtLayer.forwardr   rO   r    r    r7   r!   r   a  s
    'r   c                       rb   )CvtStagec                    s   t     _|_jjj r!ttddjj	d _t
 jj  jj jdkr4 jn j	jd   j	j  jj  jj d_dd tjd jj  j| ddD tj fd	dt jj D  _d S )
Nr   r   r   )rB   rE   rC   rD   rF   rK   c                 S   s   g | ]}|  qS r    )item).0xr    r    r!   
<listcomp>  s    z%CvtStage.__init__.<locals>.<listcomp>cpu)r(   c                    s   g | ]K}t  jj  jj  jj  jj  jj  jj  jj  j	j  j
j  jj  jj j  jj  jj d qS ))rw   rD   rQ   r   r   r   r   r   r   r   r   r   r   rv   )r   rw   stagerD   
kernel_qkvr   r   r   r   r   r   r   r   r   r   )r   r   configdrop_path_ratesr6   r    r!   r     s&    












)r4   r5   r   r   r   r   	Parameterr   randnrD   rA   patch_sizespatch_striderC   patch_paddingr   	embeddinglinspacer   depth
Sequentialrangelayers)r6   r   r   r7   r   r!   r5     s*   





	
zCvtStage.__init__c           	      C   s   d }|  |}|j\}}}}||||| ddd}| jj| j r4| j|dd}tj	||fdd}| j
D ]
}||||}|}q7| jj| j rVt|d|| gd\}}|ddd||||}||fS )Nr   r[   r   r   r   )r   r)   r\   r]   r   r   r   expandr   r   r   r   )	r6   rM   r   r^   rC   r_   r`   layerlayer_outputsr    r    r!   r9     s   

zCvtStage.forwardrj   r    r    r7   r!   r     s    *r   c                       s&   e Zd Z fddZdddZ  ZS )
CvtEncoderc                    sF   t    || _tg | _tt|jD ]}| j	t
|| qd S r3   )r4   r5   r   r   
ModuleListstagesr   r   r   appendr   )r6   r   	stage_idxr7   r    r!   r5     s   
zCvtEncoder.__init__FTc           	      C   sl   |rdnd }|}d }t | jD ]\}}||\}}|r ||f }q|s/tdd |||fD S t|||dS )Nr    c                 s   s    | ]	}|d ur|V  qd S r3   r    )r   vr    r    r!   	<genexpr>  s    z%CvtEncoder.forward.<locals>.<genexpr>r   r   r   )	enumerater   r   r   )	r6   rL   output_hidden_statesreturn_dictall_hidden_statesrM   r   r   stage_moduler    r    r!   r9     s   
zCvtEncoder.forward)FTrj   r    r    r7   r!   r     s    r   c                   @   s,   e Zd ZU eed< dZdZdgZdd ZdS )CvtPreTrainedModelr   cvtrL   r   c                 C   s   t |tjtjfr'tjj|jjd| jj	d|j_|j
dur%|j
j  dS dS t |tjr<|j
j  |jjd dS t |trY| jj|j r[tjj|jjd| jj	d|j_dS dS dS )zInitialize the weightsr"   )meanstdNg      ?)rR   r   r{   rV   inittrunc_normal_weightdatar   initializer_rangerd   zero_rX   fill_r   r   r   )r6   moduler    r    r!   _init_weights  s   

z CvtPreTrainedModel._init_weightsN)	r   r   r   r   r   base_model_prefixmain_input_name_no_split_modulesr   r    r    r    r!   r     s   
 r   c                       sb   e Zd Zd fdd	Zdd Ze			ddeej dee	 d	ee	 d
e
eef fddZ  ZS )CvtModelTc                    s(   t  | || _t|| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r4   r5   r   r   encoder	post_init)r6   r   add_pooling_layerr7   r    r!   r5     s   
zCvtModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r6   heads_to_pruner   r   r    r    r!   _prune_heads!  s   zCvtModel._prune_headsNrL   r   r   r&   c                 C   sx   |d ur|n| j j}|d ur|n| j j}|d u rtd| j|||d}|d }|s3|f|dd   S t||j|jdS )Nz You have to specify pixel_valuesr   r   r   r   r   )r   r   use_return_dict
ValueErrorr   r   r   r   )r6   rL   r   r   encoder_outputssequence_outputr    r    r!   r9   )  s$   zCvtModel.forwardr   )NNN)r   r   r   r5   r   r   r   r   r=   boolr   r   r   r9   r?   r    r    r7   r!   r     s     

r   z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       sd   e Zd Z fddZe				ddeej deej dee dee de	e
ef f
d	d
Z  ZS )CvtForImageClassificationc                    sh   t  | |j| _t|dd| _t|jd | _|jdkr)t	|jd |jnt
 | _|   d S )NF)r   r   r   )r4   r5   
num_labelsr   r   r   rX   rD   	layernormr{   r   
classifierr   )r6   r   r7   r    r!   r5   P  s   $z"CvtForImageClassification.__init__NrL   labelsr   r   r&   c                 C   s  |dur|n| j j}| j|||d}|d }|d }| j jd r&| |}n|j\}}	}
}|||	|
| ddd}| |}|jdd}| 	|}d}|dur| j j
du r}| j jdkrbd| j _
n| j jdkry|jtjkst|jtjkryd	| j _
nd
| j _
| j j
dkrt }| j jdkr|| | }n,|||}n&| j j
d	krt }||d| j j|d}n| j j
d
krt }|||}|s|f|dd  }|dur|f| S |S t|||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   r   r[   r   
regressionsingle_label_classificationmulti_label_classification)losslogitsr   )r   r   r   r   r  r)   r\   r]   r   r  problem_typer  r'   r   longr   r   squeezer   r   r
   r   )r6   rL   r  r   r   outputsr   r   r^   rC   r_   r`   sequence_output_meanr
  r	  loss_fctr0   r    r    r!   r9   ^  sL   


$

z!CvtForImageClassification.forward)NNNN)r   r   r   r5   r   r   r   r=   r   r   r   r
   r9   r?   r    r    r7   r!   r  I  s$    
r  )r  r   r   )r"   F)6r   collections.abcrS   dataclassesr   typingr   r   r   r   torch.nnr   r   r   modeling_outputsr
   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   configuration_cvtr   
get_loggerr   loggerr   r=   r<   r   r1   Moduler2   rA   rG   rc   rk   rl   rr   r   r   r   r   r   r   r   r   r   r  __all__r    r    r    r!   <module>   sT   
 	Q9B?3O