o
    eiT^                  	   @   s  d Z ddlZddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ eeZeeddG dd deZd=dejdededejfddZG dd dej Z!G dd dej Z"G dd dej Z#G dd  d ej Z$G d!d" d"ej Z%G d#d$ d$ej Z&G d%d& d&ej Z'G d'd( d(ej Z(G d)d* d*ej Z)G d+d, d,ej Z*G d-d. d.ej Z+G d/d0 d0ej Z,G d1d2 d2ej Z-G d3d4 d4ej Z.eG d5d6 d6eZ/eG d7d8 d8e/Z0ed9dG d:d; d;e/Z1g d<Z2dS )>zPyTorch CvT model.    N)	dataclass)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)$ImageClassifierOutputWithNoAttentionModelOutput)PreTrainedModel)auto_docstringlogging   )	CvtConfigzV
    Base class for model's outputs, with potential hidden states and attentions.
    )custom_introc                   @   sP   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dS )BaseModelOutputWithCLSTokenz
    cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
        Classification token at the output of the last layer of the model.
    Nlast_hidden_statecls_token_value.hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tuple r   r   b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/cvt/modeling_cvt.pyr   !   s
   
 r           Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r   r   )r   )dtypedevice)shapendimr   randr$   r%   floor_div)r    r!   r"   	keep_probr&   random_tensoroutputr   r   r   	drop_path3   s   r.   c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )CvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr!   r#   c                    s   t    || _d S N)super__init__r!   )selfr!   	__class__r   r   r2   F   s   

zCvtDropPath.__init__r   c                 C   s   t || j| jS r0   )r.   r!   r"   )r3   r   r   r   r   forwardJ   s   zCvtDropPath.forwardc                 C   s   d| j  S )Nzp=r!   )r3   r   r   r   
extra_reprM   s   zCvtDropPath.extra_reprr0   )r   r   r   r   floatr2   r   Tensorr6   strr8   __classcell__r   r   r4   r   r/   C   s
    r/   c                       (   e Zd ZdZ fddZdd Z  ZS )CvtEmbeddingsz'
    Construct the CvT embeddings.
    c                    s.   t    t|||||d| _t|| _d S )N)
patch_sizenum_channels	embed_dimstridepadding)r1   r2   CvtConvEmbeddingsconvolution_embeddingsr   Dropoutdropout)r3   r?   r@   rA   rB   rC   dropout_rater4   r   r   r2   V   s
   

zCvtEmbeddings.__init__c                 C   s   |  |}| |}|S r0   )rE   rG   )r3   pixel_valueshidden_stater   r   r   r6   ]      

zCvtEmbeddings.forwardr   r   r   r   r2   r6   r<   r   r   r4   r   r>   Q       r>   c                       r=   )rD   z"
    Image to Conv Embedding.
    c                    sP   t    t|tjjr|n||f}|| _tj|||||d| _	t
|| _d S )N)kernel_sizerB   rC   )r1   r2   
isinstancecollectionsabcIterabler?   r   Conv2d
projection	LayerNormnormalization)r3   r?   r@   rA   rB   rC   r4   r   r   r2   h   s
   
zCvtConvEmbeddings.__init__c                 C   sf   |  |}|j\}}}}|| }||||ddd}| jr$| |}|ddd||||}|S Nr      r   )rT   r&   viewpermuterV   )r3   rI   
batch_sizer@   heightwidthhidden_sizer   r   r   r6   o   s   

zCvtConvEmbeddings.forwardrL   r   r   r4   r   rD   c   rM   rD   c                       $   e Zd Z fddZdd Z  ZS )CvtSelfAttentionConvProjectionc              	      s4   t    tj|||||d|d| _t|| _d S )NF)rN   rC   rB   biasgroups)r1   r2   r   rS   convolutionBatchNorm2drV   )r3   rA   rN   rC   rB   r4   r   r   r2   }   s   
	z'CvtSelfAttentionConvProjection.__init__c                 C      |  |}| |}|S r0   )rc   rV   r3   rJ   r   r   r   r6      rK   z&CvtSelfAttentionConvProjection.forwardr   r   r   r2   r6   r<   r   r   r4   r   r`   |   s    r`   c                   @   s   e Zd Zdd ZdS ) CvtSelfAttentionLinearProjectionc                 C   s2   |j \}}}}|| }||||ddd}|S rW   )r&   rY   rZ   )r3   rJ   r[   r@   r\   r]   r^   r   r   r   r6      s   z(CvtSelfAttentionLinearProjection.forwardN)r   r   r   r6   r   r   r   r   rh      s    rh   c                       s&   e Zd Zd fdd	Zdd Z  ZS )CvtSelfAttentionProjectiondw_bnc                    s.   t    |dkrt||||| _t | _d S )Nrj   )r1   r2   r`   convolution_projectionrh   linear_projection)r3   rA   rN   rC   rB   projection_methodr4   r   r   r2      s   
z#CvtSelfAttentionProjection.__init__c                 C   re   r0   )rk   rl   rf   r   r   r   r6      rK   z"CvtSelfAttentionProjection.forward)rj   rg   r   r   r4   r   ri      s    ri   c                       s0   e Zd Z	d fdd	Zdd Zdd Z  ZS )	CvtSelfAttentionTc                    s   t    |d | _|| _|| _|| _t|||||dkrdn|d| _t|||||d| _t|||||d| _	t
j|||	d| _t
j|||	d| _t
j|||	d| _t
|
| _d S )Ng      avglinear)rm   )ra   )r1   r2   scalewith_cls_tokenrA   	num_headsri   convolution_projection_queryconvolution_projection_keyconvolution_projection_valuer   Linearprojection_queryprojection_keyprojection_valuerF   rG   )r3   rs   rA   rN   	padding_q
padding_kvstride_q	stride_kvqkv_projection_methodqkv_biasattention_drop_raterr   kwargsr4   r   r   r2      s,   



zCvtSelfAttention.__init__c                 C   s6   |j \}}}| j| j }|||| j|ddddS )Nr   rX   r   r   )r&   rA   rs   rY   rZ   )r3   rJ   r[   r^   _head_dimr   r   r   "rearrange_for_multi_head_attention   s   z3CvtSelfAttention.rearrange_for_multi_head_attentionc                 C   sT  | j rt|d|| gd\}}|j\}}}|ddd||||}| |}| |}	| |}
| j rPtj	||	fdd}	tj	||fdd}tj	||
fdd}
| j
| j }| | |	}	| | |}| | |
}
td|	|g| j }tjjj|dd}| |}td||
g}|j\}}}}|dddd ||| j| }|S )	Nr   r   rX   dimzbhlk,bhtk->bhltzbhlt,bhtv->bhlvr   )rr   r   splitr&   rZ   rY   ru   rt   rv   catrA   rs   r   rx   ry   rz   einsumrq   r   
functionalsoftmaxrG   
contiguous)r3   rJ   r\   r]   	cls_tokenr[   r^   r@   keyqueryvaluer   attention_scoreattention_probscontextr   r   r   r   r6      s,   



$zCvtSelfAttention.forwardT)r   r   r   r2   r   r6   r<   r   r   r4   r   rn      s
    )rn   c                       r=   )CvtSelfOutputz
    The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    c                    s(   t    t||| _t|| _d S r0   )r1   r2   r   rw   denserF   rG   )r3   rA   	drop_rater4   r   r   r2      s   
zCvtSelfOutput.__init__c                 C   re   r0   r   rG   r3   rJ   input_tensorr   r   r   r6     rK   zCvtSelfOutput.forwardrL   r   r   r4   r   r      s    r   c                       s(   e Zd Z	d fdd	Zdd Z  ZS )CvtAttentionTc                    s8   t    t|||||||||	|
|| _t||| _d S r0   )r1   r2   rn   	attentionr   r-   )r3   rs   rA   rN   r{   r|   r}   r~   r   r   r   r   rr   r4   r   r   r2   	  s   
zCvtAttention.__init__c                 C   s   |  |||}| ||}|S r0   )r   r-   )r3   rJ   r\   r]   self_outputattention_outputr   r   r   r6   (  s   zCvtAttention.forwardr   rg   r   r   r4   r   r     s    r   c                       r_   )CvtIntermediatec                    s.   t    t|t|| | _t | _d S r0   )r1   r2   r   rw   intr   GELU
activation)r3   rA   	mlp_ratior4   r   r   r2   /  s   
zCvtIntermediate.__init__c                 C   re   r0   )r   r   rf   r   r   r   r6   4  rK   zCvtIntermediate.forwardrg   r   r   r4   r   r   .      r   c                       r_   )	CvtOutputc                    s0   t    tt|| || _t|| _d S r0   )r1   r2   r   rw   r   r   rF   rG   )r3   rA   r   r   r4   r   r   r2   ;  s   
zCvtOutput.__init__c                 C   s    |  |}| |}|| }|S r0   r   r   r   r   r   r6   @  s   

zCvtOutput.forwardrg   r   r   r4   r   r   :  r   r   c                       s,   e Zd ZdZ	d fdd	Zdd Z  ZS )CvtLayerzb
    CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
    Tc                    s|   t    t|||||||||	|
||| _t||| _t|||| _|dkr+t|dnt	
 | _t	|| _t	|| _d S )Nr   r7   )r1   r2   r   r   r   intermediater   r-   r/   r   Identityr.   rU   layernorm_beforelayernorm_after)r3   rs   rA   rN   r{   r|   r}   r~   r   r   r   r   r   drop_path_raterr   r4   r   r   r2   L  s(   
zCvtLayer.__init__c                 C   sX   |  | |||}|}| |}|| }| |}| |}| ||}| |}|S r0   )r   r   r.   r   r   r-   )r3   rJ   r\   r]   self_attention_outputr   layer_outputr   r   r   r6   s  s   



zCvtLayer.forwardr   rL   r   r   r4   r   r   G  s
    'r   c                       r_   )CvtStagec                    s   t     _|_jjj r!ttddjj	d _t
 jj  jj jdkr4 jn j	jd   j	j  jj  jj d_dd tjd jj  j| ddD tj fd	dt jj D  _d S )
Nr   r   r   )r?   rB   r@   rA   rC   rH   c                 S   s   g | ]}|  qS r   )item).0xr   r   r   
<listcomp>  s    z%CvtStage.__init__.<locals>.<listcomp>cpu)r%   c                    s   g | ]K}t  jj  jj  jj  jj  jj  jj  jj  j	j  j
j  jj  jj j  jj  jj d qS ))rs   rA   rN   r{   r|   r~   r}   r   r   r   r   r   r   rr   )r   rs   stagerA   
kernel_qkvr{   r|   r~   r}   r   r   r   r   r   r   )r   r   configdrop_path_ratesr3   r   r   r     s&    












)r1   r2   r   r   r   r   	Parameterr   randnrA   r>   patch_sizespatch_strider@   patch_paddingr   	embeddinglinspacer   depth
Sequentialrangelayers)r3   r   r   r4   r   r   r2     s*   





	
zCvtStage.__init__c           	      C   s   d }|  |}|j\}}}}||||| ddd}| jj| j r4| j|dd}tj	||fdd}| j
D ]
}||||}|}q7| jj| j rVt|d|| gd\}}|ddd||||}||fS )Nr   rX   r   r   r   )r   r&   rY   rZ   r   r   r   expandr   r   r   r   )	r3   rJ   r   r[   r@   r\   r]   layerlayer_outputsr   r   r   r6     s   

zCvtStage.forwardrg   r   r   r4   r   r     s    *r   c                       s&   e Zd Z fddZdddZ  ZS )
CvtEncoderc                    sF   t    || _tg | _tt|jD ]}| j	t
|| qd S r0   )r1   r2   r   r   
ModuleListstagesr   lenr   appendr   )r3   r   	stage_idxr4   r   r   r2     s   
zCvtEncoder.__init__FTc           	      C   sl   |rdnd }|}d }t | jD ]\}}||\}}|r ||f }q|s/tdd |||fD S t|||dS )Nr   c                 s   s    | ]	}|d ur|V  qd S r0   r   )r   vr   r   r   	<genexpr>  s    z%CvtEncoder.forward.<locals>.<genexpr>r   r   r   )	enumerater   r   r   )	r3   rI   output_hidden_statesreturn_dictall_hidden_statesrJ   r   r   stage_moduler   r   r   r6     s   
zCvtEncoder.forward)FTrg   r   r   r4   r   r     s    r   c                   @   s4   e Zd ZU eed< dZdZdgZe	 dd Z
dS )CvtPreTrainedModelr   cvtrI   r   c                 C   s   t |tjtjfr#tj|jd| jjd |j	dur!t
|j	 dS dS t |tjtjfrVt
|j	 t|j t|dddurTt
|j t|j t
|j dS dS t |tro| jj|j rqtj|jd| jjd dS dS dS )zInitialize the weightsr   )meanstdNrunning_mean)rO   r   rw   rS   inittrunc_normal_weightr   initializer_rangera   zeros_rU   rd   ones_getattrr   running_varnum_batches_trackedr   r   r   )r3   moduler   r   r   _init_weights  s$   

z CvtPreTrainedModel._init_weightsN)r   r   r   r   r   base_model_prefixmain_input_name_no_split_modulesr   no_gradr   r   r   r   r   r     s   
 r   c                       sV   e Zd Zd fdd	Ze			ddejdB dedB dedB dee	B fd	d
Z
  ZS )CvtModelTc                    s(   t  | || _t|| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r1   r2   r   r   encoder	post_init)r3   r   add_pooling_layerr4   r   r   r2      s   
zCvtModel.__init__NrI   r   r   r#   c                 K   sx   |d ur|n| j j}|d ur|n| j j}|d u rtd| j|||d}|d }|s3|f|dd   S t||j|jdS )Nz You have to specify pixel_valuesr   r   r   r   r   )r   r   use_return_dict
ValueErrorr   r   r   r   )r3   rI   r   r   r   encoder_outputssequence_outputr   r   r   r6   
  s$   	zCvtModel.forwardr   )NNN)r   r   r   r2   r   r   r:   boolr   r   r6   r<   r   r   r4   r   r     s    
r   z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       s`   e Zd Z fddZe				ddejdB dejdB dedB dedB dee	B f
d	d
Z
  ZS )CvtForImageClassificationc                    sh   t  | |j| _t|dd| _t|jd | _|jdkr)t	|jd |jnt
 | _|   d S )NF)r   r   r   )r1   r2   
num_labelsr   r   r   rU   rA   	layernormrw   r   
classifierr   )r3   r   r4   r   r   r2   2  s   $z"CvtForImageClassification.__init__NrI   labelsr   r   r#   c                 K   s  |dur|n| j j}| j|||d}|d }|d }| j jd r&| |}n|j\}	}
}}||	|
|| ddd}| |}|jdd}| 	|}d}|dur| j j
du r}| j jdkrbd| j _
n| j jdkry|jtjkst|jtjkryd	| j _
nd
| j _
| j j
dkrt }| j jdkr|| | }n,|||}n&| j j
d	krt }||d| j j|d}n| j j
d
krt }|||}|s|f|dd  }|dur|f| S |S t|||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   r   rX   r   
regressionsingle_label_classificationmulti_label_classification)losslogitsr   )r   r   r   r   r   r&   rY   rZ   r   r   problem_typer   r$   r   longr   r   squeezer   r   r	   r   )r3   rI   r   r   r   r   outputsr   r   r[   r@   r\   r]   sequence_output_meanr   r   loss_fctr-   r   r   r   r6   @  sL   


$

z!CvtForImageClassification.forward)NNNN)r   r   r   r2   r   r   r:   r   r   r	   r6   r<   r   r   r4   r   r   +  s$    r   )r   r   r   )r   F)3r   collections.abcrP   dataclassesr   r   r   torch.nnr   r   r    r   r   modeling_outputsr	   r
   modeling_utilsr   utilsr   r   configuration_cvtr   
get_loggerr   loggerr   r:   r9   r   r.   Moduler/   r>   rD   r`   rh   ri   rn   r   r   r   r   r   r   r   r   r   r   __all__r   r   r   r   <module>   sR   
 	Q&B?,P