o
    eic                     @   s  d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZmZ ddlmZ dd	lmZmZ d
dlmZ eeZeeddG dd deZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G d d! d!ejZ"G d"d# d#ejZ#G d$d% d%ejZ$eG d&d' d'eZ%eG d(d) d)e%Z&ed*dG d+d, d,e%Z'ed-dG d.d/ d/e%Z(g d0Z)dS )1zPyTorch LeViT model.    N)	dataclass)nn   )initialization)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionModelOutput)PreTrainedModel)auto_docstringlogging   )LevitConfigzD
    Output type of [`LevitForImageClassificationWithTeacher`].
    )custom_introc                   @   s^   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
eej dB ed< dS ),LevitForImageClassificationWithTeacherOutputan  
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores as the average of the `cls_logits` and `distillation_logits`.
    cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
        class token).
    distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
        distillation token).
    Nlogits
cls_logitsdistillation_logitshidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   tuple r   r   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/levit/modeling_levit.pyr   %   s   
 r   c                       s,   e Zd ZdZ	d fdd	Zdd Z  ZS )LevitConvEmbeddingsz[
    LeViT Conv Embeddings with Batch Norm, used in the initial patch embedding layer.
    r   c	           	   
      s6   t    tj|||||||dd| _t|| _d S )NF)dilationgroupsbias)super__init__r   Conv2dconvolutionBatchNorm2d
batch_norm)	selfin_channelsout_channelskernel_sizestridepaddingr    r!   bn_weight_init	__class__r   r   r$   B   s
   
zLevitConvEmbeddings.__init__c                 C   s   |  |}| |}|S N)r&   r(   )r)   
embeddingsr   r   r   forwardK      

zLevitConvEmbeddings.forward)r   r   r   r   r   r   r   r$   r4   __classcell__r   r   r0   r   r   =   s
    	r   c                       (   e Zd ZdZ fddZdd Z  ZS )LevitPatchEmbeddingsz
    LeViT patch embeddings, for final embeddings to be passed to transformer blocks. It consists of multiple
    `LevitConvEmbeddings`.
    c                    s   t    t|j|jd d |j|j|j| _t	
 | _t|jd d |jd d |j|j|j| _t	
 | _t|jd d |jd d |j|j|j| _t	
 | _t|jd d |jd |j|j|j| _|j| _d S )Nr            )r#   r$   r   num_channelshidden_sizesr,   r-   r.   embedding_layer_1r   	Hardswishactivation_layer_1embedding_layer_2activation_layer_2embedding_layer_3activation_layer_3embedding_layer_4r)   configr0   r   r   r$   W   s"   

$
$
 zLevitPatchEmbeddings.__init__c                 C   st   |j d }|| jkrtd| |}| |}| |}| |}| |}| |}| 	|}|
dddS )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r<   )shaper=   
ValueErrorr?   rA   rB   rC   rD   rE   rF   flatten	transpose)r)   pixel_valuesr=   r3   r   r   r   r4   m   s   








zLevitPatchEmbeddings.forwardr6   r   r   r0   r   r9   Q   s    r9   c                       s&   e Zd Zd fdd	Zdd Z  ZS )MLPLayerWithBNr   c                    s,   t    tj||dd| _t|| _d S )NF)in_featuresout_featuresr"   )r#   r$   r   LinearlinearBatchNorm1dr(   )r)   	input_dim
output_dimr/   r0   r   r   r$   ~   s   
zMLPLayerWithBN.__init__c                 C   s&   |  |}| |dd|}|S )Nr   r   )rR   r(   rK   
reshape_asr)   hidden_stater   r   r   r4      s   
zMLPLayerWithBN.forward)r   r   r   r   r$   r4   r7   r   r   r0   r   rN   }   s    rN   c                       s$   e Zd Z fddZdd Z  ZS )LevitSubsamplec                       t    || _|| _d S r2   )r#   r$   r-   
resolution)r)   r-   r\   r0   r   r   r$         

zLevitSubsample.__init__c                 C   sL   |j \}}}||| j| j|d d d d | jd d | jf |d|}|S )N)rI   viewr\   r-   reshape)r)   rX   
batch_size_channelsr   r   r   r4      s   
zLevitSubsample.forwardrY   r   r   r0   r   rZ      s    rZ   c                       B   e Zd Z fddZe d
 fdd	Zdd Zdd	 Z  Z	S )LevitAttentionc                    sN  t    || _|d | _|| _|| _|| | || d  | _|| | | _t|| j| _	t
 | _t| j|dd| _ttt|t|}t|}|| _i g }}	|D ],}
|D ]'}t|
d |d  t|
d |d  f}||vrzt|||< |	||  qZqV|	| _i | _tj
t|t|| _| jdt|	||dd d S )	N      r<   r   )r/   r   attention_bias_idxsF
persistent)r#   r$   num_attention_headsscalekey_dimattention_ratioout_dim_keys_valuesout_dim_projectionrN   queries_keys_valuesr   r@   
activation
projectionlist	itertoolsproductrangelen
len_pointsabsappendindicesattention_bias_cacher   	Parameterzerosattention_biasesregister_buffer
LongTensorr_   )r)   r>   rl   rj   rm   r\   pointsrx   attention_offsetsr{   p1p2offsetr0   r   r   r$      s6   



(
zLevitAttention.__init__Tc                    (   t  | |r| jri | _d S d S d S r2   r#   trainr|   r)   moder0   r   r   r         

zLevitAttention.trainc                 C   P   | j r| jd d | jf S t|}|| jvr#| jd d | jf | j|< | j| S r2   trainingr   rg   strr|   r)   device
device_keyr   r   r   get_attention_biases      

z#LevitAttention.get_attention_biasesc           
      C   s   |j \}}}| |}|||| jdj| j| j| j| j gdd\}}}|dddd}|dddd}|dddd}||dd | j	 | 
|j }	|	jdd}	|	| dd||| j}| | |}|S Nr^   r   dimr   r<   r   )rI   rp   r_   rj   splitrl   rm   permuterL   rk   r   r   softmaxr`   ro   rr   rq   )
r)   rX   ra   
seq_lengthrb   rp   querykeyvalue	attentionr   r   r   r4      s   
"zLevitAttention.forwardT
r   r   r   r$   r   no_gradr   r   r4   r7   r   r   r0   r   re      s    	re   c                       rd   )LevitAttentionSubsamplec	                    s  t    || _|d | _|| _|| _|| | ||  | _|| | | _|| _t	|| j| _
t||| _t	||| | _t | _t	| j|| _i | _ttt|t|}	ttt|t|}
t|	t|
}}|| _|| _i g }}|
D ]>}|	D ]9}d}t|d | |d  |d d  t|d | |d  |d d  f}||vrt|||< |||  q~qz|| _tjt|t|| _ | j!dt"|#||dd d S )Nrf   r   r   r<   rg   Frh   )$r#   r$   rj   rk   rl   rm   rn   ro   resolution_outrN   keys_valuesrZ   queries_subsamplequeriesr   r@   rq   rr   r|   rs   rt   ru   rv   rw   len_points_rx   ry   rz   r{   r   r}   r~   r   r   r   r_   )r)   rT   rU   rl   rj   rm   r-   resolution_inr   r   points_rx   r   r   r{   r   r   sizer   r0   r   r   r$      sB   



H
z LevitAttentionSubsample.__init__Tc                    r   r2   r   r   r0   r   r   r     r   zLevitAttentionSubsample.trainc                 C   r   r2   r   r   r   r   r   r     r   z,LevitAttentionSubsample.get_attention_biasesc           	      C   s   |j \}}}| |||| jdj| j| j| j gdd\}}|dddd}|dddd}| | 	|}||| j
d | j| jdddd}||dd | j | |j }|jdd}|| dd|d| j}| | |}|S r   )rI   r   r_   rj   r   rl   rm   r   r   r   r   rL   rk   r   r   r   r`   ro   rr   rq   )	r)   rX   ra   r   rb   r   r   r   r   r   r   r   r4     s"   "zLevitAttentionSubsample.forwardr   r   r   r   r0   r   r      s    0	r   c                       r8   )LevitMLPLayerzE
    MLP Layer with `2X` expansion in contrast to ViT with `4X`.
    c                    s0   t    t||| _t | _t||| _d S r2   )r#   r$   rN   	linear_upr   r@   rq   linear_down)r)   rT   
hidden_dimr0   r   r   r$   2  s   

zLevitMLPLayer.__init__c                 C   s"   |  |}| |}| |}|S r2   )r   rq   r   rW   r   r   r   r4   8  s   


zLevitMLPLayer.forwardr6   r   r   r0   r   r   -  s    r   c                       r8   )LevitResidualLayerz"
    Residual Block for LeViT
    c                    r[   r2   )r#   r$   module	drop_rate)r)   r   r   r0   r   r   r$   D  r]   zLevitResidualLayer.__init__c                 C   sn   | j r.| jdkr.tj|ddd|jd}|| jd| j  }|| 	||  }|S || 	| }|S )Nr   r   )r   )
r   r   r   randr   r   ge_divdetachr   )r)   rX   rndr   r   r   r4   I  s   zLevitResidualLayer.forwardr6   r   r   r0   r   r   ?      r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )
LevitStagezP
    LeViT Stage consisting of `LevitMLPLayer` and `LevitAttention` layers.
    c                    sD  t    g | _|| _|
| _t|D ])}| jtt|||||
| jj	 |dkr;|| }| jtt
||| jj	 q|	d dkr| jd |	d  d | _| jt| jj||d  |	d |	d |	d |	d |
| jd | j| _|	d dkr| jj|d  |	d  }| jtt
| jj|d  || jj	 t| j| _d S )	Nr   	Subsampler      r<   r   )rl   rj   rm   r-   r   r   r;   )r#   r$   layersrH   r   rv   rz   r   re   drop_path_rater   r   r   r>   r   
ModuleList)r)   rH   idxr>   rl   depthsrj   rm   	mlp_ratiodown_opsr   rb   r   r0   r   r   r$   Y  sN   
zLevitStage.__init__c                 C   s   | j S r2   )r   )r)   r   r   r   get_resolution  s   zLevitStage.get_resolutionc                 C   s   | j D ]}||}q|S r2   )r   )r)   rX   layerr   r   r   r4     r5   zLevitStage.forward)r   r   r   r   r$   r   r4   r7   r   r   r0   r   r   T  s
    7r   c                       s*   e Zd ZdZ fddZdddZ  ZS )	LevitEncoderzC
    LeViT Encoder consisting of multiple `LevitStage` stages.
    c                    s   t    || _| jj| jj }g | _| jjdg tt	|j
D ].}t|||j| |j| |j
| |j| |j| |j| |j| |
}| }| j| q"t| j| _d S )N )r#   r$   rH   
image_size
patch_sizestagesr   rz   rv   rw   r   r   r>   rl   rj   rm   r   r   r   r   )r)   rH   r\   	stage_idxstager0   r   r   r$     s*   
zLevitEncoder.__init__FTc                 C   sb   |rdnd }| j D ]}|r||f }||}q	|r||f }|s+tdd ||fD S t||dS )Nr   c                 s   s    | ]	}|d ur|V  qd S r2   r   ).0vr   r   r   	<genexpr>  s    z'LevitEncoder.forward.<locals>.<genexpr>)last_hidden_stater   )r   r   r   )r)   rX   output_hidden_statesreturn_dictall_hidden_statesr   r   r   r   r4     s   



zLevitEncoder.forward)FTr6   r   r   r0   r   r     s    r   c                       r8   )LevitClassificationLayerz$
    LeViT Classification Layer
    c                    s(   t    t|| _t||| _d S r2   )r#   r$   r   rS   r(   rQ   rR   )r)   rT   rU   r0   r   r   r$     s   
z!LevitClassificationLayer.__init__c                 C   s   |  |}| |}|S r2   )r(   rR   )r)   rX   r   r   r   r   r4     r5   z LevitClassificationLayer.forwardr6   r   r   r0   r   r     r   r   c                       s8   e Zd ZU eed< dZdZdZdgZ fddZ	  Z
S )LevitPreTrainedModelrH   levitrM   )imager   c                    sp   t  | t|trt|jt|j	
|j|j d S t|tr6t|jt|j	
|j|j d S d S r2   )r#   _init_weights
isinstancere   initcopy_rg   r   r   r{   r_   rx   r   r   )r)   r   r0   r   r   r     s   

z"LevitPreTrainedModel._init_weights)r   r   r   r   r   base_model_prefixmain_input_nameinput_modalities_no_split_modulesr   r7   r   r   r0   r   r     s   
 r   c                       T   e Zd Z fddZe			d
dejdB dedB dedB dee	B fdd	Z
  ZS )
LevitModelc                    s2   t  | || _t|| _t|| _|   d S r2   )r#   r$   rH   r9   patch_embeddingsr   encoder	post_initrG   r0   r   r   r$     s
   

zLevitModel.__init__NrM   r   r   returnc           	      K   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}|d }|jdd}|s?||f|dd   S t|||jdS )Nz You have to specify pixel_valuesr   r   r   r   r   )r   pooler_outputr   )	rH   r   use_return_dictrJ   r   r   meanr   r   )	r)   rM   r   r   kwargsr3   encoder_outputsr   pooled_outputr   r   r   r4     s(   	
zLevitModel.forwardNNN)r   r   r   r$   r   r   r   boolr   r   r4   r7   r   r   r0   r   r     s    r   z
    Levit Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                       s`   e Zd Z fddZe				ddejdB dejdB dedB dedB de	e
B f
d	d
Z  ZS )LevitForImageClassificationc                    sX   t  | || _|j| _t|| _|jdkr t|jd |jntj	
 | _|   d S Nr   r^   )r#   r$   rH   
num_labelsr   r   r   r>   r   r   Identity
classifierr   rG   r0   r   r   r$   #  s   

z$LevitForImageClassification.__init__NrM   labelsr   r   r   c                 K   s   |dur|n| j j}| j|||d}|d }|d}| |}d}	|dur.| ||| j }	|sD|f|dd  }
|	durB|	f|
 S |
S t|	||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   r<   )lossr   r   )rH   r   r   r   r   loss_functionr   r   )r)   rM   r   r   r   r   outputssequence_outputr   r   outputr   r   r   r4   3  s    

z#LevitForImageClassification.forward)NNNN)r   r   r   r$   r   r   r   r   r   r   r   r4   r7   r   r   r0   r   r     s$    r   ap  
    LeViT Model transformer with image classification heads on top (a linear layer on top of the final hidden state and
    a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet. .. warning::
           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                       r   )&LevitForImageClassificationWithTeacherc                    s   t  | || _|j| _t|| _|jdkr t|jd |jntj	
 | _|jdkr4t|jd |jntj	
 | _|   d S r   )r#   r$   rH   r   r   r   r   r>   r   r   r   r   classifier_distillr   rG   r0   r   r   r$   b  s   


z/LevitForImageClassificationWithTeacher.__init__NrM   r   r   r   c                 K   s   |d ur|n| j j}| j|||d}|d }|d}| || |}}|| d }	|s;|	||f|dd   }
|
S t|	|||jdS )Nr   r   r   r<   )r   r   r   r   )rH   r   r   r   r   r   r   r   )r)   rM   r   r   r   r   r   r   distill_logitsr   r   r   r   r   r4   w  s   
z.LevitForImageClassificationWithTeacher.forwardr   )r   r   r   r$   r   r   r   r   r   r   r4   r7   r   r   r0   r   r   Y  s    	r   )r   r   r   r   )*r   rt   dataclassesr   r   r   r   r   r   modeling_outputsr   r   r   r	   modeling_utilsr
   utilsr   r   configuration_levitr   
get_loggerr   loggerr   Moduler   r9   rN   rZ   re   r   r   r   r   r   r   r   r   r   r   __all__r   r   r   r   <module>   sP   
,@VE./73