o
    wi`                  	   @   sV  d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZ e e!Z"d@de#de#dee# de#fddZ$G dd dej%Z&G dd dej%Z'G dd dej%Z(G dd dej%Z)G dd dej%Z*G d d! d!ej%Z+G d"d# d#ej%Z,G d$d% d%ej%Z-G d&d' d'ej%Z.G d(d) d)ej%Z/G d*d+ d+eZ0G d,d- d-ej%Z1eG d.d/ d/eZ2eG d0d1 d1e2Z3ed2d3G d4d5 d5e2Z4G d6d7 d7ej%Z5G d8d9 d9ej%Z6G d:d; d;ej%Z7ed<d3G d=d> d>e2Z8g d?Z9dS )AzPyTorch MobileViT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )MobileViTConfig   valuedivisor	min_valuereturnc                 C   sF   |du r|}t |t| |d  | | }|d|  k r||7 }t|S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    N   g?)maxint)r   r   r   	new_value r    m/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisible,   s   r"   c                       sv   e Zd Z						ddededededed	ed
edededeeef ddf fddZde	j
de	j
fddZ  ZS )MobileViTConvLayerr   FTconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                    s   t    t|d d | }|| dkr td| d| d|| dkr1td| d| dtj||||||||dd		| _|	rNtj|d
dddd| _nd | _|
rst	|
t
r_t|
 | _d S t	|jt
rmt|j | _d S |j| _d S d | _d S )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r%   r&   r'   r(   paddingr+   r)   r*   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr	   
activation
hidden_act)selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r/   	__class__r    r!   r7   <   sB   



zMobileViTConvLayer.__init__featuresc                 C   s6   |  |}| jd ur| |}| jd ur| |}|S N)r:   r<   r?   )rA   rD   r    r    r!   forwardr   s   




zMobileViTConvLayer.forward)r   r   Fr   TT)__name__
__module____qualname__r   r   boolr   r>   r7   torchTensorrF   __classcell__r    r    rB   r!   r#   ;   s>    	

6r#   c                       sT   e Zd ZdZ	ddedededededd	f fd
dZdejdejfddZ	  Z
S )MobileViTInvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r   r$   r%   r&   r(   r+   r   Nc              	      s   t    ttt||j d}|dvrtd| d|dko$||k| _t|||dd| _	t|||d|||d| _
t|||dd	d
| _d S )Nr   )r   r   zInvalid stride .r   r%   r&   r'   r   )r%   r&   r'   r(   r)   r+   Fr%   r&   r'   r-   )r6   r7   r"   r   roundexpand_ratior8   use_residualr#   
expand_1x1conv_3x3
reduce_1x1)rA   r$   r%   r&   r(   r+   expanded_channelsrB   r    r!   r7      s0   

z"MobileViTInvertedResidual.__init__rD   c                 C   s4   |}|  |}| |}| |}| jr|| S |S rE   )rU   rV   rW   rT   )rA   rD   residualr    r    r!   rF      s
   


z!MobileViTInvertedResidual.forwardr   )rG   rH   rI   __doc__r   r   r7   rK   rL   rF   rM   r    r    rB   r!   rN   {   s"    !rN   c                       sP   e Zd Z	ddedededededdf fd	d
ZdejdejfddZ  Z	S )MobileViTMobileNetLayerr   r$   r%   r&   r(   
num_stagesr   Nc                    sR   t    t | _t|D ]}t||||dkr|ndd}| j| |}qd S )Nr   r   )r%   r&   r(   )r6   r7   r   
ModuleListlayerrangerN   append)rA   r$   r%   r&   r(   r]   ir_   rB   r    r!   r7      s   

z MobileViTMobileNetLayer.__init__rD   c                 C      | j D ]}||}q|S rE   r_   )rA   rD   layer_moduler    r    r!   rF         

zMobileViTMobileNetLayer.forward)r   r   
rG   rH   rI   r   r   r7   rK   rL   rF   rM   r    r    rB   r!   r\      s     r\   c                       sV   e Zd Zdededdf fddZdejdejfdd	Zd
ejdejfddZ	  Z
S )MobileViTSelfAttentionr$   hidden_sizer   Nc                    s   t    ||j dkrtd| d|j d|j| _t||j | _| j| j | _tj|| j|j	d| _
tj|| j|j	d| _tj|| j|j	d| _t|j| _d S )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rO   )r*   )r6   r7   num_attention_headsr8   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrA   r$   ri   rB   r    r!   r7      s   
zMobileViTSelfAttention.__init__xc                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr   r   r   r   )sizerj   rk   viewpermute)rA   ru   new_x_shaper    r    r!   transpose_for_scores   s   
z+MobileViTSelfAttention.transpose_for_scoreshidden_statesc           
      C   s   |  |}| | |}| | |}| |}t||dd}|t| j	 }t
jj|dd}| |}t||}|dddd }| d d | jf }	|j|	 }|S )Nrv   dimr   r   r   r   )ro   r{   rp   r   rK   matmul	transposemathsqrtrk   r   
functionalsoftmaxrs   ry   
contiguousrw   rl   rx   )
rA   r|   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shaper    r    r!   rF      s   



zMobileViTSelfAttention.forward)rG   rH   rI   r   r   r7   rK   rL   r{   rF   rM   r    r    rB   r!   rh      s    rh   c                       s@   e Zd Zdededdf fddZdejdejfdd	Z  Z	S )
MobileViTSelfOutputr$   ri   r   Nc                    s*   t    t||| _t|j| _d S rE   r6   r7   r   rm   denserq   hidden_dropout_probrs   rt   rB   r    r!   r7         
zMobileViTSelfOutput.__init__r|   c                 C      |  |}| |}|S rE   r   rs   rA   r|   r    r    r!   rF      rf   zMobileViTSelfOutput.forwardrg   r    r    rB   r!   r      s    r   c                       sV   e Zd Zdededdf fddZdee ddfdd	Zd
ej	dej	fddZ
  ZS )MobileViTAttentionr$   ri   r   Nc                    s.   t    t||| _t||| _t | _d S rE   )r6   r7   rh   	attentionr   outputsetpruned_headsrt   rB   r    r!   r7     s   
zMobileViTAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r~   )lenr   r   rj   rk   r   r   ro   rp   r   r   r   rl   union)rA   r   indexr    r    r!   prune_heads  s   zMobileViTAttention.prune_headsr|   c                 C   s   |  |}| |}|S rE   )r   r   )rA   r|   self_outputsattention_outputr    r    r!   rF     rf   zMobileViTAttention.forward)rG   rH   rI   r   r   r7   r   r   rK   rL   rF   rM   r    r    rB   r!   r     s    r   c                       D   e Zd Zdedededdf fddZdejdejfd	d
Z  Z	S )MobileViTIntermediater$   ri   intermediate_sizer   Nc                    s@   t    t||| _t|jtrt|j | _	d S |j| _	d S rE   )
r6   r7   r   rm   r   r=   r@   r>   r	   intermediate_act_fnrA   r$   ri   r   rB   r    r!   r7   !  s
   
zMobileViTIntermediate.__init__r|   c                 C   r   rE   )r   r   r   r    r    r!   rF   )  rf   zMobileViTIntermediate.forwardrg   r    r    rB   r!   r          r   c                       sJ   e Zd Zdedededdf fddZdejd	ejdejfd
dZ  Z	S )MobileViTOutputr$   ri   r   r   Nc                    s*   t    t||| _t|j| _d S rE   r   r   rB   r    r!   r7   0  r   zMobileViTOutput.__init__r|   input_tensorc                 C   s    |  |}| |}|| }|S rE   r   )rA   r|   r   r    r    r!   rF   5  s   

zMobileViTOutput.forwardrg   r    r    rB   r!   r   /  s    $r   c                       r   )MobileViTTransformerLayerr$   ri   r   r   Nc                    sZ   t    t||| _t|||| _t|||| _tj	||j
d| _tj	||j
d| _d S )Nr2   )r6   r7   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   rB   r    r!   r7   =  s   
z"MobileViTTransformerLayer.__init__r|   c                 C   s<   |  | |}|| }| |}| |}| ||}|S rE   )r   r   r   r   r   )rA   r|   r   layer_outputr    r    r!   rF   E  s   

z!MobileViTTransformerLayer.forwardrg   r    r    rB   r!   r   <  r   r   c                       r   )MobileViTTransformerr$   ri   r]   r   Nc                    sJ   t    t | _t|D ]}t||t||j d}| j	| qd S )N)ri   r   )
r6   r7   r   r^   r_   r`   r   r   	mlp_ratiora   )rA   r$   ri   r]   _transformer_layerrB   r    r!   r7   P  s   

zMobileViTTransformer.__init__r|   c                 C   rc   rE   rd   )rA   r|   re   r    r    r!   rF   \  rf   zMobileViTTransformer.forwardrg   r    r    rB   r!   r   O  s    r   c                       s   e Zd ZdZ	ddedededededed	ed
df fddZdejd
e	eje
f fddZdejde
d
ejfddZdejd
ejfddZ  ZS )MobileViTLayerzC
    MobileViT block: https://huggingface.co/papers/2110.02178
    r   r$   r%   r&   r(   ri   r]   r+   r   Nc                    s   t    |j| _|j| _|dkr,t||||dkr|nd|dkr$|d ndd| _|}nd | _t||||jd| _	t|||dddd| _
t|||d| _tj||jd| _t|||dd| _t|d| ||jd| _d S )	Nr   r   )r%   r&   r(   r+   rP   F)r%   r&   r'   r,   r-   )ri   r]   r   )r6   r7   
patch_sizepatch_widthpatch_heightrN   downsampling_layerr#   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)rA   r$   r%   r&   r(   ri   r]   r+   rB   r    r!   r7   g  sN   

	zMobileViTLayer.__init__rD   c                 C   sN  | j | j}}t|| }|j\}}}}tj r$tt|| | n
tt	|| | }	tj r?tt|| | n
tt	|| | }
d}|
|ksT|	|krbt
jj||	|
fddd}d}|
| }|	| }|| }||| | |||}|dd}|||||}|dd}||| |d}||f||||||d	}||fS )
NFbilinearrw   modealign_cornersTr   r   r   rv   )	orig_size
batch_sizechannelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r   shaperK   jit
is_tracingr   ceilr   r   r   r   reshaper   )rA   rD   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dictr    r    r!   	unfolding  sH   	zMobileViTLayer.unfoldingr   r   c                 C   s   | j | j}}t|| }|d }|d }|d }|d }	|d }
| |||d}|dd}||| |	 |
||}|dd	}||||	| |
| }|d
 r_tjj	||d ddd}|S )Nr   r   r   r   r   rv   r   r   r   r   r   r   Fr   )
r   r   r   r   rx   r   r   r   r   r   )rA   r   r   r   r   r   r   r   r   r   r   rD   r    r    r!   folding  s*   zMobileViTLayer.foldingc                 C   s|   | j r|  |}|}| |}| |}| |\}}| |}| |}| ||}| |}| t	j
||fdd}|S Nr   r~   )r   r   r   r   r   r   r   r   r   rK   cat)rA   rD   rY   r   r   r    r    r!   rF     s   





zMobileViTLayer.forwardrZ   )rG   rH   rI   r[   r   r   r7   rK   rL   tupledictr   r   rF   rM   r    r    rB   r!   r   b  s.    	:3r   c                       sP   e Zd Zdeddf fddZ		ddejd	ed
edee	e
f fddZ  ZS )MobileViTEncoderr$   r   Nc           
   	      sX  t    || _t | _d| _d }}|jdkrd}d}n|jdkr%d}d}t||j	d |j	d ddd}| j
| t||j	d |j	d dd	d}| j
| t||j	d |j	d	 d|jd dd
}| j
| |rp|d9 }t||j	d	 |j	d d|jd d|d}| j
| |r|d9 }t||j	d |j	d d|jd d	|d}	| j
|	 d S )NFr   T   r   r   )r%   r&   r(   r]   r   r   )r%   r&   r(   ri   r]      )r%   r&   r(   ri   r]   r+      )r6   r7   r$   r   r^   r_   gradient_checkpointingoutput_strider\   neck_hidden_sizesra   r   hidden_sizes)
rA   r$   dilate_layer_4dilate_layer_5r+   layer_1layer_2layer_3layer_4layer_5rB   r    r!   r7     sx   



		zMobileViTEncoder.__init__FTr|   output_hidden_statesreturn_dictc                 C   s\   |rdnd }t | jD ]\}}||}|r||f }q|s(tdd ||fD S t||dS )Nr    c                 s   s    | ]	}|d ur|V  qd S rE   r    ).0vr    r    r!   	<genexpr>e  s    z+MobileViTEncoder.forward.<locals>.<genexpr>)last_hidden_stater|   )	enumerater_   r   r   )rA   r|   r   r   all_hidden_statesrb   re   r    r    r!   rF   V  s   
zMobileViTEncoder.forward)FT)rG   rH   rI   r   r7   rK   rL   rJ   r   r   r   rF   rM   r    r    rB   r!   r     s    M
r   c                   @   sD   e Zd ZeZdZdZdZdgZde	e
je
je
jf ddfdd	ZdS )
MobileViTPreTrainedModel	mobilevitpixel_valuesTr   moduler   Nc                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsg        )meanstdNg      ?)r=   r   rm   r9   weightdatanormal_r$   initializer_ranger*   zero_r   fill_)rA   r   r    r    r!   _init_weightsr  s   
z&MobileViTPreTrainedModel._init_weights)rG   rH   rI   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   rm   r9   r   r  r    r    r    r!   r   j  s    &r   c                       sl   e Zd Zddedef fddZdd Ze			dd	ee	j
 d
ee dee deeef fddZ  ZS )MobileViTModelTr$   expand_outputc                    sn   t  | || _|| _t||j|jd ddd| _t|| _	| jr1t||jd |jd dd| _
|   d	S )
aE  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
            1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
        r   r   r   )r%   r&   r'   r(   r      r   rP   N)r6   r7   r$   r  r#   num_channelsr   	conv_stemr   encoderconv_1x1_exp	post_init)rA   r$   r  rB   r    r!   r7     s&   
zMobileViTModel.__init__c                 C   sF   |  D ]\}}| jj| }t|tr |jjD ]}|j| qqdS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr  r_   r=   r   r   r   r   )rA   heads_to_prunelayer_indexr   mobilevit_layerr   r    r    r!   _prune_heads  s   
zMobileViTModel._prune_headsNr   r   r   r   c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}| jr>| |d }tj	|ddgdd}n|d }d }|sY|d urN||fn|f}||dd   S t
|||jd	S )
Nz You have to specify pixel_valuesr   r   r   r}   rv   F)r   keepdimr   )r   pooler_outputr|   )r$   r   use_return_dictr8   r  r  r  r  rK   r   r   r|   )	rA   r   r   r   embedding_outputencoder_outputsr   pooled_outputr   r    r    r!   rF     s0   
zMobileViTModel.forward)T)NNN)rG   rH   rI   r   rJ   r7   r  r   r   rK   rL   r   r   r   rF   rM   r    r    rB   r!   r    s     

r  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                       sn   e Zd Zdeddf fddZe				ddeej dee	 deej d	ee	 de
eef f
d
dZ  ZS )MobileViTForImageClassificationr$   r   Nc                    sd   t  | |j| _t|| _tj|jdd| _|jdkr't	|j
d |jnt | _|   d S )NT)inplacer   rv   )r6   r7   
num_labelsr  r   r   rq   classifier_dropout_probrs   rm   r   Identity
classifierr  rA   r$   rB   r    r!   r7     s   
$z(MobileViTForImageClassification.__init__r   r   labelsr   c                 C   sh  |dur|n| j j}| j|||d}|r|jn|d }| | |}d}|dur| j jdu rS| jdkr9d| j _n| jdkrO|jt	j
ksJ|jt	jkrOd| j _nd| j _| j jdkrqt }	| jdkrk|	| | }n+|	||}n%| j jdkrt }	|	|d| j|d}n| j jdkrt }	|	||}|s|f|dd  }
|dur|f|
 S |
S t|||jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   
regressionsingle_label_classificationmulti_label_classificationrv   r   )losslogitsr|   )r$   r  r   r  r%  rs   problem_typer"  dtyperK   longr   r   squeezer   rx   r   r   r|   )rA   r   r   r'  r   outputsr  r,  r+  loss_fctr   r    r    r!   rF     s>   

"


z'MobileViTForImageClassification.forwardNNNN)rG   rH   rI   r   r7   r   r   rK   rL   rJ   r   r   r   rF   rM   r    r    rB   r!   r     s$    
r   c                       r   )MobileViTASPPPoolingr$   r%   r&   r   Nc              	      s4   t    tjdd| _t|||ddddd| _d S )Nr   )output_sizeTrelu)r%   r&   r'   r(   r,   r-   )r6   r7   r   AdaptiveAvgPool2dglobal_poolr#   r   )rA   r$   r%   r&   rB   r    r!   r7   $  s   
zMobileViTASPPPooling.__init__rD   c                 C   s:   |j dd  }| |}| |}tjj||ddd}|S )Nr}   r   Fr   )r   r8  r   r   r   r   )rA   rD   spatial_sizer    r    r!   rF   3  s
   

zMobileViTASPPPooling.forwardrg   r    r    rB   r!   r4  #  s    r4  c                       @   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
MobileViTASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r$   r   Nc                    s   t     jd  jt jdkrtdt | _	t
 ddd}| j	| | j	 fdd jD  t }| j	| t
 d	 ddd| _tj jd
| _d S )Nr}   r   z"Expected 3 values for atrous_ratesr   r6  rQ   c              
      s    g | ]}t  d |ddqS )r   r6  )r%   r&   r'   r+   r-   )r#   )r   rater$   r%   r&   r    r!   
<listcomp>U  s    	z*MobileViTASPP.__init__.<locals>.<listcomp>r   )p)r6   r7   r   aspp_out_channelsr   atrous_ratesr8   r   r^   convsr#   ra   extendr4  projectrq   aspp_dropout_probrs   )rA   r$   in_projection
pool_layerrB   r=  r!   r7   @  s2   


	zMobileViTASPP.__init__rD   c                 C   sD   g }| j D ]	}||| qtj|dd}| |}| |}|S r   )rB  ra   rK   r   rD  rs   )rA   rD   pyramidconvpooled_featuresr    r    r!   rF   k  s   


zMobileViTASPP.forward
rG   rH   rI   r[   r   r7   rK   rL   rF   rM   r    r    rB   r!   r;  ;  s    +r;  c                       r:  )
MobileViTDeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r$   r   Nc              	      sB   t    t|| _t|j| _t||j	|j
ddddd| _d S )Nr   FT)r%   r&   r'   r,   r-   r*   )r6   r7   r;  asppr   	Dropout2dr#  rs   r#   r@  r"  r%  r&  rB   r    r!   r7   {  s   

zMobileViTDeepLabV3.__init__r|   c                 C   s&   |  |d }| |}| |}|S )Nrv   )rM  rs   r%  )rA   r|   rD   r    r    r!   rF     s   

zMobileViTDeepLabV3.forwardrK  r    r    rB   r!   rL  v  s    rL  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                       sn   e Zd Zdeddf fddZe				ddeej deej dee	 d	ee	 de
eef f
d
dZ  ZS ) MobileViTForSemanticSegmentationr$   r   Nc                    s8   t  | |j| _t|dd| _t|| _|   d S )NF)r  )r6   r7   r"  r  r   rL  segmentation_headr  r&  rB   r    r!   r7     s
   
z)MobileViTForSemanticSegmentation.__init__r   r'  r   r   c                 C   s  |dur|n| j j}|dur|n| j j}|dur"| j jdkr"td| j|d|d}|r/|jn|d }| |}d}|durYtj	j
||jdd ddd	}	t| j jd
}
|
|	|}|s{|rg|f|dd  }n	|f|dd  }|dury|f| S |S t|||r|jddS dddS )a{  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  r}   r   Fr   )ignore_indexr   )r+  r,  r|   
attentions)r$   r   r  r"  r8   r   r|   rP  r   r   r   r   r   semantic_loss_ignore_indexr   )rA   r   r'  r   r   r1  encoder_hidden_statesr,  r+  upsampled_logitsr2  r   r    r    r!   rF     sB   $

z(MobileViTForSemanticSegmentation.forwardr3  )rG   rH   rI   r   r7   r   r   rK   rL   rJ   r   r   r   rF   rM   r    r    rB   r!   rO    s$    

rO  )r   rO  r  r   )r   N):r[   r   typingr   r   rK   torch.utils.checkpointr   torch.nnr   r   r   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   configuration_mobilevitr   
get_loggerrG   loggerr   r"   Moduler#   rN   r\   rh   r   r   r   r   r   r   r   r   r   r  r   r4  r;  rL  rO  __all__r    r    r    r!   <module>   sZ   
 @03 *_UH;X