o
    ei                  	   @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ eeZd?dedededB defddZededfdedededefddZG dd dejZ G dd dejZ!G dd  d ejZ"G d!d" d"ejZ#G d#d$ d$ejZ$G d%d& d&ejZ%G d'd( d(ejZ&G d)d* d*eZ'G d+d, d,ejZ(eG d-d. d.eZ)eG d/d0 d0e)Z*ed1d2G d3d4 d4e)Z+G d5d6 d6ejZ,G d7d8 d8ejZ-G d9d: d:ejZ.ed;d2G d<d= d=e)Z/g d>Z0dS )@zPyTorch MobileViTV2 model.    N)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)auto_docstringlogging   )MobileViTV2Config   valuedivisor	min_valuereturnc                 C   sF   |du r|}t |t| |d  | | }|d|  k r||7 }t|S )zU
    Ensure that all layers have a channel count that is divisible by `divisor`.
    N   g?)maxint)r   r   r   	new_value r   r/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.pymake_divisible(   s   r   z-infinfmin_valmax_valc                 C   s   t |t|| S N)r   minr   r   r   r   r   r   clip5   s   r#   c                       sr   e Zd Z						ddededededed	ed
edededeeB ddf fddZdej	dej	fddZ
  ZS )MobileViTV2ConvLayerr   FTconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                    s   t    t|d d | }|| dkr td| d| d|| dkr1td| d| dtj||||||||dd		| _|	rNtj|d
dddd| _nd | _|
rst	|
t
r_t|
 | _d S t	|jt
rmt|j | _d S |j| _d S d | _d S )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r&   r'   r(   r)   paddingr,   r*   r+   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r0   	__class__r   r   r8   ;   sB   



zMobileViTV2ConvLayer.__init__featuresc                 C   s6   |  |}| jd ur| |}| jd ur| |}|S r    )r;   r=   r@   )rB   rE   r   r   r   forwardq   s   




zMobileViTV2ConvLayer.forward)r   r   Fr   TT)__name__
__module____qualname__r   r   boolr?   r8   torchTensorrF   __classcell__r   r   rC   r   r$   :   s>    	
6r$   c                       sT   e Zd ZdZ	ddedededededd	f fd
dZdejdejfddZ	  Z
S )MobileViTV2InvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r   r%   r&   r'   r)   r,   r   Nc              	      s   t    ttt||j d}|dvrtd| d|dko$||k| _t|||dd| _	t|||d|||d| _
t|||dd	d
| _d S )Nr   )r   r   zInvalid stride .r   )r&   r'   r(   r   )r&   r'   r(   r)   r*   r,   Fr&   r'   r(   r.   )r7   r8   r   r   roundexpand_ratior9   use_residualr$   
expand_1x1conv_3x3
reduce_1x1)rB   r%   r&   r'   r)   r,   expanded_channelsrC   r   r   r8      s0   

z$MobileViTV2InvertedResidual.__init__rE   c                 C   s4   |}|  |}| |}| |}| jr|| S |S r    )rT   rU   rV   rS   )rB   rE   residualr   r   r   rF      s
   


z#MobileViTV2InvertedResidual.forward)r   rG   rH   rI   __doc__r   r   r8   rK   rL   rF   rM   r   r   rC   r   rN   {   s"    !rN   c                       sP   e Zd Z	ddedededededdf fd	d
ZdejdejfddZ  Z	S )MobileViTV2MobileNetLayerr   r%   r&   r'   r)   
num_stagesr   Nc                    sR   t    t | _t|D ]}t||||dkr|ndd}| j| |}qd S )Nr   r   )r&   r'   r)   )r7   r8   r   
ModuleListlayerrangerN   append)rB   r%   r&   r'   r)   r\   ir^   rC   r   r   r8      s   

z"MobileViTV2MobileNetLayer.__init__rE   c                 C      | j D ]}||}q|S r    r^   )rB   rE   layer_moduler   r   r   rF         

z!MobileViTV2MobileNetLayer.forward)r   r   
rG   rH   rI   r   r   r8   rK   rL   rF   rM   r   r   rC   r   r[      s     r[   c                       sD   e Zd ZdZdededdf fddZdejdejfd	d
Z	  Z
S )MobileViTV2LinearSelfAttentionay  
    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
    https://huggingface.co/papers/2206.02680

    Args:
        config (`MobileVitv2Config`):
             Model configuration object
        embed_dim (`int`):
            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
    r%   	embed_dimr   Nc              	      s\   t    t||dd|  ddddd| _tj|jd| _t|||ddddd| _|| _d S )Nr   r   TF)r%   r&   r'   r+   r(   r-   r.   p)	r7   r8   r$   qkv_projr   Dropoutattn_dropoutout_projrh   )rB   r%   rh   rC   r   r   r8      s*   



	z'MobileViTV2LinearSelfAttention.__init__hidden_statesc           	      C   s   |  |}tj|d| j| jgdd\}}}tjjj|dd}| |}|| }tj|ddd}tjj	||
| }| |}|S )Nr   )split_size_or_sectionsdimrq   Trq   keepdim)rk   rK   splitrh   r   
functionalsoftmaxrm   sumrelu	expand_asrn   )	rB   ro   qkvquerykeyr   context_scorescontext_vectoroutr   r   r   rF      s   
 

z&MobileViTV2LinearSelfAttention.forwardrY   r   r   rC   r   rg      s    rg   c                       L   e Zd Z	ddededededdf
 fdd	Zd
ejdejfddZ	  Z
S )MobileViTV2FFN        r%   rh   ffn_latent_dimffn_dropoutr   Nc              
      sZ   t    t|||dddddd| _t|| _t|||dddddd| _t|| _d S )Nr   TF)r%   r&   r'   r(   r)   r+   r-   r.   )	r7   r8   r$   conv1r   rl   dropout1conv2dropout2)rB   r%   rh   r   r   rC   r   r   r8     s.   


zMobileViTV2FFN.__init__ro   c                 C   s,   |  |}| |}| |}| |}|S r    )r   r   r   r   )rB   ro   r   r   r   rF   #  s
   



zMobileViTV2FFN.forwardr   rG   rH   rI   r   r   floatr8   rK   rL   rF   rM   r   r   rC   r   r     s     r   c                       r   )MobileViTV2TransformerLayerr   r%   rh   r   dropoutr   Nc                    sb   t    tjd||jd| _t||| _tj|d| _	tjd||jd| _
t||||j| _d S )Nr   
num_groupsnum_channelsr3   ri   )r7   r8   r   	GroupNormlayer_norm_epslayernorm_beforerg   	attentionrl   r   layernorm_afterr   r   ffn)rB   r%   rh   r   r   rC   r   r   r8   ,  s   
z$MobileViTV2TransformerLayer.__init__ro   c                 C   s<   |  |}| |}|| }| |}| |}|| }|S r    )r   r   r   r   )rB   ro   layernorm_1_outattention_outputlayer_outputr   r   r   rF   :  s   



z#MobileViTV2TransformerLayer.forwardr   r   r   r   rC   r   r   +  s    r   c                       D   e Zd Zdedededdf fddZdejdejfd	d
Z  Z	S )MobileViTV2Transformerr%   n_layersd_modelr   Nc                    sf   t    |j}|| g| }dd |D }t | _t|D ]}t|||| d}| j| qd S )Nc                 S   s   g | ]
}t |d  d  qS )   )r   ).0dr   r   r   
<listcomp>O  s    z3MobileViTV2Transformer.__init__.<locals>.<listcomp>)rh   r   )	r7   r8   ffn_multiplierr   r]   r^   r_   r   r`   )rB   r%   r   r   r   ffn_dims	block_idxtransformer_layerrC   r   r   r8   G  s   


zMobileViTV2Transformer.__init__ro   c                 C   rb   r    rc   )rB   ro   rd   r   r   r   rF   X  re   zMobileViTV2Transformer.forwardrf   r   r   rC   r   r   F  s    r   c                       s   e Zd ZdZ			ddededededed	ed
eddf fddZdejde	eje	eef f fddZ
dejde	eef dejfddZdejdejfddZ  ZS )MobileViTV2LayerzE
    MobileViTV2 layer: https://huggingface.co/papers/2206.02680
    r   r   r%   r&   r'   attn_unit_dimn_attn_blocksr,   r)   r   Nc           	         s   t    |j| _|j| _|}|dkr.t||||dkr|nd|dkr&|d ndd| _|}nd | _t||||j|d| _	t|||dddd| _
t|||d| _tjd||jd| _t|||dd	dd| _d S )
Nr   r   )r&   r'   r)   r,   )r&   r'   r(   r*   F)r&   r'   r(   r-   r.   )r   r   r   T)r7   r8   
patch_sizepatch_widthpatch_heightrN   downsampling_layerr$   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projection)	rB   r%   r&   r'   r   r   r,   r)   cnn_out_dimrC   r   r   r8   c  sN   


zMobileViTV2Layer.__init__feature_mapc                 C   sT   |j \}}}}tjj|| j| jf| j| jfd}|||| j| j d}|||ffS )N)r(   r)   rr   )shaper   rw   unfoldr   r   reshape)rB   r   
batch_sizer&   
img_height	img_widthpatchesr   r   r   	unfolding  s   

zMobileViTV2Layer.unfoldingr   output_sizec                 C   sH   |j \}}}}|||| |}tjj||| j| jf| j| jfd}|S )N)r   r(   r)   )r   r   r   rw   foldr   r   )rB   r   r   r   in_dimr   	n_patchesr   r   r   r   folding  s   

zMobileViTV2Layer.foldingrE   c                 C   s`   | j r|  |}| |}| |}| |\}}| |}| |}| ||}| |}|S r    )r   r   r   r   r   r   r   r   )rB   rE   r   r   r   r   r   rF     s   





zMobileViTV2Layer.forward)r   r   r   )rG   rH   rI   rZ   r   r   r8   rK   rL   tupler   r   rF   rM   r   r   rC   r   r   ^  s2    
	&="r   c                
       sL   e Zd Zdeddf fddZ		ddejd	ed
edee	B fddZ
  ZS )MobileViTV2Encoderr%   r   Nc                    s  t    || _t | _d| _d }}|jdkrd}d}n|jdkr%d}d}tt	d|j
 dddddd	}td|j
 dd
}td|j
 dd
}td|j
 dd
}td|j
 dd
}	td|j
 dd
}
t|||ddd}| j| t|||ddd}| j| t|||t|jd |j
 dd
|jd d}| j| |r|d9 }t|||	t|jd |j
 dd
|jd |d}| j| |r|d9 }t||	|
t|jd |j
 dd
|jd |d}| j| d S )NFr   Tr   r       @   r"   r   r   r         i     )r&   r'   r)   r\   r   r   )r&   r'   r   r   )r&   r'   r   r   r,   )r7   r8   r%   r   r]   r^   gradient_checkpointingoutput_strider   r#   width_multiplierr[   r`   r   base_attn_unit_dimsr   )rB   r%   dilate_layer_4dilate_layer_5r,   layer_0_dimlayer_1_dimlayer_2_dimlayer_3_dimlayer_4_dimlayer_5_dimlayer_1layer_2layer_3layer_4layer_5rC   r   r   r8     s   



zMobileViTV2Encoder.__init__FTro   output_hidden_statesreturn_dictc                 C   s\   |rdnd }t | jD ]\}}||}|r||f }q|s(tdd ||fD S t||dS )Nr   c                 s   s    | ]	}|d ur|V  qd S r    r   )r   vr   r   r   	<genexpr>1  s    z-MobileViTV2Encoder.forward.<locals>.<genexpr>)last_hidden_statero   )	enumerater^   r   r   )rB   ro   r   r   all_hidden_statesra   rd   r   r   r   rF   "  s   
zMobileViTV2Encoder.forward)FT)rG   rH   rI   r   r8   rK   rL   rJ   r   r   rF   rM   r   r   rC   r   r     s    Tr   c                   @   sH   e Zd ZU eed< dZdZdZdZdgZ	e
 dejdd	fd
dZd	S )MobileViTV2PreTrainedModelr%   mobilevitv2pixel_values)imageTr   moduler   Nc                 C   s   t |tjtjtjfr?tj|jd| jj	d |j
dur!t|j
 t|dddur=t|j t|j t|j dS dS t |tjrSt|j
 t|j dS dS )zInitialize the weightsr   )meanstdNrunning_mean)r>   r   Linearr:   r<   initnormal_weightr%   initializer_ranger+   zeros_getattrr   ones_running_varnum_batches_trackedr   )rB   r   r   r   r   _init_weights?  s   
z(MobileViTV2PreTrainedModel._init_weights)rG   rH   rI   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesrK   no_gradr   Moduler   r   r   r   r   r   6  s   
 r   c                       s`   e Zd Zddedef fddZe			ddejdB dedB d	edB d
e	e
B fddZ  ZS )MobileViTV2ModelTr%   expand_outputc              	      sf   t  | || _|| _ttd|j dddddd}t||j|ddd	d	d
| _	t
|| _|   dS )a  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model. If `True`, the model will output pooled features in addition to
            hidden states. If `False`, only the hidden states will be returned.
        r   r   r   r"   r   r   r   r   Tr&   r'   r(   r)   r-   r.   N)r7   r8   r%   r  r   r#   r   r$   r   	conv_stemr   encoder	post_init)rB   r%   r  r   rC   r   r   r8   Q  s"   
	zMobileViTV2Model.__init__Nr   r   r   r   c           
      K   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}| jr;|d }tj|ddgdd}n|d }d }|sV|d urK||fn|f}	|	|dd   S t	|||j
d	S )
Nz You have to specify pixel_valuesr   r   r   rr   Frt   r   )r   pooler_outputro   )r%   r   use_return_dictr9   r  r  r  rK   r   r	   ro   )
rB   r   r   r   kwargsembedding_outputencoder_outputsr   pooled_outputoutputr   r   r   rF   m  s0   	
zMobileViTV2Model.forward)T)NNN)rG   rH   rI   r   rJ   r8   r   rK   rL   r   r	   rF   rM   r   r   rC   r   r   O  s    r   z
    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                       sj   e Zd Zdeddf fddZe				ddejdB dedB dejdB d	edB de	e
B f
d
dZ  ZS )!MobileViTV2ForImageClassificationr%   r   Nc                    s`   t  | |j| _t|| _td|j dd}|jdkr%tj||jdnt	 | _
|   d S )Nr   r   r   r   )in_featuresout_features)r7   r8   
num_labelsr   r   r   r   r   r   Identity
classifierr  )rB   r%   r'   rC   r   r   r8     s   

z*MobileViTV2ForImageClassification.__init__r   r   labelsr   c                 K   s   |dur|n| j j}| j|||d}|r|jn|d }| |}d}	|dur.| ||| j }	|sD|f|dd  }
|	durB|	f|
 S |
S t|	||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   )losslogitsro   )r%   r	  r   r  r  loss_functionr
   ro   )rB   r   r   r  r   r
  outputsr  r  r  r  r   r   r   rF     s   
z)MobileViTV2ForImageClassification.forwardNNNN)rG   rH   rI   r   r8   r   rK   rL   rJ   r   r
   rF   rM   r   r   rC   r   r    s$    r  c                       r   )MobileViTV2ASPPPoolingr%   r&   r'   r   Nc              	      s4   t    tjdd| _t|||ddddd| _d S )Nr   )r   Trz   r  )r7   r8   r   AdaptiveAvgPool2dglobal_poolr$   r   )rB   r%   r&   r'   rC   r   r   r8     s   
zMobileViTV2ASPPPooling.__init__rE   c                 C   s:   |j dd  }| |}| |}tjj||ddd}|S )Nr  bilinearFsizemodealign_corners)r   r  r   r   rw   interpolate)rB   rE   spatial_sizer   r   r   rF     s
   

zMobileViTV2ASPPPooling.forwardrf   r   r   rC   r   r    s    r  c                       @   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
MobileViTV2ASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r%   r   Nc                    s   t    td j dd}| jt jdkrtdt	 | _
t ddd}| j
| | j
 fd	d
 jD  t }| j
| t d ddd| _tj jd| _d S )Nr   r   r   r   z"Expected 3 values for atrous_ratesr   rz   rP   c              
      s    g | ]}t  d |ddqS )r   rz   )r&   r'   r(   r,   r.   )r$   )r   rater%   r&   r'   r   r   r     s    	z,MobileViTV2ASPP.__init__.<locals>.<listcomp>   ri   )r7   r8   r   r   aspp_out_channelslenatrous_ratesr9   r   r]   convsr$   r`   extendr  projectrl   aspp_dropout_probr   )rB   r%   encoder_out_channelsin_projection
pool_layerrC   r)  r   r8     s4   

	zMobileViTV2ASPP.__init__rE   c                 C   sD   g }| j D ]	}||| qtj|dd}| |}| |}|S )Nr   rs   )r.  r`   rK   catr0  r   )rB   rE   pyramidconvpooled_featuresr   r   r   rF   !  s   


zMobileViTV2ASPP.forward
rG   rH   rI   rZ   r   r8   rK   rL   rF   rM   r   r   rC   r   r'    s    ,r'  c                       r&  )
MobileViTV2DeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r%   r   Nc              	      sB   t    t|| _t|j| _t||j	|j
ddddd| _d S )Nr   FT)r&   r'   r(   r-   r.   r+   )r7   r8   r'  asppr   	Dropout2dclassifier_dropout_probr   r$   r+  r  r  rB   r%   rC   r   r   r8   2  s   

zMobileViTV2DeepLabV3.__init__ro   c                 C   s&   |  |d }| |}| |}|S )Nrr   )r;  r   r  )rB   ro   rE   r   r   r   rF   B  s   

zMobileViTV2DeepLabV3.forwardr9  r   r   rC   r   r:  -  s    r:  zZ
    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                       sj   e Zd Zdeddf fddZe				ddejdB dejdB dedB d	edB de	e
B f
d
dZ  ZS )"MobileViTV2ForSemanticSegmentationr%   r   Nc                    s8   t  | |j| _t|dd| _t|| _|   d S )NF)r  )r7   r8   r  r   r   r:  segmentation_headr  r>  rC   r   r   r8   O  s
   
z+MobileViTV2ForSemanticSegmentation.__init__r   r  r   r   c                 K   s  |dur|n| j j}|dur|n| j j}|dur"| j jdkr"td| j|d|d}|r/|jn|d }| |}d}	|durYtj	j
||jdd ddd	}
t| j jd
}||
|}	|s{|rg|f|dd  }n	|f|dd  }|	dury|	f| S |S t|	||r|jddS dddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  r  r  Fr   )ignore_indexr   )r  r  ro   
attentions)r%   r   r	  r  r9   r   ro   r@  r   rw   r$  r   r   semantic_loss_ignore_indexr   )rB   r   r  r   r   r
  r  encoder_hidden_statesr  r  upsampled_logitsloss_fctr  r   r   r   rF   Y  sB   '

z*MobileViTV2ForSemanticSegmentation.forwardr  )rG   rH   rI   r   r8   r   rK   rL   rJ   r   r   rF   rM   r   r   rC   r   r?  I  s$    
r?  )r  r?  r   r   )r   N)1rZ   rK   r   torch.nnr    r   r   activationsr   modeling_layersr   modeling_outputsr   r	   r
   r   modeling_utilsr   utilsr   r   configuration_mobilevitv2r   
get_loggerrG   loggerr   r   r   r#   r   r$   rN   r[   rg   r   r   r   r   r   r   r   r  r  r'  r:  r?  __all__r   r   r   r   <module>   sN   
 (A1?)rfI9=[