o
    eiJ                  	   @   s2  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZmZ ddlmZ eeZd?dedededB defddZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd  d ejZ%G d!d" d"ejZ&G d#d$ d$ejZ'G d%d& d&ejZ(G d'd( d(ejZ)G d)d* d*eZ*G d+d, d,ejZ+eG d-d. d.eZ,eG d/d0 d0e,Z-ed1d2G d3d4 d4e,Z.G d5d6 d6ejZ/G d7d8 d8ejZ0G d9d: d:ejZ1ed;d2G d<d= d=e,Z2g d>Z3dS )@zPyTorch MobileViT model.    N)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)auto_docstringlogging	torch_int   )MobileViTConfig   valuedivisor	min_valuereturnc                 C   sF   |du r|}t |t| |d  | | }|d|  k r||7 }t|S )zU
    Ensure that all layers have a channel count that is divisible by `divisor`.
    N   g?)maxint)r   r   r   	new_value r   n/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisible)   s   r   c                       sr   e Zd Z						ddededededed	ed
edededeeB ddf fddZdej	dej	fddZ
  ZS )MobileViTConvLayerr   FTconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                    s   t    t|d d | }|| dkr td| d| d|| dkr1td| d| dtj||||||||dd		| _|	rNtj|d
dddd| _nd | _|
rst	|
t
r_t|
 | _d S t	|jt
rmt|j | _d S |j| _d S d | _d S )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r    r!   r"   r#   paddingr&   r$   r%   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r*   	__class__r   r   r2   7   sB   



zMobileViTConvLayer.__init__featuresc                 C   s6   |  |}| jd ur| |}| jd ur| |}|S N)r5   r7   r:   )r<   r?   r   r   r   forwardm   s   




zMobileViTConvLayer.forward)r   r   Fr   TT)__name__
__module____qualname__r   r   boolr9   r2   torchTensorrA   __classcell__r   r   r=   r   r   6   s>    	
6r   c                       sT   e Zd ZdZ	ddedededededd	f fd
dZdejdejfddZ	  Z
S )MobileViTInvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r   r   r    r!   r#   r&   r   Nc              	      s   t    ttt||j d}|dvrtd| d|dko$||k| _t|||dd| _	t|||d|||d| _
t|||dd	d
| _d S )Nr   )r   r   zInvalid stride .r   r    r!   r"   r   )r    r!   r"   r#   r$   r&   Fr    r!   r"   r(   )r1   r2   r   r   roundexpand_ratior3   use_residualr   
expand_1x1conv_3x3
reduce_1x1)r<   r   r    r!   r#   r&   expanded_channelsr=   r   r   r2   {   s0   

z"MobileViTInvertedResidual.__init__r?   c                 C   s4   |}|  |}| |}| |}| jr|| S |S r@   )rP   rQ   rR   rO   )r<   r?   residualr   r   r   rA      s
   


z!MobileViTInvertedResidual.forwardr   )rB   rC   rD   __doc__r   r   r2   rF   rG   rA   rH   r   r   r=   r   rI   v   s"    !rI   c                       sP   e Zd Z	ddedededededdf fd	d
ZdejdejfddZ  Z	S )MobileViTMobileNetLayerr   r   r    r!   r#   
num_stagesr   Nc                    sR   t    t | _t|D ]}t||||dkr|ndd}| j| |}qd S )Nr   r   )r    r!   r#   )r1   r2   r   
ModuleListlayerrangerI   append)r<   r   r    r!   r#   rX   irZ   r=   r   r   r2      s   

z MobileViTMobileNetLayer.__init__r?   c                 C      | j D ]}||}q|S r@   rZ   )r<   r?   layer_moduler   r   r   rA         

zMobileViTMobileNetLayer.forward)r   r   
rB   rC   rD   r   r   r2   rF   rG   rA   rH   r   r   r=   r   rW      s     rW   c                       @   e Zd Zdededdf fddZdejdejfdd	Z  Z	S )
MobileViTSelfAttentionr   hidden_sizer   Nc                    s   t    ||j dkrtd| d|j d|j| _t||j | _| j| j | _tj|| j|j	d| _
tj|| j|j	d| _tj|| j|j	d| _t|j| _d S )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rJ   )r%   )r1   r2   num_attention_headsr3   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutr<   r   re   r=   r   r   r2      s   
zMobileViTSelfAttention.__init__hidden_statesc                 C   s   |j \}}}| ||d| j| jdd}| ||d| j| jdd}| ||d| j| jdd}t	||dd}|t
| j }tjj|dd}	| |	}	t	|	|}
|
dddd }
|
 d d | jf }|
j| }
|
S )Nr   r   dimr   r   )shaperk   viewrf   rg   	transposerl   r   rF   matmulmathsqrtr   
functionalsoftmaxro   permute
contiguoussizerh   )r<   rq   
batch_size
seq_length_query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shaper   r   r   rA      s,   

zMobileViTSelfAttention.forwardrb   r   r   r=   r   rd      s    rd   c                       rc   )
MobileViTSelfOutputr   re   r   Nc                    s*   t    t||| _t|j| _d S r@   r1   r2   r   ri   denserm   hidden_dropout_probro   rp   r=   r   r   r2         
zMobileViTSelfOutput.__init__rq   c                 C      |  |}| |}|S r@   r   ro   r<   rq   r   r   r   rA      ra   zMobileViTSelfOutput.forwardrb   r   r   r=   r   r          r   c                       rc   )
MobileViTAttentionr   re   r   Nc                    s&   t    t||| _t||| _d S r@   )r1   r2   rd   	attentionr   outputrp   r=   r   r   r2     s   
zMobileViTAttention.__init__rq   c                 C   s   |  |}| |}|S r@   )r   r   )r<   rq   self_outputsattention_outputr   r   r   rA     ra   zMobileViTAttention.forwardrb   r   r   r=   r   r     r   r   c                       D   e Zd Zdedededdf fddZdejdejfd	d
Z  Z	S )MobileViTIntermediater   re   intermediate_sizer   Nc                    s@   t    t||| _t|jtrt|j | _	d S |j| _	d S r@   )
r1   r2   r   ri   r   r8   r;   r9   r   intermediate_act_fnr<   r   re   r   r=   r   r   r2     s
   
zMobileViTIntermediate.__init__rq   c                 C   r   r@   )r   r   r   r   r   r   rA     ra   zMobileViTIntermediate.forwardrb   r   r   r=   r   r         r   c                       sJ   e Zd Zdedededdf fddZdejd	ejdejfd
dZ  Z	S )MobileViTOutputr   re   r   r   Nc                    s*   t    t||| _t|j| _d S r@   r   r   r=   r   r   r2     r   zMobileViTOutput.__init__rq   input_tensorc                 C   s    |  |}| |}|| }|S r@   r   )r<   rq   r   r   r   r   rA   #  s   

zMobileViTOutput.forwardrb   r   r   r=   r   r     s    $r   c                       r   )MobileViTTransformerLayerr   re   r   r   Nc                    sZ   t    t||| _t|||| _t|||| _tj	||j
d| _tj	||j
d| _d S )Nr-   )r1   r2   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   r=   r   r   r2   +  s   
z"MobileViTTransformerLayer.__init__rq   c                 C   s<   |  | |}|| }| |}| |}| ||}|S r@   )r   r   r   r   r   )r<   rq   r   layer_outputr   r   r   rA   3  s   

z!MobileViTTransformerLayer.forwardrb   r   r   r=   r   r   *  r   r   c                       r   )MobileViTTransformerr   re   rX   r   Nc                    sJ   t    t | _t|D ]}t||t||j d}| j	| qd S )N)re   r   )
r1   r2   r   rY   rZ   r[   r   r   	mlp_ratior\   )r<   r   re   rX   r   transformer_layerr=   r   r   r2   >  s   

zMobileViTTransformer.__init__rq   c                 C   r^   r@   r_   )r<   rq   r`   r   r   r   rA   J  ra   zMobileViTTransformer.forwardrb   r   r   r=   r   r   =  s    r   c                       s   e Zd ZdZ	ddedededededed	ed
df fddZdejd
e	eje
f fddZdejde
d
ejfddZdejd
ejfddZ  ZS )MobileViTLayerzC
    MobileViT block: https://huggingface.co/papers/2110.02178
    r   r   r    r!   r#   re   rX   r&   r   Nc                    s   t    |j| _|j| _|dkr,t||||dkr|nd|dkr$|d ndd| _|}nd | _t||||jd| _	t|||dddd| _
t|||d| _tj||jd| _t|||dd| _t|d| ||jd| _d S )	Nr   r   )r    r!   r#   r&   rK   F)r    r!   r"   r'   r(   )re   rX   r   )r1   r2   
patch_sizepatch_widthpatch_heightrI   downsampling_layerr   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)r<   r   r    r!   r#   re   rX   r&   r=   r   r   r2   U  sN   

	zMobileViTLayer.__init__r?   c                 C   sN  | j | j}}t|| }|j\}}}}tj r$tt|| | n
tt	|| | }	tj r?tt|| | n
tt	|| | }
d}|
|ksT|	|krbt
jj||	|
fddd}d}|
| }|	| }|| }||| | |||}|dd}|||||}|dd}||| |d}||f||||||d	}||fS )
NFbilinearr   modealign_cornersTr   r   r   rr   )	orig_sizer   channelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r   rv   rF   jit
is_tracingr   ceilrz   r   r|   r   reshaperx   )r<   r?   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dictr   r   r   	unfolding  sH   	zMobileViTLayer.unfoldingr   r   c                 C   s   | j | j}}t|| }|d }|d }|d }|d }	|d }
| |||d}|dd}||| |	 |
||}|dd	}||||	| |
| }|d
 r_tjj	||d ddd}|S )Nr   r   r   r   r   rr   r   r   r   r   r   r   Fr   )
r   r   r   r   rw   rx   r   r   r|   r   )r<   r   r   r   r   r   r   r   r   r   r   r?   r   r   r   folding  s*   zMobileViTLayer.foldingc                 C   s|   | j r|  |}|}| |}| |}| |\}}| |}| |}| ||}| |}| t	j
||fdd}|S Nr   rt   )r   r   r   r   r   r   r   r   r   rF   cat)r<   r?   rT   r   r   r   r   r   rA     s   





zMobileViTLayer.forwardrU   )rB   rC   rD   rV   r   r   r2   rF   rG   tupledictr   r   rA   rH   r   r   r=   r   r   P  s.    	:3r   c                
       sL   e Zd Zdeddf fddZ		ddejd	ed
edee	B fddZ
  ZS )MobileViTEncoderr   r   Nc           
   	      sX  t    || _t | _d| _d }}|jdkrd}d}n|jdkr%d}d}t||j	d |j	d ddd}| j
| t||j	d |j	d dd	d}| j
| t||j	d |j	d	 d|jd dd
}| j
| |rp|d9 }t||j	d	 |j	d d|jd d|d}| j
| |r|d9 }t||j	d |j	d d|jd d	|d}	| j
|	 d S )NFr   T   r   r   )r    r!   r#   rX   r   r   )r    r!   r#   re   rX      )r    r!   r#   re   rX   r&      )r1   r2   r   r   rY   rZ   gradient_checkpointingoutput_striderW   neck_hidden_sizesr\   r   hidden_sizes)
r<   r   dilate_layer_4dilate_layer_5r&   layer_1layer_2layer_3layer_4layer_5r=   r   r   r2     sx   



		zMobileViTEncoder.__init__FTrq   output_hidden_statesreturn_dictc                 C   s\   |rdnd }t | jD ]\}}||}|r||f }q|s(tdd ||fD S t||dS )Nr   c                 s   s    | ]	}|d ur|V  qd S r@   r   ).0vr   r   r   	<genexpr>S  s    z+MobileViTEncoder.forward.<locals>.<genexpr>)last_hidden_staterq   )	enumeraterZ   r   r   )r<   rq   r   r   all_hidden_statesr]   r`   r   r   r   rA   D  s   
zMobileViTEncoder.forward)FT)rB   rC   rD   r   r2   rF   rG   rE   r   r   rA   rH   r   r   r=   r   r     s    Mr   c                   @   sH   e Zd ZU eed< dZdZdZdZdgZ	e
 dejdd	fd
dZd	S )MobileViTPreTrainedModelr   	mobilevitpixel_values)imageTr   moduler   Nc                 C   s   t |tjtjtjfr?tj|jd| jj	d |j
dur!t|j
 t|dddur=t|j t|j t|j dS dS t |tjrSt|j
 t|j dS dS )zInitialize the weightsg        )meanstdNrunning_mean)r8   r   ri   r4   r6   initnormal_weightr   initializer_ranger%   zeros_getattrr   ones_running_varnum_batches_trackedr   )r<   r   r   r   r   _init_weightsa  s   
z&MobileViTPreTrainedModel._init_weights)rB   rC   rD   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesrF   no_gradr   Moduler   r   r   r   r   r   X  s   
 r   c                       s`   e Zd Zddedef fddZe			ddejdB dedB d	edB d
e	e
B fddZ  ZS )MobileViTModelTr   expand_outputc                    sn   t  | || _|| _t||j|jd ddd| _t|| _	| jr1t||jd |jd dd| _
|   d	S )
aE  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
            1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
        r   r   r   )r    r!   r"   r#   r      r   rK   N)r1   r2   r   r  r   num_channelsr   	conv_stemr   encoderconv_1x1_exp	post_init)r<   r   r  r=   r   r   r2   s  s&   
zMobileViTModel.__init__Nr   r   r   r   c           
      K   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}| jr>| |d }tj	|ddgdd}n|d }d }|sY|d urN||fn|f}	|	|dd   S t
|||jd	S )
Nz You have to specify pixel_valuesr   r   r   rs   rr   F)ru   keepdimr   )r   pooler_outputrq   )r   r   use_return_dictr3   r
  r  r  r  rF   r   r	   rq   )
r<   r   r   r   kwargsembedding_outputencoder_outputsr   pooled_outputr   r   r   r   rA     s0   	
zMobileViTModel.forward)T)NNN)rB   rC   rD   r   rE   r2   r   rF   rG   r   r	   rA   rH   r   r   r=   r   r  q  s    r  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                       sj   e Zd Zdeddf fddZe				ddejdB dedB dejdB d	edB de	e
B f
d
dZ  ZS )MobileViTForImageClassificationr   r   Nc                    sd   t  | |j| _t|| _tj|jdd| _|jdkr't	|j
d |jnt | _|   d S )NT)inplacer   rr   )r1   r2   
num_labelsr  r   r   rm   classifier_dropout_probro   ri   r   Identity
classifierr  r<   r   r=   r   r   r2     s   
$z(MobileViTForImageClassification.__init__r   r   labelsr   c                 K   s   |dur|n| j j}| j|||d}|r|jn|d }| | |}d}	|dur1| ||| j }	|sG|f|dd  }
|	durE|	f|
 S |
S t|	||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   )losslogitsrq   )	r   r  r   r  r  ro   loss_functionr
   rq   )r<   r   r   r  r   r  outputsr  r   r  r   r   r   r   rA     s   z'MobileViTForImageClassification.forwardNNNN)rB   rC   rD   r   r2   r   rF   rG   rE   r   r
   rA   rH   r   r   r=   r   r    s$    r  c                       r   )MobileViTASPPPoolingr   r    r!   r   Nc              	      s4   t    tjdd| _t|||ddddd| _d S )Nr   )output_sizeTrelu)r    r!   r"   r#   r'   r(   )r1   r2   r   AdaptiveAvgPool2dglobal_poolr   r   )r<   r   r    r!   r=   r   r   r2     s   
zMobileViTASPPPooling.__init__r?   c                 C   s:   |j dd  }| |}| |}tjj||ddd}|S )Nrs   r   Fr   )rv   r(  r   r   r|   r   )r<   r?   spatial_sizer   r   r   rA   
  s
   

zMobileViTASPPPooling.forwardrb   r   r   r=   r   r$    s    r$  c                       @   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
MobileViTASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r   r   Nc                    s   t     jd  jt jdkrtdt | _	t
 ddd}| j	| | j	 fdd jD  t }| j	| t
 d	 ddd| _tj jd
| _d S )Nrs   r   z"Expected 3 values for atrous_ratesr   r&  rL   c              
      s    g | ]}t  d |ddqS )r   r&  )r    r!   r"   r&   r(   )r   )r   rater   r    r!   r   r   
<listcomp>,  s    	z*MobileViTASPP.__init__.<locals>.<listcomp>r   )p)r1   r2   r   aspp_out_channelslenatrous_ratesr3   r   rY   convsr   r\   extendr$  projectrm   aspp_dropout_probro   )r<   r   in_projection
pool_layerr=   r-  r   r2     s2   


	zMobileViTASPP.__init__r?   c                 C   sD   g }| j D ]	}||| qtj|dd}| |}| |}|S r   )r3  r\   rF   r   r5  ro   )r<   r?   pyramidconvpooled_featuresr   r   r   rA   B  s   


zMobileViTASPP.forward
rB   rC   rD   rV   r   r2   rF   rG   rA   rH   r   r   r=   r   r+    s    +r+  c                       r*  )
MobileViTDeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r   r   Nc              	      sB   t    t|| _t|j| _t||j	|j
ddddd| _d S )Nr   FT)r    r!   r"   r'   r(   r%   )r1   r2   r+  asppr   	Dropout2dr  ro   r   r0  r  r  r  r=   r   r   r2   R  s   

zMobileViTDeepLabV3.__init__rq   c                 C   s&   |  |d }| |}| |}|S )Nrr   )r>  ro   r  )r<   rq   r?   r   r   r   rA   b  s   

zMobileViTDeepLabV3.forwardr<  r   r   r=   r   r=  M  s    r=  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                       sj   e Zd Zdeddf fddZe				ddejdB dejdB dedB d	edB de	e
B f
d
dZ  ZS ) MobileViTForSemanticSegmentationr   r   Nc                    s8   t  | |j| _t|dd| _t|| _|   d S )NF)r  )r1   r2   r  r  r   r=  segmentation_headr  r  r=   r   r   r2   o  s
   
z)MobileViTForSemanticSegmentation.__init__r   r  r   r   c                 K   s  |dur|n| j j}|dur|n| j j}|dur"| j jdkr"td| j|d|d}|r/|jn|d }| |}d}	|durYtj	j
||jdd ddd	}
t| j jd
}||
|}	|s{|rg|f|dd  }n	|f|dd  }|	dury|	f| S |S t|	||r|jddS dddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  rs   r   Fr   )ignore_indexr   )r  r   rq   
attentions)r   r   r  r  r3   r   rq   rA  r   r|   r   rv   r   semantic_loss_ignore_indexr   )r<   r   r  r   r   r  r"  encoder_hidden_statesr   r  upsampled_logitsloss_fctr   r   r   r   rA   y  sB   '

z(MobileViTForSemanticSegmentation.forwardr#  )rB   rC   rD   r   r2   r   rF   rG   rE   r   r   rA   rH   r   r   r=   r   r@  i  s$    
r@  )r  r@  r  r   )r   N)4rV   rz   rF   r   torch.nnr    r   r   activationsr   modeling_layersr   modeling_outputsr   r	   r
   r   modeling_utilsr   utilsr   r   r   configuration_mobilevitr   
get_loggerrB   loggerr   r   r  r   rI   rW   rd   r   r   r   r   r   r   r   r   r   r  r  r$  r+  r=  r@  __all__r   r   r   r   <module>   sV   
 @09 *_L6;[