o
    i                  	   @   sT  d Z ddlZddlZddlmZ ddlmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ eeZeeddG dd deZeeddG dd deZeeddG dd deZ eeddG dd deZ!G dd de
j"Z#G dd de
j"Z$dBd"e	j%d#e&d$e'd%e	j%fd&d'Z(G d(d) d)e
j"Z)G d*d+ d+e
j"Z*G d,d- d-e
j"Z+G d.d/ d/e
j"Z,G d0d1 d1eZ-G d2d3 d3e
j"Z.eG d4d5 d5eZ/eG d6d7 d7e/Z0ed8dG d9d: d:e/Z1ed;dG d<d= d=e/Z2ed>dG d?d@ d@e/eZ3g dAZ4dS )CzPyTorch FocalNet model.    N)	dataclass)OptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel)ModelOutputauto_docstringlogging)BackboneMixin   )FocalNetConfigzC
    FocalNet encoder's outputs, with potential hidden states.
    )custom_introc                   @   sP   e Zd ZU dZdZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dS )FocalNetEncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_statehidden_statesreshaped_hidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tupler    r   r   b/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/focalnet/modeling_focalnet.pyr   %   s
   
 	r   zZ
    FocalNet model's outputs that also contains a pooling of the last hidden states.
    c                   @   b   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )FocalNetModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_outputr   r   )r   r   r   r   r   r   r   r   r   r"   r   r   r   r   r   r   r   r!   :   s   
 r!   z.
    FocalNet masked image model outputs.
    c                   @   r    )!FocalNetMaskedImageModelingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlossreconstructionr   r   )r   r   r   r   r$   r   r   r   r   r%   r   r   r   r   r   r   r   r#   R      
 r#   z4
    FocalNet outputs for image classification.
    c                   @   r    )FocalNetImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr$   logitsr   r   )r   r   r   r   r$   r   r   r   r   r(   r   r   r   r   r   r   r   r'   l   r&   r'   c                       sN   e Zd ZdZd fdd	Z	ddeej deej de	ej
 fd	d
Z  ZS )FocalNetEmbeddingszX
    Construct the patch embeddings and layernorm. Optionally, also the mask token.
    Fc              	      s|   t    t||j|j|j|j|jdd| _| jj	| _
|r(ttdd|jnd | _tj|j|jd| _t|j| _d S )NT)config
image_size
patch_sizenum_channels	embed_dimuse_conv_embedis_stemr   eps)super__init__FocalNetPatchEmbeddingsr+   r,   r-   r.   r/   patch_embeddings	grid_size
patch_gridr   	Parameterr   zeros
mask_token	LayerNormlayer_norm_epsnormDropouthidden_dropout_probdropout)selfr*   use_mask_token	__class__r   r   r4      s   

	 zFocalNetEmbeddings.__init__Npixel_valuesbool_masked_posreturnc           
      C   st   |  |\}}| |}| \}}}|d ur1| j||d}|d|}	|d|	  ||	  }| |}||fS )N      ?)r6   r>   sizer;   expand	unsqueezetype_asrA   )
rB   rF   rG   
embeddingsoutput_dimensions
batch_sizeseq_len_mask_tokensmaskr   r   r   forward   s   

zFocalNetEmbeddings.forward)FN)r   r   r   r   r4   r   r   r   
BoolTensorr   TensorrV   __classcell__r   r   rD   r   r)      s    r)   c                       sR   e Zd Z			d
 fdd	Zdd Zdeej deej	ee
 f fdd	Z  ZS )r5   Fc	                    s
  t    t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }	|| _|| _|| _|	| _	|d |d  |d |d  f| _
|ri|rWd}
d}d}nd}
d}d}tj|||
||d| _n
tj||||d| _|rtj||jd	| _d S d | _d S )
Nr   r            r   )kernel_sizestridepadding)r^   r_   r1   )r3   r4   
isinstancecollectionsabcIterabler+   r,   r-   num_patchesr7   r   Conv2d
projectionr<   r=   r>   )rB   r*   r+   r,   r-   r.   add_normr/   r0   re   r^   r`   r_   rD   r   r   r4      s0   
 "


z FocalNetPatchEmbeddings.__init__c                 C   s   || j d  dkrd| j d || j d   f}tj||}|| j d  dkr>ddd| j d || j d   f}tj||}|S )Nr   r   )r,   r   
functionalpad)rB   rF   heightwidth
pad_valuesr   r   r   	maybe_pad   s    z!FocalNetPatchEmbeddings.maybe_padrF   rH   c                 C   s|   |j \}}}}|| jkrtd| |||}| |}|j \}}}}||f}|ddd}| jd ur:| |}||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r\   r   )shaper-   
ValueErrorrn   rg   flatten	transposer>   )rB   rF   rS   r-   rk   rl   rO   rP   r   r   r   rV      s   



zFocalNetPatchEmbeddings.forward)FFF)r   r   r   r4   rn   r   r   r   r   rY   intrV   rZ   r   r   rD   r   r5      s    *.	r5           Finput	drop_probtrainingrH   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    rt   r   r   )r   )dtypedevice)ro   ndimr   randrx   ry   floor_div)ru   rv   rw   	keep_probro   random_tensoroutputr   r   r   	drop_path   s   
r   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )FocalNetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nrv   rH   c                    s   t    || _d S rW   )r3   r4   rv   )rB   rv   rD   r   r   r4     s   

zFocalNetDropPath.__init__r   c                 C   s   t || j| jS rW   )r   rv   rw   )rB   r   r   r   r   rV     s   zFocalNetDropPath.forwardc                 C   s   d| j  S )Nzp=)rv   rB   r   r   r   
extra_repr  s   zFocalNetDropPath.extra_reprrW   )r   r   r   r   r   floatr4   r   rY   rV   strr   rZ   r   r   rD   r   r     s
    r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	FocalNetModulationr\   Trt   c           	         s$  t    || _|j| | _|j| | _|| _|j| _|j	| _	t
j|d| | jd  |d| _t
j||dd|d| _t
 | _t
||| _t
|| _t
 | _g | _t| jD ](}| j| | j }| jt
t
j|||d||d ddt
  | j| qY| jrt
j||jd| _d S d S )Nr\   r   )bias)r^   r_   r   F)r^   r_   groupsr`   r   r1   )r3   r4   dimfocal_windowsfocal_windowfocal_levelsfocal_levelfocal_factor use_post_layernorm_in_modulationnormalize_modulatorr   Linearprojection_inrf   projection_contextGELU
activationprojection_outr?   projection_dropout
ModuleListfocal_layerskernel_sizesrangeappend
Sequentialr<   r=   	layernorm)	rB   r*   indexr   r   r   r   kr^   rD   r   r   r4     s8   
 

zFocalNetModulation.__init__c                 C   s$  |j d }| |dddd }t|||| jd fd\}}}d}t| jD ]}| j| |}|||dd||d f   }q)| 	|j
dddj
ddd}	||	|dd| jdf   }| jrk|| jd  }| |}
||
 }|dddd }| jr| |}| |}| |}|S )	z
        Args:
            hidden_state:
                Input features with shape of (batch_size, height, width, num_channels)
        rI   r   r   r   r\   NT)keepdim)ro   r   permute
contiguousr   splitr   r   r   r   meanr   r   r   r   r   r   )rB   hidden_stater-   xqctxgatesctx_alllevel
ctx_global	modulatorx_outr   r   r   rV   ;  s&   
 "



zFocalNetModulation.forward)r\   Trt   r   r   r   r4   rV   rZ   r   r   rD   r   r     s    !r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )FocalNetMlpNrt   c                    sR   t    |p|}|p|}t||| _t|j | _t||| _t	|| _
d S rW   )r3   r4   r   r   fc1r   
hidden_actr   fc2r?   drop)rB   r*   in_featureshidden_featuresout_featuresr   rD   r   r   r4   a  s   
zFocalNetMlp.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S rW   )r   r   r   r   )rB   r   r   r   r   rV   j  s   




zFocalNetMlp.forward)NNrt   r   r   r   rD   r   r   `  s    	r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )FocalNetLayera  Focal Modulation Network layer (block).

    Args:
        config (`FocalNetConfig`):
            Model config.
        index (`int`):
            Layer index.
        dim (`int`):
            Number of input channels.
        input_resolution (`tuple[int]`):
            Input resolution.
        drop_path (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate.
    rt   c                    s   t    || _|| _|| _|j| _|j| _tj	||j
d| _t|||| jd| _|dkr1t|nt | _tj	||j
d| _t||j }t|||| jd| _d| _d| _|jrwtj|jt| dd| _tj|jt| dd| _d S d S )Nr1   )r*   r   r   r   rt   )r*   r   r   r   rJ   T)requires_grad)r3   r4   r*   r   input_resolutionr@   r   use_post_layernormr   r<   r=   norm1r   
modulationr   Identityr   norm2rs   	mlp_ratior   mlpgamma_1gamma_2use_layerscaler9   layerscale_valuer   ones)rB   r*   r   r   r   r   mlp_hidden_dimrD   r   r   r4     s.   
 zFocalNetLayer.__init__c           	   	   C   s   |\}}|j \}}}|}| jr|n| |}|||||}| |||| |}| js/|n| |}|| | j|  }|| | j| jrN| | 	|n| 	| |  }|S rW   )
ro   r   r   viewr   r   r   r   r   r   )	rB   r   input_dimensionsrk   rl   rQ   rS   r-   shortcutr   r   r   rV     s   $zFocalNetLayer.forward)rt   )r   r   r   r   r4   rV   rZ   r   r   rD   r   r   s  s     r   c                       sB   e Zd Z fddZdejdeeef deej fddZ  Z	S )FocalNetStagec              
      s"  t     | _t j| _ fddt| jD }| | jd k r+|d  nd }| jd k r6tnd }dd tj	d j
t jddD }|t jd  t jd d   t fddt j D | _|d ur| d	|d
 jdd| _nd | _d| _d S )Nc                    s   g | ]	} j d |  qS )r\   )r.   .0i)r*   r   r   
<listcomp>  s    z*FocalNetStage.__init__.<locals>.<listcomp>r   c                 S   s   g | ]}|  qS r   )item)r   r   r   r   r   r     s    r   cpu)ry   c              
      s0   g | ]}t  ttr| nd qS ))r*   r   r   r   r   )r   ra   listr   r*   r   r   r   r   r   r   r     s    r\   TF)r*   r+   r,   r-   r.   rh   r/   r0   )r3   r4   r*   lendepths
num_stagesr   r5   r   linspacedrop_path_ratesumr   r   layersr/   
downsamplepointing)rB   r*   r   r   r.   out_dimr   dprrD   r   r   r4     s6   
$,

zFocalNetStage.__init__r   r   rH   c           	      C   s|   |\}}| j D ]}|||}q|}| jd ur1|\}}|dd|jd d||}| |\}}n||||f}|||f}|S )Nr   r\   r   rI   )r   r   rr   reshapero   )	rB   r   r   rk   rl   layer_module!hidden_states_before_downsamplingrP   stage_outputsr   r   r   rV     s   


zFocalNetStage.forward)
r   r   r   r4   r   rY   r   rs   rV   rZ   r   r   rD   r   r     s    .,r   c                       sd   e Zd Z fddZ			ddejdeeef dee	 dee	 d	ee	 d
e
eef fddZ  ZS )FocalNetEncoderc                    sH   t    t j| _ | _t fddt| jD | _	d| _
d S )Nc              	      s6   g | ]}t  |d  d|  d d|  fdqS )r   r\   r   )r*   r   r   )r   )r   i_layerr*   r7   r   r   r     s    z,FocalNetEncoder.__init__.<locals>.<listcomp>F)r3   r4   r   r   r   r*   r   r   r   stagesgradient_checkpointing)rB   r*   r7   rD   r   r   r4     s   

zFocalNetEncoder.__init__FTr   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrH   c                 C   sz  |rdnd }|r
dnd }|r1|j \}}	}
|j|g||
R  }|dddd}||f7 }||f7 }t| jD ]r\}}|||}|d }|d }|d }|d |d f}|r|r|j \}}	}
|j|g|d |d f|
R  }|dddd}||f7 }||f7 }q6|r|s|j \}}	}
|j|g||
R  }|dddd}||f7 }||f7 }q6|stdd	 ||fD S t|||d
S )Nr   r   r   r   r\   rI   c                 s   s    | ]	}|d ur|V  qd S rW   r   )r   vr   r   r   	<genexpr><  s    z*FocalNetEncoder.forward.<locals>.<genexpr>)r   r   r   )ro   r   r   	enumerater   r   r   )rB   r   r   r   r   r   all_hidden_statesall_reshaped_hidden_statesrQ   rS   hidden_sizereshaped_hidden_stater   stage_moduler   r   rP   r   r   r   rV     sP   





zFocalNetEncoder.forward)FFT)r   r   r   r4   r   rY   r   rs   r   boolr   r   rV   rZ   r   r   rD   r   r     s$    

r   c                   @   s0   e Zd ZU eed< dZdZdZdgZdd Z	dS )	FocalNetPreTrainedModelr*   focalnetrF   Tr   c                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS t |trL|jdurJ|jj
  dS dS t |tri| jjrk|jj| jj |jj| jj dS dS dS )zInitialize the weightsrt   )r   stdNrJ   )ra   r   r   rf   weightdatanormal_r*   initializer_ranger   zero_r<   fill_r)   r;   r   r   r   r   r   )rB   moduler   r   r   _init_weightsM  s$   



z%FocalNetPreTrainedModel._init_weightsN)
r   r   r   r   r   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr  r   r   r   r   r   E  s   
 r   c                       sn   e Zd Zd fdd	Zdd Ze				ddeej d	eej	 d
ee
 dee
 deeef f
ddZ  ZS )FocalNetModelTFc                    s   t  | || _t|j| _t|jd| jd   | _t	||d| _
t|| j
j| _tj| j|jd| _|r<tdnd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        r\   r   )rC   r1   N)r3   r4   r*   r   r   r   rs   r.   num_featuresr)   rO   r   r8   encoderr   r<   r=   r   AdaptiveAvgPool1dpooler	post_init)rB   r*   add_pooling_layerrC   rD   r   r   r4   c  s   zFocalNetModel.__init__c                 C   s   | j jS rW   )rO   r6   r   r   r   r   get_input_embeddingsx  s   z"FocalNetModel.get_input_embeddingsNrF   rG   r   r   rH   c                 C   s   |dur|n| j j}|dur|n| j j}|du rtd| j||d\}}| j||||d}|d }| |}d}	| jdurM| |dd}	t	
|	d}	|s[||	f|dd  }
|
S t||	|j|jdS )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rG   r   r   r   r   r\   )r   r"   r   r   )r*   r   use_return_dictrp   rO   r  r   r
  rr   r   rq   r!   r   r   )rB   rF   rG   r   r   embedding_outputr   encoder_outputssequence_outputpooled_outputr   r   r   r   rV   {  s6   

zFocalNetModel.forward)TFNNNN)r   r   r   r4   r  r   r   r   r   rX   r   r   r   r!   rV   rZ   r   r   rD   r   r  a  s&    
r  a  
    FocalNet Model with a decoder on top for masked image modeling.

    This follows the same implementation as in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                       d   e Zd Z fddZe				ddeej deej dee	 dee	 de
eef f
d	d
Z  ZS )FocalNetForMaskedImageModelingc                    sz   t  | t|ddd| _t|j| _t|jd| jd   }t	
t	j||jd |j ddt	|j| _|   d S )NFT)r  rC   r\   r   )in_channelsout_channelsr^   )r3   r4   r  r   r   r   r   rs   r.   r   r   rf   encoder_strider-   PixelShuffledecoderr  )rB   r*   r  rD   r   r   r4     s   
z'FocalNetForMaskedImageModeling.__init__NrF   rG   r   r   rH   c                 C   s4  |dur|n| j j}| j||||d}|d }|dd}|j\}}}	t|	d  }
}||||
|}| |}d}|durz| j j	| j j
 }|d||}|| j j
d| j j
dd }tjj||dd	}||  | d
  | j j }|s|f|dd  }|dur|f| S |S t|||j|jdS )a?  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, FocalNetConfig, FocalNetForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-base-simmim-window6-192")
        >>> config = FocalNetConfig()
        >>> model = FocalNetForMaskedImageModeling(config)

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 192, 192]
        ```N)rG   r   r   r   r   r\   g      ?rI   none)	reductiongh㈵>)r$   r%   r   r   )r*   r  r   rr   ro   mathfloorr   r  r+   r,   repeat_interleaverM   r   r   ri   l1_lossr   r-   r#   r   r   )rB   rF   rG   r   r   outputsr  rQ   r-   sequence_lengthrk   rl   reconstructed_pixel_valuesmasked_im_lossrK   rU   reconstruction_lossr   r   r   r   rV     sB   $
 z&FocalNetForMaskedImageModeling.forwardr  )r   r   r   r4   r   r   r   r   rX   r   r   r   r#   rV   rZ   r   r   rD   r   r    s$    
r  z
    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
    ImageNet.
    c                       r  )FocalNetForImageClassificationc                    sP   t  | |j| _t|| _|jdkrt| jj|jnt | _	| 
  d S )Nr   )r3   r4   
num_labelsr  r   r   r   r  r   
classifierr  rB   r*   rD   r   r   r4   %  s   
"z'FocalNetForImageClassification.__init__NrF   labelsr   r   rH   c           
      C   s   |dur|n| j j}| j|||d}|d }| |}d}|dur)| ||| j }|s?|f|dd  }	|dur=|f|	 S |	S t|||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r\   )r$   r(   r   r   )r*   r  r   r)  loss_functionr'   r   r   )
rB   rF   r+  r   r   r"  r  r(   r$   r   r   r   r   rV   3  s(   
z&FocalNetForImageClassification.forwardr  )r   r   r   r4   r   r   r   r   
LongTensorr   r   r   r'   rV   rZ   r   r   rD   r   r'    s$    
r'  zG
    FocalNet backbone, to be used with frameworks like X-Decoder.
    c                
       sT   e Zd ZdZdef fddZe		ddejde	e
 de	e
 d	efd
dZ  ZS )FocalNetBackboneFr*   c                    s>   t  | t  | |jg|j | _t|| _|   d S rW   )	r3   r4   _init_backboner.   hidden_sizesr  r  r   r  r*  rD   r   r   r4   e  s
   
zFocalNetBackbone.__init__NrF   r   r   rH   c           
      C   s   |dur|n| j j}|dur|n| j j}| j|ddd}|j}d}t| jD ]\}}|| jv r6||| f7 }q&|sF|f}	|rD|	|jf7 }	|	S t	||rP|jddS dddS )aj  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
        >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr  r   )feature_mapsr   
attentions)
r*   r  r   r   r   r   stage_namesr   r   r	   )
rB   rF   r   r   r"  r   r1  idxstager   r   r   r   rV   o  s.   
zFocalNetBackbone.forward)NN)r   r   r   has_attentionsr   r4   r   r   rY   r   r   r	   rV   rZ   r   r   rD   r   r.  ]  s    
r.  )r'  r  r.  r  r   )rt   F)5r   collections.abcrb   r  dataclassesr   typingr   r   r   r   activationsr   modeling_layersr   modeling_outputsr	   modeling_utilsr
   utilsr   r   r   utils.backbone_utilsr   configuration_focalnetr   
get_loggerr   loggerr   r!   r#   r'   Moduler)   r5   rY   r   r   r   r   r   r   r   r   r   r   r  r  r'  r.  __all__r   r   r   r   <module>   sz   
( HGEBKKb:A