o
    i%                     @   s  d Z ddlmZmZ ddlZddlmZ ddlZddlm	Z	m
Z
mZ ddlmZmZmZmZmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZm Z  	 ddl!m"Z" G dd deZ#G dd deZ$G dd de
Z%G dd de	Z&G dd deZ'G dd deZ(G dd deZ)G dd deZ*G dd  d eZ+eG d!d" d"eZ,ed#d$G d%d& d&eZ-G d'd( d(eZ.g d)Z/dS )*zPyTorch SAM 2 model.    )OptionalUnionN)
Sam2ConfigSam2MaskDecoderConfigSam2PromptEncoderConfig)Sam2AttentionSam2FeedForwardSam2LayerNorm	Sam2ModelSam2PreTrainedModelSam2TwoWayAttentionBlockSam2VisionEncoderOutputSam2VisionModel)TransformersKwargscheck_model_inputs   )PretrainedConfig)Unpack)auto_docstring   )CONFIG_MAPPING
AutoConfigT)TimmWrapperModelc                       sJ   e Zd ZdZdZdZdeiZ											
		d fdd	Z  Z	S )EdgeTamVisionConfiga  
    This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`):
            The list of channel dimensions for the backbone.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
            The spatial sizes of the feature maps from the backbone.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        fpn_kernel_size (`int`, *optional*, defaults to 1):
            The kernel size for the convolutions in the neck.
        fpn_stride (`int`, *optional*, defaults to 1):
            The stride for the convolutions in the neck.
        fpn_padding (`int`, *optional*, defaults to 0):
            The padding for the convolutions in the neck.
        fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
            The levels for the top-down FPN connections.
        num_feature_levels (`int`, *optional*, defaults to 3):
            The number of feature levels from the FPN to use.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configedgetam_vision_modelbackbone_configN      r   r   geluư>{Gz?c                    s  t  jdi | |d u rg dn|}|d u r"ddgddgddggn|}|d u r,ddgn|}t|trG|dd|d< t|d  di |}nt|trO|}n|d u r`tjd	dd
g ddd}|| _|| _	|| _
|| _|| _|| _|| _|| _|	| _|
| _|| _|| _d S )N)i     `   0   r      @   r   r   
model_typetimm_wrapperztimm/repvit_m1.dist_in1kT)r   r   r   r   )in_chansfeatures_onlyout_indices)
model_args )super__init__
isinstancedictgetr   r   from_pretrainedr   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levels
hidden_actlayer_norm_epsinitializer_range)selfr   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   kwargs	__class__r-   _/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/edgetam/modular_edgetam.pyr/   ^   s6    


zEdgeTamVisionConfig.__init__)NNNr   r   r   r   Nr   r   r    r!   )
__name__
__module____qualname____doc__base_config_keyr'   r   sub_configsr/   __classcell__r-   r-   rA   rC   r   1   s&    &r   c                   @      e Zd ZdS )EdgeTamPromptEncoderConfigNrD   rE   rF   r-   r-   r-   rC   rL          rL   c                   @   rK   )EdgeTamMaskDecoderConfigNrM   r-   r-   r-   rC   rO      rN   rO   c                   @   rK   )EdgeTamConfigNrM   r-   r-   r-   rC   rP      rN   rP   c                   @   rK   )EdgeTamLayerNormNrM   r-   r-   r-   rC   rQ      rN   rQ   c                   @   rK   )EdgeTamVisionEncoderOutputNrM   r-   r-   r-   rC   rR      rN   rR   c                   @   rK   )EdgeTamAttentionNrM   r-   r-   r-   rC   rS      rN   rS   c                   @   rK   )EdgeTamTwoWayAttentionBlockNrM   r-   r-   r-   rC   rT      rN   rT   c                   @   rK   )EdgeTamFeedForwardNrM   r-   r-   r-   rC   rU      rN   rU   c                   @   s   e Zd Zdd ZdS )EdgeTamPreTrainedModelc                 C   s   | j j}t|tjtjtjfr$|jjj	d|d |j
d ur#|j
j  n3t|tjrB|jjj	d|d |jd urA|jj|j   nt|tjtfrW|jjd |j
j  t|tri|jd urk|jj  d S d S d S )Ng        )meanstdg      ?)configr>   r0   nnLinearConv2dConvTranspose2dweightdatanormal_biaszero_	Embeddingpadding_idx	LayerNormrQ   fill_EdgeTamModelno_memory_embedding)r?   modulerX   r-   r-   rC   _init_weights   s&   



z$EdgeTamPreTrainedModel._init_weightsN)rD   rE   rF   rj   r-   r-   r-   rC   rV      s    rV   zN
    The vision model from EdgeTAM without any head or projection on top.
    )custom_introc                
   @   sV   e Zd ZeZdZeedZdd Ze		d
de
ej dee deeef fdd	ZdS )EdgeTamVisionModelpixel_values)hidden_states
attentionsc                 C      t dNz2Can't get input embeddings from timm wrapper modelNotImplementedErrorr?   r-   r-   rC   get_input_embeddings      z'EdgeTamVisionModel.get_input_embeddingsNr@   returnc                 K   s   |d u rt d| |}|j}dd |D }| |\}}|| j d  d d d }|| j d  d d d }t|d ||dS )Nz You have to specify pixel_valuesc                 S   s   g | ]
}| d dddqS )r   r   r   r   )permute).0hidden_stater-   r-   rC   
<listcomp>   s    z.EdgeTamVisionModel.forward.<locals>.<listcomp>)last_hidden_statefpn_hidden_statesfpn_position_encoding)
ValueErrorbackboner}   neckr;   rR   )r?   rm   r@   backbone_outputintermediate_hidden_statesr~   r   r-   r-   rC   forward   s   
zEdgeTamVisionModel.forward)N)rD   rE   rF   r   config_classmain_input_namer   _can_record_outputsru   r   r   torchFloatTensorr   r   r   tuplerR   r   r-   r-   r-   rC   rl      s    

rl   c                   @   s   e Zd Zg dZdd ZdS )rg   )z
^memory_.*z^mask_downsample.*zspatial_perceiver.*z^object_pointer_proj.*z0^temporal_positional_encoding_projection_layer.*no_memory_positional_encodingno_object_pointer%occlusion_spatial_embedding_parameterc                 C   rp   rq   rr   rt   r-   r-   rC   ru      rv   z!EdgeTamModel.get_input_embeddingsN)rD   rE   rF   "_keys_to_ignore_on_load_unexpectedru   r-   r-   r-   rC   rg      s    rg   )rg   rl   rV   rP   r   rL   rO   )0rG   typingr   r   r   torch.nnrZ   torch.utils.checkpoint+transformers.models.sam2.configuration_sam2r   r   r   &transformers.models.sam2.modeling_sam2r   r   r	   r
   r   r   r   r   transformers.utils.genericr   r   configuration_utilsr   processing_utilsr   utilsr   autor   r   6transformers.models.timm_wrapper.modeling_timm_wrapperr   r   rL   rO   rP   rQ   rR   rS   rT   rU   rV   rl   rg   __all__r-   r-   r-   rC   <module>   s>   (
a"