o
    eik/                     @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZmZmZmZmZmZm Z m!Z! G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G dd de Z'G dd deZ(G dd deZ)G dd  d eZ*eG d!d" d"eZ+ed#d$G d%d& d&e!Z,G d'd( d(eZ-g d)Z.dS )*zPyTorch SAM 2 model.    N   )initialization)PreTrainedConfig)PreTrainedModel)Unpack)auto_docstring)TransformersKwargsmerge_with_config_defaults)capture_outputs   )CONFIG_MAPPING
AutoConfig)
Sam2ConfigSam2MaskDecoderConfigSam2PromptEncoderConfig)Sam2AttentionSam2FeedForwardSam2LayerNorm	Sam2ModelSam2PreTrainedModelSam2TwoWayAttentionBlockSam2VisionEncoderOutputSam2VisionModelc                       sJ   e Zd ZdZdZdZdeiZ											
		d fdd	Z  Z	S )EdgeTamVisionConfiga	  
    This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `timm/repvit_m1.dist_in1k`):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`):
            The list of channel dimensions for the backbone.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
            The spatial sizes of the feature maps from the backbone.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        fpn_kernel_size (`int`, *optional*, defaults to 1):
            The kernel size for the convolutions in the neck.
        fpn_stride (`int`, *optional*, defaults to 1):
            The stride for the convolutions in the neck.
        fpn_padding (`int`, *optional*, defaults to 0):
            The padding for the convolutions in the neck.
        fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
            The levels for the top-down FPN connections.
        num_feature_levels (`int`, *optional*, defaults to 3):
            The number of feature levels from the FPN to use.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configedgetam_vision_modelbackbone_configN      r   r   geluư>{Gz?c                    s   |d u rg dn|}|d u rddgddgddggn|}|d u r#ddgn|}t |tr>|dd|d< t|d  di |}n|d u rOtjd	dd
g ddd}|| _|| _|| _|| _	|| _
|| _|| _|| _|	| _|
| _|| _|| _t jdi | d S )N)i     `   0   r      @   r   r   
model_typetimm_wrapperztimm/repvit_m1.dist_in1kT)r   r   r   r   )in_chansfeatures_onlyout_indices)
model_args )
isinstancedictgetr   r   from_pretrainedr   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levels
hidden_actlayer_norm_epsinitializer_rangesuper__init__)selfr   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   kwargs	__class__r-   i/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/edgetam/modular_edgetam.pyr>   V   s2    
zEdgeTamVisionConfig.__init__)NNNr   r   r   r   Nr   r   r    r!   )
__name__
__module____qualname____doc__base_config_keyr'   r   sub_configsr>   __classcell__r-   r-   rA   rC   r   )   s&    &r   c                   @      e Zd ZdS )EdgeTamPromptEncoderConfigNrD   rE   rF   r-   r-   r-   rC   rL          rL   c                   @   rK   )EdgeTamMaskDecoderConfigNrM   r-   r-   r-   rC   rO      rN   rO   c                   @   s   e Zd ZdZdS )EdgeTamConfiga
  
    [`EdgeTamConfig`] is the configuration class to store the configuration of a [`EdgeTamModel`]. It is used to instantiate a
    EDGETAM model according to the specified arguments, defining the memory attention, memory encoder, and image encoder
    configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny
    [facebook/edgetam.1-hiera-tiny](https://huggingface.co/facebook/edgetam.1-hiera-tiny) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    <Tip>

    EdgeTAM checkpoints with `model_type="edgetam_video"` are compatible with `EdgeTamModel` since the video variant
    weights are a superset of the image-only model weights. You may see a warning about model type mismatch when
    loading such checkpoints, which can be safely ignored in this case.

    </Tip>

    Args:
        vision_config (Union[`dict`, `EdgeTamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `EdgeTamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `EdgeTamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamMaskDecoderConfig`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            Standard deviation for parameter initialization.

    Example:

    ```python
    >>> from transformers import (
    ...     EdgeTamVisionConfig,
    ...     EdgeTamPromptEncoderConfig,
    ...     EdgeTamMaskDecoderConfig,
    ...     EdgeTamModel,
    ... )

    >>> # Initializing a EdgeTamConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration
    >>> configuration = EdgeTamConfig()

    >>> # Initializing a EdgeTamModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration
    >>> model = EdgeTamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a EdgeTamConfig from a EdgeTamVisionConfig, EdgeTamPromptEncoderConfig, and EdgeTamMaskDecoderConfig
    >>> # Initializing EDGETAM vision encoder, memory attention, and memory encoder configurations
    >>> vision_config = EdgeTamVisionConfig()
    >>> prompt_encoder_config = EdgeTamPromptEncoderConfig()
    >>> mask_decoder_config = EdgeTamMaskDecoderConfig()

    >>> config = EdgeTamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```
    N)rD   rE   rF   rG   r-   r-   r-   rC   rP      s    8rP   c                   @   rK   )EdgeTamLayerNormNrM   r-   r-   r-   rC   rQ      rN   rQ   c                   @   rK   )EdgeTamVisionEncoderOutputNrM   r-   r-   r-   rC   rR      rN   rR   c                   @   rK   )EdgeTamAttentionNrM   r-   r-   r-   rC   rS      rN   rS   c                   @   rK   )EdgeTamTwoWayAttentionBlockNrM   r-   r-   r-   rC   rT      rN   rT   c                   @   rK   )EdgeTamFeedForwardNrM   r-   r-   r-   rC   rU      rN   rU   c                   @   s    e Zd ZdZe dd ZdS )EdgeTamPreTrainedModelNc                 C   sX   t | | t|tr|jd urt|j d S d S t|dr*tj|j	|j
d d S d S )Npositional_embedding)std)r   _init_weightsr.   EdgeTamModelno_memory_embeddinginitzeros_hasattrnormal_rW   scale)r?   moduler-   r-   rC   rY      s   


z$EdgeTamPreTrainedModel._init_weights)rD   rE   rF   "_keys_to_ignore_on_load_unexpectedtorchno_gradrY   r-   r-   r-   rC   rV      s    rV   zN
    The vision model from EdgeTAM without any head or projection on top.
    )custom_introc                
   @   sP   e Zd ZeZdZi Zdd Zee		d	de
jdB dee deeB fddZdS )
EdgeTamVisionModelpixel_valuesc                 C      t dNz2Can't get input embeddings from timm wrapper modelNotImplementedErrorr?   r-   r-   rC   get_input_embeddings      z'EdgeTamVisionModel.get_input_embeddingsNr@   returnc                 K   s   |d u rt d| j|fi |}|j}dd |D }| |\}}|| j d  d d d }|| j d  d d d }t|d |||jdS )Nz You have to specify pixel_valuesc                 S   s   g | ]
}| d dddqS )r   r   r   r   )permute).0hidden_stater-   r-   rC   
<listcomp>	  s    z.EdgeTamVisionModel.forward.<locals>.<listcomp>)last_hidden_statefpn_hidden_statesfpn_position_encodinghidden_states)
ValueErrorbackboneru   neckr9   rR   rx   )r?   rg   r@   backbone_outputintermediate_hidden_statesrv   rw   r-   r-   rC   forward   s   zEdgeTamVisionModel.forward)N)rD   rE   rF   r   config_classmain_input_name_can_record_outputsrm   r	   r
   rc   FloatTensorr   r   tuplerR   r~   r-   r-   r-   rC   rf      s    rf   c                   @   s   e Zd Zg dZdd ZdS )rZ   )z
^memory_.*z^mask_downsample.*zspatial_perceiver.*z^object_pointer_proj.*z0^temporal_positional_encoding_projection_layer.*no_memory_positional_encodingno_object_pointer%occlusion_spatial_embedding_parameterc                 C   rh   ri   rj   rl   r-   r-   rC   rm   $  rn   z!EdgeTamModel.get_input_embeddingsN)rD   rE   rF   rb   rm   r-   r-   r-   rC   rZ     s    rZ   )rZ   rf   rV   rP   r   rL   rO   )/rG   rc    r   r\   configuration_utilsr   modeling_utilsr   processing_utilsr   utilsr   utils.genericr   r	   utils.output_capturingr
   autor   r   sam2.configuration_sam2r   r   r   sam2.modeling_sam2r   r   r   r   r   r   r   r   r   rL   rO   rP   rQ   rR   rS   rT   rU   rV   rf   rZ   __all__r-   r-   r-   rC   <module>   s:   (^<&