o
    eiS                     @   s   d Z ddlmZ ddlmZ ddlmZmZ G dd deZG d	d
 d
eZ	G dd deZ
G dd deZG dd deZG dd deZG dd deZg dZdS )zSAM3 model configuration    )CLIPTextConfig   )PreTrainedConfig   )CONFIG_MAPPING
AutoConfigc                       sL   e Zd ZdZdZdZ								
										d fdd	Z  ZS )Sam3ViTConfiga  
    Configuration class for SAM3 Vision Encoder (ViT backbone).

    Instantiating a configuration defaults will yield a similar configuration to that of SAM 3
    [facebook/sam3](https://huggingface.co/facebook/sam3) architecture.

    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers.
        intermediate_size (`int`, *optional*, defaults to 4736):
            Dimensionality of the feedforward (MLP) layers.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer.
        num_channels (`int`, *optional*, defaults to 3):
            Number of input image channels.
        image_size (`int`, *optional*, defaults to 1008):
            Expected input image size.
        patch_size (`int`, *optional*, defaults to 14):
            Size of image patches.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention probabilities.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            Base frequency for RoPE.
        window_size (`int`, *optional*, defaults to 24):
            Window size for windowed attention.
        global_attn_indexes (`list[int]`, *optional*, defaults to `[7, 15, 23, 31]`):
            Indexes of layers with global attention.
        layer_scale_init_value (`float`, *optional*):
            Initial value for layer scale. None means no layer scale.
        pretrain_image_size (`int`, *optional*, defaults to 336):
            Pretrained model image size for position embedding initialization.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for hidden states.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
    backbone_configsam3_vit_model            r        geluư>             @   NP  {Gz?c                    s   t  jdi | |d u rg d}|| _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|| _|| _|| _|| _d S )N)             )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_size
hidden_actlayer_norm_epsattention_dropout
rope_thetawindow_sizeglobal_attn_indexeslayer_scale_init_valuepretrain_image_sizehidden_dropoutinitializer_range)selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   kwargs	__class__r   i/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/sam3/configuration_sam3.pyr   E   s(   
zSam3ViTConfig.__init__)r   r   r   r   r   r   r   r   r   r   r   r   NNr   r   r   )__name__
__module____qualname____doc__base_config_key
model_typer   __classcell__r   r   r2   r4   r      s,    +r   c                       sZ   e Zd ZdZdZdZdeiZ								d fd
d	Ze	dd Z
e
jdd Z
  ZS )Sam3VisionConfiga  
    This is the configuration class to store the configuration of a [`Sam3VisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 3
    [facebook/sam3](https://huggingface.co/facebook/sam3) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `Sam3ViTConfig()`):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[288, 288], [144, 144], [72, 72]]`):
            The spatial sizes (height, width) of the feature maps from the backbone at different scales.
        scale_factors (`list[float]`, *optional*, defaults to `[4.0, 2.0, 1.0, 0.5]`):
            Scale factors for FPN multi-scale features. List of scaling factors for each FPN level.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configsam3_vision_modelr	   N   r   r   r   c           	         s   |d u rg dn|}|d u rddgddgddgg}t |tr2|dd|d< t|d  di |}n	|d u r;td  }|| _|| _|| _|| _|| _|| _	|| _
t jdi | d S )N)g      @g       @g      ?g      ?i      H   r:   r
   r   )
isinstancedictgetr   r	   fpn_hidden_sizescale_factorsbackbone_feature_sizesr&   r'   r/   r   r   )	r0   r	   rE   rG   rF   r&   r'   r/   r1   r2   r   r4   r      s    

zSam3VisionConfig.__init__c                 C      | j jS )z"Image size for the vision encoder.r	   r$   r0   r   r   r4   r$         zSam3VisionConfig.image_sizec                 C      || j _dS )z-Set the image size and propagate to backbone.NrI   r0   valuer   r   r4   r$         )Nr?   NNr   r   r   )r5   r6   r7   r8   r9   r:   r   sub_configsr   propertyr$   setterr;   r   r   r2   r4   r<   q   s$    !
r<   c                       :   e Zd ZdZdZ									
		d fdd	Z  ZS )Sam3GeometryEncoderConfiga  
    Configuration class for SAM3 Geometry Encoder.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the encoder layers.
        num_layers (`int`, *optional*, defaults to 3):
            Number of transformer encoder layers for processing geometry prompts.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads in the geometry encoder.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the feedforward layers.
        dropout (`float`, *optional*, defaults to 0.1):
            Dropout probability.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            Activation function in FFN.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for hidden states.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            Epsilon for layer normalization.
        roi_size (`int`, *optional*, defaults to 7):
            ROI size for box pooling operations.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
    sam3_geometry_encoderr?   r         皙?relur   r   r   r   c                    R   t  jdi | || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _d S Nr   )r   r   r   
num_layersr"   r    dropoutr&   r.   r'   roi_sizer/   )r0   r   r\   r"   r    r]   r&   r.   r'   r^   r/   r1   r2   r   r4   r         
z"Sam3GeometryEncoderConfig.__init__)
r?   r   rV   rW   rX   rY   r   r   r   r   r5   r6   r7   r8   r:   r   r;   r   r   r2   r4   rT          rT   c                       s8   e Zd ZdZdZ									
	d fdd	Z  ZS )Sam3DETREncoderConfigaN  
    Configuration class for SAM3 DETR Encoder (vision-text fusion encoder).

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the encoder layers.
        num_layers (`int`, *optional*, defaults to 6):
            Number of encoder layers.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the feedforward layers.
        dropout (`float`, *optional*, defaults to 0.1):
            Dropout probability.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            Activation function in FFN.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for hidden states.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            Epsilon for layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
    sam3_detr_encoderr?      rV   rW   rX   rY   r   r   r   c
                    sL   t  jdi |
 || _|| _|| _|| _|| _|| _|| _|| _	|	| _
d S r[   )r   r   r   r\   r"   r    r]   r&   r.   r'   r/   )r0   r   r\   r"   r    r]   r&   r.   r'   r/   r1   r2   r   r4   r     s   
zSam3DETREncoderConfig.__init__)	r?   rd   rV   rW   rX   rY   r   r   r   r`   r   r   r2   r4   rb      s    rb   c                       rS   )Sam3DETRDecoderConfiga  
    Configuration class for SAM3 DETR Decoder (object query decoder).

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the decoder layers.
        num_layers (`int`, *optional*, defaults to 6):
            Number of decoder layers.
        num_queries (`int`, *optional*, defaults to 200):
            Number of object queries.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the feedforward layers.
        dropout (`float`, *optional*, defaults to 0.1):
            Dropout probability.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            Activation function in FFN.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for hidden states.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            Epsilon for layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
    sam3_detr_decoderr?   rd      rV   rW   rX   rY   r   r   r   c                    rZ   r[   )r   r   r   r\   num_queriesr"   r    r]   r&   r.   r'   r/   )r0   r   r\   rh   r"   r    r]   r&   r.   r'   r/   r1   r2   r   r4   r   I  r_   zSam3DETRDecoderConfig.__init__)
r?   rd   rg   rV   rW   rX   rY   r   r   r   r`   r   r   r2   r4   re   ,  ra   re   c                       s2   e Zd ZdZdZ						d fd	d
	Z  ZS )Sam3MaskDecoderConfiga]  
    Configuration class for SAM3 Mask Decoder (pixel-level mask prediction).

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the mask decoder.
        num_upsampling_stages (`int`, *optional*, defaults to 3):
            Number of upsampling stages in the pixel decoder (FPN).
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            Epsilon for layer normalization.
        dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for prompt cross-attention.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for prompt cross-attention.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
    sam3_mask_decoderr?   r   r   r   rV   r   c                    s:   t  jdi | || _|| _|| _|| _|| _|| _d S r[   )r   r   r   num_upsampling_stagesr'   r]   r"   r/   )r0   r   rk   r'   r]   r"   r/   r1   r2   r   r4   r   y  s   

zSam3MaskDecoderConfig.__init__)r?   r   r   r   rV   r   r`   r   r   r2   r4   ri   d  s    ri   c                       sd   e Zd ZdZdZdZeeee	e
edZ							d fdd	Zed	d
 Zejdd
 Z  ZS )
Sam3Configa  
    Configuration class to store the configuration of a [`Sam3Model`].

    Instantiating a configuration defaults will yield a similar configuration to that of SAM 3
    [facebook/sam3](https://huggingface.co/facebook/sam3) architecture.

    This is the main configuration class that combines all sub-configurations for the SAM3 model.

    <Tip>

    SAM3 checkpoints with `model_type="sam3_video"` are compatible with `Sam3Model` since the video variant weights
    are a superset of the image-only model weights. You may see a warning about model type mismatch when loading
    such checkpoints, which can be safely ignored in this case.

    </Tip>

    Args:
        vision_config (`dict` or `Sam3VisionConfig`, *optional*):
            Configuration for the vision encoder.
        text_config (`dict` or `Sam3TextConfig`, *optional*):
            Configuration for the text encoder.
        geometry_encoder_config (`dict` or `Sam3GeometryEncoderConfig`, *optional*):
            Configuration for the geometry encoder.
        detr_encoder_config (`dict` or `Sam3DETREncoderConfig`, *optional*):
            Configuration for the DETR encoder.
        detr_decoder_config (`dict` or `Sam3DETRDecoderConfig`, *optional*):
            Configuration for the DETR decoder.
        mask_decoder_config (`dict` or `Sam3MaskDecoderConfig`, *optional*):
            Configuration for the mask decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.

    Example:
    ```python
    >>> from transformers import Sam3Config, Sam3Model

    >>> # Initializing a SAM3 configuration
    >>> configuration = Sam3Config()

    >>> # Initializing a model from the configuration
    >>> model = Sam3Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    sam3T)r=   text_configgeometry_encoder_configdetr_encoder_configdetr_decoder_configmask_decoder_configNr   c           	   	      sB  |d u ri }t |trtd
i || _n|| _|d u r&ddddddddd	}t |tr4td
i || _n|| _|d u r=i }t |trKtd
i || _n|| _|d u rTi }t |trbtd
i || _	n|| _	|d u rki }t |tryt
d
i || _n|| _|d u ri }t |trtd
i || _n|| _|| _t jd
i | d S )Ni   r   i   i   r   r   r   r   )
vocab_sizer   r    projection_dimr!   r"   max_position_embeddingsr&   r   )rB   rC   r<   r=   r   rn   rT   ro   rb   rp   re   rq   ri   rr   r/   r   r   )	r0   r=   rn   ro   rp   rq   rr   r/   r1   r2   r   r4   r     sP   






zSam3Config.__init__c                 C   rH   )zImage size for the SAM3 model.r=   r$   rJ   r   r   r4   r$     rK   zSam3Config.image_sizec                 C   rL   )z2Set the image size and propagate to vision config.Nrv   rM   r   r   r4   r$     rO   )NNNNNNr   )r5   r6   r7   r8   r:   is_compositionr<   r   rT   rb   re   ri   rP   r   rQ   r$   rR   r;   r   r   r2   r4   rl     s.    /G
rl   )rl   r   r<   rT   rb   re   ri   N)r8   transformersr   configuration_utilsr   autor   r   r   r<   rT   rb   re   ri   rl   __all__r   r   r   r4   <module>   s   [O848( 