o
    wij[                     @   sv  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZmZmZmZ d
dlmZ d
dl m!Z!m"Z" e#e$Z%G dd deZ&G dd deZ'G dd de!Z(G dd deZ)G dd deZ*G dd deZ+G dd deZ,G dd deZ-eG dd deZ.G d d! d!eZ/g d"Z0dS )#    )CallableOptionalUnionN   )PretrainedConfig)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionc                       sD   e Zd ZdZdZdZ								
						d fdd	Z  ZS )MLCDVisionConfiga  
    This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
    [DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1664):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 1024):
            Dimensionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 336):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```python
    >>> from transformers import MLCDVisionConfig, MLCDVisionModel

    >>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> configuration = MLCDVisionConfig()

    >>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> model = MLCDVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```mlcd_vision_modelvision_config      0         r   P     geluh㈵>        {Gz?      ?c                    sd   t  jdi | || _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|
| _|	| _d S )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_groupsnum_channels
patch_size
image_sizeinitializer_rangeinitializer_factorattention_dropoutlayer_norm_eps
hidden_act)selfr,   r-   r.   r/   r0   r1   r3   r2   r8   r7   r6   r4   r5   kwargs	__class__r)   b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/mlcd/modular_mlcd.pyr+   d   s   
zMLCDVisionConfig.__init__)r   r   r   r    r!   r   r"   r#   r$   r%   r&   r'   r(   )__name__
__module____qualname____doc__
model_typebase_config_keyr+   __classcell__r)   r)   r;   r=   r   *   s$    6r   c                   @   s   e Zd ZdS )MLCDMLPN)r>   r?   r@   r)   r)   r)   r=   rE      s    rE   c                   @   s$   e Zd ZdededejfddZdS )MLCDRotaryEmbeddingnum_patches_heightnum_patches_widthreturnc           
      C   s   t j|| jjddd|}t j|| jjdd|d}t j| | gdd}t||}t j|| jj| jj	d}t 
|| j}|| d}	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer!   r   dim)rJ   dtype)torcharangeinv_freqrJ   	unsqueezeexpandstackflattenmaxrN   outer)
r9   rG   rH   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embr)   r)   r=   forward   s   
zMLCDRotaryEmbedding.forwardN)r>   r?   r@   intrO   Tensorr_   r)   r)   r)   r=   rF      s    rF   c                       s8   e Zd Zdef fddZdejdejfddZ  Z	S )MLCDVisionEmbeddingsconfigc                    s   t  | | `d S N)r*   r+   position_embeddingr9   rc   r;   r)   r=   r+      s   zMLCDVisionEmbeddings.__init__pixel_valuesrI   c                 C   s^   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|S )Nr   )rN   r   r!   rK   rL   )shapepatch_embeddingweightrN   torU   	transposeclass_embeddingrS   rO   cat)r9   rg   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingsr)   r)   r=   r_      s   

zMLCDVisionEmbeddings.forward)
r>   r?   r@   r   r+   rO   FloatTensorra   r_   rD   r)   r)   r;   r=   rb      s    rb   c                       sp   e Zd ZdZdef fddZ	ddejdeejejf de	ej d	e
e d
eeje	ej f f
ddZ  ZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    rc   c                    s   t  | |j| _d| _d S NF)r*   r+   r0   	is_causalrf   r;   r)   r=   r+      s   
zMLCDAttention.__init__Nhidden_statesposition_embeddingsattention_maskr:   rI   c                 K   sn  |j d d \}}| |||| j| jf}| |||| j| jf}| |||| j| jf}	|d d }
|d d }t	|||
|\}}|
dddd }|
dddd }|	
dddd }	t}| jjdkrzt| jj }|| |||	|f| jsdn| j| j| jd|\}}|
dddd }|||d}| |}|
ddd }||fS )	NrK   r   r!   r   r   eagerr&   )dropoutscalingrw   )rh   q_projreshape	num_headshead_dimk_projv_projrR   floatr   permute
contiguousr   rc   _attn_implementationr
   trainingr|   scalerw   viewout_proj)r9   rx   ry   rz   r:   ro   
seq_lengthquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightsr)   r)   r=   r_      s>   	

zMLCDAttention.forwardrd   )r>   r?   r@   rA   r   r+   rO   ra   tupler   r   r   r_   rD   r)   r)   r;   r=   ru      s    	ru   c                       sd   e Zd Zdef fddZ		ddejdeejejf deej d	ee	 d
eej
 f
ddZ  ZS )MLCDEncoderLayerrc   c                    s   t  | t|| _d S rd   )r*   r+   ru   	self_attnrf   r;   r)   r=   r+      s   zMLCDEncoderLayer.__init__NFrx   ry   rz   output_attentionsrI   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        rx   ry   rz   r   )layer_norm1r   layer_norm2mlp)r9   rx   ry   rz   r   residualr   outputsr)   r)   r=   r_      s"   




zMLCDEncoderLayer.forwardrv   )r>   r?   r@   r   r+   rO   ra   r   r   boolrt   r_   rD   r)   r)   r;   r=   r      s    r   c                       s~   e Zd ZdZdef fddZ				ddejdeej	ej	f de
ej	 d	e
e d
e
e de
e deeef fddZ  ZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    rc   c                    s   t  | dS )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r*   r+   rf   r;   r)   r=   r+   5  s   zMLCDEncoder.__init__Ninputs_embedsry   rz   r   output_hidden_statesreturn_dictrI   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ] \}
}|r<||	f }||	|||d}|d }	|rQ||d f }q1|rY||	f }|sgtdd |	||fD S t|	||dS )	aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr)   r   r   r!   c                 s   s    | ]	}|d ur|V  qd S rd   r)   ).0vr)   r)   r=   	<genexpr>w  s    z&MLCDEncoder.forward.<locals>.<genexpr>)last_hidden_staterx   
attentions)rc   r   use_return_dictr   	enumeratelayersr   r   )r9   r   ry   rz   r   r   r   encoder_statesall_attentionsrx   idxencoder_layerlayer_outputsr)   r)   r=   r_   9  s:   "

zMLCDEncoder.forwardNNNN)r>   r?   r@   rA   r   r+   rO   rt   r   ra   r   r   r   r   r_   rD   r)   r)   r;   r=   r   ,  s,    
r   c                       sh   e Zd Zdef fddZe				ddeej dee	 dee	 dee	 d	e
eef f
d
dZ  ZS )MLCDVisionTransformerrc   c                    sF   t  | t|j|j d | _tt	d|j|j d | _
d S )Nr   r!   )r*   r+   rF   r,   r/   vision_rotary_embeddingnn	ParameterrO   randnclass_pos_embrf   r;   r)   r=   r+     s   $zMLCDVisionTransformer.__init__Nrg   r   r   r   rI   c                 C   s<  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td|jd | j j }|jd | j j }| ||}|| j	j
}tj| j	|gdd}tj||fdd}| | f}	| |}
| |
}
| j|
|	|||d}|d }|d d dd d f }| |}|s||f|dd   S t|||j|jdS )	Nz You have to specify pixel_valuesrK   r   rL   )r   ry   r   r   r   r!   )r   pooler_outputrx   r   )rc   r   r   r   
ValueErrorrh   r2   r   rk   r   rJ   rO   rn   r   r   rs   pre_layrnormencoderpost_layernormr	   rx   r   )r9   rg   r   r   r   rG   rH   r^   embry   rx   encoder_outputsr   pooled_outputr)   r)   r=   r_     sB   	


zMLCDVisionTransformer.forwardr   )r>   r?   r@   r   r+   r   r   rO   rt   r   r   r   r	   r_   rD   r)   r)   r;   r=   r     s$    
r   c                   @   s(   e Zd ZeZdZdZdZdZdd Z	dS )MLCDPreTrainedModelmlcdTc                 C   s  | j j}t|tr,| j j}tjj|jd|jd | d tjj|j	j
|j j| d dS t|tru| j j}|jd d|j j d  | }|jd | }tjj|jj
|d tjj|jj
|d tjj|jj
|d tjj|jj
|d dS t|tr| j j}|j jd d|j j d  | }d|j j d | }tjj|jj
|d tjj|jj
|d dS t|tr| j j}|j j|j j d d | }tjj|jd|d dS t|tjr|jj  |j
jd dS t|tjr|jdur|jj  dS dS dS )zInitialize the weightsr&   g      )meanstd)r   r   r(   N)rc   r5   
isinstancerb   r   initnormal_rm   	embed_dimri   rj   r4   ru   r.   r~   r   r   r   rE   r,   fc1fc2r   r/   r   	LayerNormbiasdatazero_fill_Linear)r9   modulefactorin_proj_stdout_proj_stdfc_stdpos_emb_stdr)   r)   r=   _init_weights  s:   
 

 
z!MLCDPreTrainedModel._init_weightsN)
r>   r?   r@   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar   r)   r)   r)   r=   r     s    r   c                   @   sR   e Zd Ze				d	deej dee dee dee dee	e
f f
ddZdS )
MLCDVisionModelNrg   r   r   r   rI   c                 C   sN   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||dS )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```N)rg   r   r   r   )rc   r   r   r   vision_model)r9   rg   r   r   r   r)   r)   r=   r_     s   zMLCDVisionModel.forwardr   )r>   r?   r@   r   r   rO   rt   r   r   r   r	   r_   r)   r)   r)   r=   r     s"    
r   )r   r   r   )1typingr   r   r   rO   torch.nnr   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r	   modeling_utilsr
   r   processing_utilsr   utilsr   r   clip.modeling_clipr   r   r   r   r   r   r   llama.modeling_llamar   qwen2_vl.modeling_qwen2_vlr   r   
get_loggerr>   loggerr   rE   rF   rb   ru   r   r   r   r   r   __all__r)   r)   r)   r=   <module>   s2   $	
\"<2S9'-