o
    eiGO                     @   sr  d dl mZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZmZ ddlmZ dd	lmZmZmZ d
dlmZmZmZmZmZmZmZ d
dlmZ d
dl m!Z!m"Z" e#e$Z%G dd de	Z&G dd deZ'G dd de!Z(G dd deZ)G dd deZ*G dd deZ+G dd deZ,eG dd deZ-G dd deZ.G d d! d!eZ/g d"Z0dS )#    )CallableN   )initialization)PreTrainedConfig)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionc                       sD   e Zd ZdZdZdZ								
						d fdd	Z  ZS )MLCDVisionConfiga  
    This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
    [DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1664):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 1024):
            Dimensionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 336):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```python
    >>> from transformers import MLCDVisionConfig, MLCDVisionModel

    >>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> configuration = MLCDVisionConfig()

    >>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> model = MLCDVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```mlcd_vision_modelvision_config      0         r   P     geluh㈵>        {Gz?      ?c                    sd   t  jdi | || _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|
| _|	| _d S )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_groupsnum_channels
patch_size
image_sizeinitializer_rangeinitializer_factorattention_dropoutlayer_norm_eps
hidden_act)selfr+   r,   r-   r.   r/   r0   r2   r1   r7   r6   r5   r3   r4   kwargs	__class__r(   c/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mlcd/modular_mlcd.pyr*   c   s   
zMLCDVisionConfig.__init__)r   r   r   r   r    r   r!   r"   r#   r$   r%   r&   r'   )__name__
__module____qualname____doc__
model_typebase_config_keyr*   __classcell__r(   r(   r:   r<   r   )   s$    6r   c                   @   s   e Zd ZdS )MLCDMLPN)r=   r>   r?   r(   r(   r(   r<   rD      s    rD   c                   @   s$   e Zd ZdededejfddZdS )MLCDRotaryEmbeddingnum_patches_heightnum_patches_widthreturnc           
      C   s   t j|| jjddd|}t j|| jjdd|d}t j| | gdd}t||}t j|| jj| jj	d}t 
|| j}|| d}	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer    r   dim)rI   dtype)torcharangeinv_freqrI   	unsqueezeexpandstackflattenmaxrM   outer)
r8   rF   rG   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embr(   r(   r<   forward   s   
zMLCDRotaryEmbedding.forwardN)r=   r>   r?   intrN   Tensorr^   r(   r(   r(   r<   rE      s    rE   c                       s8   e Zd Zdef fddZdejdejfddZ  Z	S )MLCDVisionEmbeddingsconfigc                    s   t  | | `d S N)r)   r*   position_embeddingr8   rb   r:   r(   r<   r*      s   zMLCDVisionEmbeddings.__init__pixel_valuesrH   c                 C   s^   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|S )Nr   rM   r   r    rJ   rK   )shapepatch_embeddingweightrM   torT   	transposeclass_embeddingrR   rN   cat)r8   rf   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingsr(   r(   r<   r^      s   

zMLCDVisionEmbeddings.forward)
r=   r>   r?   r   r*   rN   FloatTensorr`   r^   rC   r(   r(   r:   r<   ra      s    ra   c                       sp   e Zd ZdZdef fddZ	ddejdeejejf dejdB d	e	e
 d
eejejdB f f
ddZ  ZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    rb   c                    s   t  | |j| _d| _d S )NF)r)   r*   r/   	is_causalre   r:   r(   r<   r*      s   
zMLCDAttention.__init__Nhidden_statesposition_embeddingsattention_maskr9   rH   c                 K   sb  |j d d \}}| |||| j| jf}| |||| j| jf}| |||| j| jf}	|d d }
|d d }t	|||
|\}}|
dddd }|
dddd }|	
dddd }	t| jjt}|| |||	|f| jsdn| j| j| jd|\}}|
dddd }|||d}| |}|
ddd }||fS )NrJ   r   r    r   r   r%   )dropoutscalingrv   )rh   q_projreshape	num_headshead_dimk_projv_projrQ   floatr   permute
contiguousr   get_interfacerb   _attn_implementationr   trainingrz   scalerv   viewout_proj)r8   rw   rx   ry   r9   ro   
seq_lengthquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightsr(   r(   r<   r^      s>   	

zMLCDAttention.forwardrc   )r=   r>   r?   r@   r   r*   rN   r`   tupler
   r   r^   rC   r(   r(   r:   r<   ru      s    	ru   c                       sb   e Zd Zdef fddZ	ddejdeejejf dejdB dee	 d	eej
 f
d
dZ  ZS )MLCDEncoderLayerrb   c                    s   t  | t|| _d S rc   )r)   r*   ru   	self_attnre   r:   r(   r<   r*      s   zMLCDEncoderLayer.__init__Nrw   rx   ry   r9   rH   c                 K   sV   |}|  |}| jd|||d|\}}|| }|}| |}| |}|| }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
        )rw   rx   ry   Nr(   )layer_norm1r   layer_norm2mlp)r8   rw   rx   ry   r9   residual_r(   r(   r<   r^      s   



zMLCDEncoderLayer.forwardrc   )r=   r>   r?   r   r*   rN   r`   r   r
   r   rt   r^   rC   r(   r(   r:   r<   r      s    r   c                       sd   e Zd ZdZdef fddZ	ddejdeej	ej	f dej	dB d	e
e d
eeB f
ddZ  ZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    rb   c                    s   t  | dS )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r)   r*   re   r:   r(   r<   r*   ,  s   zMLCDEncoder.__init__Ninputs_embedsrx   ry   r9   rH   c                 K   s.   |}| j D ]}||||fi |}qt|dS )a=  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        )last_hidden_state)layersr   )r8   r   rx   ry   r9   rw   encoder_layerr(   r(   r<   r^   0  s   
zMLCDEncoder.forwardrc   )r=   r>   r?   r@   r   r*   rN   rt   r   r`   r
   r   r   r^   rC   r(   r(   r:   r<   r   #  s    r   c                   @   sL   e Zd ZU eed< dZdZdZdZdZ	dZ
dZeedZe dd ZdS )	MLCDPreTrainedModelrb   mlcdTF)rw   
attentionsc                 C   sL  | j j}t|tr;| j j}tj|jd|jd | d tj|jj	|j j
| d t|jt|jjd d dS t|tr| j j}|jd d|j j d  | }|jd | }tj|jj	|d tj|jj	|d tj|jj	|d tj|jj	|d dS t|tr| j j}|j jd d|j j d  | }d|j j d | }tj|jj	|d tj|jj	|d dS t|tr| j j}|j j|j j d d | }tj|jd|d dS t|tjrt |j! t"|j	 dS t|tj#r|j!durt |j! dS t|t$r$d	|j%tjd
|j&dtj'd|j&   }t|j(| dS dS )zInitialize the weightsr%   g      )meanstd)r   rJ   )r    rJ   r   Nr'   r   rg   ))rb   r4   
isinstancera   initnormal_rm   	embed_dimri   rj   r3   copy_position_idsrN   rO   rh   rR   ru   r-   r|   r   r   r   rD   r+   fc1fc2MLCDVisionTransformerr.   class_pos_embnn	LayerNormzeros_biasones_LinearrE   thetarL   r   rP   )r8   modulefactorin_proj_stdout_proj_stdfc_stdpos_emb_stdrP   r(   r(   r<   _init_weightsc  sB   
&

 
&z!MLCDPreTrainedModel._init_weightsN)r=   r>   r?   r   __annotations__base_model_prefixsupports_gradient_checkpointingaccepts_loss_kwargs_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   ru   _can_record_outputsrN   no_gradr   r(   r(   r(   r<   r   T  s   
 r   c                       sJ   e Zd Zdef fddZ	d
dejdB dee de	e
B fdd	Z  ZS )r   rb   c                    sF   t  | t|j|j d | _tt	d|j|j d | _
d S )Nr   r    )r)   r*   rE   r+   r.   vision_rotary_embeddingr   	ParameterrN   randnr   re   r:   r(   r<   r*     s   $zMLCDVisionTransformer.__init__Nrf   r9   rH   c                 K   s   |d u rt d|jd | jj }|jd | jj }| ||}|| jj}tj	| j|gdd}tj	||fdd}|
 | f}| |}| |}| jd||d|}	|	d }
|
d d dd d f }| |}t|
|dS )	Nz You have to specify pixel_valuesrJ   r   rK   )r   rx   )r   pooler_outputr(   )
ValueErrorrh   rb   r1   r   rk   r   rI   rN   rn   r   r   rs   pre_layrnormencoderpost_layernormr   )r8   rf   r9   rF   rG   r]   embrx   rw   encoder_outputsr   pooled_outputr(   r(   r<   r^     s0   


zMLCDVisionTransformer.forwardrc   )r=   r>   r?   r   r*   rN   rt   r
   r   r   r   r^   rC   r(   r(   r:   r<   r     s    r   c                   @   s4   e Zd Z	ddejdB dee deeB fddZ	dS )MLCDVisionModelNrf   r9   rH   c                 K   s   | j dd|i|S )a  
        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```rf   Nr(   )vision_model)r8   rf   r9   r(   r(   r<   r^     s
   zMLCDVisionModel.forwardrc   )
r=   r>   r?   rN   rt   r
   r   r   r   r^   r(   r(   r(   r<   r     s    r   )r   r   r   )1collections.abcr   rN   torch.nnr    r   r   configuration_utilsr   modeling_outputsr   r   modeling_utilsr   r	   processing_utilsr
   utilsr   r   r   clip.modeling_clipr   r   r   r   r   r   r   llama.modeling_llamar   qwen2_vl.modeling_qwen2_vlr   r   
get_loggerr=   loggerr   rD   rE   ra   ru   r   r   r   r   r   __all__r(   r(   r(   r<   <module>   s2   $	
\"<*13)$