o
    ij                     @   s  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZ G dd dejZG dd dejZG dd dejZ	d9dejdejdejdejdeej de de dee fddZ!dd Z"d ejd!e#d"ejfd#d$Z$d%ejd&ejd'ejd(ejd"e%ejejf f
d)d*Z&G d+d, d,ejZ'G d-d. d.eZ(G d/d0 d0ejZ)G d1d2 d2ejZ*eG d3d4 d4eZ+ed5d6G d7d8 d8e+Z,d4d8gZ-dS ):    )CallableOptionalUnionN   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring	torch_int   )MLCDVisionConfigc                       s2   e Zd Z fddZdejdejfddZ  ZS )MLCDMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)super__init__configr   
hidden_actactivation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/mlcd/modeling_mlcd.pyr   %   s
   
zMLCDMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r!   r&   r$   r$   r%   forward,   s   


zMLCDMLP.forward)__name__
__module____qualname__r   torchTensorr(   __classcell__r$   r$   r"   r%   r   $   s    r   c                       sP   e Zd ZU ejed< ddededdf fddZd	ed
edejfddZ	  Z
S )MLCDRotaryEmbeddinginv_freq     @dimthetar'   Nc                    s>   t    d|tjd|dtjd|   }| jd|dd d S )N      ?r      dtyper0   F
persistent)r   r   r,   arangefloatregister_buffer)r!   r2   r3   r0   r"   r$   r%   r   6   s   
 zMLCDRotaryEmbedding.__init__num_patches_heightnum_patches_widthc           
      C   s   t j|| jjddd|}t j|| jjdd|d}t j| | gdd}t||}t j|| jj| jj	d}t 
|| j}|| d}	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer   r   r2   )r?   r7   )r,   r:   r0   r?   	unsqueezeexpandstackflattenmaxr7   outer)
r!   r=   r>   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embr$   r$   r%   r(   ;   s   
zMLCDRotaryEmbedding.forward)r1   )r)   r*   r+   r,   r-   __annotations__intr;   r   r(   r.   r$   r$   r"   r%   r/   3   s   
 
 r/   c                       sV   e Zd Zdef fddZdejdededejfdd	Zd
ej	dejfddZ
  ZS )MLCDVisionEmbeddingsr   c                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebiasr5   r   position_ids)r   r@   r8   )r   r   r   r   	embed_dim
image_size
patch_sizer   	Parameterr,   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsr<   r:   rC   r    r"   r$   r%   r   ]   s    
"zMLCDVisionEmbeddings.__init__
embeddingsheightwidthr'   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr@   g      ?r   r5   bicubicF)sizemodealign_cornersrA   )shapeposition_embeddingweightrB   r,   jit
is_tracingrW   rZ   r   reshapepermuter   
functionalinterpolateviewcat)r!   rc   rd   re   ra   rk   rb   class_pos_embedpatch_pos_embedr2   
new_height	new_widthsqrt_num_positionsr$   r$   r%   interpolate_pos_encodingr   s*   



z-MLCDVisionEmbeddings.interpolate_pos_encodingpixel_valuesc                 C   s^   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|S )Nr   r6   r5   r   r@   rA   )rj   r`   rl   r7   torE   	transposer]   rC   r,   rt   )r!   r{   
batch_sizetarget_dtypepatch_embedsclass_embedsrc   r$   r$   r%   r(      s   

zMLCDVisionEmbeddings.forward)r)   r*   r+   r   r   r,   r-   rP   rz   FloatTensorr(   r.   r$   r$   r"   r%   rQ   \   s    )rQ           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr5   r   r@   )r2   r7   )ptrainingr   )	repeat_kvnum_key_value_groupsr,   matmulr}   rj   r   rq   softmaxfloat32r|   r7   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputr$   r$   r%   eager_attention_forward   s   
&r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr@   r5   rA   )rj   r,   rt   )xx1x2r$   r$   r%   rotate_half   s   r   r&   n_repr'   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rj   rC   ro   )r&   r   batchnum_key_value_headsslenhead_dimr$   r$   r%   r      s
   0r   qkcossinc                 C   s   | j }|j }|  | } }|d |d }}| | t| |  }|| t||  }||}||}||fS )Nr   )r7   r;   rB   r   r|   )r   r   r   r   orig_q_dtypeorig_k_dtypeq_embedk_embedr$   r$   r%   apply_rotary_pos_emb_vision   s   

r   c                       sp   e Zd ZdZdef fddZ	ddejdeejejf de	ej d	e
e d
eeje	ej f f
ddZ  ZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    r   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _|j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r   r   r   r   rX   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutr   	is_causalr   r   k_projv_projq_projout_projr   r    r"   r$   r%   r      s&   

zMLCDAttention.__init__Nr&   position_embeddingsr   r   r'   c                 K   sn  |j dd \}}| |||| j| jf}| |||| j| jf}| |||| j| jf}	|d d }
|d d }t	|||
|\}}|
dddd }|
dddd }|	
dddd }	t}| jjdkrzt| jj }|| |||	|f| jsdn| j| j| jd	|\}}|
dddd }|||d}| |}|
ddd }||fS )
z#Input shape: Batch x Time x ChannelNr@   r   r   r5   r   eagerr   )r   r   r   )rj   r   ro   r   r   r   r   rB   r;   r   rp   r   r   r   _attn_implementationr   r   r   r   r   rs   r   )r!   r&   r   r   r   r~   
seq_lengthquery_statesr   r   r   r   attention_interfacer   r   r$   r$   r%   r(      s>   	

zMLCDAttention.forwardr   )r)   r*   r+   __doc__r   r   r,   r-   tupler   r   r   r(   r.   r$   r$   r"   r%   r      s    r   c                       sd   e Zd Zdef fddZ		ddejdeejejf deej d	ee	 d
eej
 f
ddZ  ZS )MLCDEncoderLayerr   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S )Neps)r   r   r   rX   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r    r"   r$   r%   r   1  s   


zMLCDEncoderLayer.__init__NFr&   r   r   output_attentionsr'   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        r&   r   r   r   )r   r   r   r   )r!   r&   r   r   r   residualr   outputsr$   r$   r%   r(   9  s"   




zMLCDEncoderLayer.forward)NF)r)   r*   r+   r   r   r,   r-   r   r   boolr   r(   r.   r$   r$   r"   r%   r   0  s    r   c                       s~   e Zd ZdZdef fddZ				ddejdeej	ej	f de
ej	 d	e
e d
e
e de
e deeef fddZ  ZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    r   c                    s:   t     | _t fddt jD | _d| _dS )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.c                    s   g | ]}t  qS r$   )r   ).0_r   r$   r%   
<listcomp>s  s    z(MLCDEncoder.__init__.<locals>.<listcomp>FN)	r   r   r   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr    r"   r   r%   r   o  s   
 
zMLCDEncoder.__init__Ninputs_embedsr   r   r   output_hidden_statesreturn_dictr'   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ] \}
}|r<||	f }||	|||d}|d }	|rQ||d f }q1|rY||	f }|sgtdd |	||fD S t|	||dS )	aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr$   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r$   )r   vr$   r$   r%   	<genexpr>  s    z&MLCDEncoder.forward.<locals>.<genexpr>)last_hidden_stater&   
attentions)r   r   use_return_dictr   	enumerater   r   r	   )r!   r   r   r   r   r   r   encoder_statesall_attentionsr&   idxencoder_layerlayer_outputsr$   r$   r%   r(   v  s:   "

zMLCDEncoder.forwardNNNN)r)   r*   r+   r   r   r   r,   r   r   r-   r   r   r   r	   r(   r.   r$   r$   r"   r%   r   f  s,    
r   c                       sh   e Zd Zdef fddZe				ddeej dee	 dee	 dee	 d	e
eef f
d
dZ  ZS )MLCDVisionTransformerr   c                    s   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _t|j|j d | _ttd|j|j d | _d S )Nr   r5   r   )r   r   r   r   rQ   rc   r   r   r   pre_layrnormr   encoderpost_layernormr/   r   vision_rotary_embeddingr[   r,   r\   class_pos_emb)r!   r   rX   r"   r$   r%   r     s   


$zMLCDVisionTransformer.__init__Nr{   r   r   r   r'   c                 C   s<  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td|jd | j j }|jd | j j }| ||}|| j	j
}tj| j	|gdd}tj||fdd}| | f}	| |}
| |
}
| j|
|	|||d}|d }|d d dd d f }| |}|s||f|dd   S t|||j|jdS )	Nz You have to specify pixel_valuesr   r@   r   rA   )r   r   r   r   r   r   )r   pooler_outputr&   r   )r   r   r   r   r   rj   rZ   r   r|   r   r?   r,   rt   r   r   rc   r   r   r   r
   r&   r   )r!   r{   r   r   r   r=   r>   rN   embr   r&   encoder_outputsr   pooled_outputr$   r$   r%   r(     sB   	


zMLCDVisionTransformer.forwardr   )r)   r*   r+   r   r   r   r   r,   r   r   r   r   r
   r(   r.   r$   r$   r"   r%   r     s$    
r   c                   @   s.   e Zd ZU eed< dZdZdZdZdd Z	dS )MLCDPreTrainedModelr   mlcdTc                 C   s  | j j}t|tr,| j j}tjj|jd|jd | d tjj|j	j
|j j| d dS t|tru| j j}|jd d|j j d  | }|jd | }tjj|jj
|d tjj|jj
|d tjj|jj
|d tjj|jj
|d dS t|tr| j j}|j jd d|j j d  | }d|j j d | }tjj|jj
|d tjj|jj
|d dS t|tr| j j}|j j|j j d d | }tjj|jd|d dS t|tjr|jj  |j
jd dS t|tjr|jdur|jj  dS dS dS )zInitialize the weightsr   r   )meanstd)r   r5   r4   N)r   initializer_factor
isinstancerQ   r   initnormal_r]   rX   r`   rl   initializer_ranger   r   r   r   r   r   r   r   r   r   r   r   r   r   rV   datazero_fill_r   )r!   r   factorin_proj_stdout_proj_stdfc_stdpos_emb_stdr$   r$   r%   _init_weights  s:   
 

 
z!MLCDPreTrainedModel._init_weightsN)
r)   r*   r+   r   rO   base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpar  r$   r$   r$   r%   r     s   
 r   zN
    The vision model from M_L_C_D without any head or projection on top.
    )custom_introc                       s   e Zd ZU eed< dZdgZdef fddZdej	fddZ
e								ddeej d
ee dee dee deeef f
ddZ  ZS )MLCDVisionModelr   r{   r   c                    s"   t  | t|| _|   d S r   )r   r   r   vision_model	post_initr    r"   r$   r%   r   .  s   
zMLCDVisionModel.__init__r'   c                 C   s
   | j jjS r   )r
  rc   r`   )r!   r$   r$   r%   get_input_embeddings4  s   
z$MLCDVisionModel.get_input_embeddingsNr   r   r   c                 C   sN   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||dS )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```N)r{   r   r   r   )r   r   r   r   r
  )r!   r{   r   r   r   r$   r$   r%   r(   7  s   zMLCDVisionModel.forwardr   )r)   r*   r+   r   rO   main_input_name_no_split_modulesr   r   Moduler  r   r   r,   r   r   r   r   r
   r(   r.   r$   r$   r"   r%   r	  $  s,   
 
r	  )r   ).typingr   r   r   r,   torch.nnr   activationsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr	   r
   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_mlcdr   r  r   r/   rQ   r-   r;   r   r   rP   r   r   r   r   r   r   r   r   r	  __all__r$   r$   r$   r%   <module>   sp   )S

M6V@':