o
    eii`                     @   s  d dl mZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ G dd dejZG dd dejZ G dd dejZ!	d;dejdej"dej"dej"dej"dB de#de#dee fddZ$d d! Z%d"ej"d#e&d$ej"fd%d&Z'd'ej"d(ej"d)ej"d*ej"d$e(ej"ej"f f
d+d,Z)G d-d. d.ejZ*G d/d0 d0eZ+G d1d2 d2ejZ,eG d3d4 d4eZ-G d5d6 d6e-Z.ed7d8G d9d: d:e-Z/d4d:gZ0dS )<    )CallableN   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring	torch_int)merge_with_config_defaults)capture_outputs   )MLCDVisionConfigc                       s2   e Zd Z fddZdejdejfddZ  ZS )MLCDMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)super__init__configr   
hidden_actactivation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr   	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mlcd/modeling_mlcd.pyr   &   s
   
zMLCDMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r!   r&   r$   r$   r%   forward-   s   


zMLCDMLP.forward)__name__
__module____qualname__r   torchTensorr(   __classcell__r$   r$   r"   r%   r   %   s    r   c                       sP   e Zd ZU ejed< ddededdf fddZd	ed
edejfddZ	  Z
S )MLCDRotaryEmbeddinginv_freq     @dimthetar'   Nc                    sJ   t    || _|| _d|tjd|dtjd|   }| jd|dd d S )N      ?r      dtyper0   F
persistent)r   r   r2   r3   r,   arangefloatregister_buffer)r!   r2   r3   r0   r"   r$   r%   r   7   s
   
 zMLCDRotaryEmbedding.__init__num_patches_heightnum_patches_widthc           
      C   s   t j|| jjddd|}t j|| jjdd|d}t j| | gdd}t||}t j|| jj| jj	d}t 
|| j}|| d}	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer   r   r2   )r?   r7   )r,   r:   r0   r?   	unsqueezeexpandstackflattenmaxr7   outer)
r!   r=   r>   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embr$   r$   r%   r(   >   s   
zMLCDRotaryEmbedding.forward)r1   )r)   r*   r+   r,   r-   __annotations__intr;   r   r(   r.   r$   r$   r"   r%   r/   4   s   
 
 r/   c                       sV   e Zd Zdef fddZdejdededejfdd	Zd
ej	dejfddZ
  ZS )MLCDVisionEmbeddingsr   c                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebiasr5   r   position_idsr   r@   r8   )r   r   r   r   	embed_dim
image_size
patch_sizer   	Parameterr,   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsr<   r:   rC   r    r"   r$   r%   r   `   s    
"zMLCDVisionEmbeddings.__init__
embeddingsheightwidthr'   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr@   g      ?r   r5   bicubicF)sizemodealign_cornersrA   )shapeposition_embeddingweightrB   r,   jit
is_tracingrW   r[   r   reshapepermuter   
functionalinterpolateviewcat)r!   rd   re   rf   rb   rl   rc   class_pos_embedpatch_pos_embedr2   
new_height	new_widthsqrt_num_positionsr$   r$   r%   interpolate_pos_encodingu   s*   



z-MLCDVisionEmbeddings.interpolate_pos_encodingpixel_valuesc                 C   s^   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|S )Nr   r6   r5   r   r@   rA   )rk   ra   rm   r7   torE   	transposer^   rC   r,   ru   )r!   r|   
batch_sizetarget_dtypepatch_embedsclass_embedsrd   r$   r$   r%   r(      s   

zMLCDVisionEmbeddings.forward)r)   r*   r+   r   r   r,   r-   rP   r{   FloatTensorr(   r.   r$   r$   r"   r%   rQ   _   s    )rQ           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr5   r   r@   )r2   r7   )ptrainingr   )	repeat_kvnum_key_value_groupsr,   matmulr~   r   rr   softmaxfloat32r}   r7   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr$   r$   r%   eager_attention_forward   s   
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr@   r5   rA   )rk   r,   ru   )xx1x2r$   r$   r%   rotate_half   s   r   r&   n_repr'   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rk   rC   rp   )r&   r   batchnum_key_value_headsslenhead_dimr$   r$   r%   r      s
   0r   qkcossinc                 C   s   | j }|j }|  | } }|d |d }}| | t| |  }|| t||  }||}||}||fS )N)r7   r;   rB   r   r}   )r   r   r   r   orig_q_dtypeorig_k_dtypeq_embedk_embedr$   r$   r%   apply_rotary_pos_emb_vision   s   

r   c                       sp   e Zd ZdZdef fddZ	ddejdeejejf dejdB d	e	e
 d
eejejdB f f
ddZ  ZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    r   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _|j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r   r   r   r   rY   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutr   	is_causalr   r   k_projv_projq_projout_projr   r    r"   r$   r%   r      s&   

zMLCDAttention.__init__Nr&   position_embeddingsr   r   r'   c                 K   sb  |j dd \}}| |||| j| jf}| |||| j| jf}| |||| j| jf}	|d d }
|d d }t	|||
|\}}|
dddd }|
dddd }|	
dddd }	t| jjt}|| |||	|f| jsdn| j| j| jd|\}}|
dddd }|||d}| |}|
ddd }||fS )	z#Input shape: Batch x Time x ChannelNr@   r   r   r5   r   r   )r   r   r   )rk   r   rp   r   r   r   r   rB   r;   r   rq   r   r	   get_interfacer   _attn_implementationr   r   r   r   r   rt   r   )r!   r&   r   r   r   r   
seq_lengthquery_statesr   r   r   r   attention_interfacer   r   r$   r$   r%   r(     s>   	

zMLCDAttention.forwardr   )r)   r*   r+   __doc__r   r   r,   r-   tupler   r   r(   r.   r$   r$   r"   r%   r      s    r   c                       sb   e Zd Zdef fddZ	ddejdeejejf dejdB dee	 d	eej
 f
d
dZ  ZS )MLCDEncoderLayerr   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S )Neps)r   r   r   rY   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r    r"   r$   r%   r   3  s   


zMLCDEncoderLayer.__init__Nr&   r   r   r   r'   c                 K   sV   |}|  |}| jd|||d|\}}|| }|}| |}| |}|| }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
        )r&   r   r   Nr$   )r   r   r   r   )r!   r&   r   r   r   residual_r$   r$   r%   r(   ;  s   



zMLCDEncoderLayer.forwardr   )r)   r*   r+   r   r   r,   r-   r   r   r   r   r(   r.   r$   r$   r"   r%   r   2  s    r   c                       sd   e Zd ZdZdef fddZ	ddejdeej	ej	f dej	dB d	e
e d
eeB f
ddZ  ZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    r   c                    s:   t     | _t fddt jD | _d| _dS )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.c                    s   g | ]}t  qS r$   )r   ).0r   r   r$   r%   
<listcomp>m  s    z(MLCDEncoder.__init__.<locals>.<listcomp>FN)	r   r   r   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr    r"   r   r%   r   i  s   
 
zMLCDEncoder.__init__Ninputs_embedsr   r   r   r'   c                 K   s.   |}| j D ]}||||fi |}qt|dS )a=  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        )last_hidden_state)r   r   )r!   r   r   r   r   r&   encoder_layerr$   r$   r%   r(   p  s   
zMLCDEncoder.forwardr   )r)   r*   r+   r   r   r   r,   r   r   r-   r   r   r   r(   r.   r$   r$   r"   r%   r   `  s    r   c                   @   sL   e Zd ZU eed< dZdZdZdZdZ	dZ
dZeedZe dd ZdS )	MLCDPreTrainedModelr   mlcdTF)r&   
attentionsc                 C   sL  | j j}t|tr;| j j}tj|jd|jd | d tj|jj	|j j
| d t|jt|jjd d dS t|tr| j j}|jd d|j j d  | }|jd | }tj|jj	|d tj|jj	|d tj|jj	|d tj|jj	|d dS t|tr| j j}|j jd d|j j d  | }d|j j d | }tj|jj	|d tj|jj	|d dS t|tr| j j}|j j|j j d d | }tj|jd|d dS t|tjrt |j! t"|j	 dS t|tj#r|j!durt |j! dS t|t$r$d	|j%tjd
|j&dtj'd|j&   }t|j(| dS dS )zInitialize the weightsr   r   )meanstd)r   r@   rX   r5   Nr4   r   r6   ))r   initializer_factor
isinstancerQ   initnormal_r^   rY   ra   rm   initializer_rangecopy_rW   r,   r:   rk   rC   r   r   r   r   r   r   r   r   r   r   MLCDVisionTransformerr   class_pos_embr   r   zeros_rV   ones_r   r/   r3   r2   r;   r0   )r!   r   factorin_proj_stdout_proj_stdfc_stdpos_emb_stdr0   r$   r$   r%   _init_weights  sB   
&

 
&z!MLCDPreTrainedModel._init_weightsN)r)   r*   r+   r   rO   base_model_prefixsupports_gradient_checkpointingaccepts_loss_kwargs_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr,   no_gradr   r$   r$   r$   r%   r     s   
 r   c                       st   e Zd ZU eed< dZdZdgZdef fddZe	e
dde		ddejd	B d
ee deeB fddZ  ZS )r   r   r|   imager   c                    s   t  | || _|j}t|| _tj||jd| _	t
|| _tj||jd| _t|j|j d | _ttd|j|j d | _|   d S )Nr   r5   r   )r   r   r   r   rQ   rd   r   r   r   pre_layrnormr   encoderpost_layernormr/   r   vision_rotary_embeddingr\   r,   r]   r   	post_init)r!   r   rY   r"   r$   r%   r     s   

 zMLCDVisionTransformer.__init__F)tie_last_hidden_statesNr   r'   c                 K   s   |d u rt d|jd | jj }|jd | jj }| ||}|| jj}tj	| j|gdd}tj	||fdd}|
 | f}| |}| |}| jd||d|}	|	d }
|
d d dd d f }| |}t|
|dS )	Nz You have to specify pixel_valuesr   r@   r   rA   )r   r   )r   pooler_outputr$   )r   rk   r   r[   r   r}   r   r?   r,   ru   r   r   rd   r   r   r   r   )r!   r|   r   r=   r>   rN   embr   r&   encoder_outputsr   pooled_outputr$   r$   r%   r(     s0   


zMLCDVisionTransformer.forwardr   )r)   r*   r+   r   rO   main_input_nameinput_modalities_no_split_modulesr   r   r   r   r,   r   r   r   r   r   r(   r.   r$   r$   r"   r%   r     s"   
 r   zN
    The vision model from M_L_C_D without any head or projection on top.
    )custom_introc                	       sv   e Zd ZU eed< dZdZdgZdef fddZde	j
fdd	Ze	
ddejd
B dee deeB fddZ  ZS )MLCDVisionModelr   r|   r   r   c                    s"   t  | t|| _|   d S r   )r   r   r   vision_modelr   r    r"   r$   r%   r     s   
zMLCDVisionModel.__init__r'   c                 C   s
   | j jjS r   )r  rd   ra   )r!   r$   r$   r%   get_input_embeddings  s   
z$MLCDVisionModel.get_input_embeddingsNr   c                 K   s   | j dd|i|S )a  
        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```r|   Nr$   )r  )r!   r|   r   r$   r$   r%   r(     s
   zMLCDVisionModel.forwardr   )r)   r*   r+   r   rO   r  r  r  r   r   Moduler  r   r,   r   r   r   r   r   r(   r.   r$   r$   r"   r%   r    s    
 r  )r   )1collections.abcr   r,   torch.nnr    r   r   activationsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr	   r
   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   configuration_mlcdr   r	  r   r/   rQ   r-   r;   r   r   rP   r   r   r   r   r   r   r   r   r  __all__r$   r$   r$   r%   <module>   st   +S

M.4393