o
    iQ                     @   s  d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ eeZdd Z G dd dej!Z"dd Z#d3ddZ$	d4dej!dej%dej%dej%deej% de&d e&fd!d"Z'G d#d$ d$ej!Z(G d%d& d&ej!Z)G d'd( d(ej!Z*G d)d* d*eZ+G d+d, d,ej!Z,eG d-d. d.eZ-d/d0 Z.eG d1d2 d2e-Z/d2d.gZ0dS )5zPyTorch Pixtral model.    )Callable)OptionalUnionN)nn   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )PixtralVisionConfigc           
      C   s   g }| D ];}|j dd  \}}tjt|t|dd}tj|dddddd\}}|| | }	||	d d df  qt|S )Nij)indexingdim   r   )	shapetorchmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_width	positionspatchheightwidthmeshh_gridv_gridids r.   `/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgrid%   s   "
r0   c                       sB   e Zd ZU dZejed< d fdd	Ze e	dd Z
  ZS )	PixtralRotaryEmbeddinga  
    The key with pixtral embedding is just that you have a frequency for each pixel positions.
    If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
    is given by indexing the pre_computed frequency on the width and height.

    What you output is of dimension (batch, height * width, dim) with dim the embed dim.

    This simply means that for each image hidden state, you are going to add
    a corresponding positional embedding, based on its index in the grid.
    inv_freqNc           
         s  t    d| _|j| _|j| _|j|j }d| jt	
d| jd | j   }t	j
||jd}t	j
||jd}t	||d d d  }t	||dd d  }t	j|d d d d d f d|d|d d d d d f |ddgddd| jd }	| jd	t	j|	|	fddd
d d S )Ndefault      ?r   r   )devicer   r   r   r2   F)
persistent)super__init__	rope_typehead_dimr   
rope_thetabase
image_size
patch_sizer   r   floatr5   outerr#   repeatr    register_buffer)
selfconfigr5   max_patches_per_sidefreqshwfreqs_hfreqs_wr2   	__class__r.   r/   r8   >   s&   
$"
zPixtralRotaryEmbedding.__init__c                 C   s   | j | }t|jjtr|jjdkr|jjnd}tj|dd |}| }| }W d    n1 s4w   Y  |j	|j
d|j	|j
dfS )NmpscpuF)device_typeenabled)dtype)r2   
isinstancer5   typestrr   autocastcossintorQ   )rC   xposition_idsrF   rO   embrV   rW   r.   r.   r/   forwardW   s   
&
zPixtralRotaryEmbedding.forwardN)__name__
__module____qualname____doc__r   Tensor__annotations__r8   no_gradr   r\   __classcell__r.   r.   rK   r/   r1   0   s   
 
r1   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r   r#   )rY   x1x2r.   r.   r/   rotate_halff   s   rh   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerh   )qkrV   rW   rZ   unsqueeze_dimq_embedk_embedr.   r.   r/   apply_rotary_pos_embm   s
   

ro           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr   r   )r   rQ   )ptrainingr   r   )r   matmul	transposer   
functionalsoftmaxfloat32rX   rQ   rw   ry   
contiguous)
rq   rr   rs   rt   ru   rv   rw   kwargsattn_weightsattn_outputr.   r.   r/   eager_attention_forward   s   
r   c                       sz   e Zd ZdZ fddZ			ddejdeej deeejejf  d	ee	 d
e
e deejeej f fddZ  ZS )PixtralAttentionzI
    Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
    c                    s   t    || _|j| _|j| _| j| j | _d| _| jd | _	d| _|j
| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFg      ࿩bias)r7   r8   rD   hidden_size	embed_dimnum_attention_heads	num_headsr:   	is_causalrv   attention_dropoutrw   r   Lineark_projv_projq_projo_projrC   rD   rK   r.   r/   r8      s   
zPixtralAttention.__init__NFhidden_statesru   position_embeddingsoutput_attentionsr   returnc                 K   s:  |  \}}}| |}	| |}
| |}|	||| j| jdd}	|
||| j| jdd}
|||| j| jdd}|\}}t|	|
||dd\}	}
t	}| j
jdkr]t| j
j }| j
jdkro|d j|jdd	|d< || |	|
||f| js{d
n| j| jd|\}}|||d }| |}|sd}||fS )z#Input shape: Batch x Time x Channelr   r   r   )rl   eagerflash_attention_2rZ   T)non_blockingrp   )rw   rv   r   N)sizer   r   r   viewr   r:   r{   ro   r   rD   _attn_implementationr   rX   r5   ry   rw   rv   r    r   r   )rC   r   ru   r   r   r   
batch_sizepatches_query_states
key_statesvalue_statesrV   rW   attention_interfacer   r   r.   r.   r/   r\      s>   





zPixtralAttention.forward)NNF)r^   r_   r`   ra   r8   r   rb   r   tupleboolr   r   r\   re   r.   r.   rK   r/   r      s&    r   c                       s$   e Zd Z fddZdd Z  ZS )
PixtralMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S )NFr   )r7   r8   rD   r   intermediate_sizer   r   	gate_projup_proj	down_projr   
hidden_actact_fnr   rK   r.   r/   r8      s   
zPixtralMLP.__init__c                 C   s$   |  | | || | }|S r]   )r   r   r   r   )rC   rY   r   r.   r.   r/   r\      s    zPixtralMLP.forward)r^   r_   r`   r8   r\   re   r.   r.   rK   r/   r      s    
r   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	PixtralRMSNormư>c                    s&   t    tt|| _|| _dS )z=
        PixtralRMSNorm is equivalent to T5LayerNorm
        N)r7   r8   r   	Parameterr   onesweightvariance_epsilon)rC   r   epsrK   r.   r/   r8      s   

zPixtralRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   r   T)keepdim)	rQ   rX   r   r~   powmeanrsqrtr   r   )rC   r   input_dtypevariancer.   r.   r/   r\     s
   zPixtralRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r   r   r   r   rC   r.   r.   r/   
extra_repr  s   zPixtralRMSNorm.extra_repr)r   )r^   r_   r`   r8   r\   r   re   r.   r.   rK   r/   r      s    r   c                       sf   e Zd Z fddZ		ddejdejdeeejejf  dee de	e
 d	eej fd
dZ  ZS )PixtralAttentionLayerc                    sB   t    t|jdd| _t|| _t|| _t|jdd| _	d S )Nh㈵>r   )
r7   r8   r   r   attention_normr   feed_forwardr   	attentionffn_normr   rK   r.   r/   r8     s
   


zPixtralAttentionLayer.__init__Nr   ru   r   r   r   r   c           	      K   sl   |}|  |}| jd||||d|\}}|| }|}| |}| |}|| }|f}|r4||f7 }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   ru   r   r   Nr.   )r   r   r   r   )	rC   r   ru   r   r   r   residualr   outputsr.   r.   r/   r\     s&   




zPixtralAttentionLayer.forward)NN)r^   r_   r`   r8   r   rb   r   r   r   r   r   FloatTensorr\   re   r.   r.   rK   r/   r     s"    r   c                       s|   e Zd Z fddZ					ddeej deeejejf  dee dee dee d	e	e
 d
eeef fddZ  ZS )PixtralTransformerc                    sF   t    || _tj | _t|jD ]
}| j	t
| qd| _d S )NF)r7   r8   rD   r   r   
ModuleListlayersrangenum_hidden_layersr"   r   gradient_checkpointing)rC   rD   r   rK   r.   r/   r8   B  s   

zPixtralTransformer.__init__Nru   r   r   output_hidden_statesreturn_dictr   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}	|}
| jD ]"}|r8||
f }||
|f||d|}|d }
|rQ|	|d f }	q/|rY||
f }|sgtdd |
||	fD S t|
||	dS )	av  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embeddings which serve as input to the Transformer.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr.   )r   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r]   r.   ).0vr.   r.   r/   	<genexpr>  s    z-PixtralTransformer.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)rD   r   r   use_return_dictr   r   r
   )rC   inputs_embedsru   r   r   r   r   r   encoder_statesall_attentionsr   encoder_layerlayer_outputsr.   r.   r/   r\   J  s<   


zPixtralTransformer.forward)NNNNN)r^   r_   r`   r8   r   r   rb   r   r   r   r   r   r
   r\   re   r.   r.   rK   r/   r   A  s,    
	r   c                   @   sP   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdgZdZ	dZ
dZdZdd ZdS )	PixtralPreTrainedModelrD   modelpixel_valuesTr   c                 C   sj   | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tr3|jjd d S d S )Nrp   )r   stdr4   )rD   initializer_rangerR   r   r   Conv2dr   datanormal_r   zero_r   fill_)rC   rq   r   r.   r.   r/   _init_weights  s   

z$PixtralPreTrainedModel._init_weightsN)r^   r_   r`   r   rc   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modulesr   r.   r.   r.   r/   r     s   
 r   c                 C   s   |j }|j}|jd }t|j}tj||f|||d}t| d}tdg| d d  d}t	||D ]\}	}
d||	|
|	|
f< q8|d d d d d d f 
|jd ddd}|S )Nr   )
fill_valuerQ   r5   r   r   )rQ   r5   r   r   finfominfulltensorcumsumzipexpand)r$   r   rQ   r5   seq_lend_mincausal_maskblock_end_idxblock_start_idxstartendr.   r.   r/   generate_block_attention_mask  s   
*r   c                       s   e Zd ZdZ fddZdd Zee				ddej	de
ej	 d	e
e d
e
e de
e dee deeef fddZ  ZS )PixtralVisionModelvision_encoderc                    sh   t  | || _tj|j|j|j|jdd| _|j| _t	|jdd| _
t|| _t|| _|   d S )NF)in_channelsout_channelskernel_sizestrider   r   r   )r7   r8   rD   r   r   num_channelsr   r>   
patch_convr   ln_prer   transformerr1   patch_positional_embedding	post_initr   rK   r.   r/   r8     s   

zPixtralVisionModel.__init__c                 C   s   | j S r]   )r  r   r.   r.   r/   get_input_embeddings  s   z'PixtralVisionModel.get_input_embeddingsNr   image_sizesr   r   r   r   r   c                    s   |d u r|j \}}	}
}|
|fg| } |} fddt||D }tjdd |D ddd} |}t| jj	 jj
 d}||d<  ||} jjdkrVd }n
td	d |D |} j|f||||d
d|S )Nc                    s:   g | ]\}}|d d|d  j  d|d  j  f qS ).Nr   r   )r>   )r   embedr   r   r.   r/   
<listcomp>  s    (z.PixtralVisionModel.forward.<locals>.<listcomp>c                 S   s   g | ]}| d jqS )r   )flattenTr   rx   r.   r.   r/   r    s    r   r   )r%   rZ   r   c                 S   s    g | ]}|j d  |j d  qS )r   r   )r   r  r.   r.   r/   r    s     T)ru   r   r   r   r   )r   r  r   r   r#   ri   r  r0   rD   r=   r>   r  r   r   r  )rC   r   r	  r   r   r   argsr   r   r   r(   r)   patch_embedsr$   rZ   r   ru   r.   r   r/   r\     s<   


zPixtralVisionModel.forward)NNNN)r^   r_   r`   r   r8   r  r   r   r   rb   r   r   r   r   r   r   r
   r\   re   r.   r.   rK   r/   r     s2    
	r   )Nr   )rp   )1ra   collections.abcr   typingr   r   r   r   activationsr   modeling_flash_attention_utilsr   modeling_layersr	   modeling_outputsr
   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_pixtralr   
get_loggerr^   loggerr0   Moduler1   rh   ro   rb   r?   r   r   r   r   r   r   r   r   r   __all__r.   r.   r.   r/   <module>   s\   
6
#
J2KM