o
    wiS                     @   s  d Z ddlmZ ddlmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ eeZ dd Z!G dd dej"Z#dd Z$d3ddZ%	d4dej"dej&dej&dej&deej& de'd e'fd!d"Z(G d#d$ d$ej"Z)G d%d& d&ej"Z*G d'd( d(ej"Z+G d)d* d*eZ,G d+d, d,ej"Z-eG d-d. d.eZ.d/d0 Z/eG d1d2 d2e.Z0d2d.gZ1dS )5zPyTorch Pixtral model.    )Callable)OptionalUnionN)nn   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )PixtralVisionConfigc           
      C   s   g }| D ];}|j dd  \}}tjt|t|dd}tj|dddddd\}}|| | }	||	d d df  qt|S )Nij)indexingdim   r   )	shapetorchmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_width	positionspatchheightwidthmeshh_gridv_gridids r.   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgrid&   s   "
r0   c                       s6   e Zd ZdZd fdd	Ze edd Z  Z	S )PixtralRotaryEmbeddinga  
    The key with pixtral embedding is just that you have a frequency for each pixel positions.
    If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
    is given by indexing the pre_computed frequency on the width and height.

    What you output is of dimension (batch, height * width, dim) with dim the embed dim.

    This simply means that for each image hidden state, you are going to add
    a corresponding positional embedding, based on its index in the grid.
    Nc           
         s  t    d| _|j| _|j| _|j|j }d| jt	
d| jd | j   }t	j
||jd}t	j
||jd}t	||d d d  }t	||dd d  }t	j|d d d d d f d|d|d d d d d f |ddgddd| jd }	| jd	t	j|	|	fddd
d d S )Ndefault      ?r   r   )devicer   r   r   inv_freqF)
persistent)super__init__	rope_typehead_dimr   
rope_thetabase
image_size
patch_sizer   r   floatr4   outerr#   repeatr    register_buffer)
selfconfigr4   max_patches_per_sidefreqshwfreqs_hfreqs_wr5   	__class__r.   r/   r8   =   s&   
$"
zPixtralRotaryEmbedding.__init__c                 C   s   | j | }t|jjtr|jjdkr|jjnd}tj|dd |}| }| }W d    n1 s4w   Y  |j	|j
d|j	|j
dfS )NmpscpuF)device_typeenabled)dtype)r5   
isinstancer4   typestrr   autocastcossintorQ   )rC   xposition_idsrF   rO   embrV   rW   r.   r.   r/   forwardV   s   
&
zPixtralRotaryEmbedding.forwardN)
__name__
__module____qualname____doc__r8   r   no_gradr   r\   __classcell__r.   r.   rK   r/   r1   1   s    r1   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r   r#   )rY   x1x2r.   r.   r/   rotate_halfe   s   rf   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerf   )qkrV   rW   rZ   unsqueeze_dimq_embedk_embedr.   r.   r/   apply_rotary_pos_embl   s
   

rm           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr   r   )r   rQ   )ptrainingr   r   )r   matmul	transposer   
functionalsoftmaxfloat32rX   rQ   ru   rw   
contiguous)
ro   rp   rq   rr   rs   rt   ru   kwargsattn_weightsattn_outputr.   r.   r/   eager_attention_forward   s   
r   c                       sz   e Zd ZdZ fddZ			ddejdeej deeejejf  d	ee	 d
e
e deejeej f fddZ  ZS )PixtralAttentionzI
    Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
    c                    s   t    || _|j| _|j| _| j| j | _d| _| jd | _	d| _|j
| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFg      ࿩bias)r7   r8   rD   hidden_size	embed_dimnum_attention_heads	num_headsr:   	is_causalrt   attention_dropoutru   r   Lineark_projv_projq_projo_projrC   rD   rK   r.   r/   r8      s   
zPixtralAttention.__init__NFhidden_statesrs   position_embeddingsoutput_attentionsr~   returnc                 K   sV  |  \}}}| |}	| |}
| |}|	||| j| jdd}	|
||| j| jdd}
|||| j| jdd}|\}}t|	|
||dd\}	}
t	}| j
jdkrk| j
jdkre|retd nt| j
j }| j
jdkr}|d	 j|jd
d|d	< || |	|
||f| jsdn| j| jd|\}}|||d }| |}|sd}||fS )z#Input shape: Batch x Time x Channelr   r   r   )rj   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.flash_attention_2rZ   T)non_blockingrn   )ru   rt   r   N)sizer   r   r   viewr   r:   ry   rm   r   rD   _attn_implementationloggerwarning_oncer   rX   r4   rw   ru   rt   r    r}   r   )rC   r   rs   r   r   r~   
batch_sizepatches_query_states
key_statesvalue_statesrV   rW   attention_interfacer   r   r.   r.   r/   r\      sF   





zPixtralAttention.forward)NNF)r^   r_   r`   ra   r8   r   Tensorr   tupleboolr   r   r\   rc   r.   r.   rK   r/   r      s&    r   c                       s$   e Zd Z fddZdd Z  ZS )
PixtralMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S )NFr   )r7   r8   rD   r   intermediate_sizer   r   	gate_projup_proj	down_projr   
hidden_actact_fnr   rK   r.   r/   r8      s   
zPixtralMLP.__init__c                 C   s$   |  | | || | }|S r]   )r   r   r   r   )rC   rY   r   r.   r.   r/   r\      s    zPixtralMLP.forward)r^   r_   r`   r8   r\   rc   r.   r.   rK   r/   r      s    
r   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	PixtralRMSNormư>c                    s&   t    tt|| _|| _dS )z=
        PixtralRMSNorm is equivalent to T5LayerNorm
        N)r7   r8   r   	Parameterr   onesweightvariance_epsilon)rC   r   epsrK   r.   r/   r8     s   

zPixtralRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   r   T)keepdim)	rQ   rX   r   r|   powmeanrsqrtr   r   )rC   r   input_dtypevariancer.   r.   r/   r\   	  s
   zPixtralRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r   r   r   r   rC   r.   r.   r/   
extra_repr  s   zPixtralRMSNorm.extra_repr)r   )r^   r_   r`   r8   r\   r   rc   r.   r.   rK   r/   r      s    r   c                       sf   e Zd Z fddZ		ddejdejdeeejejf  dee de	e
 d	eej fd
dZ  ZS )PixtralAttentionLayerc                    sB   t    t|jdd| _t|| _t|| _t|jdd| _	d S )Nh㈵>r   )
r7   r8   r   r   attention_normr   feed_forwardr   	attentionffn_normr   rK   r.   r/   r8     s
   


zPixtralAttentionLayer.__init__Nr   rs   r   r   r~   r   c           	      K   sl   |}|  |}| jd||||d|\}}|| }|}| |}| |}|| }|f}|r4||f7 }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   rs   r   r   Nr.   )r   r   r   r   )	rC   r   rs   r   r   r~   residualr   outputsr.   r.   r/   r\     s&   




zPixtralAttentionLayer.forward)NN)r^   r_   r`   r8   r   r   r   r   r   r   r   FloatTensorr\   rc   r.   r.   rK   r/   r     s"    r   c                       s|   e Zd Z fddZ					ddeej deeejejf  dee dee dee d	e	e
 d
eeef fddZ  ZS )PixtralTransformerc                    sF   t    || _tj | _t|jD ]
}| j	t
| qd| _d S )NF)r7   r8   rD   r   r   
ModuleListlayersrangenum_hidden_layersr"   r   gradient_checkpointing)rC   rD   r   rK   r.   r/   r8   G  s   

zPixtralTransformer.__init__Nrs   r   r   output_hidden_statesreturn_dictr~   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}	|}
| jD ]"}|r8||
f }||
|f||d|}|d }
|rQ|	|d f }	q/|rY||
f }|sgtdd |
||	fD S t|
||	dS )	av  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embeddings which serve as input to the Transformer.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr.   )r   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r]   r.   ).0vr.   r.   r/   	<genexpr>  s    z-PixtralTransformer.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)rD   r   r   use_return_dictr   r   r
   )rC   inputs_embedsrs   r   r   r   r   r~   encoder_statesall_attentionsr   encoder_layerlayer_outputsr.   r.   r/   r\   O  s<   


zPixtralTransformer.forward)NNNNN)r^   r_   r`   r8   r   r   r   r   r   r   r   r   r
   r\   rc   r.   r.   rK   r/   r   F  s,    
	r   c                   @   sJ   e Zd ZeZdZdZdZdZdZ	dZ
dZdgZdZ	dZ
dZdZdd ZdS )PixtralPreTrainedModelmodelpixel_valuesTr   c                 C   sj   | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tr3|jjd d S d S )Nrn   )r   stdr3   )rD   initializer_rangerR   r   r   Conv2dr   datanormal_r   zero_r   fill_)rC   ro   r   r.   r.   r/   _init_weights  s   

z$PixtralPreTrainedModel._init_weightsN)r^   r_   r`   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_2_supports_sdpa_supports_flex_attn_no_split_modulesr   r.   r.   r.   r/   r     s    r   c                 C   s   |j }|j}|jd }t|j}tj||f|||d}t| d}tdg| d d  d}t	||D ]\}	}
d||	|
|	|
f< q8|d d d d d d f 
|jd ddd}|S )Nr   )
fill_valuerQ   r4   r   r   )rQ   r4   r   r   finfominfulltensorcumsumzipexpand)r$   r   rQ   r4   seq_lend_mincausal_maskblock_end_idxblock_start_idxstartendr.   r.   r/   generate_block_attention_mask  s   
*r   c                       s   e Zd ZdZ fddZdd Zee				ddej	de
ej	 d	e
e d
e
e de
e dee deeef fddZ  ZS )PixtralVisionModelvision_encoderc                    sh   t  | || _tj|j|j|j|jdd| _|j| _t	|jdd| _
t|| _t|| _|   d S )NF)in_channelsout_channelskernel_sizestrider   r   r   )r7   r8   rD   r   r   num_channelsr   r>   
patch_convr   ln_prer   transformerr1   patch_positional_embedding	post_initr   rK   r.   r/   r8     s   

zPixtralVisionModel.__init__c                 C   s   | j S r]   )r  r   r.   r.   r/   get_input_embeddings  s   z'PixtralVisionModel.get_input_embeddingsNr   image_sizesr   r   r   r~   r   c                    s   |d u r|j \}}	}
}|
|fg| } |} fddt||D }tjdd |D ddd} |}t| jj	 jj
 d}||d<  ||} jjdkrVd }n
td	d |D |} j|f||||d
d|S )Nc                    s:   g | ]\}}|d d|d  j  d|d  j  f qS ).Nr   r   )r>   )r   embedr   r   r.   r/   
<listcomp>  s    (z.PixtralVisionModel.forward.<locals>.<listcomp>c                 S   s   g | ]}| d jqS )r   )flattenTr   rv   r.   r.   r/   r    s    r   r   )r%   rZ   r   c                 S   s    g | ]}|j d  |j d  qS )r   r   )r   r  r.   r.   r/   r    s     T)rs   r   r   r   r   )r   r  r   r   r#   rg   r  r0   rD   r=   r>   r	  r   r   r  )rC   r   r  r   r   r   argsr~   r   r   r(   r)   patch_embedsr$   rZ   r   rs   r.   r   r/   r\     s<   


zPixtralVisionModel.forward)NNNN)r^   r_   r`   r   r8   r  r   r   r   r   r   r   r   r   r   r   r
   r\   rc   r.   r.   rK   r/   r     s2    
	r   )Nr   )rn   )2ra   collections.abcr   typingr   r   r   torch.utils.checkpointr   activationsr   modeling_flash_attention_utilsr   modeling_layersr	   modeling_outputsr
   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_pixtralr   
get_loggerr^   r   r0   Moduler1   rf   rm   r   r?   r   r   r   r   r   r   r   r   r   __all__r.   r.   r.   r/   <module>   s^   
4
#
P2KM