o
    eiU                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ee Z!dd Z"G dd dej#Z$dd Z%d4ddZ&	d5dej#dej'dej'dej'dej'dB d e(d!e(fd"d#Z)G d$d% d%ej#Z*G d&d' d'ej#Z+G d(d) d)ej#Z,G d*d+ d+eZ-G d,d- d-ej#Z.eG d.d/ d/eZ/d0d1 Z0eG d2d3 d3e/Z1d3d/gZ2dS )6zPyTorch Pixtral model.    )Callable)OptionalN)nn   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging)is_flash_attention_requestedmaybe_autocast   )PixtralVisionConfigc           
      C   s   g }| D ];}|j dd  \}}tjt|t|dd}tj|dddddd\}}|| | }	||	d d df  qt|S )Nij)indexingdim   r   )	shapetorchmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_width	positionspatchheightwidthmeshh_gridv_gridids r/   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgrid%   s   "
r1   c                       s   e Zd ZU dZejed< ddef fddZe				ddedB de
d d	edB d
edef fddZe edd Z  ZS )PixtralRotaryEmbeddinga  
    The key with pixtral embedding is just that you have a frequency for each pixel positions.
    If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
    is given by indexing the pre_computed frequency on the width and height.

    What you output is of dimension (batch, height * width, dim) with dim the embed dim.

    This simply means that for each image hidden state, you are going to add
    a corresponding positional embedding, based on its index in the grid.
    inv_freqNconfigc                    s   t    || _| jjd | _| j}| jdkr$t| jj d| j d|| j|\}}| j	d|dd | j	d|
 dd d S )	N	rope_typedefaultz7 does not support non-default RoPE, but got `rope_type=`r3   F)
persistentoriginal_inv_freq)super__init__r4   rope_parametersr5   compute_default_rope_parameters
ValueError	__class____name__register_bufferclone)selfr4   device
layer_typerope_init_fnr3   attention_scalingr?   r/   r0   r;   >   s   

zPixtralRotaryEmbedding.__init__rD   ztorch.deviceseq_lenreturnztorch.Tensorc                 C   s  | j d }t| ddp| j| j }d}| j| j }t|}t|}d|td|d |   }	t	||	ddd  }
t	||	ddd  }tj
|
dddddf d|d|dddddf |ddgdd	d|d }tj
||fdd	}||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r   r   r   r   )r<   getattrhidden_sizenum_attention_heads
image_size
patch_sizer   r   floatouterr$   repeatr!   )r4   rD   rI   baser   attention_factormax_patches_per_sidehwfreqsfreqs_hfreqs_wr3   r/   r/   r0   r=   N   s&   



z6PixtralRotaryEmbedding.compute_default_rope_parametersc                 C   s   | j | }t|jjtr|jjdkr|jjnd}t|dd |}| }| }W d    n1 s3w   Y  |j|j	d|j|j	dfS )NmpscpuF)device_typeenableddtype)
r3   
isinstancerD   typestrr   cossintorb   )rC   xposition_idsrZ   r_   embrf   rg   r/   r/   r0   forward{   s   
&
zPixtralRotaryEmbedding.forwardNN)NNN)r@   
__module____qualname____doc__r   Tensor__annotations__r   r;   staticmethodr   inttuplerR   r=   no_gradr
   rl   __classcell__r/   r/   rH   r0   r2   0   s(   
 

,r2   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r   r$   )ri   x1x2r/   r/   r0   rotate_half   s   rz   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerz   )qkrf   rg   unsqueeze_dimq_embedk_embedr/   r/   r0   apply_rotary_pos_emb   s
   

r           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr   r   )r   rb   )ptrainingr   r   )r   matmul	transposer   
functionalsoftmaxfloat32rh   rb   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr/   r/   r0   eager_attention_forward   s   
r   c                       sz   e Zd ZdZ fddZ			ddejdejdB deejejf dB d	edB d
e	e
 deejejdB f fddZ  ZS )PixtralAttentionzI
    Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
    c                    s   t    || _|j| _|j| _| j| j | _d| _| jd | _	d| _|j
| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFg      ࿩bias)r:   r;   r4   rN   	embed_dimrO   	num_headsrL   	is_causalr   attention_dropoutr   r   Lineark_projv_projq_projo_projrC   r4   rH   r/   r0   r;      s   
zPixtralAttention.__init__NFhidden_statesr   position_embeddingsoutput_attentionsr   rJ   c                 K   s
  |  \}}}| |}	| |}
| |}|	||| j| jdd}	|
||| j| jdd}
|||| j| jdd}|\}}t|	|
||dd\}	}
t	
| jjt}|| |	|
||f| jscdn| j| jd|\}}|||d }| |}|sd}||fS )	z#Input shape: Batch x Time x Channelr   r   r   )r~   r   )r   r   r   N)sizer   r   r   viewr   rL   r   r   r   get_interfacer4   _attn_implementationr   r   r   r   r!   r   r   )rC   r   r   r   r   r   
batch_sizepatches_query_states
key_statesvalue_statesrf   rg   attention_interfacer   r   r/   r/   r0   rl      s:   





zPixtralAttention.forward)NNF)r@   rn   ro   rp   r;   r   rq   ru   boolr   r   rl   rw   r/   r/   rH   r0   r      s&    r   c                       s$   e Zd Z fddZdd Z  ZS )
PixtralMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S )NFr   )r:   r;   r4   rN   intermediate_sizer   r   	gate_projup_proj	down_projr   
hidden_actact_fnr   rH   r/   r0   r;     s   
zPixtralMLP.__init__c                 C   s$   |  | | || | }|S N)r   r   r   r   )rC   ri   r   r/   r/   r0   rl     s    zPixtralMLP.forward)r@   rn   ro   r;   rl   rw   r/   r/   rH   r0   r     s    
r   c                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )PixtralRMSNormư>epsrJ   Nc                    s&   t    tt|| _|| _dS )z=
        PixtralRMSNorm is equivalent to T5LayerNorm
        N)r:   r;   r   	Parameterr   onesweightvariance_epsilon)rC   rN   r   rH   r/   r0   r;     s   

zPixtralRMSNorm.__init__r   c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   r   T)keepdim)	rb   rh   r   r   powmeanrsqrtr   r   )rC   r   input_dtypevariancer/   r/   r0   rl   !  s
   zPixtralRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)ru   r   r   r   rC   r/   r/   r0   
extra_repr(  s   zPixtralRMSNorm.extra_repr)r   )
r@   rn   ro   rR   r;   r   rq   rl   r   rw   r/   r/   rH   r0   r     s    r   c                       sf   e Zd Z fddZ		ddejdejdeejejf dB dedB dee	 d	eej
 fd
dZ  ZS )PixtralAttentionLayerc                    sB   t    t|jdd| _t|| _t|| _t|jdd| _	d S )Nh㈵>r   )
r:   r;   r   rN   attention_normr   feed_forwardr   	attentionffn_normr   rH   r/   r0   r;   -  s
   


zPixtralAttentionLayer.__init__Nr   r   r   r   r   rJ   c           	      K   sl   |}|  |}| jd||||d|\}}|| }|}| |}| |}|| }|f}|r4||f7 }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   Nr/   )r   r   r   r   )	rC   r   r   r   r   r   residualr   outputsr/   r/   r0   rl   4  s&   




zPixtralAttentionLayer.forwardrm   )r@   rn   ro   r;   r   rq   ru   r   r   r   FloatTensorrl   rw   r/   r/   rH   r0   r   ,  s"    r   c                       sx   e Zd Z fddZ					ddejdB deejejf dB dedB dedB dedB d	ee	 d
ee
B fddZ  ZS )PixtralTransformerc                    sF   t    || _tj | _t|jD ]
}| j	t
| qd| _d S )NF)r:   r;   r4   r   r   
ModuleListlayersrangenum_hidden_layersr#   r   gradient_checkpointing)rC   r4   r   rH   r/   r0   r;   _  s   

zPixtralTransformer.__init__Nr   r   r   output_hidden_statesreturn_dictr   rJ   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}	|}
| jD ]"}|r8||
f }||
|f||d|}|d }
|rQ|	|d f }	q/|rY||
f }|sgtdd |
||	fD S t|
||	dS )	av  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embeddings which serve as input to the Transformer.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr/   )r   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r/   ).0vr/   r/   r0   	<genexpr>  s    z-PixtralTransformer.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)r4   r   r   use_return_dictr   ru   r	   )rC   inputs_embedsr   r   r   r   r   r   encoder_statesall_attentionsr   encoder_layerlayer_outputsr/   r/   r0   rl   g  s<   


zPixtralTransformer.forward)NNNNN)r@   rn   ro   r;   r   rq   ru   r   r   r   r	   rl   rw   r/   r/   rH   r0   r   ^  s,    	r   c                   @   s<   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdZdgZdS )PixtralPreTrainedModelr4   modelpixel_values)imageTr   N)r@   rn   ro   r   rr   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modulesr/   r/   r/   r0   r     s   
 
r   c                 C   s   |j }|j}|jd }t|j}tj||f|||d}t| d}tdg| d d  d}t	||D ]\}	}
d||	|
|	|
f< q8|d d d d d d f 
|jd ddd}|S )Nr   )
fill_valuerb   rD   r   r   )rb   rD   r   r   finfominfulltensorcumsumzipexpand)r%   r   rb   rD   rI   d_mincausal_maskblock_end_idxblock_start_idxstartendr/   r/   r0   generate_block_attention_mask  s   
*r  c                       s|   e Zd ZdZ fddZdd Zee				ddej	dej	dB d	e
dB d
e
dB de
dB dee deeB fddZ  ZS )PixtralVisionModelvision_encoderc                    sh   t  | || _tj|j|j|j|jdd| _|j| _t	|jdd| _
t|| _t|| _|   d S )NF)in_channelsout_channelskernel_sizestrider   r   r   )r:   r;   r4   r   Conv2dnum_channelsrN   rQ   
patch_convr   ln_prer   transformerr2   patch_positional_embedding	post_initr   rH   r/   r0   r;     s   

zPixtralVisionModel.__init__c                 C   s   | j S r   )r
  r   r/   r/   r0   get_input_embeddings  s   z'PixtralVisionModel.get_input_embeddingsNr   image_sizesr   r   r   r   rJ   c                    s   |d u r|j \}}	}
}|
|fg| } jjj} |j|d} fddt||D }tjdd |D ddd} 	|}t
| jj jj d}|dj|jdd	|d
<  ||}t jrgd }n
tdd |D |} j|f||||dd|S )Nra   c                    s:   g | ]\}}|d d|d  j  d|d  j  f qS ).Nr   r   )rQ   )r   embedr   r   r/   r0   
<listcomp>  s    (z.PixtralVisionModel.forward.<locals>.<listcomp>c                 S   s   g | ]}| d jqS r   )flattenTr   r   r/   r/   r0   r    s    r   r   )r&   T)non_blockingrj   c                 S   s    g | ]}|j d  |j d  qS )r   r   )r   r  r/   r/   r0   r    s     )r   r   r   r   r   )r   r
  r   rb   rh   r   r   r$   r{   r  r1   r4   rP   rQ   rD   r  r   r  r  )rC   r   r  r   r   r   argsr   r   r   r)   r*   target_dtypepatch_embedsr%   rj   r   r   r/   r   r0   rl     s>   



zPixtralVisionModel.forward)NNNN)r@   rn   ro   r   r;   r  r   r   r   rq   r   r   r   ru   r	   rl   rw   r/   r/   rH   r0   r    s2    	r  r  )r   )3rp   collections.abcr   typingr   r   r   activationsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr	   modeling_rope_utilsr
   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   configuration_pixtralr   
get_loggerr@   loggerr1   Moduler2   rz   r   rq   rR   r   r   r   r   r   r   r   r  r  __all__r/   r/   r/   r0   <module>   s^   
Y
!
F2KN