o
    پi/                     @   s   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d d	lmZmZ G d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZdS )    )OptionalN)nn)PretrainedConfig)
get_act_fn)VisionAttention)ColumnParallelLinearRowParallelLinear)QuantizationConfig)
add_prefixis_npuc                	       sN   e Zd Z		ddedee deddf fddZd	ej	dej	fd
dZ
  ZS )Idefics2VisionMLPN configquant_configprefixreturnc                    s\   t    || _t|j| _t|j|jd|t	d|d| _
t|j|jd|t	d|d| _d S )NTfc1)biasr   r   fc2)super__init__r   r   
hidden_actactivation_fnr   hidden_sizeintermediate_sizer
   r   r   r   selfr   r   r   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/idefics2.pyr   $   s"   
zIdefics2VisionMLP.__init__hidden_statesc                 C   s*   |  |\}}| |}| |\}}|S N)r   r   r   )r   r!   _r   r   r    forward<   s   
zIdefics2VisionMLP.forwardNr   __name__
__module____qualname__r   r   r	   strr   torchTensorr$   __classcell__r   r   r   r    r   "   s    r   c                	       sT   e Zd Z		ddedee deddf fddZd	ej	d
ej	dej	fddZ
  ZS )Idefics2EncoderLayerNr   r   r   r   r   c                    s   t    |j| _|j| _t|j| j|jd||jddt	d|d	| _
tj| j|jd| _t||t	d|d| _tj| j|jd| _d S )NTF	self_attn)		embed_dim	num_headsprojection_sizeuse_qkv_parallelr   dropoutsoftmax_in_single_precisionflatten_batchr   epsmlpr   r   )r   r   r   r0   num_attention_headsr1   r   r   attention_dropoutr
   r/   r   	LayerNormlayer_norm_epslayer_norm1r   r9   layer_norm2r   r   r   r    r   E   s*   
zIdefics2EncoderLayer.__init__r!   
cu_seqlensc                 C   sH   |}|  |}| j||d}|| }|}| |}| |}|| }|S )z
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.

        rA   )r?   r/   r@   r9   )r   r!   rA   residualr   r   r    r$   a   s   


zIdefics2EncoderLayer.forwardr%   r&   r   r   r   r    r.   C   s&    r.   c                	       sX   e Zd ZdZ		ddedee deddf fdd	Zd
e	j
de	j
de	j
fddZ  ZS )Idefics2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention
    layers. Each layer is a
    [`Idefics2EncoderLayer`].

    Args:
        config: Idefics2Config
    Nr   r   r   r   r   c                    s8   t     | _t fddt jD | _d S )Nc                    s&   g | ]}t  td | dqS )zlayers.r:   )r.   r
   ).0ir   r   r   r   r    
<listcomp>   s    z,Idefics2Encoder.__init__.<locals>.<listcomp>)r   r   r   r   
ModuleListrangenum_hidden_layerslayersr   r   rG   r    r      s   

zIdefics2Encoder.__init__inputs_embedsrA   c                 C   s4   t  r|d}|}| jD ]
}|||d}|}q|S )a  
        Args:
            inputs_embeds (torch.Tensor):
                Optionally, instead of passing `input_ids` you can choose to
                directly pass an embedded representation.
                This is useful if you want more control over how to convert
                `input_ids` indices into associated vectorsthan the model's
                internal embedding lookup matrix.
        cpurB   )r   torL   )r   rM   rA   r!   encoder_layerlayer_outputsr   r   r    r$      s   

zIdefics2Encoder.forwardr%   )r'   r(   r)   __doc__r   r   r	   r*   r   r+   r,   r$   r-   r   r   r   r    rD   x   s(    rD   c                	       st   e Zd ZdZdef fddZ	ddejdejde	ej
 fd	d
Z	ddejdejde	ej
 dejfddZ  ZS )Idefics2VisionEmbeddingsa\  
    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
    ` to enable images of variable
    resolution.

    The modifications are adapted from [Patch n' Pack: NaViT, a Vision
    Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
    which allows treating images in their native aspect ratio and without the
    need to resize them to the same fixed size. In particular, we start from the
    original pre-trained SigLIP model(which uses images of fixed-size square
    images) and adapt it by training on images of variable resolutions.
    r   c                    sx   t    |j| _|j| _|j| _tj|j| j| j| jdd| _	| j| j | _
| j
d | _| j| _t| j| j| _d S )Nvalid)in_channelsout_channelskernel_sizestridepadding   )r   r   r   r0   
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patches_per_sidenum_patchesnum_positions	Embeddingposition_embedding)r   r   r   r   r    r      s   
z!Idefics2VisionEmbeddings.__init__Npixel_valuespatch_attention_mask	tgt_sizesc                 C   s4  |j \}}}}|| j || j }}	td| j dd| j }
tj|||	 fdd}t|D ]a\}}|d urC|| d }|| d }n|d d df  }|d  }tddd| }tddd| }tj||
dd}tj||
dd}|d d d f | j | 	 }||| |
d < q.|| jjj}|S )	N   g      ?r   )size
fill_valueg!?T)right)shaper\   r+   aranger`   full	enumeratesum	bucketizeflattenviewrN   rO   rd   weightdevice)r   re   rf   rg   
batch_sizer#   max_im_hmax_im_wmax_nb_patches_hmax_nb_patches_w
boundariesposition_ids	batch_idxp_attn_masknb_patches_hnb_patches_wfractional_coords_hfractional_coords_wbucket_coords_hbucket_coords_wpos_idsr   r   r    get_position_ids   s<   z)Idefics2VisionEmbeddings.get_position_idsr   c                 C   sZ   | j jj}|j| j jj|d}|  |}|ddd}| |||}|| | }|S )Nrv   dtyperZ   rh   )	r_   ru   r   rO   rv   rs   	transposer   rd   )r   re   rf   rg   target_dtypepatch_embeds
embeddingsr}   r   r   r    r$      s   


z Idefics2VisionEmbeddings.forwardr"   )r'   r(   r)   rR   r   r   r+   FloatTensor
BoolTensorr   	IntTensorr   r,   r$   r-   r   r   r   r    rS      s*    
.rS   c                       s   e Zd Z			ddedee dededdf
 fd	d
Zde	j
fddZ		ddeej deej dejfddZ		ddeej deej dejfddZ  ZS )Idefics2VisionTransformerNTr   r   r   require_post_normr   r   c                    s^   t    |j}|| _t|| _t||td|d| _|r(t	j
||jd| _d S t	 | _d S )Nencoder)r   r   r   r7   )r   r   r   r   rS   r   rD   r
   r   r   r=   r>   Identitypost_layernorm)r   r   r   r   r   r0   r   r   r    r     s   

z"Idefics2VisionTransformer.__init__c                 C   s   | j S r"   )r   )r   r   r   r    get_input_embeddings'  s   z.Idefics2VisionTransformer.get_input_embeddingsrg   input_embedsc                 C   s   |d ur|d d df |d d df  }n|d ur,t j|jd f|jd t j|jd}ntdt jt jdg|jt jdt j|dt jdgdd	|j}|S )Nr   rh   )ri   rj   r   rv   zLEither `tgt_sizes` or `input_embeds` must be provided to compute cu_seqlens.r   )dimr   )r   )
r+   ro   rm   int32rv   
ValueErrorcattensorcumsumrO   )r   rg   r   seqlenrA   r   r   r    compute_cu_seqlens*  s*   "
z,Idefics2VisionTransformer.compute_cu_seqlensrf   c                 C   s8   | j |||d}| ||}| j||d}| |}|S )N)re   rf   rg   rB   )r   r   r   r   )r   re   rf   rg   r!   rA   encoder_outputslast_hidden_stater   r   r    r$   G  s   
z!Idefics2VisionTransformer.forward)NTr   )NN)r'   r(   r)   r   r   r	   boolr*   r   r   rc   r   r+   r,   r   r   r   r$   r-   r   r   r   r    r     sD    
 r   )typingr   r+   r   transformersr   sglang.srt.layers.activationr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.linearr   r   *sglang.srt.layers.quantization.base_configr	   sglang.srt.utilsr
   r   Moduler   r.   rD   rS   r   r   r   r   r    <module>   s   !59]