o
    پiL$                     @   s   d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ G dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdS )    )partial)OptionalTypeUnionN)SiglipVisionConfig)	QuickGELU)VisionAttention)ColumnParallelLinearRowParallelLinear)QuantizationConfig)VocabParallelEmbedding)
add_prefixc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )SiglipVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tj|j	| j| j| jdd| _
| j| j d | _| j| _t| j| j| _| jdt| jddd d S )Nvalid)in_channelsout_channelskernel_sizestridepadding   position_ids)   F)
persistent)super__init__r   hidden_size	embed_dim
image_size
patch_sizennConv2dnum_channelspatch_embeddingnum_patchesnum_positionsr   position_embeddingregister_buffertorcharangeexpand)selfr   	__class__ L/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/siglip.pyr      s,   

zSiglipVisionEmbeddings.__init__pixel_valuesreturnc                 C   sB   | j jj}|  |j|d}|ddd}|| | j }|S )N)dtyper   r   )r$   weightr3   toflatten	transposer'   r   )r,   r1   target_dtypepatch_embeds
embeddingsr/   r/   r0   forward0   s   

zSiglipVisionEmbeddings.forward)	__name__
__module____qualname__r   r   r)   Tensorr;   __classcell__r/   r/   r-   r0   r      s    r   c                       sR   e Zd Zeddfdeej dee de	f fddZ
dejd	ejfd
dZ  ZS )	SiglipMLPN 	act_layerquant_configprefixc                    sN   t    t|j|j|td|d| _| | _t|j|j|td|d| _	d S )Nfc1)rD   rE   fc2)
r   r   r	   r   intermediate_sizer   rF   actr
   rG   )r,   r   rC   rD   rE   r-   r/   r0   r   ?   s   
zSiglipMLP.__init__xr2   c                 C   s*   |  |\}}| |}| |\}}|S N)rF   rI   rG   )r,   rJ   
x_parallel_r/   r/   r0   r;   U   s   
zSiglipMLP.forward)r<   r=   r>   r   r   r!   Moduler   r   strr   r)   r?   r;   r@   r/   r/   r-   r0   rA   =   s    rA   c                       sr   e Zd Zedddfdedeej deej dee	 de
ddf fd	d
ZdejdejdejdejfddZ  ZS )SiglipEncoderLayerNrB   r   rC   
norm_layerrD   rE   r2   c              
      sz   t    |d u rttj|jd}||j| _||j| _t	|j|j
|jdd|td|d| _t|||td|d| _d S )NepsT	self_attn)r   	num_headsprojection_sizeuse_qkv_parallelflatten_batchrD   rE   mlp)rC   rD   rE   )r   r   r   r!   	LayerNormlayer_norm_epsr   layer_norm1layer_norm2r   num_attention_headsr   rT   rA   rY   )r,   r   rC   rQ   rD   rE   r-   r/   r0   r   _   s(   
	zSiglipEncoderLayer.__init__hidden_statesattention_maskcausal_attention_maskc                 C   st   |}|  |}|d ur|d ur|| }n	|d ur|}n|}| j||d}|| }|}| |}| |}|| }|S )N)r`   )r\   rT   r]   rY   )r,   r_   r`   ra   residual	attn_maskr/   r/   r0   r;   |   s"   



zSiglipEncoderLayer.forward)r<   r=   r>   r   r   r   r!   rN   r   r   rO   r   r)   r?   r;   r@   r/   r/   r-   r0   rP   ]   s6    rP   c                       sx   e Zd ZdZ		ddedee deddf fdd	Z			
dde	j
de	j
de	j
dedee	j
ee	j
 f f
ddZ  ZS )SiglipEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self
    attention layers. Each layer is a [`SiglipEncoderLayer`].

    Args:
        config: SiglipConfig
    NrB   r   rD   rE   r2   c                    sN   t     | _ j}ttj jdt fddt	|D | _
d S )NrR   c              	      s(   g | ]}t  td | dqS )zlayers.)r   rQ   rD   rE   )rP   r   ).0	layer_idxr   rQ   rE   rD   r/   r0   
<listcomp>   s    z*SiglipEncoder.__init__.<locals>.<listcomp>)r   r   r   num_hidden_layersr   r!   rZ   r[   
ModuleListrangelayers)r,   r   rD   rE   ri   r-   rg   r0   r      s   

zSiglipEncoder.__init__Finputs_embedsr`   ra   return_all_hidden_statesc                 C   s<   |g}|}| j D ]}||||}|r|| q|r|S |S rK   )rl   append)r,   rm   r`   ra   rn   hidden_states_poolr_   encoder_layerr/   r/   r0   r;      s   

zSiglipEncoder.forwardNrB   )NNF)r<   r=   r>   __doc__r   r   r   rO   r   r)   r?   boolr   listr;   r@   r/   r/   r-   r0   rd      s6    rd   c                	       sb   e Zd Z		ddedee deddf fddZede	j
fd	d
Z
de	jde	jfddZ  ZS )SiglipVisionTransformerNrB   r   rD   rE   r2   c                    s   t    || _|j}t|| _t||td|d| _|j	}t
| jj|j	kr6td| dt
| jj dtj||jd| _d S )Nencoder)r   rD   rE   zThe original encoder only has z layers, but you requested z layers.rR   )r   r   r   r   r   r:   rd   r   rw   ri   lenrl   
ValueErrorr!   rZ   r[   post_layernorm)r,   r   rD   rE   r   ri   r-   r/   r0   r      s"   


z SiglipVisionTransformer.__init__c                 C   s   | j jd jjjS )Nr   )rw   rl   r\   r4   devicer,   r/   r/   r0   r{      s   zSiglipVisionTransformer.devicer1   c                 C   s2   |  || j}d}| j||d}| |}|S )NF)rm   rn   )r:   r5   r{   rw   rz   )r,   r1   r_   rn   last_hidden_stater/   r/   r0   r;      s   
zSiglipVisionTransformer.forwardrr   r<   r=   r>   r   r   r   rO   r   propertyr)   r{   r?   r;   r@   r/   r/   r-   r0   rv      s&    rv   c                       sX   e Zd Z		ddedee def fddZede	j
fd	d
Z
de	jfddZ  ZS )SiglipVisionModelNrB   r   rD   rE   c                    s$   t    t||td|d| _d S )Nvision_model)rE   )r   r   rv   r   r   )r,   r   rD   rE   r-   r/   r0   r   	  s   
zSiglipVisionModel.__init__r2   c                 C   s   | j jS rK   )r   r{   r|   r/   r/   r0   r{     s   zSiglipVisionModel.devicer1   c                 C   s
   |  |S rK   )r   )r,   r1   r/   r/   r0   r;     s   
zSiglipVisionModel.forwardrr   r~   r/   r/   r-   r0   r     s    r   )	functoolsr   typingr   r   r   r)   torch.nnr!   transformersr   sglang.srt.layers.activationr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.linearr	   r
   *sglang.srt.layers.quantization.base_configr   *sglang.srt.layers.vocab_parallel_embeddingr   sglang.srt.utilsr   rN   r   rA   rP   rd   rv   r   r/   r/   r/   r0   <module>   s"   ) >76