o
    پi:K                     @   s  d dl mZ d dlmZmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' G dd dej(Z)G dd dej(Z*G dd dej(Z+G dd dej(Z,G dd dej(Z-G dd dej(Z.G dd dej(Z/G dd dej(Z0G dd  d ej(Z1G d!d" d"ej(Z2d#d$ Z3e2Z4dS )%    )partial)IterableListOptionalTupleTypeUnionN)
CLIPConfigCLIPTextConfigCLIPVisionConfig) _create_4d_causal_attention_mask)	QuickGELU)VisionAttention)ColumnParallelLinearRowParallelLinear)EmbeddingPoolerOutputPoolerPoolingType)QuantizationConfig)MultimodalInputs)ForwardBatch)default_weight_loader)
add_prefixflatten_nested_listc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )CLIPVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _| j| j dksJ tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )	Nr   F)in_channelsout_channelskernel_sizestridebias      position_idsr"   
persistent)super__init__r   hidden_size	embed_dim
image_size
patch_sizenn	Parametertorchrandnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpand)selfr   	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/clip.pyr)      s,   

zCLIPVisionEmbeddings.__init__pixel_valuesreturnc                 C   sn   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|| | j }|S )Nr   )dtyper!   r"   r%   )dim)shaper5   weightrD   toflatten	transposer2   r<   r0   catr9   r#   )r=   rB   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingsr@   r@   rA   forward4   s   


zCLIPVisionEmbeddings.forward)	__name__
__module____qualname__r   r)   r0   TensorrQ   __classcell__r@   r@   r>   rA   r      s    r   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )CLIPTextEmbeddingsr   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nr#   r$   Fr&   )r(   r)   r*   r.   r8   
vocab_sizetoken_embeddingmax_position_embeddingsr9   r:   r0   r;   r<   )r=   r   r+   r>   r@   rA   r)   D   s   

zCLIPTextEmbeddings.__init__N	input_idsr#   inputs_embedsrC   c                 C   sb   |d ur	|j d n|j d }|d u r| jd d d |f }|d u r&| |}| |}|| }|S )Nr%   )rF   r#   rY   r9   )r=   r[   r#   r\   
seq_lengthposition_embeddingsrP   r@   r@   rA   rQ   T   s   

zCLIPTextEmbeddings.forward)NNN)rR   rS   rT   r
   r)   r   r0   
LongTensorFloatTensorrU   rQ   rV   r@   r@   r>   rA   rW   C   s    rW   c                       sR   e Zd Zeddfdeej dee de	f fddZ
dejd	ejfd
dZ  ZS )CLIPMLPN 	act_layerquant_configprefixc                    sN   t    t|j|j|td|d| _| | _t|j|j|td|d| _	d S )Nfc1)re   rf   fc2)
r(   r)   r   r*   intermediate_sizer   rg   actr   rh   )r=   r   rd   re   rf   r>   r@   rA   r)   l   s   
zCLIPMLP.__init__xrC   c                 C   s*   |  |\}}| |}| |\}}|S N)rg   rj   rh   )r=   rk   
x_parallel_r@   r@   rA   rQ      s   
zCLIPMLP.forward)rR   rS   rT   r   r   r.   Moduler   r   strr)   r0   rU   rQ   rV   r@   r@   r>   rA   rb   j   s    rb   c                       sr   e Zd Zedddfdedeej deej dee	 de
ddf fd	d
ZdejdejdejdejfddZ  ZS )CLIPEncoderLayerNrc   r   rd   
norm_layerre   rf   rC   c              
      sz   t    |d u rttj|jd}||j| _||j| _t	|j|j
|jdd|td|d| _t|||td|d| _d S )NepsT	self_attn)r+   	num_headsprojection_sizeuse_qkv_parallelflatten_batchre   rf   mlp)rd   re   rf   )r(   r)   r   r.   	LayerNormlayer_norm_epsr*   layer_norm1layer_norm2r   num_attention_headsr   ru   rb   rz   )r=   r   rd   rr   re   rf   r>   r@   rA   r)      s(   
	zCLIPEncoderLayer.__init__hidden_statesattention_maskcausal_attention_maskc                 C   st   |}|  |}|d ur|d ur|| }n	|d ur|}n|}| j||d}|| }|}| |}| |}|| }|S )N)r   )r}   ru   r~   rz   )r=   r   r   r   residual	attn_maskr@   r@   rA   rQ      s"   



zCLIPEncoderLayer.forward)rR   rS   rT   r   r   r   r.   ro   r   r   rp   r)   r0   rU   rQ   rV   r@   r@   r>   rA   rq      s6    rq   c                       sx   e Zd ZdZ		ddedee deddf fdd	Z			
dde	j
de	j
de	j
dedee	j
ee	j
 f f
ddZ  ZS )CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self
    attention layers. Each layer is a [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    Nrc   r   re   rf   rC   c                    sN   t     | _ j}ttj jdt fddt	|D | _
d S )Nrs   c              	      s(   g | ]}t  td | dqS )zlayers.)r   rr   re   rf   )rq   r   ).0	layer_idxr   rr   rf   re   r@   rA   
<listcomp>   s    z(CLIPEncoder.__init__.<locals>.<listcomp>)r(   r)   r   num_hidden_layersr   r.   r{   r|   
ModuleListrangelayers)r=   r   re   rf   r   r>   r   rA   r)      s   

zCLIPEncoder.__init__Fr\   r   r   return_all_hidden_statesc                 C   s<   |g}|}| j D ]}||||}|r|| q|r|S |S rl   )r   append)r=   r\   r   r   r   hidden_states_poolr   encoder_layerr@   r@   rA   rQ      s   

zCLIPEncoder.forwardNrc   )NNF)rR   rS   rT   __doc__r   r   r   rp   r)   r0   rU   boolr   listrQ   rV   r@   r@   r>   rA   r      s6    r   c                	       sv   e Zd Z		ddedee deddf fddZede	j
fd	d
Z
		dde	jdee	j dee	j fddZ  ZS )CLIPTextTransformerNrc   r   re   rf   rC   c                    sL   t    || _|j}t|| _t||td|d| _t	j
||jd| _d S )Nencoderr   re   rf   rs   )r(   r)   r   r*   rW   rP   r   r   r   r.   r{   r|   final_layer_norm)r=   r   re   rf   r+   r>   r@   rA   r)      s   

zCLIPTextTransformer.__init__c                 C      | j jd jjjS Nr   r   r   r}   rG   devicer=   r@   r@   rA   r        zCLIPTextTransformer.devicer[   r   r#   c           	      C   sT   |  }|d|d }| ||}t|j|j|jd}| |||}| |}|S )Nr%   )r   )	sizeviewrP   r   rF   rD   r   r   r   )	r=   r[   r   r#   input_shaper   r   encoder_outputslast_hidden_stater@   r@   rA   rQ     s   
zCLIPTextTransformer.forwardr   )NN)rR   rS   rT   r
   r   r   rp   r)   propertyr0   r   rU   rQ   rV   r@   r@   r>   rA   r      s.    r   c                	       sN   e Zd Z		ddedee deddf fddZd	ej	d
ej	fddZ
  ZS )CLIPTextModelNrc   r   re   rf   rC   c                    s*   t    || _t||td|d| _d S )N
text_modelr   )r(   r)   r   r   r   r   r=   r   re   rf   r>   r@   rA   r)   &  s   
zCLIPTextModel.__init__r[   r#   c                 C   s   |  ||S rl   )r   )r=   r[   r#   r@   r@   rA   rQ   4  s   zCLIPTextModel.forwardr   )rR   rS   rT   r
   r   r   rp   r)   r0   rU   rQ   rV   r@   r@   r>   rA   r   %  s"    r   c                	       sb   e Zd Z		ddedee deddf fddZede	j
fd	d
Z
de	jde	jfddZ  ZS )CLIPVisionTransformerNrc   r   re   rf   rC   c                    s   t    || _|j}t|| _tj||jd| _	t
||td|d| _|j}t| jj|jkr?td| dt| jj dtj||jd| _d S )Nrs   r   r   zThe original encoder only has z layers, but you requested z layers.)r(   r)   r   r*   r   rP   r.   r{   r|   pre_layrnormr   r   r   r   lenr   
ValueErrorpost_layernorm)r=   r   re   rf   r+   r   r>   r@   rA   r)   >  s$   


zCLIPVisionTransformer.__init__c                 C   r   r   r   r   r@   r@   rA   r   ^  r   zCLIPVisionTransformer.devicerB   c                 C   s<   |  || j}| |}d}| j||d}| |}|S )NF)r\   r   )rP   rH   r   r   r   r   )r=   rB   r   r   r   r@   r@   rA   rQ   b  s   

zCLIPVisionTransformer.forwardr   rR   rS   rT   r   r   r   rp   r)   r   r0   r   rU   rQ   rV   r@   r@   r>   rA   r   <  s&     r   c                       sX   e Zd Z		ddedee def fddZede	j
fd	d
Z
de	jfddZ  ZS )CLIPVisionModelNrc   r   re   rf   c                    s$   t    t||td|d| _d S )Nvision_modelrf   )r(   r)   r   r   r   r   r>   r@   rA   r)   v  s   
zCLIPVisionModel.__init__rC   c                 C   s   | j jS rl   )r   r   r   r@   r@   rA   r     s   zCLIPVisionModel.devicerB   c                 C   s
   |  |S rl   )r   )r=   rB   r@   r@   rA   rQ     s   
zCLIPVisionModel.forwardr   r   r@   r@   r>   rA   r   u  s    r   c                	       s   e Zd Z		ddedee deddf fddZ		dd
ej	dej	de
defddZd
ee defddZdeeeej	f  fddZ  ZS )	CLIPModelNrc   r   re   rf   rC   c                    s  t    || _t|jtstdt|j dt|jt	s*tdt|j d|j}|j}|j
| _
|j| _|j| _tj| j| j
dd| _tj| j| j
dd| _tt| jj| _t||td|d}t||td|d}|j| _|j| _ttjd	d
| _t   d S )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type F)r    r   r   r   T)pooling_type	normalize)!r(   r)   r   
isinstancetext_configr
   	TypeErrortypevision_configr   projection_dimr*   text_embed_dimvision_embed_dimr.   Linearvisual_projectiontext_projectionr/   r0   tensorlogit_scale_init_valuelogit_scaler   r   r   r   r   r   r   LASTpoolermonkey_patch_weight_loader)r=   r   re   rf   r   r   r   r   r>   r@   rA   r)     sL   



zCLIPModel.__init__Tr[   	positionsforward_batchget_embeddingc                 C   s   |sJ dg }|j d ur|j }dd tdd |D D }t|dkrLt|}| |}|d d dd d f }	| |	}
tjj	|
ddd}
t
|
d	S | j||d
}| |d |}	t
| |	jd	S )Nz-CLIPEmbeddingModel is only used for embeddingc                 S   s   g | ]}|j qS r@   )feature)r   itemr@   r@   rA   r     s    z%CLIPModel.forward.<locals>.<listcomp>c                 S   s   g | ]	}|d ur|j qS rl   )mm_items)r   mm_inputr@   r@   rA   r     s    r   r!   r"   )prE   )rP   )r#   )	mm_inputsr   r   r0   concatr   r   r.   
functionalr   r   r   r   r   rP   )r=   r[   r   r   r   r   pixel_values_listrB   vision_outputspooled_outputimage_embedstext_outputsr@   r@   rA   rQ     s*   





zCLIPModel.forwardimage_inputsc                 C   s   |S rl   r@   )r=   r[   r   r@   r@   rA   pad_input_ids  s   zCLIPModel.pad_input_idsweightsc                 C   s   g d}t |  }|D ]C\}}d|v rqd|v r|dd}|D ]\}}}||vr+q!|||}|| }	|	j}
|
|	||  n|| }	t|	dt}
|
|	| qd S )N))qkv_projq_projq)r   k_projk)r   v_projvr#   out_projprojweight_loader)dictnamed_parametersreplacer   getattrr   )r=   r   stacked_params_mappingparams_dictnameloaded_weight
param_name
shard_nameshard_idparamr   r@   r@   rA   load_weights  s(   
zCLIPModel.load_weightsr   )T)rR   rS   rT   r	   r   r   rp   r)   r0   rU   r   r   rQ   r   intr   r   r   r   r   rV   r@   r@   r>   rA   r     s0    4
 $r   c                     sr   dd l dd lddlm}  ddlm m dtdtt dt	dt
ttt t	f f fdd	}t| d
| d S )Nr   )DefaultModelLoader)download_weights_from_hf%filter_files_not_needed_for_inferencemodel_name_or_pathrevisionfall_back_to_ptrC   c           
         s   |  ||p|}j|}d}dg}|s# || jj||| jjd}n|}g }|D ]}	|j||	7 }q)|}dd |D }t|dkrQt	d| d|||fS )	NFz*.bin)ignore_patternsc                 S   s   g | ]}d |vr|qS )	open_clipr@   )r   filer@   r@   rA   r   !  s    zGmonkey_patch_weight_loader.<locals>.prepare_weights.<locals>.<listcomp>r   z$Cannot find any model weights with ``)
_maybe_download_from_modelscopepathisdirload_configdownload_dirr   globjoinr   RuntimeError)
r=   r   r   r   is_localuse_safetensorsallow_patterns	hf_folderhf_weights_filespatternr   r   r  osr@   rA   prepare_weights  s6   

z3monkey_patch_weight_loader.<locals>.prepare_weights_prepare_weights)r  r  sglang.srt.model_loader.loaderr   $sglang.srt.model_loader.weight_utilsr   r   rp   r   r   r   r   setattr)r   r  r@   r
  rA   r     s   )r   )5	functoolsr   typingr   r   r   r   r   r   r0   torch.nnr.   transformersr	   r
   r   %transformers.modeling_attn_mask_utilsr   sglang.srt.layers.activationr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.linearr   r   sglang.srt.layers.poolerr   r   r   *sglang.srt.layers.quantization.base_configr   "sglang.srt.managers.schedule_batchr   &sglang.srt.model_executor.model_runnerr   r  r   sglang.srt.utilsr   r   ro   r   rW   rb   rq   r   r   r   r   r   r   r   
EntryClassr@   r@   r@   rA   <module>   s6    ,'=6)9p6