o
    پik                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ e$e%Z&G dd dej'Z(G dd dej'Z)G dd dej'Z*G dd dej'Z+G dd dej'Z,G dd dej'Z-G dd dej'Z.G dd deZ/G d d! d!ej'Z0G d"d# d#eZ1G d$d% d%e/Z2e/e1gZ3dS )&zbMinimal implementation of CLIPVisionModel intended to be only used
within a vision language model.    )Iterable)OptionalN)BaseEncoderOutputCLIPTextConfigCLIPVisionConfig)divideget_tp_world_size)
get_act_fn)LocalAttention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader)ImageEncoderTextEncoder)resolve_visual_encoder_outputs)AttentionBackendEnumcurrent_platform)init_loggerc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )CLIPVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _| j| j dksJ tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )	Nr   F)in_channelsout_channelskernel_sizestridebias      position_idsr   
persistent)super__init__r   hidden_size	embed_dim
image_size
patch_sizenn	Parametertorchrandnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandselfr   	__class__ f/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/encoders/clip.pyr%   1   s,   

zCLIPVisionEmbeddings.__init__pixel_valuesreturnc                 C   sn   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|| | j }|S )Nr   dtyper   r   r!   dim)shaper1   weightrB   toflatten	transposer.   r8   r,   catr5   r   )r:   r?   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingsr=   r=   r>   forwardL   s   


zCLIPVisionEmbeddings.forward)	__name__
__module____qualname__r   r%   r,   TensorrP   __classcell__r=   r=   r;   r>   r   /   s    r   c                	       sX   e Zd Zdef fddZ			ddejdB dejdB dejdB dejfd	d
Z	  Z
S )CLIPTextEmbeddingsr   c                    sX   t    || _|j}t|j|| _t|j|| _	| j
dt|jddd d S )Nr   r    Fr"   )r$   r%   r   r&   r*   r4   
vocab_sizetoken_embeddingmax_position_embeddingsr5   r6   r,   r7   r8   )r:   r   r'   r;   r=   r>   r%   ]   s   

zCLIPTextEmbeddings.__init__N	input_idsr   inputs_embedsr@   c                 C   s   |d ur
|j d }n|d ur|j d }ntd| jjj d }||kr-td| d| |d u r<| jd d d |f }|d u rE| |}| |}|| }|S )Nr!   z3Either input_ids or inputs_embeds must be provided.r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rE   
ValueErrorr5   rF   r   rX   )r:   rZ   r   r[   
seq_lengthmax_position_embeddingposition_embeddingsrO   r=   r=   r>   rP   n   s*   

zCLIPTextEmbeddings.forward)NNN)rQ   rR   rS   r   r%   r,   
LongTensorFloatTensorrT   rP   rU   r=   r=   r;   r>   rV   [   s    rV   c                       sr   e Zd ZdZ		ddeeB dedB def fddZd	e	j
d
edefddZ	dde	j
de	j
dB fddZ  ZS )CLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN r   quant_configprefixc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t| j| j| j|| dd| _t| j| j|| dd| _t | _t| j| j| _t| j| j| j| j	d	|jd
| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      z	.qkv_proj)r&   	head_sizetotal_num_headsre   rf   z	.out_proj)
input_sizeoutput_sizere   rf   T)softmax_scalecausalsupported_attention_backends)r$   r%   r   r&   r'   num_attention_heads	num_headshead_dimr]   scaleattention_dropoutdropoutr   qkv_projr   out_projr   tp_sizer   num_heads_per_partitionr
   _supported_attention_backendsattnr:   r   re   rf   r;   r=   r>   r%      sL   
zCLIPAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr   r   )viewro   rp   rI   
contiguous)r:   r{   r|   r}   r=   r=   r>   _shape   s   zCLIPAttention._shapehidden_statesattention_maskc           
      C   s  |  |\}}|jddd\}}}||jd |jd | j| j}||jd |jd | j| j}||jd |jd | j| j}| jjtj	kr|
dd}|
dd}|
dd}t sbt rqtjjj|||dd| jd	}n<|dur| dkr|ddddddf j|jd
}	d|	 t|jj }	n|}	nd}	tjjj||||	|du | jd	}|
dd}n| |||}||jd |jd | j| j }| |\}}|dfS )z#Input shape: Batch x Time x Channel   r!   rC   r   r   r   NT)	attn_mask	is_causalrq   rA   g      ?)rt   chunkreshaperE   rw   rp   ry   backendr   
TORCH_SDPArI   r   is_rocmis_musar,   r*   
functionalscaled_dot_product_attentionrq   rD   rG   rB   finfominru   )
r:   r   r   
qkv_states_query_states
key_statesvalue_statesattn_outputr   r=   r=   r>   rP      s|   	

zCLIPAttention.forwardNrd   N)rQ   rR   rS   __doc__r   r   r   strr%   r,   rT   intr   rP   rU   r=   r=   r;   r>   rc      s$    /
rc   c                	       sR   e Zd Z		ddeeB dedB deddf fddZd	ej	dej	fd
dZ
  ZS )CLIPMLPNrd   r   re   rf   r@   c                    s\   t    || _t|j| _t|j|jd|| dd| _	t
|j|jd|| dd| _d S )NTz.fc1)r   re   rf   z.fc2)r$   r%   r   r	   
hidden_actactivation_fnr   r&   intermediate_sizefc1r   fc2rz   r;   r=   r>   r%      s"   
zCLIPMLP.__init__r   c                 C   s*   |  |\}}| |}| |\}}|S r   )r   r   r   )r:   r   r   r=   r=   r>   rP   8  s   
zCLIPMLP.forwardr   )rQ   rR   rS   r   r   r   r   r%   r,   rT   rP   rU   r=   r=   r;   r>   r     s    r   c                	       s`   e Zd Z		ddeeB dedB deddf fddZ	dd	ej	d
ej	dB dej	fddZ
  ZS )CLIPEncoderLayerNrd   r   re   rf   r@   c                    sb   t    t||| dd| _tj|j|jd| _t	||| dd| _
tj|j|jd| _d S )Nz
.self_attn)re   rf   epsz.mlp)r$   r%   rc   	self_attnr*   	LayerNormr&   layer_norm_epslayer_norm1r   mlplayer_norm2rz   r;   r=   r>   r%   B  s   
zCLIPEncoderLayer.__init__r   r   c                 C   sL   |}|  |}| j||d\}}|| }|}| |}| |}|| }|S )N)r   r   )r   r   r   r   )r:   r   r   residualr   r=   r=   r>   rP   R  s   



zCLIPEncoderLayer.forwardr   r   )rQ   rR   rS   r   r   r   r   r%   r,   rT   rP   rU   r=   r=   r;   r>   r   @  s(    r   c                       s|   e Zd ZdZ			ddeeB dedB dedB deddf
 fd	d
Z		dde
jdede
jdB de
jee
j B fddZ  ZS )CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self
    attention layers. Each layer is a [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    Nrd   r   re   num_hidden_layers_overriderf   r@   c                    sJ   t     | _|d u r j}n|}t fddt|D | _d S )Nc                    s$   g | ]}t   d | dqS )z.layers.r   re   rf   )r   ).0	layer_idxr   rf   re   r=   r>   
<listcomp>  s    z(CLIPEncoder.__init__.<locals>.<listcomp>)r$   r%   r   num_hidden_layersr*   
ModuleListrangelayers)r:   r   re   r   rf   r   r;   r   r>   r%   q  s   

zCLIPEncoder.__init__r[   return_all_hidden_statesr   c                 C   sF   |g}|}t | jD ]\}}|||d}|r|| q
|r |S |gS )N)r   )	enumerater   append)r:   r[   r   r   hidden_states_poolr   idxencoder_layerr=   r=   r>   rP     s   
zCLIPEncoder.forwardNNrd   r   )rQ   rR   rS   r   r   r   r   r   r   r%   r,   rT   boollistrP   rU   r=   r=   r;   r>   r   h  s4    r   c                       s   e Zd Z			ddededB dedB def fddZ				dd	ej	dB d
ej	dB dej	dB dej	dB de
dB defddZ  ZS )CLIPTextTransformerNrd   r   re   r   rf   c                    sP   t    || _|j}t|| _t||||d| _tj	||j
d| _|j| _d S )N)re   r   rf   r   )r$   r%   r   r&   rV   rO   r   encoderr*   r   r   final_layer_normeos_token_id)r:   r   re   r   rf   r'   r;   r=   r>   r%     s   

zCLIPTextTransformer.__init__rZ   r   r   r[   output_hidden_statesr@   c                 C   s   |d ur|n| j j}|d u rtd| }|d|d }| j||d}| j|||d}|d }	| |	}	| jdkrV|	t	j
|	jd |	jd|jt	j|	jdjdd	f }
n|	t	j
|	jd |	jd|jt	j|	jd| jk jdd	f }
t|	|
|d
S )NzYou have to specify input_idsr!   )rZ   r   )r[   r   r   r   r   )device)rB   r   rC   )last_hidden_statepooler_outputr   )r   r   r]   sizer~   rO   r   r   r   r,   r7   rE   r   rG   r   argmaxr   )r:   rZ   r   r   r[   r   input_shaper   encoder_outputsr   pooled_outputr=   r=   r>   rP     sV   



zCLIPTextTransformer.forwardr   NNNN)rQ   rR   rS   r   r   r   r   r%   r,   rT   r   r   rP   rU   r=   r=   r;   r>   r     s<    r   c                       s   e Zd Zdeddf fddZ				ddejdB dejdB dejdB d	ejdB d
edB defddZ	de
eeejf  dee fddZ  ZS )CLIPTextModelr   r@   Nc                    s$   t  | t||j|jd| _d S )Nr   )r$   r%   r   re   rf   
text_modelr9   r;   r=   r>   r%     s   
zCLIPTextModel.__init__rZ   r   r   r[   r   c                 K   s   | j ||||d}|S )N)rZ   r   r   r   )r   )r:   rZ   r   r   r[   r   kwargsoutputsr=   r=   r>   rP     s   
zCLIPTextModel.forwardweightsc                 C   s   g d}t |  }t }|D ]F\}}|D ]'\}}}	||v r<|||}
|
|v r:||
 }|j}||||	 ||
  nq||v rU|| }t|dt}||| || q|S )N))rt   q_projq)rt   k_projk)rt   v_projvweight_loader)dictnamed_parameterssetreplacer   addgetattrr   )r:   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idmodel_param_nameparamr   r=   r=   r>   load_weights,  s.   


zCLIPTextModel.load_weightsr   )rQ   rR   rS   r   r%   r,   rT   r   r   rP   r   tupler   r   r   rU   r=   r=   r;   r>   r     s0    
,r   c                       sz   e Zd Z				ddededB dedB dedB deddf fd	d
Z		dde	j
dee dee dB defddZ  ZS )CLIPVisionTransformerNrd   r   re   r   require_post_normrf   r@   c                    s   t    || _|j}t|| _tj||jd| _	t
|||| dd| _|j}t| jj|jkr@td| dt| jj d|d u rLt| jj|k}|rYtj||jd| _d S d | _d S )Nr   z.encoder)r   re   r   rf   zThe original encoder only has z layers, but you requested z layers.)r$   r%   r   r&   r   rO   r*   r   r   pre_layrnormr   r   r   lenr   r]   post_layernorm)r:   r   re   r   r   rf   r'   r   r;   r=   r>   r%   S  s.   



zCLIPVisionTransformer.__init__r?   r   feature_sample_layersc                 C   sf   |  |}| |}|p|d u}| j||d}|s'|d }t||| j| jj}|r.t|dS t|dS )N)r[   r   r   )r   )r   )rO   r   r   r   r   r   r   r   )r:   r?   r   r   r   r   r   r=   r=   r>   rP   }  s(   



zCLIPVisionTransformer.forward)NNNrd   NN)rQ   rR   rS   r   r   r   r   r   r%   r,   rT   r   r   r   rP   rU   r=   r=   r;   r>   r   Q  s:    -
r   c                	       s   e Zd ZeZdZdg diZdeddf fddZ		ddej	d	e
e dB d
ee defddZedd Zdeeeej	f  dee fddZ  ZS )CLIPVisionModelr?   rt   )r   r   r   r   r@   Nc                    s2   t  | t||j|j|j|j dd| _d S )Nz.vision_model)r   re   r   r   rf   )r$   r%   r   re   r   r   rf   vision_modelr9   r;   r=   r>   r%     s   
zCLIPVisionModel.__init__r   r   c                 K   s   | j |||d}|S )N)r   r   )r   )r:   r?   r   r   r   base_encoder_outputr=   r=   r>   rP     s   zCLIPVisionModel.forwardc                 C   s   t |  jS r   )next
parametersr   )r:   r=   r=   r>   r     s   zCLIPVisionModel.devicer   c                 C   s   t |  }t }t| jjj}|D ]a\}}|drq|dr(| jjd u r(q|dr;t	|
dd }||kr;q| jjjD ]\}}	}
|	|vrJq@||	|}|| }|j}||||
  n|| }t|dt}||| || q|S )Nvisual_projectionzvision_model.post_layernormzvision_model.encoder.layers.r   r   )r   r   r   r   r   r   r   
startswithr   r   splitr   arch_configr   r   r   r   r   r   )r:   r   r   r   layer_countr   r   r   r   r   r   r   r   r=   r=   r>   r     s>   


zCLIPVisionModel.load_weightsr   )rQ   rR   rS   r   config_classmain_input_namepacked_modules_mappingr%   r,   rT   r   r   r   r   r   rP   propertyr   r   r   r   r   r   rU   r=   r=   r;   r>   r     s&    


,r   c                   @   s   e Zd ZdS )	BertModelN)rQ   rR   rS   r=   r=   r=   r>   r    s    r  )4r   collections.abcr   typingr   r,   torch.nnr*   -sglang.multimodal_gen.configs.models.encodersr   r   r   )sglang.multimodal_gen.runtime.distributedr   r   /sglang.multimodal_gen.runtime.layers.activationr	   .sglang.multimodal_gen.runtime.layers.attentionr
   +sglang.multimodal_gen.runtime.layers.linearr   r   r   1sglang.multimodal_gen.runtime.layers.quantizationr   1sglang.multimodal_gen.runtime.loader.weight_utilsr   2sglang.multimodal_gen.runtime.models.encoders.baser   r   4sglang.multimodal_gen.runtime.models.encoders.visionr   'sglang.multimodal_gen.runtime.platformsr   r   1sglang.multimodal_gen.runtime.utils.logging_utilsr   rQ   loggerModuler   rV   rc   r   r   r   r   r   r   r   r  
EntryClassr=   r=   r=   r>   <module>   s<   ,4 "(:mBRO