o
    پi?c                     @   s  d dl mZ d dlmZmZmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* G dd dej+Z,G dd dej+Z-G dd dej+Z.G dd dej+Z/G dd dej+Z0G dd dej+Z1G dd dej+Z2G dd  d ej+Z3G d!d" d"e'Z4d#ej5fd$d%Z6e4gZ7dS )&    )Iterable)ListOptionalSetTupleUnionN)	rearrange)GELUActivation)	torch_int)
get_act_fn)VisionAttention)ColumnParallelLinearRowParallelLinear)QuantizationConfig)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)Ernie4_5_ForCausalLM)
add_prefixis_npuc                       sN   e Zd Z	d
def fddZdejdeee	e	e	f  dejfdd	Z
  ZS )	Projector prefixc                    s   t    || _|| _d| _| jj| jd  | jd  | _tjj| jjdd| _	tj
| j| jdd| _t | _tj
| j| jjdd| _d S )N)   r   r      gh㈵>epsT)bias)super__init__text_configvision_configmerge_kernel_sizehidden_sizetorchnn	LayerNormpre_normLinearlinear_1r	   actlinear_2)selfr#   r$   r   	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/paddleocr_vl.pyr"   ,   s    
zProjector.__init__image_featuresimage_grid_thwreturnc              
   C   s   | j \}}t|ttfrHt }t||D ]1\}}| |}|\}}	}
t|d||	| ||
| |d}| |}| |}| 	|}|
| q|S |jd d }|jd }|t||}| |d| j}| |}| |}| 	|}|jg |dR  S )Nz$(t h p1 w p2) d -> (t h w) (p1 p2 d))thp1wp2)r%   
isinstancelisttuplezipr*   r   r,   r-   r.   appendshapeviewnpprodr&   )r/   r4   r5   m1m2processed_featuresimage_feature
image_gridr7   r8   r:   hidden_statesdimsdimr2   r2   r3   forwardD   s8   



	





zProjector.forward)r   )__name__
__module____qualname__strr"   r'   Tensorr   r   intrN   __classcell__r2   r2   r0   r3   r   *   s    r   c                       s   e Zd Z fddZ	ddejdedededejf
d	d
ZddefddZ				ddej
deej deeeeeeef eeeeef  f   dejfddZ  ZS )SiglipVisionEmbeddingsc                    s   t    || _|j| _|j| _|j| _tj|j	| j| j| jdd| _
| j| j d | _| j| _t | _t | _t| j| j| _td| j| _| jdt| jddd d S )	Nvalid)in_channelsout_channelskernel_sizestridepaddingr   i   position_ids)r   r<   F
persistent)r!   r"   configr&   	embed_dim
image_size
patch_sizer(   Conv2dnum_channelspatch_embeddingnum_patchesnum_positionsdictcache_position_embeddingcache_position_count	Embeddingposition_embeddingpacking_position_embeddingregister_bufferr'   arangeexpand)r/   r`   r0   r2   r3   r"   m   s.   

zSiglipVisionEmbeddings.__init__F
embeddingsheightwidthis_after_patchifyr6   c                 C   s   | j jjd }| j jd}|jd }|r|}|}	n
|| j }|| j }	t|d }
|d|
|
|}|dddd}tj	j
|||	fddd	}|dddddd|}|S )
Nr   r<   g      ?r      r   bilinearF)sizemodealign_corners)rm   weightrB   	unsqueezerc   r
   reshapepermuter(   
functionalinterpolaterC   )r/   rr   rs   rt   ru   rh   patch_pos_embedrM   
new_height	new_widthsqrt_num_positionsr2   r2   r3   interpolate_pos_encoding   s*   


z/SiglipVisionEmbeddings.interpolate_pos_encoding   	max_cachec                 C   s   ||f}|| j v r| j|  d7  < | j | S t| j |kr3t| j| jjd}| j| | j | | |||d}d| j|< || j |< |S )Nr   )keyT)rj   rk   lenmingetpopr   )r/   rr   r8   r:   r   gridmin_hit_gridrm   r2   r2   r3   "fetch_position_embedding_lfu_cache   s   



z9SiglipVisionEmbeddings.fetch_position_embedding_lfu_cacheNpixel_valuesr]   r5   c                 C   s8  |  dkr|d}|  dkr|d u rtd|j\}}}}}	| jjj}
t|d}| |j|
d}|	d
d}|r|d urd}t }|D ]3}|\}}}||| |  }|||d d f }| |||d	
d|d
}|| }|| |}qItj|ddd}|S || | }|S td|   d)N   r      z9position_ids cannot be None when pixel_values.dim() is 5.zb l c h w -> (b l) c h wdtyper<   Tr   rM   z$Unsupported pixel_values dimension: z. Expected 4 or 5.)rM   r|   
ValueErrorrB   rf   r{   r   r   toflattensqueezer>   r   repeatrA   r'   concatrn   )r/   r   r]   r5   r   
batch_sizesquence_lenchannelrs   rt   target_dtypepatch_embedsrr   starttmp_embeddingsrJ   r7   r8   r:   endimage_embeddingsrm   r2   r2   r3   rN      sR   
	



zSiglipVisionEmbeddings.forwardF)r   )NNF)rO   rP   rQ   r"   r'   rS   rT   boolr   r   FloatTensorr   r   r   r   rN   rU   r2   r2   r0   r3   rV   k   sF    !
%rV   c                       sH   e Zd Zddededdf fddZdd	 Zd
edejfddZ	  Z
S )SigLIPRotaryEmbedding     @rM   thetar6   Nc                    s"   t    || _|| _|   d S N)r!   r"   rM   r   	rope_init)r/   rM   r   r0   r2   r3   r"      s   
zSigLIPRotaryEmbedding.__init__c                 C   s:   d| j tjd| jdtjd| j   }| jd|dd d S )Ng      ?r   r   r   inv_freqFr^   )r   r'   rp   rM   floatro   )r/   r   r2   r2   r3   r     s    zSigLIPRotaryEmbedding.rope_initseqlenc                 C   s*   t j|| jj| jjd}t || j}|S )N)devicer   )r'   rp   r   r   r   outer)r/   r   seqfreqsr2   r2   r3   rN   
  s   zSigLIPRotaryEmbedding.forward)r   )rO   rP   rQ   rT   r   r"   r   r'   rS   rN   rU   r2   r2   r0   r3   r      s    r   c                       sJ   e Zd Z		ddee deddf fddZdejdejfd	d
Z	  Z
S )	SiglipMLPNr   quant_configr   r6   c                    s   t    || _t|j| _|r| dv rd}n|jd dko&|jd dk}t	|j|j|r0|nd t
d|d| _t|j|j|rB|nd t
d|d| _d S )N)bitsandbytestorchaoT@   r   fc1r   r   fc2)r!   r"   r`   r   
hidden_actactivation_fnget_namer&   intermediate_sizer   r   r   r   r   )r/   r`   r   r   quantizabler0   r2   r3   r"     s&   


zSiglipMLP.__init__rK   c                 C   s*   |  |\}}| |}| |\}}|S r   )r   r   r   )r/   rK   _r2   r2   r3   rN   3  s   
zSiglipMLP.forwardNr   )rO   rP   rQ   r   r   rR   r"   r'   rS   rN   rU   r2   r2   r0   r3   r     s    r   c                
       sr   e Zd Z		ddee def fddZ		ddejdee	ej  d	ee
ejejf  d
e
ej fddZ  ZS )SiglipEncoderLayerNr   r   r   c                    sz   t    |j| _tj| j|jd| _t| j|j	| jddd|t
d|d| _tj| j|jd| _t||t
d|d| _d S )Nr   T	self_attn)ra   	num_headsprojection_sizeuse_qkv_parallelqkv_biasflatten_batchr   r   mlpr   )r!   r"   r&   ra   r(   r)   layer_norm_epslayer_norm1r   num_attention_headsr   r   layer_norm2r   r   r/   r`   r   r   r0   r2   r3   r"   <  s"   
zSiglipEncoderLayer.__init__rK   
cu_seqlensrope_embr6   c                 C   sJ   |}|  |}| j|||d}|| }|}| |}| |}|| }|S )N)r   position_embeddings)r   r   r   r   )r/   rK   r   r   residualr2   r2   r3   rN   V  s   


zSiglipEncoderLayer.forwardr   )NN)rO   rP   rQ   r   r   rR   r"   r'   rS   r   r   r   rN   rU   r2   r2   r0   r3   r   :  s&    r   c                       s   e Zd Z		ddee def fddZedd Z				dd	ee	e
j  d
ee	eeeeef e	eeeef  f   dee
j dee
j de
jf
ddZ  ZS )SiglipEncoderNr   r   r   c                    sZ   t     | _ j} j}|| }t fddt jD | _	t
|d | _d S )Nc                    s&   g | ]}t  td | dqS )zlayers.r   )r   r   ).0	layer_idxr`   r   r   r2   r3   
<listcomp>  s    z*SiglipEncoder.__init__.<locals>.<listcomp>r   )r!   r"   r`   r&   r   r(   
ModuleListrangenum_hidden_layerslayersr   rotary_pos_emb)r/   r`   r   r   ra   r   head_dimr0   r   r3   r"   t  s   

zSiglipEncoder.__init__c                 C   s4   t  }| D ]}t|t r|| q|| q|S r   )r>   r=   extendrA   )r5   tmp_image_grid_thwrJ   r2   r2   r3   flatten_list  s   
zSiglipEncoder.flatten_listr   r5   height_position_idswidth_position_idsr6   c                 C   s2  |j }|}| |}|d u s|d u rOt }	t }
|D ]&\}}}tj|| | |d||  }|| }|| }|	| |
| qtj|
dd}tj|	dd}tj||gdd}| d }| 	|}|| 
d}|dd}| | f}t rt|tjr|d}|}|}| jD ]	}||||d}q|S )	N)r   r   r   r<   r   r   cpu)r   r   )r   r   r>   r'   rp   rA   r   stackmaxr   r   r   cossinr   r=   rS   r   r   )r/   inputs_embedsr   r5   r   r   r   rK   flatten_image_grid_thw
split_hids
split_widsr7   r8   r:   
image_pidssample_hidssample_widspidsmax_grid_sizerope_emb_max_gridr   attn_cu_seqlensencoder_layerr2   r2   r3   rN     sD   




zSiglipEncoder.forwardr   )NNNN)rO   rP   rQ   r   r   rR   r"   staticmethodr   r   r'   rS   r   r   rT   rN   rU   r2   r2   r0   r3   r   r  sB    
r   c                       s   e Zd Z		ddee def fddZ						ddee d	eej	 d
eej	 deej	 dee
ej	  dee
eeeeef e
eeeef  f   deej	 fddZ  ZS )SiglipVisionTransformerNr   r   r   c                    sL   t    || _|j}t|| _t||td|d| _t	j
||jd| _d S )Nencoderr   r   )r!   r"   r`   r&   rV   rr   r   r   r   r(   r)   r   post_layernorm)r/   r`   r   r   ra   r0   r2   r3   r"     s   

z SiglipVisionTransformer.__init__Fr   r]   r   r   r   r5   r6   c                 C   s   | j ||||d}| j|||||d}	| |	}	t }
|d u r#tdt|jd d D ]!}|| }||d  }|	d d ||d d f d}|
| q,|
S )N)r   r]   r5   )r   r   r5   r   r   zHcu_seqlens cannot be None for SiglipVisionTransformer output processing.r   r   )	rr   r   r   r>   r   r   rB   r   rA   )r/   r   r   r]   r   r   r   r5   rK   last_hidden_statesample_hidden_stateir   r   tensorr2   r2   r3   rN     s2   
 zSiglipVisionTransformer.forwardr   )FNNNNN)rO   rP   rQ   r   r   rR   r"   r   r'   rS   r   r   r   rT   r>   rN   rU   r2   r2   r0   r3   r     sJ    r   c                       s   e Zd ZdZdZ		ddee def fddZe	d	e
jfd
dZe	d	e
jfddZd	ejfddZ				ddedee
j deeeeeeef eeeeef  f   deee
j  d	ee
j f
ddZ  ZS )SiglipVisionModelPaddleOCRVisionConfigr   Nr   r   r   c                    s*   t    t||td|d| _|| _d S )Nvision_modelr   )r!   r"   r   r   r   r   r   r0   r2   r3   r"     s   

zSiglipVisionModel.__init__r6   c                 C      | j jjjjS r   )r   rr   rf   r{   r   r/   r2   r2   r3   r   %     zSiglipVisionModel.dtypec                 C   r  r   )r   rr   rf   r{   r   r  r2   r2   r3   r   )  r  zSiglipVisionModel.devicec                 C   s
   | j jjS r   )r   rr   rf   r  r2   r2   r3   get_input_embeddings-  s   
z&SiglipVisionModel.get_input_embeddingsFr   r]   r5   r   c                 C   s   | j |||||dS )N)r   r   r]   r5   r   )r   )r/   r   r   r]   r5   r   r2   r2   r3   rN   0  s   zSiglipVisionModel.forwardr   )FNNN)rO   rP   rQ   config_classmain_input_namer   r   rR   r"   propertyr'   r   r   r(   Moduler  r   rS   r   r   r   rT   r>   rN   rU   r2   r2   r0   r3   r     sL    r   c                	       s   e Zd Zddddef fddZdee defd	d
Zdd Z	dd Z
dee dejfddZ	ddejdejdedefddZdeeeejf  dee fddZ  ZS )#PaddleOCRVLForConditionalGenerationNr   r   r   c                   s|   t  j||d | j}t||jtd|d| _t|jtd|d| _t	| j
ds5dd l}|t| j
| j
_d| jjv | _d S )N)r`   r   mlp_AR)r   visualr  r   mrope_section)r!   r"   r`   r   r$   r   r
  r   r  hasattrmodeltypes
MethodTyper  rope_scalingis_mrope_enabled)r/   r`   r   r   r  r0   r2   r3   r"   K  s   z,PaddleOCRVLForConditionalGeneration.__init__	input_ids	mm_inputsc                 C   s   t  }|||S r   )r   pad_input_tokens)r/   r  r  patternr2   r2   r3   pad_input_ids]  s   z1PaddleOCRVLForConditionalGeneration.pad_input_idsc                 C   s   | j jS r   )r  embed_tokensr  r2   r2   r3   r  a  s   z8PaddleOCRVLForConditionalGeneration.get_input_embeddingsc                 C   s   | | jj}t }t }dg}t|D ]6\}}t|   	 }t
|}	|| t|	t
|dd   }
||
 ||d |	  qtj|dd|j}tj|tjd|j}| j|||d|d}| ||}tj|dd}|S )Nr   r   r<   r   r   T)r   r5   r]   r   r   )typer  r   r>   	enumerater?   detachr   numpytolistrD   rE   rA   r'   rp   r   r   r   r   int32r
  cat)r/   r   r5   siglip_position_idsimage_grid_hwsr   idxgrid_thw	thw_tuplenumelimage_position_idsvision_outputsimage_embedsr2   r2   r3   encode_imaged  s2   


z0PaddleOCRVLForConditionalGeneration.encode_imageitemsr6   c                 C   sJ   t jdd |D dd| jj}t jdd |D dd}| ||}|S )Nc                 S      g | ]}|j qS r2   )featurer   itemr2   r2   r3   r         zIPaddleOCRVLForConditionalGeneration.get_image_feature.<locals>.<listcomp>r   r   c                 S   r+  r2   )r5   r-  r2   r2   r3   r     r/  )r'   r  r  r  r   r   r)  )r/   r*  r   r5   r(  r2   r2   r3   get_image_feature  s   z5PaddleOCRVLForConditionalGeneration.get_image_featureF	positionsforward_batchget_embeddingc                 C   st   | j r|j}|j s'| r'| j r'|jdkr|ddks'J d|  t||| j| |d}| 	||| j
|S )Nr   r   rv   zMmultimodal section rotary embedding requires (3, seq_len) positions, but got )r  r2  language_modelmultimodal_modelr1  )r  mrope_positionsforward_mode	is_decodecontains_image_inputsndimrx   r   r  logits_processorlm_head)r/   r  r1  r2  r3  rK   r2   r2   r3   rN     s,   
z+PaddleOCRVLForConditionalGeneration.forwardweightsc                 C   s   g d}t |  }|D ]h\}}d|v rqd|v sd|v rqd|v s&d|v r'q|D ]\}}}||vr3q)|||}|| }	|	j}
|
|	||  n,d|v rVd|v rV|d	d
}|| v rl|| }	t|	dt}
|
|	| qtd| dqd S )N))	.qkv_projz.q_projq)r>  z.k_projk)r>  z.v_projv).gate_up_projz
.gate_projr   )rB  z.up_projr   zrotary_emb.inv_freqzhead.attentionzhead.layernormzhead.mlpz
head.prober   out_projz.self_attn.out_projz.self_attn.projweight_loaderzParameter 'z' not found in model.)ri   named_parametersreplacerD  keysgetattrr   KeyError)r/   r=  stacked_params_mappingparams_dictnameloaded_weight
param_nameweight_nameshard_idparamrD  r2   r2   r3   load_weights  s8   z0PaddleOCRVLForConditionalGeneration.load_weightsr   )rO   rP   rQ   rR   r"   r   rT   r   r  r  r)  r   r'   rS   r0  r   r   rN   r   r   r   rR  rU   r2   r2   r0   r3   r	  I  s"     
,r	  r6   c                 C   s   | j S r   )r  r  r2   r2   r3   r    s   r  )8collections.abcr   typingr   r   r   r   r   r  rD   r'   torch.nnr(   einopsr   transformers.activationsr	   transformers.utilsr
   sglang.srt.layers.activationr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.linearr   r   *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.ernie4r   sglang.srt.utilsr   r   r  r   rV   r   r   r   r   r   r   r	  rl   r  
EntryClassr2   r2   r2   r3   <module>   s>   A &8ZF7 
