o
    
۾iX                     @   s$  d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	 d dl
Zd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9m:Z: d dl;m<Z<m=Z=m>Z>m?Z?m@Z@ d dlAmBZB d dlCmDZDmEZE d dlFmGZG ddlHmIZI ddlJmKZKmLZLmMZM ddlNmOZO ddl mPZPmQZQmRZRmSZSmTZT dd lUmVZV 	!	"	#dKd$eWd%eWd&eWd'eWd(eWf
d)d*ZXG d+d, d,e>ZYG d-d. d.e<eY ZZG d/d0 d0e=eY Z[G d1d2 d2ej\Z]G d3d4 d4eDZ^G d5d6 d6ej\Z_d7ej`d8eWd9eWfd:d;ZaG d<d= d=ej\ZbG d>d? d?ej\ZcG d@dA dAej\ZdG dBdC dCej\ZeG dDdE dEej\ZfG dFdG dGej\Zge1jhe[eYeZdHG dIdJ dJej\eMeLZidS )L    N)IterableMappingSequence)partial)	AnnotatedLiteral	rearrange)BatchFeaturePretrainedConfig)GELUActivation)BaseModelOutputWithPooling)	torch_int)
VllmConfig)BaseDummyOptions)parallel_state)utils)MMEncoderAttention)Conv2dLayer)QKVParallelLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)default_weight_loadermaybe_remap_kv_scale_name)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape)AttentionBackendEnum   )Ernie4_5ForCausalLM)MultiModalEmbeddingsSupportsMRoPESupportsMultiModal)	SiglipMLP)AutoWeightsLoaderPPMissingLayerWeightsMapperis_pp_missing_parametermaybe_prefix)get_vit_attn_backend      P heightwidthfactor
min_pixels
max_pixelsc                 C   s2  | |k rt || |  }|} ||k rt | | | } |}t| |t| | dkr7tdt| |t| |  t | | | }t || | }|| |krpt| | | }t| | | | }t|| | | }||fS || |k rt|| |  }t| | | | }t|| | | }||fS )a)  Rescales the image so that the following conditions are met:

    1. Both dimensions (height and width) are divisible by 'factor'.

    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].

    3. The aspect ratio of the image is maintained as closely as possible.

       z4absolute aspect ratio must be smaller than 200, got )roundmaxmin
ValueErrormathsqrtfloorceil)r;   r<   r=   r>   r?   h_barw_barbeta rL   [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/paddleocr_vl.pysmart_resizeY   s0   rN   c                   @   s\   e Zd Zdd ZdefddZdefddZdd	 Zd
ededefddZ	de
fddZdS )PaddleOCRVLProcessingInfoc                 C   s
   | j  S N)ctxget_hf_configselfrL   rL   rM   rR         
z'PaddleOCRVLProcessingInfo.get_hf_configkwargsc                 K   s   | j jdi |S NrL   )rQ   get_hf_processorrT   rV   rL   rL   rM   rX         z*PaddleOCRVLProcessingInfo.get_hf_processorc                 K   s   | j di |jS rW   )rX   image_processorrY   rL   rL   rM   get_image_processor   rZ   z-PaddleOCRVLProcessingInfo.get_image_processorc                 C   s   dd iS )NimagerL   rS   rL   rL   rM   get_supported_mm_limits   s   z1PaddleOCRVLProcessingInfo.get_supported_mm_limitsimage_widthimage_heightreturnc                C   s   |d u r|   }|  }|j}|j}|j}t|||| |j|jd\}}	t|	|d}
d}|
j	| }|
j
| }|| | }||d  }|S )N)r;   r<   r=   r>   r?   )r<   r;   r,      )r\   rR   vision_config
patch_sizespatial_merge_sizerN   r>   r?   r!   r;   r<   )rT   r_   r`   r[   	hf_configrc   rd   
merge_sizeresized_heightresized_widthpreprocessed_sizegrid_tgrid_hgrid_wnum_patchesnum_image_tokensrL   rL   rM   get_num_image_tokens   s(   


z.PaddleOCRVLProcessingInfo.get_num_image_tokensc                 C   sh   |   }|jj}|jj}|| }|  j|d  }tt|}||| 8 }|| }t	|| || dS )Nrb   )r;   r<   )
rR   rc   re   rd   r\   r?   intrE   rF   r!   )rT   rf   rg   rd   r=   max_num_tokens	h_patches	w_patchesrL   rL   rM   !get_image_size_with_most_features   s   z;PaddleOCRVLProcessingInfo.get_image_size_with_most_featuresN)__name__
__module____qualname__rR   objectrX   r\   r^   rq   rp   r!   ru   rL   rL   rL   rM   rO      s    
 rO   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )PaddleOCRVLDummyInputsBuilder	mm_countsra   c                 C   s$   | dd}| j }|j}|| S )Nr]   r   )getinforX   image_token)rT   r{   
num_images	processorr~   rL   rL   rM   get_dummy_text   s   
z,PaddleOCRVLDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j }|r| dnd }d| j|j|j||diS )Nr]   r   )r<   r;   r   	overrides)r|   r}   ru   _get_dummy_imagesr<   r;   )rT   r   r{   r   r   max_image_sizeimage_overridesrL   rL   rM   get_dummy_mm_data   s   
z/PaddleOCRVLDummyInputsBuilder.get_dummy_mm_datarP   )
rv   rw   rx   r   strrq   r   r   r   r   rL   rL   rL   rM   rz      s    
rz   c                
   @   s   e Zd Zdedeeef deeef deeef def
ddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZdS )PaddleOCRVLMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsra   c                 C   s   |r4| j j| j jdi |tdd|i|tdi ||}|d d}|d | |d< |S | j  }||ddd}|S )	Ntextimage_grid_thwpixel_valuesTpt)add_special_tokensreturn_tensorsrL   )	r}   rQ   call_hf_processorrX   dictprodsplittolistget_tokenizer)rT   r   r   r   r   processed_outputsnum_patches_per_image	tokenizerrL   rL   rM   _call_hf_processor   s    
z1PaddleOCRVLMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t tdtddS )Nr]   )r   r   )r   r   batched)rT   r   r   rL   rL   rM   _get_mm_fields_config   s   z4PaddleOCRVLMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sR   j jdi |}j  }|j dtf fdd}td gt||ddgS )Nitem_idxc                    s6    dt}|| }jj|j|j|d} g| S )Nr]   )r_   r`   r[   )	get_itemsr    get_image_sizer}   rp   r<   r;   )r   r[   images
image_sizero   image_token_idr   rT   rL   rM   get_replacement  s   

zKPaddleOCRVLMultiModalProcessor._get_prompt_updates.<locals>.get_replacementr]   )r[   )modalitytargetreplacementrL   )r}   r\   rR   r   rq   r&   r   )rT   r   r   r   r[   rf   r   rL   r   rM   _get_prompt_updates  s   

z2PaddleOCRVLMultiModalProcessor._get_prompt_updatesN)rv   rw   rx   r   r   ry   r
   r   r   r   r"   r   r   r'   r   rL   rL   rL   rM   r      s8    








r   c                       sJ   e Zd Z	ddededef fddZdejdejd	ejfd
dZ  Z	S )	Projector text_configrc   prefixc                    s   t    || _|| _d| _| jj| jd  | jd  | _tjj| jjdd| _	tj
| j| jdd| _t | _tj
| j| jjdd| _d S )N)rb   rb   r   r,   gh㈵>epsT)bias)super__init__r   rc   merge_kernel_sizehidden_sizetorchnn	LayerNormpre_normLinearlinear_1r   actlinear_2)rT   r   rc   r   	__class__rL   rM   r   $  s    
zProjector.__init__image_featuresr   ra   c              
   C   s   | j \}}t|ttfrHt }t||D ]1\}}| |}|\}}	}
t|d||	| ||
| |d}| |}| |}| 	|}|
| q|S |jd d }|jd }|t||}| |d| j}| |}| |}| 	|}|jg |dR  S )Nz$(t h p1 w p2) d -> (t h w) (p1 p2 d))thp1wp2r   )r   
isinstancelisttuplezipr   r	   r   r   r   appendshapeviewnpr   r   )rT   r   r   m1m2processed_featuresimage_feature
image_gridr   r   r   hidden_statesdimsdimrL   rL   rM   forward<  s8   



	





zProjector.forward)r   )
rv   rw   rx   r   r   r   r   Tensorr   __classcell__rL   rL   r   rM   r   #  s     r   c                
   @   sV   e Zd ZU ed ed< eejeddddddhdf ed< eejeddf ed< d	S )
PaddleOCRImagePixelInputsr   typebnp   rd   )dynamic_dimsr   N)	rv   rw   rx   r   __annotations__r   r   r   r*   rL   rL   rL   rM   r   c  s   
 r   c                       s   e Zd Zdef fddZ	ddejdededed	ejf
d
dZ		ddejdededefddZ
			ddejdejdB deeeeef eeeeef  B  dB d	ejfddZ  ZS )SiglipVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _t|j| j| j| jdd| _	| j| j d | _
| j
| _t | _t | _t| j| j| _td| j| _| jdt| jddd d S )	Nvalid)in_channelsout_channelskernel_sizestridepaddingrb   i   position_ids)r,   r   F
persistent)r   r   r   r   	embed_dimr   rd   r   num_channelspatch_embeddingrn   num_positionsr   cache_position_embeddingcache_position_countr   	Embeddingposition_embeddingpacking_position_embeddingregister_bufferr   arangeexpand)rT   r   r   rL   rM   r   p  s.   

zSiglipVisionEmbeddings.__init__F
embeddingsr;   r<   is_after_patchifyra   c                 C   s   | j jjd }| j jd}|jd }|r|}|}	n
|| j }|| j }	t|d }
|d|
|
|}|dddd}tj	j
|||	fddd	}|dddddd|}|S )
Nr   r   g      ?r,   r   rb   bilinearF)sizemodealign_corners)r   weightr   	unsqueezerd   r   reshapepermuter   
functionalinterpolater   )rT   r   r;   r<   r   r   patch_pos_embedr   
new_height	new_widthsqrt_num_positionsrL   rL   rM   interpolate_pos_encoding  s*   


z/SiglipVisionEmbeddings.interpolate_pos_encoding   r   r   	max_cachec                 C   s   ||f}|| j v r| j|  d7  < | j | S t| j |kr3t| j| jjd}| j| | j | | |||d}d| j|< || j |< |S )Nr,   )keyT)r   r   lenrC   r|   popr  )rT   r   r   r   r  gridmin_hit_gridr   rL   rL   rM   "fetch_position_embedding_lfu_cache  s   



z9SiglipVisionEmbeddings.fetch_position_embedding_lfu_cacheNr   r   r   c                 C   s8  |  dkr|d}|  dkr|d u rtd|j\}}}}}	| jjj}
t|d}| |j|
d}|	d
d}|r|d urd}t }|D ]3}|\}}}||| |  }|||d d f }| |||d	
d|d
}|| }|| |}qItj|ddd}|S || | }|S td|   d)N   r      z9position_ids cannot be None when pixel_values.dim() is 5.zb l c h w -> (b l) c h wdtyper   Tr,   r   z$Unsupported pixel_values dimension: z. Expected 4 or 5.)r   r  rD   r   r   r  r  r	   toflattensqueezer   r  repeatr   r   concatr   )rT   r   r   r   r  
batch_sizesquence_lenchannelr;   r<   target_dtypepatch_embedsr   starttmp_embeddingsr   r   r   r   endimage_embeddingsr   rL   rL   rM   r     sR   
	



zSiglipVisionEmbeddings.forward)F)r  )NNF)rv   rw   rx   r   r   r   r   rq   boolr  r  FloatTensorr   r   r   r   rL   rL   r   rM   r   o  sL    !
%
"r   local_tensorr   tp_sizec                    sp   ddl m} fddtD }|j|t jd  fdd|D }dd t| D }tj	|dd	}|S )
zEAll-gather the input tensor interleavely across model parallel group.r   Nc                    s   g | ]}t  qS rL   )r   
zeros_like).0_)r-  rL   rM   
<listcomp>  s    z)all_gather_interleave.<locals>.<listcomp>)groupc                    s   g | ]}t |  d qS )r   )r   r   )r0  tensor)r   r.  rL   rM   r2    s    c                 S   s   g | ]	}|D ]}|qqS rL   rL   )r0  pairr4  rL   rL   rM   r2    s
    r   r  )
torch.distributeddistributedrange
all_gatherr   get_tp_groupdevice_groupr   r   cat)r-  r   r.  distgathered_tensorsgathered_tensors_splitordered_tensorsresult_tensorrL   )r   r-  r.  rM   all_gather_interleave  s   rB  c                       s   e Zd ZdZddddededededB d	ed
df fddZdej	d
e
ej	df fddZdej	dej	dej	dB dej	dB d
ej	f
ddZ  ZS )SiglipAttentionz=SigLIP vision attention adapted from Qwen2.5-VisionAttention.Nr   quant_configr   r   	num_headsprojection_sizerE  r   ra   c             	      s   t    t | _t | _t||| _	t|| j| _
t|| j	||d|| dd| _t|||| dd| _t| j
| j	| j	d | dd| _tddd	| _d S )
NTz	.qkv_proj)r   	head_sizetotal_num_headstotal_num_kv_headsr   rE  r   z	.out_proj)
input_sizeoutput_sizerE  r   g      z.attn)rF  rH  scaler   )enforce_enableenable_fp32_compute)r   r   r   $get_tensor_model_parallel_world_sizer.  get_tensor_model_parallel_ranktp_rank
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr   qkv_projr   out_projr   attnr   apply_rotary_emb)rT   r   rF  rG  rE  r   r   rL   rM   r     sD   
	

	zSiglipAttention.__init__qkv.c           	         s   |j \}}}| jdkrt|| jj| j}|jddd\}}}| jdkr@ttj| jd}||| j	 }||| j	 }||| j	 }||| j
| jf  fdd|||fD \}}}|||fS )Nr,   r   rb   r  )num_partitionsc                 3   s    | ]}|j   V  qd S rP   )r   )r0  x	new_shaperL   rM   	<genexpr>S      z,SiglipAttention.split_qkv.<locals>.<genexpr>)r   r.  rB  rW  r   chunkr   rS  split_tensor_along_last_dimrR  rV  rU  )	rT   r[  r   bsr1  qkvsplitterrL   r^  rM   	split_qkv>  s$   


zSiglipAttention.split_qkvr   
cu_seqlensrotary_pos_emb
max_seqlenc                C   s   |j \}}}t|d}| |\}}| |\}}	}
dd ||	|
fD \}}	}
|d urItj||	gdd}| || | }tj	|ddd\}}	| j
||	|
||d}t|d}| |\}}|S )	Nzb s d -> s b dc                 s   s    | ]}t |d V  qdS )zs b h d -> b s h dNr   )r0  r   rL   rL   rM   r`  c  ra  z*SiglipAttention.forward.<locals>.<genexpr>r   r  rb   )queryr  valuerj  rl  zb s h d -> b s (h d))r   r	   rW  ri  r   r<  rZ  cossinrb  rY  rX  )rT   r   rj  rk  rl  r"  r1  r]  re  rf  rg  	qk_concat
qk_rotatedcontext_layeroutputrL   rL   rM   r   V  s.   

zSiglipAttention.forward)rv   rw   rx   __doc__rq   r   r   r   r   r   r   ri  r   r   rL   rL   r   rM   rC    s:    .rC  c                       sH   e Zd Zddededdf fddZdd	 Zd
edejfddZ	  Z
S )SigLIPRotaryEmbedding     @r   thetara   Nc                    s"   t    || _|| _|   d S rP   )r   r   r   rx  	rope_init)rT   r   rx  r   rL   rM   r   |  s   
zSigLIPRotaryEmbedding.__init__c                 C   s:   d| j tjd| jdtjd| j   }| jd|dd d S )N      ?r   rb   r  inv_freqFr   )rx  r   r   r   floatr   )rT   r{  rL   rL   rM   ry    s    zSigLIPRotaryEmbedding.rope_initseqlenc                 C   s*   t j|| jj| jjd}t || j}|S )N)devicer  )r   r   r{  r~  r  outer)rT   r}  seqfreqsrL   rL   rM   r     s   zSigLIPRotaryEmbedding.forward)rw  )rv   rw   rx   rq   r|  r   ry  r   r   r   r   rL   rL   r   rM   rv  {  s    rv  c                
       sd   e Zd Z		ddededB def fddZdejd	ejd
ejdB dejdB dejf
ddZ	  Z
S )SiglipEncoderLayerNr   r   rE  r   c                    st   t    |j| _tj| j|jd| _t|j|j	|j|| dd| _
tj| j|jd| _t||| dd| _d S )Nr   z
.self_attn)r   rF  rG  rE  r   z.mlprD  )r   r   r   r   r   r   layer_norm_epslayer_norm1rC  num_attention_heads	self_attnlayer_norm2r1   mlprT   r   rE  r   r   rL   rM   r     s    
zSiglipEncoderLayer.__init__r   rj  rk  rl  ra   c                C   sL   |}|  |}| j||||d}|| }|}| |}| |}|| }|S )N)r   rj  rk  rl  )r  r  r  r  )rT   r   rj  rk  rl  residualrL   rL   rM   r     s   


zSiglipEncoderLayer.forwardNr   )rv   rw   rx   r   r   r   r   r   r   r   r   rL   rL   r   rM   r    s*    r  c                       s   e Zd Z		ddededB def fddZedd	 Z				dd
e	j
dB deeeeef eeeeef  B  dB de	j
dB de	j
dB de	j
f
ddZ  ZS )SiglipEncoderNr   r   rE  r   c                    s   t     | _ j} j}|| }t|t d| _| jt	j
t	jt	jhvr/td| j dt fddt jD | _t|d | _d S )N)rH  r  zPaddleOCR-VL does not support z backend now.c                    s$   g | ]}t   d | dqS )z.layers.rD  )r  )r0  	layer_idxr   r   rE  rL   rM   r2    s    z*SiglipEncoder.__init__.<locals>.<listcomp>rb   )r   r   r   r   r  r7   r   get_default_dtypeattn_backendr+   
FLASH_ATTN
TORCH_SDPAROCM_AITER_FARuntimeErrorr   
ModuleListr8  num_hidden_layerslayersrv  rk  )rT   r   rE  r   r   rF  head_dimr   r  rM   r     s.   

zSiglipEncoder.__init__c                 C   s4   t  }| D ]}t|t r|| q|| q|S rP   )r   r   extendr   )r   tmp_image_grid_thwr   rL   rL   rM   flatten_list  s   
zSiglipEncoder.flatten_listrj  r   height_position_idswidth_position_idsra   c                 C   sf  |j }|}| |}|d u s|d u rOt }	t }
|D ]&\}}}tj|| | |d||  }|| }|| }|	| |
| qtj|
dd}tj|	dd}tj||gdd}| d }| 	|}|| 
d}|d u rrtdt|tjstj|tj|d}n|j|d}d }| jtjtjhv r|dd  |d d   }|}| jD ]
}|||||d}q|S )	N)r~  r   r  r   r,   z,cu_seqlens cannot be None for SiglipEncoder.)r  r~  )rj  rk  rl  )r~  r  r   r   r   r   r!  stackrB   rk  r  rD   r   r   r4  int32r  r  r+   r  r  r  )rT   inputs_embedsrj  r   r  r  r~  r   flatten_image_grid_thw
split_hids
split_widsr   r   r   
image_pidssample_hidssample_widspidsmax_grid_sizerope_emb_max_gridrk  rl  encoder_layerrL   rL   rM   r     sR   	



zSiglipEncoder.forwardr  )NNNN)rv   rw   rx   r   r   r   r   staticmethodr  r   r   r   r   rq   r   r   rL   rL   r   rM   r    s:    $
"r  c                       s   e Zd Z		ddededB def fddZ						dd	ejd
e	dB dejdB dejdB dejdB dejdB dejdB dejfddZ
  ZS )SiglipVisionTransformerNr   r   rE  r   c                    sL   t    || _|j}t|| _t||| dd| _tj	||j
d| _d S )Nz.encoderrD  r   )r   r   r   r   r   r   r  encoderr   r   r  post_layernorm)rT   r   rE  r   r   r   rL   rM   r   2  s   

z SiglipVisionTransformer.__init__Fr   r  r   r  r  rj  r   ra   c           
      C   s4   | j ||||d}| j|||||d}	| |	}	|	S )N)r  r   r   )r  rj  r   r  r  )r   r  r  )
rT   r   r  r   r  r  rj  r   r   last_hidden_staterL   rL   rM   r   D  s   

zSiglipVisionTransformer.forwardr  )FNNNNN)rv   rw   rx   r   r   r   r   r   r   r+  r   r   rL   rL   r   rM   r  1  sB    	r  c                       s   e Zd Z		ddedB def fddZedejfdd	Zedej	fd
dZ	de
jfddZ				ddedejdB deeeeef eeeeef  B  dB dejdB def
ddZdeeeejf  dee fddZ  ZS )SiglipVisionModelNr   rE  r   c                    s*   t    t||| dd| _|| _d S )Nz.vision_modelrD  )r   r   r  vision_modelrE  r  r   rL   rM   r   b  s   

zSiglipVisionModel.__init__ra   c                 C      | j jjjjS rP   )r  r   r   r  r  rS   rL   rL   rM   r  q     zSiglipVisionModel.dtypec                 C   r  rP   )r  r   r   r  r~  rS   rL   rL   rM   r~  u  r  zSiglipVisionModel.devicec                 C   s
   | j jjS rP   )r  r   r   rS   rL   rL   rM   get_input_embeddingsy  rU   z&SiglipVisionModel.get_input_embeddingsFr  r   r   rj  c                 C   s   | j |||||dS )N)r   r  r   r   rj  )r  )rT   r   r  r   r   rj  rL   rL   rM   r   |  s   	zSiglipVisionModel.forwardweightsc                 C   sx  g d}t | jdd}t }|D ]\}}d|v rqd|v s"d|v r#qd|v s+d|v r,q| jd urZ| j| }rZ|| }t|d	t}	| d
krK|n|d
 }|	|| || q|D ].\}
}}||vrfq\|	||
}|
drv||vrvq\t|| r|q\|| }|j}	|	|||  n)|
dr||vrqt||}|d u rqt|| rq|| }t|d	t}	|	|| || q|S )N))rW  q_projre  )rW  k_projrf  )rW  v_projrg  F)remove_duplicatezrotary_emb.inv_freqzhead.attentionzhead.layernormzhead.mlpz
head.probeweight_loaderr   z.bias)r   named_parameterssetrE  get_cache_scalegetattrr   r   addreplaceendswithr5   r  r   )rT   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr  
param_nameweight_nameshard_idrL   rL   rM   load_weights  sp   






zSiglipVisionModel.load_weightsr  )FNNN)rv   rw   rx   r   r   r   propertyr   r  r~  r   Moduler  r+  r   r   r   rq   r   r   r   r  r  r   rL   rL   r   rM   r  a  s>    "
,r  )r}   dummy_inputsc                
       sD  e Zd ZeddddZededededB fd	d
Zddde	def fddZ
dejdejdB fddZdee dee deejef fddZdededB fddZ		d-dejdB dejdedB dejdB fdd Zd!ejd"ejdejfd#d$Zd%edefd&d'Zdefd(d)Zd*eeeejf  dee fd+d,Z  ZS ).#PaddleOCRVLForConditionalGenerationzlanguage_model.model.zlanguage_model.lm_head.)zmodel.zlm_head.)orig_to_new_prefixr   ira   Nc                 C   s   | drdS td)Nr]   z1<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>z Only image modality is supported)
startswithrD   )clsr   r  rL   rL   rM   get_placeholder_str  s   
z7PaddleOCRVLForConditionalGeneration.get_placeholder_strr   )r   vllm_configr   c          	         s  t    |jj}t|dr'|j }g d}|D ]}||d  q|| |j	}|| _
| |d t|j|t|dd| _t||j| _W d    n1 sQw   Y  | |$ t|t|dd| _| jjjD ]}t|tswd|jj_qkW d    n1 sw   Y  | jj| _d S )	Nr   )
model_typearchitecturestie_word_embeddingsr]   visual)r   rE  r   language_model)r  r   T)r   r   model_configrf   hasattrr   to_dictr  updaterE  r   _mark_tower_modelr  rc   r6   r  r   mlp_AR_mark_language_modelr-   r  modelr  r   r3   r  
rotary_embis_neox_stylemake_empty_intermediate_tensors)	rT   r  r   r   r   unsafe_keysr  rE  layerr   rL   rM   r     s>   





z,PaddleOCRVLForConditionalGeneration.__init__r   c                 C   s   | j |S rP   )r  compute_logits)rT   r   rL   rL   rM   r    s   z2PaddleOCRVLForConditionalGeneration.compute_logitsinput_tokensmm_featuresc           *   	   C   sj  t |h d}dd |dg D }dd |dg D }|dg }| j}|j}|j}	|j}
|jj}t	|jdd	}t
|}t
||
kd
}||d
  }||k }||	k }g }d}||}}d\}}t|| D ]}d}|dkrz|||}W n ty   t|d
 }Y nw t|d
 }|dkrz||	|}W n ty   t|d
 }Y nw t|d
 }||k r|| \}}}|d
7 }|d
8 }|}n|| \}}}d	}|r|| }|d
7 }|d
8 }|}||| || } }!}"|| }#t|dkr|d  d
 nd}$|t
|#d
ddd|$  t
| dd
d|!|" | |   }%t
|!d
dd
| d|" }&t
|"d
d
d| |!d }'|t
|%|&|'g|# |$  || |! |"  }qo|t|k rt|dkr|d  d
 nd}$t|| }#|t
|#d
ddd|$  t
j|d
ddd}(|( d
 t|  })|(|)fS )N>   r   video_grid_thwsecond_per_grid_tsc                 S      g | ]}|  qS rL   r   r0  itemrL   rL   rM   r2        zQPaddleOCRVLForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>r   c                 S   r  rL   r  r  rL   rL   rM   r2    r  r  r  tokens_per_secondrz  r,   r   )r   r   g        r   r   r  )r   gather_kwargsr|   r   r   video_token_idvision_start_token_idrc   re   r  r   r4  argwherer  sumr8  indexrD   r  rB   r   r   r   r   longr  r  r<  r  r  )*rT   r  r  rV   r   r  r  rf   r   r  r  re   r  input_tokens_tensorvision_start_indicesvision_tokens
image_nums
video_numsllm_pos_ids_liststremain_imagesremain_videosimage_indexvideo_indexr1  video_second_per_grid_ted_imageed_videor   r   r   ed
llm_grid_t
llm_grid_h
llm_grid_wtext_lenst_idxt_indexh_indexw_indexllm_positionsmrope_position_deltarL   rL   rM   get_mrope_input_positions  s   


"



"z=PaddleOCRVLForConditionalGeneration.get_mrope_input_positionsrV   c                 K   s2   | dd }| dd }|d u rd S td||dS )Nr   r   )r   r   r   )r  r   )rT   rV   r   r   rL   rL   rM   _parse_and_validate_image_input  s   zCPaddleOCRVLForConditionalGeneration._parse_and_validate_image_input	input_ids	positionsintermediate_tensorsr  c                 K   s   |d urd }|  ||||S rP   )r  )rT   r  r  r  r  rV   rL   rL   rM   r     s
   z+PaddleOCRVLForConditionalGeneration.forwardr   r   c           
      C   s   | | jj}t }t }dg}t| }t|}|| t	
|t|dd   }|| ||d |  t	j|dd|j}t	j|t	jd|j}| j|||d|d}	|	S )Nr   r,   r   r  r  T)r   r   r   r  rj  )r   r  r  r   r   r   r   r   r   r   r   r!  r  r~  r4  r  )
rT   r   r   siglip_position_idsimage_grid_hwsrj  	thw_tuplenumelimage_position_idsvision_outputsrL   rL   rM   encode_image  s,   


z0PaddleOCRVLForConditionalGeneration.encode_imageimage_inputc                    s8   |j }|j}t fddt||D } ||}|S )Nc                 3   s&    | ]\}}  ||d V  qdS )r   N)r  r  )r0  pixelr  rS   rL   rM   r`    s
    
zKPaddleOCRVLForConditionalGeneration._process_image_input.<locals>.<genexpr>)r   r   r   r   r  )rT   r   r   r   r  image_embedsrL   rS   rM   _process_image_input  s   z8PaddleOCRVLForConditionalGeneration._process_image_inputc                 K   s:   | j di |}|d u rdS d}| |}|t|7 }|S rW   )r  r#  r   )rT   rV   r   multimodal_embeddingsr"  rL   rL   rM   embed_multimodal  s   
z4PaddleOCRVLForConditionalGeneration.embed_multimodalr  c                 C   s   t | }|j|| jd}|S )N)mapper)r2   r  hf_to_vllm_mapper)rT   r  loaderautoloaded_weightsrL   rL   rM   r    s   z0PaddleOCRVLForConditionalGeneration.load_weights)NN)rv   rw   rx   r4   r'  classmethodr   rq   r  r   r   r   r   r  r   r   r   r  ry   r   r  r(   r   r  r.   r#  r%  r   r  r  r   rL   rL   r   rM   r    sf    $

t



,r  )r8   r9   r:   )jrE   collections.abcr   r   r   	functoolsr   typingr   r   numpyr   r   torch.nnr   einopsr	   transformersr
   r   transformers.activationsr   transformers.modeling_outputsr   transformers.utilsr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   rS  $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser    r!   r"   vllm.multimodal.processingr#   r$   r%   r&   r'   vllm.sequencer(   vllm.utils.tensor_schemar)   r*   #vllm.v1.attention.backends.registryr+   ernie45r-   
interfacesr.   r/   r0   siglipr1   r2   r3   r4   r5   r6   visionr7   rq   rN   rO   rz   r   r  r   r   r   r   rB  rC  rv  r  r  r  r  register_processorr  rL   rL   rL   rM   <module>   s   
+=
D@ n5j0m