o
    
۾i_i                     @   s  U d dl mZmZ d dlmZmZmZ d dlZd dlm	Z	 d dlm
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZD d dlEmFZFmGZG d dlHmIZImJZJ d dlKmLZL ddlMmNZNmOZO d ZPG d!d" d"eIZQG d#d$ d$eIZReQeRB ZSeeTd%< G d&d' d'e5ZUG d(d) d)e7ZVG d*d+ d+e	jWZXG d,d- d-e	jWZYG d.d/ d/e	jWZZG d0d1 d1e	jWZ[G d2d3 d3e	jWZ\G d4d5 d5e	jWZ]G d6d7 d7e	jWZ^G d8d9 d9e	jWZ_e@j`e6eVeUd:G d;d< d<e	jWe-e.e,ZadS )=    )IterableMapping)	AnnotatedLiteral	TypeAliasN)	LayerNorm)Qwen2VLProcessor)
VllmConfig)BaseDummyOptions)utils)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)
SiluAndMul)MMEncoderAttention)Conv2dLayer)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)default_weight_loader)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)Qwen2ForCausalLM)Qwen2VisionAttentionQwen2VLDummyInputsBuilderQwen2VLMultiModalProcessorQwen2VLProcessingInfo)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)get_vit_attn_backend)MULTIMODAL_REGISTRY)MultiModalDataDict)IntermediateTensors)DotsOCRConfigDotsVisionConfig)TensorSchemaTensorShape)AttentionBackendEnum   )is_vit_use_data_parallel!run_dp_sharded_mrope_vision_modelz
<|imgpad|>c                   @   N   e Zd ZU dZed ed< eeje	ddf ed< eeje	ddf ed< d	S )
DotsOCRImagePixelInputsz
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - ni: Number of images
        - cps: Number of channels * patch_size * patch_size
    pixel_valuestypenpcpsni   image_grid_thwN
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr.    rD   rD   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/dots_ocr.pyr4   D   s
   
 r4   c                   @   r3   )
DotsOCRImageEmbeddingInputszu
    Dimensions:
        - nf: Number of image features
        - hs: Hidden size
        - ni: Number of images
    image_embedsr6   nfhsr9   r:   r;   Nr<   rD   rD   rD   rE   rF   S   s
   
 rF   DotsOCRImageInputsc                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )DotsOCRDummyInputsBuilder	mm_countsreturnc                 C   s   | dd}t| S )Nimager   )getIMAGE_TOKEN)selfrL   
num_imagesrD   rD   rE   get_dummy_texte   s   z(DotsOCRDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )NrN   r   )widthheightrR   	overrides)rO   info!get_image_size_with_most_features_get_dummy_images)rQ   rT   rL   rU   rR   target_widthtarget_heightimage_overridesrD   rD   rE   get_dummy_mm_datai   s   z+DotsOCRDummyInputsBuilder.get_dummy_mm_dataN)
r=   r>   r?   r   strintrS   r
   r)   r_   rD   rD   rD   rE   rK   d   s    
rK   c                   @   sl   e Zd ZdefddZdeeedB f fddZdedeeef deeef fd	d
Z	de
defddZdS )DotsOCRProcessingInforM   c                 C   sT   | j  }|jjdkstdt| t|dr(t|jt	r(t
di |j|_|S )Nr+   zExpected DotsOCRConfig, got vision_configrD   )ctxget_hf_config	__class__r=   	TypeErrorr6   hasattr
isinstancerd   dictr,   rQ   configrD   rD   rE   rf      s   
z#DotsOCRProcessingInfo.get_hf_configNc                 C   s   dd iS NrN   rD   rQ   rD   rD   rE   get_supported_mm_limits   s   z-DotsOCRProcessingInfo.get_supported_mm_limitsrT   rL   c                 C   s   |   }d|iS rn   )get_max_image_tokens)rQ   rT   rL   max_image_tokensrD   rD   rE   get_mm_max_tokens_per_item   s   z0DotsOCRProcessingInfo.get_mm_max_tokens_per_itemkwargsc                 K   s.   t |  _| jjtfi |}t |_d|_|S )Nz<|video_pad|>)rP   get_tokenizerimage_tokenre   get_hf_processorr   video_token)rQ   rt   	processorrD   rD   rE   rw      s   
z&DotsOCRProcessingInfo.get_hf_processor)r=   r>   r?   r+   rf   r   ra   rb   rp   rs   objectr   rw   rD   rD   rD   rE   rc      s    



rc   c                       s@   e Zd Zddededdf fddZdedejfd	d
Z  Z	S )VisionRotaryEmbedding     @dimthetarM   Nc                    s>   t    d|tjd|dtjd|   }| jd|dd d S )Ng      ?r      dtypeinv_freqF)
persistent)super__init__rB   arangefloatregister_buffer)rQ   r}   r~   r   rg   rD   rE   r      s   
 zVisionRotaryEmbedding.__init__seqlenc                 C   s*   t j|| jj| jjd}t || j}|S )Ndevicer   )rB   r   r   r   r   outer)rQ   r   seqfreqsrD   rD   rE   forward   s
   zVisionRotaryEmbedding.forward)r|   )
r=   r>   r?   rb   r   r   rB   rC   r   __classcell__rD   rD   r   rE   r{      s    r{   c                       sP   e Zd Z			ddededededd	f
 fd
dZdejdejfddZ  Z	S )PatchMergerr   	layernorm r}   context_dimspatial_merge_sizeprefixrM   Nc                    s   t    t }||d  | _|| _| jdkrt|dd| _n| jdkr+t|dd| _t	t
| j| jdd| d|d	t t| j|dd| d
|d	| _d S )Nr   r   gư>epsrmsnormTFz.0)biasreturn_biasr   
disable_tpz.2)r   r   r1   hidden_sizepre_normr   ln_qr   nn
Sequentialr   GELUr   mlp)rQ   r}   r   r   r   r   use_data_parallelr   rD   rE   r      s6   



zPatchMerger.__init__xc                 C   s<   | j r| | |d| j}|S | |d| j}|S )N)r   r   r   viewr   )rQ   r   rD   rD   rE   r      s
   zPatchMerger.forward)r   r   r   )
r=   r>   r?   rb   ra   r   rB   rC   r   r   rD   rD   r   rE   r      s     %r   c                       s   e Zd Z		dddddededed	edB d
eddf fddZ	ddddej	dej	dej	dB dej	dB dej	f
ddZ
  ZS )DotsVisionAttention   TNr   quant_configr   r}   	num_headsr   r   r   rM   c             	      s   t    t }|| _|rdnt | _|rdnt | _t	||| _
t	|| j| _t|| j
|||| d|d| _t||||| d|d| _t| j| j
| j
d | dd	| _td
d
d| _d S )Nr0   r   z.qkv)r   	head_sizetotal_num_headsr   r   r   r   z.proj)
input_sizeoutput_sizer   r   r   r   g      .attn)r   r   scaler   T)enforce_enableenable_fp32_compute)r   r   r1   	embed_dimr   tp_sizer   tp_rank
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr   qkvr   projr   attnr   apply_rotary_emb)rQ   rm   r}   r   r   r   r   r   r   rD   rE   r      sJ   

		zDotsVisionAttention.__init__
max_seqlenhidden_states
cu_seqlensrotary_pos_embr   c                C   s  | d}| |\}}t| |\}}}	|jd }
|dddd }|dddd }|	dddd }	|d urZtj||gdd}| 	||
 | }tj|ddd\}}| j|||	||d}|dddd }||jd |
d}| |\}}|dS )Nr0   r   r   r:   r}   )querykeyvaluer   r   r   )	unsqueezer   r   	split_qkvshapepermute
contiguousrB   catr   cossinchunkr   r   r   squeeze)rQ   r   r   r   r   r   _qkvbs	qk_concat
qk_rotatedcontext_layeroutrD   rD   rE   r     s4   
	
	
zDotsVisionAttention.forward)r   Tr`   )r=   r>   r?   rb   boolr   ra   r   rB   rC   r   r   rD   rD   r   rE   r      sB    	9r   c                       sl   e Zd ZddddedB def fddZdejd	ejfd
dZde	e
eejf  d	ee fddZ  ZS )DotsSwiGLUFFNNr   r   r   r   c                   sl   t    |j}|j}|j}t }t||gd ||| d|d| _t||||| d|d| _	t
 | _d S )Nr   z.fc13)r   r   r   r   z.fc2)r   r   intermediate_sizer   use_biasr1   r   fc13r   fc2r   act_fn)rQ   rm   r   r   hidden_featuresin_featuresr   r   r   rD   rE   r   A  s,   
zDotsSwiGLUFFN.__init__r   rM   c                 C   s*   |  |\}}| |}| |\}}|S r`   )r   r   r   )rQ   r   r   rD   rD   rE   r   a  s   
zDotsSwiGLUFFN.forwardweightsc                 C   s   ddg}t |  }t }|D ]M\}}|D ](\}}}	||vrq|||}|dr/||vr/q|| }
|
j}||
||	  n|drH||vrHq|| }
t|
dt}||
| || q|S )N)r   fc1r   )r   fc3r0   z.biasweight_loader)	rk   named_parameterssetreplaceendswithr   getattrr   add)rQ   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   rD   rD   rE   load_weightsg  s.   
zDotsSwiGLUFFN.load_weights)r=   r>   r?   r   ra   r   rB   rC   r   r   tupler   r   r   rD   rD   r   rE   r   @  s     ,r   c                       4   e Zd Z fddZddejdejfddZ  ZS )	DotsPatchEmbedc                    sl   t    |j| _|j| _|j| _|j| _|| _t|j|j|j|jf|j|jfd| _t	|j|j
d| _d S )N)kernel_sizestrider   )r   r   num_channels
patch_sizetemporal_patch_sizer   rm   r   r   r   rms_norm_epsnormrl   r   rD   rE   r     s   


zDotsPatchEmbed.__init__Nr   rM   c                 C   sN   | d| j| j| j| jd d d d df }| | d| j}| |}|S )Nr   r   )r   r   r  r   r   r   r  )rQ   r   grid_thwrD   rD   rE   r     s   
zDotsPatchEmbed.forwardr`   r=   r>   r?   r   rB   rC   r   r   rD   rD   r   rE   r     s     r   c                       r   )	DotsViTPreprocessorc                    s6   t    |j| _|j| _|j| _|| _t|| _d S r`   )	r   r   r   patch_hpatch_wr   rm   r   
patchifierrl   r   rD   rE   r     s   
zDotsViTPreprocessor.__init__Nr   rM   c                 C   s   |  ||}|S r`   )r	  )rQ   r   r  tokensrD   rD   rE   r     s   zDotsViTPreprocessor.forwardr`   r  rD   rD   r   rE   r    s     r  c                       sb   e Zd ZddddedB def fddZddd	ejd
ejdejdedB dejf
ddZ	  Z
S )DotsVisionBlockNr   r   r   r   c                   sj   t    t||j|j|j|| dd| _t|j|jd| _	t
||| dd| _t|j|jd| _d S )Nr   )r   r   r   r   r   z.mlpr   )r   r   r   r   num_attention_headsr   r   r   r  norm1r   r   norm2)rQ   rm   r   r   r   rD   rE   r     s    
zDotsVisionBlock.__init__r   r   r   r   r   rM   c                C   s4   || j | ||||d }|| | | }|S )Nr   r   r   )r   r  r   r  )rQ   r   r   r   r   rD   rD   rE   r     s   zDotsVisionBlock.forward)r=   r>   r?   r   ra   r   rB   rC   rb   r   r   rD   rD   r   rE   r    s(    r  c                       s   e Zd Z	ddddddededB dedB dedB ded	df fd
dZe	d	e
jfddZe	d	e
jfddZdeee  d	ee
j fddZdeee  d	e
jfddZde
jd	edB fddZde
jdeee  d	e
jfddZ  ZS )DotsVisionTransformerNr   )num_hidden_layers_overriderequire_post_normr   rm   r   r  r  r   rM   c                   s   t     | _ j| _t | _ j j }t|d | _	t
|t d| _ j| _|d u r2 jn|}t fddt|D | _|d u rQt| j jk}|ra| jjrat j jd| _nd | _t j j jd| _d S )Nr   )r   r   c                    s$   g | ]}t   d | dqS )z.blocks.r   )r  ).0irm   r   r   rD   rE   
<listcomp>  s    z2DotsVisionTransformer.__init__.<locals>.<listcomp>r   )r}   r   r   )r   r   rm   r   r  patch_embedr   r  r{   r   r'   rB   get_default_dtypeattn_backendr   out_hidden_sizenum_hidden_layersr   
ModuleListrangeblockslen	post_normr   r  post_trunk_normr   merger)rQ   rm   r   r  r  r   head_dim
num_layersr   r  rE   r     s<   
	

zDotsVisionTransformer.__init__c                 C      | j jjjjS r`   )r  r	  r   weightr   ro   rD   rD   rE   r        zDotsVisionTransformer.dtypec                 C   r%  r`   )r  r	  r   r&  r   ro   rD   rD   rE   r     r'  zDotsVisionTransformer.devicer  c                 C   s   g }|D ]e\}}}t |dd|}||| j | j|| j | j}|dddd}| }t |d|d}||| j | j|| j | j}|dddd}| }|t j	||gdd
|d q|S )Nr0   r   r   r   r:   r   )rB   r   r   expandreshaper   r   flattenappendstackrepeat)rQ   r  pos_idsthwhpos_idswpos_idsrD   rD   rE   get_pos_ids_by_grid  s,   "z)DotsVisionTransformer.get_pos_ids_by_gridc                 C   sF   |  |}tj|dd}tdd |D }| |}|| d}|S )Nr   r   c                 s   s     | ]\}}}t ||V  qd S r`   )max)r  r   r0  r1  rD   rD   rE   	<genexpr>6  s    z4DotsVisionTransformer.rot_pos_emb.<locals>.<genexpr>r0   )r4  rB   r   r5  r   r*  )rQ   r  r.  max_grid_sizerotary_pos_emb_fullr   rD   rD   rE   rot_pos_emb3  s   

z!DotsVisionTransformer.rot_pos_embr   c                 C   s<   d }| j tjks| j tjkr|dd  |d d   }|S )Nr0   r   )r  r/   
FLASH_ATTNROCM_AITER_FAr5  )rQ   r   r   rD   rD   rE   compute_attn_mask_seqlen;  s
   z.DotsVisionTransformer.compute_attn_mask_seqlenr   c                 C   s   |  |}tj||jtjd}|| j}| ||}t|d d df |d d df  |d d df j	dtj
 r>|jntjd}t|d|g}| |}| jD ]
}|||||d}qU| jd urj| |}| |}|S )Nr   r0   r   r   )r}   r   r  )r9  rB   tensorr   longtor   r  repeat_interleavecumsumjit
is_tracingint32r   	new_zerosr<  r  r!  r"  )rQ   r   r  r   r   r   blkrD   rD   rE   r   D  s0   
,




zDotsVisionTransformer.forwardr`   )r=   r>   r?   r,   r   rb   r   ra   r   propertyrB   r   r   listrC   r4  r9  r<  r   r   rD   rD   r   rE   r    sB    3 	
r  )rY   dummy_inputsc                       sr  e Zd ZedddddddZg dd	d
gdgddgdZdZedede	dedB fddZ
dddedef fddZdededB fddZdedeejd f fd!d"Zd#e	de	fd$d%Zd&e	de	fd'd(Zdedefd)d*Z		d9d+ejdB d,ejd-edB d.ejdB dejeB f
d/d0Zd1ejdejdB fd2d3Zd4eeeejf  dee fd5d6Zdefd7d8Z  Z S ):DotsOCRForCausalLMz
.attn.qkv.z.attn.proj.)z.attn.qkv_proj.z.attn.out_proj.zlanguage_model.lm_head.zlanguage_model.model.)zlm_head.zmodel.)orig_to_new_substrorig_to_new_prefix)q_projk_projv_proj	gate_projup_proj	.attn.qkvr   r   )qkv_projgate_up_projrR  r   Tmodalityr  rM   Nc                 C   s   | drdS d S )NrN   z<|img|><|imgpad|><|endofimg|>)
startswith)clsrU  r  rD   rD   rE   get_placeholder_str  s   
z&DotsOCRForCausalLM.get_placeholder_strr   )r   vllm_configr   c                   s   t    |jj| _|j| _|jj}|jdk| _t	| jj
tr-tdi | jj
}|| j_
n| jj
}| |d t|| jt|dd| _W d    n1 sNw   Y  | | t|| jt|ddgd| _W d    n1 sqw   Y  | jj| _d S )	NdatarN   vision_towerr   language_modelr   )rY  	hf_configr   architecturesrD   )r   r   model_configr]  rm   r   multimodal_configmm_encoder_tp_moder   rj   rd   rk   r,   _mark_tower_modelr  r&   r[  _mark_language_modelr%   r\  make_empty_intermediate_tensors)rQ   rY  r   r`  rd   r   rD   rE   r     s4   




	zDotsOCRForCausalLM.__init__rt   c                 K   sh   | dd }| dd }| dd }|d u r|d u rd S |d ur'td||dS |d ur2td||dS d S )Nr5   rG   r;   )r6   r5   r;   )r6   rG   r;   )popr4   rF   )rQ   rt   r5   rG   r;   rD   rD   rE   _parse_and_validate_image_input  s$   z2DotsOCRForCausalLM._parse_and_validate_image_inputimage_input.c                 C   s   |d }|j dksJ | }|d dkr|d | jj}n%|d | jj}| jr4t| j||ddS | ||d d d | jjf }| jj	}t
j|t
jdd	||   }||S )
Nr;   r   r6   rG   r5   rope_3d)	rope_typer   r   )ndimtolistr6   r[  r   r   r2   rm   r   r   rB   r=  r>  prodsplit)rQ   rg  r  grid_thw_listrG   r5   
merge_sizesizesrD   rD   rE   _process_image_input  s.   

z'DotsOCRForCausalLM._process_image_inputnum_image_tokensc                 C   s   | j j}||d  S Nr   r[  r   )rQ   rr  ro  rD   rD   rE   get_num_mm_encoder_tokens     z,DotsOCRForCausalLM.get_num_mm_encoder_tokensnum_vision_tokensc                 C   s   | j j}||d  S rs  rt  )rQ   rw  ro  rD   rD   rE   get_num_mm_connector_tokens  rv  z.DotsOCRForCausalLM.get_num_mm_connector_tokensc                 K   s*   | j di |}|d u rg S | |}|S )NrD   )rf  rq  )rQ   rt   rg  vision_embeddingsrD   rD   rE   embed_multimodal  s
   
z#DotsOCRForCausalLM.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r{  r|  r}  r~  )r\  )rQ   r{  r|  r}  r~  rt   r   rD   rD   rE   r     s   zDotsOCRForCausalLM.forwardr   c                 C   s   | j |S r`   )r\  compute_logits)rQ   r   rD   rD   rE   r    s   z!DotsOCRForCausalLM.compute_logitsr   c                 C   s   t | }|j|| jdS )N)mapper)r#   r   hf_to_vllm_mapper)rQ   r   loaderrD   rD   rE   r     s   zDotsOCRForCausalLM.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r\  zvision_tower.mergerzvision_tower.)r\  	connectortower_model)r   from_string_fieldro   rD   rD   rE   get_mm_mapping  s
   z!DotsOCRForCausalLM.get_mm_mapping)NN)!r=   r>   r?   r$   r  packed_modules_mappingsupports_encoder_tp_dataclassmethodra   rb   rX  r	   r   rz   rJ   rf  r   rB   rC   rq  ru  rx  r   rz  r*   r   r  r   r   r   r   r  r   rD   rD   r   rE   rJ  f  sj     

!

$rJ  )bcollections.abcr   r   typingr   r   r   rB   torch.nnr   r   transformers.models.qwen2_vlr   vllm.configr	   vllm.config.multimodalr
   vllm.distributedr   r   vllm.distributed.parallel_stater   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.interfacesr   r   r   r   )vllm.model_executor.models.module_mappingr    vllm.model_executor.models.qwen2r   #vllm.model_executor.models.qwen2_vlr   r    r!   r"    vllm.model_executor.models.utilsr#   r$   r%   r&   !vllm.model_executor.models.visionr'   vllm.multimodalr(   vllm.multimodal.inputsr)   vllm.sequencer*   'vllm.transformers_utils.configs.dotsocrr+   r,   vllm.utils.tensor_schemar-   r.   #vllm.v1.attention.backends.registryr/   visionr1   r2   rP   r4   rF   rJ   rA   rK   rc   Moduler{   r   r   r   r   r  r  r  register_processorrJ  rD   rD   rD   rE   <module>   sh   $.`F, 
