o
    پi                     @   s  U d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZmZmZ ddlZddlm  mZ ddlZddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ee-d e-d de.d f Z/ee0d< e-e eB e.edf B Z1ee0d< e2e3Z4de/dejfddZ5de/de6fddZ7dejde/dejdejfddZ8d ejd!e-e9 dejfd"d#Z:d$ejdejde/d%e9e-e9 B dejf
d&d'Z;G d(d) d)ej<Z=G d*d+ d+ej<Z>G d,d- d-ej<Z?d.ejd/ejd0ejd1ee9e9f d2ee9e9f dejfd3d4Z@G d5d6 d6ej<ZAd7ejd8e9deejee9e9f f fd9d:ZBd;ejd8e9d<ee9e9f d=ee9e9f dejf
d>d?ZCG d@dA dAej<ZDG dBdC dCej<ZEdDdE ZFG dFdG dGej<ZG		HddIe9fdJdKZHddIe9fdLdMZIdNdO ZJG dPdQ dQej<ZKG dRdS dSejj<ZLejMjNdTdU ZOG dVdW dWej<ZPG dXdY dYejjQZRG dZd[ d[ej<ZSG d\d] d]ej<ZTG d^d_ d_ej<ZUeVdi d`dadbdHdcdddedddfdgdhdidjdidkdldmdndodpdqdpdrdldsdtdudtdvdwdxdydzg ZWd{d| ZXG d}d~ d~ej<ZYG dd dej<ZZ	a		y	n			dde9de9dee9de9de9de9fddZ[G dd dej<Z\e\gZ]dS )zAInference-only Apertus model compatible with HuggingFace weights.    N)partial)IterableListOptionalSetTupleType	TypeAliasUnion)Tensornn)get_rel_pos)DeepseekVLV2Config)QuantizationConfig)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)DeepseekForCausalLM)DeepseekV2ForCausalLMDeepseekV3ForCausalLM)maybe_prefixNestedTensorstorch.Tensor)r   ..MultiModalEmbeddings
embeddingsreturnc                 C   s0   t | tjr| ddS ttdd | D S )z`
    Recursively flattens and concatenates NestedTensors on all but the last
    dimension.
    r   c                 s       | ]}t |V  qd S N)_flatten_embeddings).0t r%   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/deepseek_ocr.py	<genexpr>C       z&_flatten_embeddings.<locals>.<genexpr>)
isinstancetorchr   flattencattupler   r%   r%   r&   r"   9   s   r"   c                 C   s>   t | tjrddd | jdd D S ddd | D S )	ze
    Constructs a debugging representation of the number of embeddings in the
    NestedTensors.
    z x c                 S   s   g | ]}t |qS r%   )str)r#   dimr%   r%   r&   
<listcomp>M   s    z/_embedding_count_expression.<locals>.<listcomp>Nz + c                 s   r    r!   )_embedding_count_expression)r#   innerr%   r%   r&   r'   O   r(   z._embedding_count_expression.<locals>.<genexpr>)r)   r*   r   joinshaper.   r%   r%   r&   r3   F   s   r3   inputs_embedsmultimodal_embeddingsis_multimodalc           	      C   s   t |dkr| S t|}| j}z| |d|j|d W | S  tyQ } z&t |}|  }||krHt	|}t
d| d| d| d|t
d|d	}~ww )
z
    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
    positions in `inputs_embeds` corresponding to placeholder tokens in
    `input_ids`.

    Note:
        This updates `inputs_embeds` in place.
    r   r2   dtypezAttempted to assign z = z multimodal tokens to z placeholdersz%Error during masked scatter operationN)lenr"   r;   masked_scatter_	unsqueezetoRuntimeErrorsumitemr3   
ValueError)	r7   r8   r9   mm_embeds_flatinput_dtypeenum_actual_tokensnum_expected_tokensexprr%   r%   r&   _merge_multimodal_embeddingsR   s0   
rJ   elementstest_elements_listc                 C   s&   t j|ddj| jdd}t | |S )NT)
pin_memory)devicenon_blocking)r*   tensorr?   rN   isin)rK   rL   test_elementsr%   r%   r&   	isin_list|   s   rS   	input_idsplaceholder_token_idc                 C   s,   t |trt| |}n| |k}t|||dS )a  
    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
    positions in `inputs_embeds` corresponding to placeholder tokens in
    `input_ids`.

    `placeholder_token_id` can be a list of token ids (e.g, token ids
    of img_start, img_break, and img_end tokens) when needed: This means
    the order of these tokens in the `input_ids` MUST MATCH the order of
    their embeddings in `multimodal_embeddings` since we need to
    slice-merge instead of individually scattering.

    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
    - T is text token
    - S is image start token
    - I is image embedding token
    - B is image break token
    - E is image end token.

    Then the image embeddings (that correspond to I's) from vision encoder
    must be padded with embeddings of S, B, and E in the same order of
    input_ids for a correct embedding merge.

    Note:
        This updates `inputs_embeds` in place.
    )r8   r9   )r)   listrS   rJ   )rT   r7   r8   rU   r9   r%   r%   r&   merge_multimodal_embeddings   s   
rW   c                       s,   e Zd Z			d fdd	Zdd Z  ZS )MlpProjector      c                    sx  || _ || _|| _|| _d| _d| _t   |dkr!t	 }n|dkr-t
||}n|dkrY|}t
||g}td|D ]}	|t  |t
|| q?tj| }n^|dkr|}|}t|| | t
|| | || g}td|d D ]}	|t  |t
|| ||  q}|t  |t
|| | tj| }n
|dkr|}|}t
|| | || g}td|d D ]}	|t  |t
|| ||  q|t  |t
|| | tj| }n|dkr2|}t
||d	 | _t
||d	 | _g }td|D ]}	|t  |t
|| qtj| }n|d
krx|}d}
t
|d t||
 | _t
|d |t||
  | _g }td|D ]}	|t  |t
|| q^tj| }n?|dkr|}g }td|D ]}	|t  |t
|d	 |d	  qtj| }tj| | _t|| _ntd| || _d S )NFidentitylinearmlp_gelurY   normlayer_downsample_mlp_geludownsample_mlp_gelulow_high_hybrid_split_mlp_gelu   hybrid_split_feature_mlp_gelu      ?r   low_high_split_mlp_geluzUnknown projector type: )projector_type	input_dimn_embeddepthtoken_poolingconv_fusion_high_low_featuressuper__init__r   IdentityLinearrangeappendGELU
Sequential	LayerNormhigh_up_projlow_up_projinthigh_layerscopydeepcopy
low_layersrC   layers)selfre   rf   rg   rh   	mlp_ratiodownsample_ratiomodules	mlp_depth_channel_div	__class__r%   r&   rl      s   	







zMlpProjector.__init__c              	   C   s  | j r]|j\}}}t|d  }}|||||}|dddd}|dddddd}| \}}}}	}
}
| ||||	 d}|dddd }||||	 |d }| |}| j	rs| 
|d d df |d d df  }| jdkr|d |d }}| |}| |}tj||gdd	}| jd
kr|dd | jd f }|d| jd d f }| |}| |}tj||gdd	}| jdkr|d |d }}| |}| |}tj||gdd	}|S | jdks| jdkrF|j\}}}t|d  }}	 || j r| j|| j  }nd}|||||}|dkr+t|ddd|d|fdd}	 |dddd}tj|| j| jdd}|ddd}| |S )Nrc   r      rY   ra   r2   rZ   r`   r0   rb   .rd   r_   r^   constantkernel_sizestridepadding)ri   r6   rv   viewpermuteunfoldsize
contiguoustoken_pooling_layerrj   fusion_layerre   rt   ru   r*   concatrf   rw   rz   r~   reshapeFpadr{   )r|   x
batch_sizewxhchannelswhpatches	h_patches	w_patchesr   high_xlow_xbshwrf   r   r%   r%   r&   forward  sj   
&











zMlpProjector.forward)rY   rY   rZ   __name__
__module____qualname__rl   r   __classcell__r%   r%   r   r&   rX      s    irX   c                       sB   e Zd Zddededdf fddZdejdejfd	d
Z  Z	S )LayerNorm2dư>num_channelsepsr   Nc                    s8   t    tt|| _tt|| _|| _	d S r!   )
rk   rl   r   	Parameterr*   onesweightzerosbiasr   )r|   r   r   r   r%   r&   rl   e  s   

zLayerNorm2d.__init__r   c                 C   sn   |j ddd}|| dj ddd}|| t|| j  }| jd d d d f | | jd d d d f  }|S )NrY   T)keepdimra   )meanpowr*   sqrtr   r   r   )r|   r   usr%   r%   r&   r   k  s
   ,zLayerNorm2d.forward)r   )
r   r   r   rv   floatrl   r*   r   r   r   r%   r%   r   r&   r   d  s    r   c                	       sP   e Zd Zejfdededeej ddf fddZde	j
de	j
fd	d
Z  ZS )MLPBlockembedding_dimmlp_dimactr   Nc                    s2   t    t||| _t||| _| | _d S r!   )rk   rl   r   rn   lin1lin2r   )r|   r   r   r   r   r%   r&   rl   t  s   
zMLPBlock.__init__r   c                 C   s   |  | | |S r!   )r   r   r   r|   r   r%   r%   r&   r     s   zMLPBlock.forward)r   r   r   r   rq   rv   r   Modulerl   r*   r   r   r   r%   r%   r   r&   r   s  s    r   q	rel_pos_h	rel_pos_wq_sizek_sizec                 C   s   |\}}|\}}t |||}	t |||}
| j\}}}| ||||}td||	}td||
}|d}|d}|||| |d}|||| d|}||fS )a  
    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
    Args:
        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkr2   r   rY   )r   r6   r   r*   einsumr>   )r   r   r   r   r   q_hq_wk_hk_wRhRwBr   r0   r_qrel_hrel_wr%   r%   r&   add_decomposed_rel_pos  s   

r   c                       sl   e Zd ZdZ					ddededed	ed
edeeeef  ddf fddZde	j
de	j
fddZ  ZS )	Attentionz=Multi-head Attention block with relative position embeddings.   TFNr0   	num_headsqkv_biasuse_rel_posrel_pos_zero_init
input_sizer   c                    s   t    || _|| }|d | _tj||d |d| _t||| _|| _| jrS|dus1J dt	t
d|d  d || _t	t
d|d  d || _dS dS )	a  
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads.
            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            input_size (tuple(int, int) or None): Input resolution for calculating the relative
                positional parameter size.
        g      r   r   NzBInput size must be provided if using relative positional encoding.ra   r   rY   )rk   rl   r   scaler   rn   qkvprojr   r   r*   r   r   r   )r|   r0   r   r   r   r   r   head_dimr   r%   r&   rl     s   


 $zAttention.__init__r   c              	   C   s  |j \}}}}| |||| d| jdddddd}|d|| j || dd\}}}	d\}
}| jrGt|| j| j	||f||f\}
}|
|| j|| d}|
|| j|| d}|	
|| j|| d}	| jr|

|| j|
d|
d|
d}
|
|| j|d|d|d}|
| 
|| j|
d|
d|d }tjjj|||	|d}n	tjj|||	}|
|| j||dddddd|||d}| |}|S )	Nr   r2   ra   r   rY   rZ   )NN	attn_mask)r6   r   r   r   r   unbindr   r   r   r   r   r   r*   r   
functionalscaled_dot_product_attentionr   )r|   r   r   HWr   r   r   kvr   r   	attn_biasr%   r%   r&   r     s@   *& 
zAttention.forward)r   TFTN)r   r   r   __doc__rv   boolr   r   rl   r*   r   r   r   r%   r%   r   r&   r     s.    #r   r   window_sizec              	   C   s   | j \}}}}|||  | }|||  | }|dks|dkr+t| ddd|d|f} || || }}	| ||| ||	| ||} | dddddd d|||}
|
||	ffS )aT  
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.
    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    r   rY   r   ra   rZ      r2   )r6   r   r   r   r   r   )r   r   r   r   r   Cpad_hpad_wHpWpwindowsr%   r%   r&   window_partition  s   "r   r   pad_hwr   c           
      C   s   |\}}|\}}| j d || | |  }| ||| || ||d}	|	dddddd |||d}	||ks=||krO|	ddd|d|ddf  }	|	S )	a  
    Window unpartition into original sequences and removing padding.
    Args:
        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
        window_size (int): window size.
        pad_hw (Tuple): padded height and width (Hp, Wp).
        hw (Tuple): original height and width (H, W) before padding.
    Returns:
        x: unpartitioned sequences with [B, H, W, C].
    r   r2   rY   r   ra   rZ   r   N)r6   r   r   r   )
r   r   r   r   r   r   r   r   r   r   r%   r%   r&   window_unpartition  s   $$r   c                       s   e Zd ZdZddejejddddfdeded	ed
e	de
ej de
ej de	de	dedeeeef  ddf fddZdejdejfddZ  ZS )BlockzSTransformer blocks with support of window attention and residual propagation blocks      @TFr   Nr0   r   r}   r   
norm_layer	act_layerr   r   r   r   r   c                    sf   t    ||| _t||||||	dkr|
n|	|	fd| _||| _t|t|| |d| _|	| _	dS )ai  
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads in each ViT block.
            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            norm_layer (nn.Module): Normalization layer.
            act_layer (nn.Module): Activation layer.
            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks. If it equals 0, then
                use global attention.
            input_size (tuple(int, int) or None): Input resolution for calculating the relative
                positional parameter size.
        r   )r   r   r   r   r   )r   r   r   N)
rk   rl   norm1r   attnnorm2r   rv   mlpr   )r|   r0   r   r}   r   r   r   r   r   r   r   r   r%   r&   rl   7  s   


	
zBlock.__init__r   c                 C   s   |}|  |}| jdkr|jd |jd }}t|| j\}}| |}| jdkr3t|| j|||f}|| }|| | | }|S )Nr   rY   ra   )r   r   r6   r   r   r   r   r   )r|   r   shortcutr   r   r   r%   r%   r&   r   e  s   



zBlock.forward)r   r   r   r   r   rs   rq   rv   r   r   r   r   r   r   rl   r*   r   r   r   r%   r%   r   r&   r   4  sD    	
.r   c                       st   e Zd ZdZ					ddeeef deeef deeef d	ed
eddf fddZdejdejfddZ	  Z
S )
PatchEmbedz#
    Image to Patch Embedding.
       r  r   r   r      r   r   r   in_chans	embed_dimr   Nc                    s$   t    tj|||||d| _dS )aP  
        Args:
            kernel_size (Tuple): kernel size of the projection layer.
            stride (Tuple): stride of the projection layer.
            padding (Tuple): padding size of the projection layer.
            in_chans (int): Number of input image channels.
            embed_dim (int): Patch embedding dimension.
        r   N)rk   rl   r   Conv2dr   )r|   r   r   r   r  r  r   r%   r&   rl   }  s   

zPatchEmbed.__init__r   c                 C   s   |  |}|dddd}|S )Nr   ra   r   rY   )r   r   r   r%   r%   r&   r     s   
zPatchEmbed.forward)r   r   r  r   r  )r   r   r   r   r   rv   rl   r*   r   r   r   r%   r%   r   r&   r   x  s*    


r   c                 C   sj   | j }| d}||kr3| dddd}|tj}tj|||fdddd|}|dddd}|S | S )	NrY   r   r   ra   bicubicTFr   mode	antialiasalign_corners)r;   r   r   r?   r*   float32r   interpolate)abs_postgt_sizer;   src_sizeold_pos_embednew_pos_embedr%   r%   r&   get_abs_pos_sam  s"   
r  c                %       s   e Zd Zdddddddddejejdd	dd
ddfdededededededededede	ej
 de	ej
 dededededeedf deddf$ fd d!Zd"ejdejfd#d$Z  ZS )%ImageEncoderViT   r  r   r     r      TFr   r%   img_size
patch_sizer  r  rh   r   r}   	out_chansr   r   r   use_abs_posr   r   r   global_attn_indexes.net_3_out_channelsr   Nc                    s  t    || _t||f||f||d| _d| _|r*tt	d|| || || _t
 | _t|D ]"}t||||	|
|||||vrD|nd|| || fd
}| j| q3ttj||dddt|tj||dddd	t|| _tjd
dddddd| _tjd|ddddd| _dS )a  
        Args:
            img_size (int): Input image size.
            patch_size (int): Patch size.
            in_chans (int): Number of input image channels.
            embed_dim (int): Patch embedding dimension.
            depth (int): Depth of ViT.
            num_heads (int): Number of attention heads in each ViT block.
            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            norm_layer (nn.Module): Normalization layer.
            act_layer (nn.Module): Activation layer.
            use_abs_pos (bool): If True, use absolute positional embeddings.
            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks.
            global_attn_indexes (list): Indexes for blocks using global attention.
        )r   r   r  r  NrY   r   )
r0   r   r}   r   r   r   r   r   r   r   F)r   r   r   )r   r   r   r  i   ra   )r   r   r   r   )rk   rl   r  r   patch_embed	pos_embedr   r   r*   r   
ModuleListblocksro   r   rp   rr   r  r   necknet_2net_3)r|   r  r  r  r  rh   r   r}   r  r   r   r   r  r   r   r   r  r  iblockr   r%   r&   rl     sf   
&
zImageEncoderViT.__init__r   c                 C   sp   |  |}| jd ur|t| j|d }| jD ]}||}q| |dddd}| |}| |	 }|S )NrY   r   r   ra   )
r  r  r  r   r!  r"  r   r#  r$  clone)r|   r   blkx2x3r%   r%   r&   r     s   




zImageEncoderViT.forward)r   r   r   r   rs   rq   rv   r   r   r   r   r   rl   r*   r   r   r   r%   r%   r   r&   r    sp    	

`r  r  r  c                 C   st   d}d}d}t || |dttjjdd||dd|d||d	}	|	  |d ur8t|}
|	jd
d |
 D dd |	S )Nr  r  r  rZ   r   r   T   )rh   r  r  r}   r   r   r  r   r   r  r   r  r  c                 S   s&   i | ]\}}d |v r|dd |qS )vision_tower_high   Nr%   )r#   r   r   r%   r%   r&   
<dictcomp>>  s   & z_build_sam.<locals>.<dictcomp>strict)	r  r   r*   r   rs   evalloadload_state_dictitems)encoder_embed_dimencoder_depthencoder_num_headsencoder_global_attn_indexes
checkpointr  prompt_embed_dim
image_sizevit_patch_sizeimage_encoder
state_dictr%   r%   r&   
_build_sam   s4   
r@  c                 C   s   t dddg d| |dS )Nr  r  )ra   r   r      )r6  r7  r8  r9  r:  r  )r@  )r:  r  r%   r%   r&   build_sam_vit_bD  s   rB  c           
      C   s   |  d}| d}|d d |dd  }}tt|jd d }tt|}| j}||kr||d|||dddd	 }|
tj}tj|||fdddd	
|}|dddd}||| |}tj||gdd
}	|	d|| d |}	|	S | S )Nr2   r   rY   r   ra   r  TFr  r   )r   squeezerv   mathr   r6   r;   r   r   r   r?   r*   r  r   r  r,   )
r  r  r0   abs_pos_new	cls_tokenr  r  r;   r  vision_pos_embedr%   r%   r&   get_abs_posO  s6   

rH  c                       s&   e Zd Zd	 fdd	Zdd Z  ZS )
CLIPVisionEmbeddingsr     r,  r   c                    s   t    || _|| _|| _tjt| j| _	tjj
|| j| j| jdd| _| j| j d | _| jd | _tj| j| j| _| dt| jd d S )NF)in_channelsout_channelsr   r   r   ra   rY   position_ids)rY   r2   )rk   rl   r  r<  r  r*   r   r   randnclass_embeddingr  patch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpand)r|   hidden_sizer<  r  r   r   r%   r&   rl   s  s$   
zCLIPVisionEmbeddings.__init__c                 C   sv   |j d }|d ur|}n| |}|ddd}| j|dd}tj||gdd}|t| 	| j
|d }|S )Nr   ra   rY   r2   r   )r6   rP  r+   	transposerO  rW  r*   r,   rH  rT  rM  r   )r|   pixel_valuespatch_embedsr   class_embedsr   r%   r%   r&   r     s   

zCLIPVisionEmbeddings.forward)r  rJ  r,  r   r   r%   r%   r   r&   rI  r  s    rI  c                       s,   e Zd Z fddZdejfddZ  ZS )NoTPAttentionc                    s   t    |d | _|d | _|d |d  | _|d | _|d | _tjj	|d |d d dd| _
tjj	|d |d dd| _|d | _d S )	Nnum_attention_headsrX  
seq_lengthuse_flash_attnr   Tr   attention_dropout)rk   rl   r   n_local_headsr   max_seq_lenuse_flash_attentionr*   r   rn   qkv_projout_proj	attn_drop)r|   cfgr   r%   r&   rl     s   




zNoTPAttention.__init__r   c           
      C   sj  |j \}}}| |}|||d| j| j}| jrdtj|ddd\}}}|d}|d}|d}|	dddd}|	dddd}|	dddd}tj
jj|||d d}	|		dddd||d}	nJtj|ddd\}}}|d}|d}|d}|	dddd}|	dddd}|	dddd}tj
jj|||d d}	|		dddd||d}	| |	}	|	S )Nr   rY   ra   r   r   r   r2   )r6   re  r   r   r   rd  r*   splitrC  r   r   r   r   r   rf  )
r|   r   bszseqlenr   xqkvxqxkxvoutputr%   r%   r&   r     s8   







zNoTPAttention.forward)r   r   r   rl   r*   r   r   r   r%   r%   r   r&   r]    s
    r]  c                 C   s   | t d|   S )NgZd;?)r*   sigmoid)r   r%   r%   r&   
quick_gelu  s   rr  c                       s.   e Zd Zdedef fddZdd Z  ZS )NoTPFeedForwardr0   
hidden_dimc                    s6   t    tjj||dd| _tjj||dd| _d S )NTr   )rk   rl   r*   r   rn   fc1fc2)r|   rh  r0   rt  r   r%   r&   rl     s   
zNoTPFeedForward.__init__c                 C   s   |  t| |}|S r!   )rv  rr  ru  )r|   r   rp  r%   r%   r&   r     s   zNoTPFeedForward.forward)r   r   r   rv   rl   r   r   r%   r%   r   r&   rs    s    rs  c                       s(   e Zd ZdZdejf fddZ  ZS )LayerNormfp32z*Subclass torch's LayerNorm to handle fp16.r   c                    s$   |j }t |tj}||S r!   )r;   rk   r   typer*   r  )r|   r   	orig_typeretr   r%   r&   r     s   
zLayerNormfp32.forward)r   r   r   r   r*   r   r   r   r%   r%   r   r&   rw    s    rw  c                       s4   e Zd Zddef fddZdejfddZ  ZS )	NoTPTransformerBlockr  layer_idc                    s   t    |d | _|d | _|d |d  | _t|| _t||d |d d| _|| _	t
jj|d |d d| _t
jj|d |d d| _d S )Nr^  rX  ffn_hidden_size)r0   rt  layernorm_epsilonr+  )rk   rl   n_headsr0   r   r]  	self_attnrs  r   r|  r*   r   rs   layer_norm1layer_norm2)r|   rh  r|  multiple_ofr   r%   r&   rl     s   



zNoTPTransformerBlock.__init__r   c                 C   s4   | j | |}|| }|| j| | }|S r!   )r  r   r  r   r  )r|   r   residualr   outr%   r%   r&   r     s   zNoTPTransformerBlock.forward)r  )	r   r   r   rv   rl   r*   r   r   r   r%   r%   r   r&   r{    s    r{  c                       s$   e Zd Z fddZdd Z  ZS )NoTPTransformerc                    sP   t    || _|d | _tj | _t| jD ]}| j	t
||d  qd S )N
num_layersrY   )rk   rl   rh  r  r*   r   r   r{   ro   rp   r{  )r|   rh  r|  r   r%   r&   rl     s   

zNoTPTransformer.__init__c                 C   s   | j D ]}||}q|S r!   )r{   )r|   hidden_stateslayerr%   r%   r&   r   #  s   

zNoTPTransformer.forwardr   r%   r%   r   r&   r    s    r  c                       sJ   e Zd Zdd fddZedd Zdd	 Zdefd
dZdd Z	  Z
S )VitModelFr   Nc                    s   t    t|d |d |d d| _|r"| j D ]\}}d|_qt|d| _|ddrAt	
d t|d |d	d
d| _ntjj|d |d	d
d| _|r_| j D ]\}}d|_qW|  D ]}d|_qcd S )NrX  r<  r  )rX  r<  r  F)rh  fp32normzLoad fp32 layernorm for ViT.pre_layernorm_epsilonh㈵>r+  T)rk   rl   rI  r   named_parametersrequires_gradr  transformergetloggerinforw  pre_layrnormr*   r   rs   
parametersmicro_dp)r|   rh  freeze_embedfreeze_pre_normr   parampr   r%   r&   rl   /  s4   




zVitModel.__init__c                 C   s   t |  jS r!   )nextr  r;   r|   r%   r%   r&   r;   Q  s   zVitModel.dtypec                 C   s$   t |ts|g}| j|d  d S )Nr   )r)   rV   r  set_input_tensor)r|   input_tensorr%   r%   r&   r  U  s   
zVitModel.set_input_tensorc                 C   s   dS )N	open_clipr%   r  r%   r%   r&   __str__Z  s   zVitModel.__str__c                 C   s$   |  ||}| |}| |}|S r!   )r   r  r  )r|   r   r[  r  rp  r%   r%   r&   r   ]  s   

zVitModel.forward)FF)r   N)r   r   r   rl   propertyr;   r  r/   r  r   r   r%   r%   r   r&   r  .  s    "
r  r     rX  r   r  r^  r}  i   r_  r  max_position_embeddingsr`  Funderstand_projector_stridera   hidden_dropout        ra  no_persist_layer_normr~  r  r  r<  rJ  r  r,  recompute_listc                   C   s   t tdddS )NF)rh  r  r  )r  vit_model_cfgr%   r%   r%   r&   build_clip_l{  s
   r  c                       s   e Zd ZdZ										
				d#dededededededededededededef fddZdd Zd$d!d"Z	  Z
S )%CustomQwen2Decoderz@Qwen2 decoder with mixed causal masking for OCR2 vision encoder.r       r,  ra      Q sdpar       .Ar  silu{Gz?decoder_layerr  hidden_dimensionr^  num_key_value_headsintermediate_size
vocab_sizeattn_implementationrms_norm_eps
rope_thetara  
hidden_actinitializer_rangec                    sn   t    |dkrtdttjjjd}ttd}|||||||||	|
||||d}| ||| _	| j	`
d S )Nflash_attention_2zICustomQwen2Decoder does not support flash_attention_2; use sdpa or eager.
Qwen2ModelQwen2Config)rX  num_hidden_layersr^  r  r  r  r  r  r  ra  r  r  _attn_implementation)rk   rl   rC   getattrtransformersmodelsqwen2modeling_qwen2_create_custom_modelmodelembed_tokens)r|   r  r  r  r^  r  r  r  r  r  r  ra  r  r  r  r  configr   r%   r&   rl     s0   


zCustomQwen2Decoder.__init__c                 C   s   G dd d|}||S )Nc                       sH   e Zd Z											d fdd	Z fddZdd Z  ZS )	zFCustomQwen2Decoder._create_custom_model.<locals>.CustomQwen2ModelInnerNc                    s<   || _ d| |||||i}t j||||||||	|
|d
S )Nfull_attention)
rT   attention_maskrM  past_key_valuesr7   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_position)_current_token_type_ids_update_causal_maskrk   r   )r|   rT   r  rM  r  r7   token_type_idsr  r  r  r  r  causal_mask_mappingr   r%   r&   r     s*   	zNCustomQwen2Decoder._create_custom_model.<locals>.CustomQwen2ModelInner.forwardc                    s   |j |j}}t|j}|jd |jd }	}
t| dd }|d u r,t |||||S | j	|
|||	|d}|d urZ|
 dkrZ|d d d d d d f j|d}d| | }|| }|S )Nr   rY   r  )sequence_lengthr;   rN   r   r  ra   r:   g      ?)r;   rN   r*   finfominr6   r  rk   r  _create_custom_4d_maskr0   r?   )r|   r  r  r  r  r  r;   rN   	min_dtyper   r  r  causal_maskpadding_maskr   r%   r&   r    s4    zZCustomQwen2Decoder._create_custom_model.<locals>.CustomQwen2ModelInner._update_causal_maskc              	   S   s   t |j}g }t|D ]]}t j||f|||d}	|| }
|
dkjddd }|
dkjddd }t|dkrCd|	|d d d f |f< t|D ]\}}t|dkrWd|	||f< d|	||d |d  f< qG||	 qt j	|dd
d}	|	S )N)
fill_valuer;   rN   r   T)as_tuplerY   r  r   )r*   r  r  ro   fullnonzeror<   	enumeraterp   stackr>   )r|   r  r;   rN   r   r  r  masksbmasktype_idsimage_positionstext_positionsr%  text_posr%   r%   r&   r    s*   z]CustomQwen2Decoder._create_custom_model.<locals>.CustomQwen2ModelInner._create_custom_4d_mask)NNNNNNNNNNN)r   r   r   r   r  r  r   r%   r%   r   r&   CustomQwen2ModelInner  s    %(r  r%   )r|   r  r  r  r%   r%   r&   r    s   qz'CustomQwen2Decoder._create_custom_modelNc                 K   s   | j d|||d|S )N)r7   r  r  r%   r  )r|   r7   r  r  kwargsr%   r%   r&   r   '  s   zCustomQwen2Decoder.forward)r  r  r  r,  ra   r  r  r  r   r  r  r  r  r!   )r   r   r   r   rv   r/   r   rl   r  r   r   r%   r%   r   r&   r    sX    	
-tr  c                       sP   e Zd ZdZdedededededef fdd	Zd
ejdejfddZ  Z	S )Qwen2Decoder2Encoderz*Decoder-as-encoder for OCR2 vision tokens.r  r  r^  r  r  	max_queryc                    s@   t    t|||||dd| _td|| _td|| _d S )Nr  )r  r  r^  r  r  r     r  )rk   rl   r  r  r   rS  	query_768
query_1024)r|   r  r  r^  r  r  r  r   r%   r&   rl   3  s   
		zQwen2Decoder2Encoder.__init__r   r   c                 C   s
  | ddd}|j\}}}|dkr| jj}n(|dkr!| jj}n|| jjkr+| jjn| jj}tj|j	
d|ddddj	}|
d|d	d	}tj||gdd
}tjtj||tj|jdtj||tj|jdgdd
}	| ||	d }
|
d d |d d d f }
|
S )Nra   rY   r  r  r   r\   F)r   r	  r  r2   r   )r;   rN   )r+   rY  r6   r  r   r  num_embeddingsr   r  Tr>   rC  rW  r*   r,   r   longrN   r   r  )r|   r   r   n_queryr   	param_imgbasebatch_query_imgs
x_combinedr  yr%   r%   r&   r   I  s<   


zQwen2Decoder2Encoder.forward)
r   r   r   r   rv   rl   r*   r   r   r   r%   r%   r   r&   r  0  s     r  r  r    r  r  r  r  r  c           	      C   s8   t | |||||d}|d urt|}|j|dd |S )N)r  r  r^  r  r  r  Tr0  )r  r*   r3  r4  )	r  r  r^  r  r  r  r:  decoder_as_encoderr?  r%   r%   r&   build_qwen2_decoder_as_encoderp  s   	
r  c                       s  e Zd Zddddedee def fddZed	e	e
 d
edee	e  fddZdejdejfddZdejdejfddZdejdejfddZdejdejdejfddZdefddZ	d9dejdejdejdee	e  def
d d!Zd"e	e
 dejfd#d$Zdejjfd%d&Zdedee fd'd(Z	d9d)ejd*ee dejfd+d,Zd)e	e d-efd.d/Z d	e	e
 dejfd0d1Z!d)ejd2ejd3e"defd4d5Z#d6e$e%eejf  fd7d8Z&  Z'S ):DeepseekOCRForCausalLMN )quant_configprefixr  r  r  c                   s  t    || _|j| _|j| _|j| _tt| jdd dkp)t| jdd dk| _	t| jdd}|j
| _
|j| _dttj|tjd	 }| j
d
krftt|| | _| j	sett|| | _ntd| j
 | j	s| jjdkrt|j|t|dd| _n*| jjst|j|t|dd| _nt|j|t|dd| _nt|j|t|dd| _| j	st | _t | _ nt| jdd}t|d| _t!|d| _"t#| jj$| jj%|| jj&| jj'| jj(d| _)d S )N
model_namer  deepencoderv2rf   r  rg   i   rY   r:   2Dz.Only 2D tile_tag is supported currently, got: noaux_tclanguage)r  r  r  )r  )r  )re   rf   rg   rh   r}   r~   )*rk   rl   r  vision_configprojector_configtext_configr/   r  loweris_ocr2tile_tagglobal_view_posr*   r   rP   r  r   r   rN  view_seperatorimage_newlinerC   topk_methodr   r   r  use_mlar   r   rB  	sam_modelr  vision_modelr  qwen2_modelrX   re   rf   rh   r}   r~   	projector)r|   r  r  r  rg   	embed_stdprojector_input_dimr   r%   r&   rl     sz   






zDeepseekOCRForCausalLM.__init__r5  	flag_namer   c                 C   s:   g }| D ]}t ||d }|d u r d S |t| q|S r!   )r  rp   r   )r5  r  valuesrB   valuer%   r%   r&   _collect_mm_flag  s   z'DeepseekOCRForCausalLM._collect_mm_flagimagesc                 C   s0   |  |}| |}| |}|d|jd S )Nr2   )r  r  r  r   r6   )r|   r  featuresr%   r%   r&   _encode_ocr2_features  s   


z,DeepseekOCRForCausalLM._encode_ocr2_featuresc                 C   sR   |  |}| ||}tj|d d dd f |ddddfdd}| |S )NrY   ra   r   r2   r   )r  r  r*   r,   r+   r   r  )r|   r  
features_1
features_2r  r%   r%   r&   _encode_ocr1_features  s   

z,DeepseekOCRForCausalLM._encode_ocr1_featuresr  c                 C   sb   |j \}}}t|d  }}||||}tj|| jd d d d f |d|gdd}|d|S )Nrc   rY   r   r2   )r6   rv   r   r*   r,   r  rW  )r|   r  r   r   n_dimr   r   r%   r%   r&   _format_ocr1_global_features  s    z3DeepseekOCRForCausalLM._format_ocr1_global_features
crop_shapec           
      C   s   |j \}}}t|d  }}t|d t|d }}	||	||||ddddd|	| || |}tj|| jd d d d f |	| d|gdd}|d|S )	Nrc   r   rY   ra   r   rZ   r   r2   )	r6   rv   r   r   r   r*   r,   r  rW  )
r|   r  r&  r   hw2n_dim2h2w2width_crop_numheight_crop_numr%   r%   r&   _format_ocr1_local_features  s    
	z2DeepseekOCRForCausalLM._format_ocr1_local_featuresr  c                 K   s   | dd }| dd }| dd }| dd }|d u rd S |d ur'|s&d S nt| dkr2d S |d urnt|tjtfsGtdt| t|tjtfsXtdt| t|tjtfsitdt| |||gS t	d	)
NrZ  images_spatial_cropimages_crop
has_imagesr   z*Incorrect type of pixel values. Got type: z)Incorrect type of image sizes. Got type: z(Incorrect type of image crop. Got type: z This line should be unreachable.)
popr*   rA   rB   r)   r   rV   rC   rx  AssertionError)r|   r  rZ  r.  r/  r0  r%   r%   r&   _parse_and_validate_image_input  s:   
z6DeepseekOCRForCausalLM._parse_and_validate_image_inputrZ  r/  r.  has_local_cropsc              	   C   s  g }| j st v t|dD ]e}|| d tj}|| }|| d }	|d ur/|| nt| dk}
| 	|}| 
|}|
ra| 	|}| ||	}tj||| jd d d f gdd}ntj|| jd d d f gdd}|| qW d    |S 1 sw   Y  |S t e t|dD ]T}|| d tj}|| }|d ur|| nt| dk}
| |}|
r| |}tj||| jd d d f gdd}ntj|| jd d d f gdd}|| qW d    |S 1 sw   Y  |S )Nr   r   )r  r*   no_gradro   r   r?   bfloat16rA   rB   r#  r%  r-  r,   r  rp   r   )r|   rZ  r/  r.  r4  images_in_this_batchjdxr   	image_orir&  use_local_cropsglobal_featureslocal_featuresglobal_local_featuresr%   r%   r&   _pixel_values_to_embedding=  st   



	
""



z1DeepseekOCRForCausalLM._pixel_values_to_embeddingmm_itemsc           	      C   s   | j rt| j jn| jj}| |d}tjdd |D dd	|}tjdd |D dd	tj
j|jd}tjdd |D dd	tj
j|jd}| d	ksVJ | d
ks^J | j||||d}tj|dd	|}|S )Nr4  c                 S      g | ]}|j qS r%   )featurer#   rB   r%   r%   r&   r1         z?DeepseekOCRForCausalLM._process_image_input.<locals>.<listcomp>r   r   c                 S   r@  r%   )r/  rB  r%   r%   r&   r1     rC  )rN   c                 S   r@  r%   )r.  rB  r%   r%   r&   r1     rC     r   )rZ  r/  r.  r4  )r  r  r  r  r;   r  r  r*   r  rx  r  r?   rN   r,   r0   r>  )	r|   r?  target_dtyper4  rZ  r/  r.  vision_feature_listsvision_featuresr%   r%   r&   _process_image_input  s8   z+DeepseekOCRForCausalLM._process_image_inputc                 C   s   | j S r!   r  r  r%   r%   r&   get_language_model  s   z)DeepseekOCRForCausalLM.get_language_modelc                 K   s*   | j di |}|d u rd S | |}|S )Nr%   )r3  rH  )r|   r  image_inputvision_embeddingsr%   r%   r&   get_multimodal_embeddings  s
   
z0DeepseekOCRForCausalLM.get_multimodal_embeddingsrT   r8   c                 C   s(   | j |}|d urt|||| j}|S r!   )r  get_input_embeddingsrW   image_token_id)r|   rT   r8   r7   r%   r%   r&   rM    s   
z+DeepseekOCRForCausalLM.get_input_embeddings	mm_inputsc                 C   s   t  }|||S r!   )r   pad_input_tokens)r|   rT   rO  patternr%   r%   r&   pad_input_ids  s   z$DeepseekOCRForCausalLM.pad_input_idsc                 C   s   |  |}|S r!   )rH  )r|   r5  rK  r%   r%   r&   get_image_feature  s   
z(DeepseekOCRForCausalLM.get_image_feature	positionsforward_batchc                 K   s   t ||| j| |d}|S )N)rT   rU  language_modelmultimodal_modelrT  )r   r  )r|   rT   rT  rU  r  r  r%   r%   r&   r     s   zDeepseekOCRForCausalLM.forwardweightsc                 C   s8  g d}t |  }t }|D ]\}}d|v rqd|v }|dkr#d}n@|drcd|v s@d|v s@d	|v s@d
|v s@d|v s@d|v rI|tdd  }nd|v scd	|v scd
|v scd|v scd|v sc|dd}|r|}||vrd|v rv|dd}	n|ddd}	|	|v r|	}|dr||vrq||v r|| }
t|
dt}||
| |	| q|D ]5\}}}||vrq|||}|dr||vrqd|v sd|v r||vrq|| }
|
j
}||
||  n&|dr||vrqd|v sd|v r||vrq|| }
t|
dt}||
| |	| q| | }|rtd| d S )N))	.qkv_projz.q_projr   )rY  z.k_projr   )rY  z.v_projr   ).gate_up_projz
.gate_projr   )rZ  z.up_projrY   zrotary_emb.inv_freqzqwen2_model.zlm_head.weightzmodel.lm_head.weightzmodel.r  z
.projectorr  r  r  r  zmodel.model.z.model.model.z.model.rY   z.biasweight_loaderzmlp.experts.zmlp.shared_experts.z3Some weights are not initialized from checkpoints: )dictr  set
startswithr<   replaceendswithr  r   addr[  keysr@   )r|   rX  stacked_params_mappingparams_dictloaded_paramsnameloaded_weightis_qwen2_weighttarget_namealt_namer  r[  
param_nameweight_nameshard_idunloaded_paramsr%   r%   r&   load_weights  s   	



z#DeepseekOCRForCausalLM.load_weightsr!   )(r   r   r   r   r   r   r/   rl   staticmethodr   r   r   r  r*   r   r   r#  r%  r-  objectr3  r   r>  rH  r   r   rI  r   rL  rM  rv   r   rR  rS  r   r   r   r   ro  r   r%   r%   r   r&   r     s    S


)

N#


$r   )Nr  r%   )r  r  r,  ra   r  r  N)^r   rx   loggingrD  	functoolsr   typingr   r   r   r   r   r   r	   r
   r*   torch.nn.functionalr   r   r   r  r   *transformers.models.vitdet.modeling_vitdetr   sglang.srt.configs.deepseek_ocrr   sglang.srt.layers.quantizationr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.deepseekr   sglang.srt.models.deepseek_v2r   r   sglang.srt.models.transformersr   rV   r-   r   __annotations__r   	getLoggerr   r  r"   r/   r3   rJ   rv   rS   rW   r   rX   r   r   r   r   r   r   r   r   r  r  r@  rB  rH  rI  r]  jitscriptrr  rs  rs   rw  r{  r  r  r\  r  r  r  r  r  r   
EntryClassr%   r%   r%   r&   <module>   sd  ( 

*


+ 3


$U



D"u
$#+=
	8	
 .A
   
<