o
    پiP.                     @   s6  d dl Z d dlmZ d dlZd dlmZ d dlm  mZ d dl	Zd dlm
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ e eZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd deZ#dS )    N)Optional)	LayerNorm)PreTrainedModel)DotsVisionConfig)parallel_state)VisionAttention)QuantizationConfig)
add_prefixis_npuc                       s@   e Zd Zddededdf fddZdedejfd	d
Z  Z	S )VisionRotaryEmbedding     @dimthetareturnNc                    s>   t    d|tjd|dtjd|   }| jd|dd d S )Ng      ?r      )dtypeinv_freqF)
persistent)super__init__torcharangefloatregister_buffer)selfr   r   r   	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/dots_vlm_vit.pyr      s   
 zVisionRotaryEmbedding.__init__seqlenc                 C   s*   t j|| jj| jjd}t || j}|S )N)devicer   )r   r   r   r    r   outer)r   r   seqfreqsr   r   r   forward   s
   zVisionRotaryEmbedding.forward)r   )
__name__
__module____qualname__intr   r   r   Tensorr$   __classcell__r   r   r   r   r      s    r   c                       sV   e Zd Z				ddedededee ddf
 fd	d
ZdejdejfddZ	  Z
S )PatchMergerr   	layernormNr   context_dimspatial_merge_sizequant_configr   c                    s   t    ||d  | _|| _| jdkrt|dd| _n| jdkr)t|dd| _n	td| j  t	
t	| j| jt	 t	| j|| _|d ur{t	jj| jd jd|d	 t	j| jd j t	jj| jd jd|d	 t	j| jd j d S d S )
Nr   r,   ư>epsrmsnormzno norm in patch merger: r           meanstd)r   r   hidden_sizepre_normr   ln_qRMSNormloggerwarningnn
SequentialLinearGELUmlpinitnormal_weightzeros_bias)r   r   r-   r.   r9   init_merger_stdr/   r   r   r   r   #   s&   
	

zPatchMerger.__init__xc                 C   s<   | j r| | |d| j}|S | |d| j}|S )N)r9   rB   r:   viewr8   r   rI   r   r   r   r$   B   s
   zPatchMerger.forward)r   r,   NN)r%   r&   r'   r(   r   r   r   r   r)   r$   r*   r   r   r   r   r+   "   s"    r+   c                       sb   e Zd Zddedef fddZdejdejfdd	Zde	fd
dZ
dejdejfddZ  ZS )r;   r0   r   r2   c                    s&   t    tt|| _|| _d S N)r   r   r>   	Parameterr   onesrE   r2   )r   r   r2   r   r   r   r   K   s   

zRMSNorm.__init__rI   r   c                 C   s   |  | |}|| j S rM   )_normr   type_asrE   )r   rI   outputr   r   r   r$   P   s   
zRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerE   shaper2   r   r   r   r   
extra_reprT   s   zRMSNorm.extra_reprc                 C   s$   |t |djddd| j  S )Nr   rJ   T)keepdim)r   rsqrtpowr6   r2   rL   r   r   r   rP   W   s   $zRMSNorm._norm)r0   )r%   r&   r'   r(   r   r   r   r)   r$   strrV   rP   r*   r   r   r   r   r;   J   s
    r;   c                       s>   e Zd Zd	dee f fddZdejdejfddZ  Z	S )
DotsSwiGLUFFNNr/   c                    sV   t    |j}|j}|j}tj|||d| _tj|||d| _tj|||d| _	d S )N)rG   )
r   r   intermediate_size	embed_dimuse_biasr>   r@   fc1fc2fc3)r   configr/   hidden_featuresin_featuresrG   r   r   r   r   \   s   
zDotsSwiGLUFFN.__init__rI   r   c                 C   s(   t | || | }| |}|S rM   )Fsilur_   ra   r`   rL   r   r   r   r$   f   s   
zDotsSwiGLUFFN.forwardrM   
r%   r&   r'   r   r   r   r   r)   r$   r*   r   r   r   r   r[   [   s    
r[   c                       @   e Zd Zd	dee f fddZd	dejdejfddZ  Z	S )
DotsPatchEmbedNr/   c                    sn   t    |j| _|j| _|j| _|j| _|| _tj|j|j|j|jf|j|jfd| _	t
|j|jd| _d S )N)kernel_sizestrider1   )r   r   num_channels
patch_sizetemporal_patch_sizer]   rb   r>   Conv2dprojr;   rms_norm_epsnormr   rb   r/   r   r   r   r   m   s   


zDotsPatchEmbed.__init__rI   r   c                 C   sN   | d| j| j| j| jd d d d df }| | d| j}| |}|S )NrJ   r   )rK   rl   rn   rm   rp   r]   rr   )r   rI   grid_thwr   r   r   r$   |   s   
zDotsPatchEmbed.forwardrM   rg   r   r   r   r   ri   l   s     ri   c                       rh   )
DotsViTPreprocessorNr/   c                    s8   t    |j| _|j| _|j| _|| _t||| _d S rM   )	r   r   rm   patch_hpatch_wr]   rb   ri   
patchifierrs   r   r   r   r      s   
zDotsViTPreprocessor.__init__rI   r   c                 C   s   |  ||}|S rM   )rx   )r   rI   rt   tokensr   r   r   r$      s   zDotsViTPreprocessor.forwardrM   rg   r   r   r   r   ru      s     ru   c                       sD   e Zd Z		ddedee def fddZdej	fd	d
Z
  ZS )DotsVisionBlockN rb   r/   prefixc                    sn   t    t|j|j|jdd|td||j|j|jd
| _t	|j|j
d| _t||| _t	|j|j
d| _d S )NTattn)
r]   	num_headsprojection_sizeuse_qkv_parallelflatten_batchr/   r|   num_dummy_headsqkv_bias	proj_biasr1   )r   r   r   r]   num_attention_headsr	   r   r^   r}   r;   rq   norm1r[   rB   norm2)r   rb   r/   r|   r   r   r   r      s    
zDotsVisionBlock.__init__r   c                 C   s2   || j | |||d }|| | | }|S )N)
cu_seqlensposition_embeddings)r}   r   rB   r   )r   hidden_statesr   rotary_pos_embr   r   r   r$      s   zDotsVisionBlock.forward)Nr{   )r%   r&   r'   r   r   r   rZ   r   r   r)   r$   r*   r   r   r   r   rz      s    rz   c                       s   e Zd Z	ddedee ddf fddZdd Zd	d
 Ze	de
jfddZe	de
jfddZdd Zdd Zdd Z	dde
jde
jde
jfddZ  ZS )DotsVisionTransformerNrb   r/   r   c                    s   t     | _|    j| _t | _| | jjj	  j
 j }t|d | _ j}t fddt|D | _| jjrLt j
 jd| _t j j
 j| jjd| _d| _d S )Nr   c                    s   g | ]}t  d | qS )zblocks.)rz   ).0irb   r/   r   r   
<listcomp>   s    z2DotsVisionTransformer.__init__.<locals>.<listcomp>r1   )r   r-   r.   rH   r/   F)r   r   rb   _update_vision_configr.   ru   patch_embed_init_weightsrx   rp   r]   r   r   r   num_hidden_layersr>   
ModuleListrangeblocks	post_normr;   rq   post_trunk_normr+   r8   rH   mergergradient_checkpointing)r   rb   r/   head_dim_num_hidden_layersr   r   r   r      s0   
zDotsVisionTransformer.__init__c                 C   s`   t  }| jj}| jj| }d}|| dkr || | | | }t| jd| t| jd| dS )z"update vision config to support tpr   r   r   N)r   $get_tensor_model_parallel_world_sizerb   r   r]   setattr)r   
world_sizer~   r   r   r   r   r   r      s   
z+DotsVisionTransformer._update_vision_configc                 C   s   | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tjrD|jjjd|d |jd urF|jj|j 
  d S d S d S )Nr4   r5   )rb   initializer_range
isinstancer>   r@   ro   rE   datarD   rG   zero_	Embeddingpadding_idx)r   moduler7   r   r   r   r      s   

z#DotsVisionTransformer._init_weightsc                 C      | j d jjjjS Nr   )r   rB   r`   rE   r   rU   r   r   r   r         zDotsVisionTransformer.dtypec                 C   r   r   )r   rB   r`   rE   r    rU   r   r   r   r       r   zDotsVisionTransformer.devicec                 C   s   g }|D ]e\}}}t |dd|}||| j | j|| j | j}|dddd}| }t |d|d}||| j | j|| j | j}|dddd}| }|t j	||gdd
|d q|S )N   rJ   r   r      r   )r   r   	unsqueezeexpandreshaper.   permuteflattenappendstackrepeat)r   rt   pos_idsthwhpos_idswpos_idsr   r   r   get_pos_ids_by_grid  s,   "z)DotsVisionTransformer.get_pos_ids_by_gridc                 C   sL   |  |}tj|dd}|d d dd f  }| |}|| d}|S )Nr   r   r   )r   r   catmaxr   r   )r   rt   r   max_grid_sizerotary_pos_emb_fullr   r   r   r   rot_pos_emb  s   

z!DotsVisionTransformer.rot_pos_embc                 C   sX   |  }| }|ddddd }|ddddd }||f}|S )Nr   r   r   )cossinr   r   r   )r   r   r   r   r   r   r   calc_cos_sin$  s   z"DotsVisionTransformer.calc_cos_sinTr   rt   c                 C   s   | | j}|r| }| ||}| |}| |}t|d d df |d d df  |d d df jdtj	
 r?|jntjd}t|d|g}t rV| d}| jD ]	}||||d}qY| jjrl| |}| |}|S )Nr   r   r   )r   r   cpu)r   r   )tor    bfloat16r   r   r   r   repeat_interleavecumsumjit
is_tracingr   int32r   	new_zerosr
   r   rb   r   r   r   )r   r   rt   bf16r   r   blkr   r   r   r$   ,  s0   

,



zDotsVisionTransformer.forwardrM   )T)r%   r&   r'   r   r   r   r   r   r   propertyr   r   r    r   r   r   r)   r$   r*   r   r   r   r   r      s4    &	r   )$loggingtypingr   r   torch.nnr>   torch.nn.functional
functionalre   torch.utils.checkpointr   transformers.modeling_utilsr   sglang.srt.configs.dots_vlmr   sglang.srt.distributedr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.quantizationr   sglang.srt.utilsr	   r
   	getLoggerr%   r<   Moduler   r+   r;   r[   ri   ru   rz   r   r   r   r   r   <module>   s,    
("