o
    پiB                     @   st  U d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
m	  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZ eeeef B Zeed< eeeef ej B Z!eed< dd Z"e"dZ#e"dZ$e"dZ%e"dZ&e"Z'G dd de	j(Z)G dd de	j(Z*G dd de	j(Z+G dd de	j,Z-G dd de	j(Z.G dd de	j(Z/dS )    N)Iterable)repeat)	TypeAlias)	rearrange)PretrainedConfig)BaseModelOutput)QuantizationConfig)default_weight_loaderreplace_prefixreplace_substrings)InternVisionEncoderinput_dim_tnorm_tc                    s    fdd}|S )Nc                    s*   t | trt | tst| S tt|  S N)
isinstancer   strtupler   )xn K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/radio.pyparse)   s   z_ntuple.<locals>.parser   )r   r   r   r   r   _ntuple(   s   r               c                       sT   e Zd Z				ddededededB dedB f
 fd	d
ZdejfddZ  Z	S )ClsTokenr   TNndim
num_tokensenabledregister_multiplenum_registersc                    s   t    || _|| _d| _|| _|r6|r|| _n	|r"|||  | _|d }tt	|| j || | _
nd | _
| j| j | _d S )Nr         )super__init__r   r!   r#   r    nn	Parametertorchrandntokennum_patches)selfr   r    r!   r"   r#   scale	__class__r   r   r&   9   s$   
zClsToken.__init__r   c                 C   s@   | j d u r|S | j d|jd dd}tj||gdd}|S )Nr   r   dim)r+   	unsqueezeexpandshaper)   cat)r-   r   r+   r   r   r   forwardY   s   
zClsToken.forward)r   TNN)
__name__
__module____qualname__intboolr&   r)   Tensorr8   __classcell__r   r   r/   r   r   8   s"     r   c                       s  e Zd Z												d7dededed	ed
edededB dededededB dedB def fddZdej	dej	fddZ
edd Zedd Zedd Zedd  Zed!d" Zd#ej	d$ejfd%d&Zd'ej	d(ej	fd)d*Zdej	dej	fd+d,Z		d8d-ej	d.ej	dB d/eeef dB dej	fd0d1Z		d8d2ed.ej	dB d/eeef dB dej	fd3d4Zd2edeeef fd5d6Z  ZS )9ViTPatchGeneratorTFN        r   
patch_size	embed_dim
input_dimsabs_posnormalize_patches	cls_tokenmax_input_dimspos_dropoutreturn_pos_encnum_cls_tokensr"   r#   
patch_biasc                    sT  t    t|tr||f}|d u r|}t|tr||f}t fdd|D }||k| _|| _|	| _t||d} | _	|| _
|| _|d   | _|d   | _t fdd|D | _| j| j | _|| _t | _t |fd|i|| _|r|d }ttjd| j|fi || | _t||
|||d	| _|rt|| _d S t | _d S )
Nc                 3   s&    | ]}t t|    V  qd S r   )r<   mathceil.0drB   r   r   	<genexpr>   s    
z-ViTPatchGenerator.__init__.<locals>.<genexpr>)devicedtyper   r   c                 3   s    | ]}|  V  qd S r   r   rO   rR   r   r   rS      s    biasr$   )r    r!   r"   r#   )r%   r&   r   r<   r   cpe_moderI   rJ   dictrB   rE   rC   num_rowsnum_colsrD   r,   rH   
Im2Patchesim_to_patchesViTPatchLinearembedderr'   r(   r)   r*   	pos_embedr   rG   	LayerNormIdentitypatch_normalizer)r-   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   r"   r#   rL   rT   rU   factoryr.   r/   rR   r   r&   j   s\   




	zViTPatchGenerator.__init__r   returnc                 C   sL   |  |}| j||jdd  d\}}| |}| |}| jr$||fS |S )Nr   )
input_size)embed_patchesapply_pos_encr6   rG   rb   rJ   )r-   r   patchespos_encr   r   r   r8      s   


zViTPatchGenerator.forwardc                 C      | j jS r   )rG   r!   r-   r   r   r   apply_cls_token      z!ViTPatchGenerator.apply_cls_tokenc                 C   rj   r   )rG   r    rk   r   r   r   rK      rm   z ViTPatchGenerator.num_cls_tokensc                 C   rj   r   )rG   r,   rk   r   r   r   num_cls_patches   rm   z!ViTPatchGenerator.num_cls_patchesc                 C   rj   r   )rG   r#   rk   r   r   r   r#      rm   zViTPatchGenerator.num_registersc                 C   s   | j | j S r   )rK   r#   rk   r   r   r   num_skip   s   zViTPatchGenerator.num_skip	src_embed
targ_embedc                 C   s   |j |j kr8tt|j d }|d |j d ksJ dt|d||d}tj|| j| jfdddd	}t|d
}|j	
| d S )Nr   r   z*Unable to interpolate non-square embeddingzb (h w) c -> b c h w)hwbicubicTFsizemodealign_corners	antialiaszb c h w -> b (h w) c)r6   r<   rM   sqrtr   FinterpolaterY   rZ   datacopy_)r-   rp   rq   src_sizer   r   r   _load_embed   s"   

zViTPatchGenerator._load_embedsrc_proj_weighttarg_proj_weightc                 C   s   |j |j kr=tt|j d d }|d d |j d ks!J dt|dd||d}tj|| j| jfddd	d
}t|d}|j	| d S )Nr   r   r   z+Unable to interpolate non-square patch sizezb (c h w) -> b c h w)crr   rs   rt   TFru   zb c h w -> b (c h w))
r6   r<   rM   rz   r   r{   r|   rB   r}   r~   )r-   r   r   src_patch_sizer   r   r   _load_projection   s.   

z"ViTPatchGenerator._load_projectionc                 C   s   |  |}| |}|S r   )r\   r^   )r-   r   rh   r   r   r   rf      s   

zViTPatchGenerator.embed_patchesrh   
patch_idxsre   c                 C   sr   | j s|S | |jd ||}| jr1| jdkr1tj|jd dd|j|jd| jk}t	||d}n|}|| |fS )Nr   r   )rU   rT   )
rE   get_pos_encr6   trainingrI   r)   randrU   rT   where)r-   rh   r   re   ri   keepspos_enc_dropr   r   r   rg     s   zViTPatchGenerator.apply_pos_enc
batch_sizec                    s|   |d u r j }nt fdd|D } ||}|d u r|S |ddd|jd }tj||jd ddd|d}|S )Nc                 3   s    | ]}| j  V  qd S r   rR   rO   rk   r   r   rS   $  s    z0ViTPatchGenerator.get_pos_enc.<locals>.<genexpr>r1   r   r   )r3   index)rD   r   _get_pos_embeddingsr4   r5   r6   r)   gather)r-   r   r   re   rD   r_   exp_patch_idxsr   rk   r   r     s   zViTPatchGenerator.get_pos_encc                    s<  | j | jf kr| jS | jd| j | jddddd} fdd}| jr| jrtd}t	j
|dd|jd	d|  | }td
}| }t	t	j
|dd|jd	||  | }	||	 }
|d|	  }t	j|
|gdddd}t	j
|ddd|jd	d|  }t	jdd d |jdd | d d}t	jdd d |jdd d d d f |d d }t	j||gdd}|| | }|dd tj| |ddd|dddd|j}nt }tj| ||fddd|j}||}n||}|jdd   krtj|  ddd|j}|dddd}|S )Nr   r1   r   r   r   c                    s\    d | j d k r| dd  d d d f }  d | j d k r,| dd d d  d f } | S )Nr   .r   r1   )r6   )r_   rD   r   r   window_select:  s
   z<ViTPatchGenerator._get_pos_embeddings.<locals>.window_selectg?)rT   g      ?r2   )stepsrT   NNbilinearzerosT)gridrw   padding_moderx   )rv   rx   rw   r   )rY   rZ   r_   reshapepermuterW   r   rM   rz   r)   r   rT   logexpstackclamp_linspacer5   mul_sub_r{   grid_samplefloattorU   maxr|   r6   flatten)r-   r   rD   r_   r   	min_scaler.   
aspect_min
aspect_maxaspectscale_xscale_yscale_xypos_xylin_xlin_ylin_xygrid_xymax_dimr   r   r   r   2  s   


z%ViTPatchGenerator._get_pos_embeddings)TFFNrA   Fr   NNFNNr   )r9   r:   r;   r<   r   r=   r   r&   r)   r>   r8   propertyrl   rK   rn   r#   ro   r'   r(   r   r   rf   r   rg   r   r   r?   r   r   r/   r   r@   i   s    	
G	







"r@   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )r[   rB   c                    s   t    || _d S r   r%   r&   rB   )r-   rB   r/   r   r   r&     s   

zIm2Patches.__init__r   rd   c                 C   sb   | j dkr|d}|ddd}|S |jd | j  }|jd | j  }t|d|| j || j d}|S )Nr   r   r   r   r1   z*b c (py yy) (px xx) -> b (py px) (c yy xx))pyyypxxx)rB   r   r   r6   r   )r-   r   rh   r   r   r   r   r   r8     s   

zIm2Patches.forward)	r9   r:   r;   r<   r&   r)   r>   r8   r?   r   r   r/   r   r[     s    r[   c                       s,   e Zd Zddededef fddZ  ZS )r]   FrB   rC   rV   c                    s,   t  jd|d  |fd|i| || _d S )Nr   r   rV   r   )r-   rB   rC   rV   rc   r/   r   r   r&     s   "
zViTPatchLinear.__init__)F)r9   r:   r;   r<   r=   r&   r?   r   r   r/   r   r]     s    $r]   c                       sv   e Zd ZddgiZ		ddededB ddf fddZdeeeef B fd	d
Z	dd Z
dejdejfddZ  ZS )RadioInternVisionModelqkvNconfigquant_configrd   c                    sz   t    || _| t|j|j\| _| _| _	t
t|j|j |j }t|j|j| j|d|jd| _t||d| _d S )NT)rD   rH   rG   r"   r   r   )r%   r&   r   _init_img_size	to_2tuplerB   
image_sizeimg_size	grid_sizer,   r<   roundmax_img_sizer@   hidden_size
reg_tokenspatch_generatorr   encoder)r-   r   r   r   r/   r   r   r&     s"   
	zRadioInternVisionModel.__init__r   c                 C   sF   |d u rdS t |}tdd t||D }|d |d  }|||fS )N)NNNc                 S   s   g | ]\}}|| qS r   r   )rP   spr   r   r   
<listcomp>  s    z9RadioInternVisionModel._init_img_size.<locals>.<listcomp>r   r   )r   r   zip)r-   rB   r   r   r,   r   r   r   r     s   
z%RadioInternVisionModel._init_img_sizec                 C   s   | j S r   )
embeddingsrk   r   r   r   get_input_embeddings  s   z+RadioInternVisionModel.get_input_embeddingsr   c                 C   s:   | j d usJ |  |}| jj|d}t|tsJ |jS )N)inputs_embeds)r   r   r8   r   r   last_hidden_state)r-   r   hidden_statesencoder_outputsr   r   r   r8     s
   
zRadioInternVisionModel.forwardr   )r9   r:   r;   packed_modules_mappingr   r   r&   r<   r   r   r   r)   r>   FloatTensorr8   r?   r   r   r/   r   r     s    r   c                       s   e Zd ZddgiZ	ddededB ddf fddZ		ddejdB d	ejdB dej	fd
dZ
dee fddZdejfddZ  ZS )
RadioModelr   Nr   r   rd   c                    s"   t    || _t||d| _d S )Nr   )r%   r&   r   r   model)r-   r   r   r/   r   r   r&     s   
zRadioModel.__init__pixel_valuespixel_embedsc                 C   s   |  |}| |S r   )r   _extract_final)r-   r   r   yr   r   r   r8     s   

zRadioModel.forwardc                 C   s   dddd}ddi}t  }t|  }t|trt| }nt|}|D ].\}}|ds/q%t||}t||}|rS||v rS|| }	t	|	dt
}
|
|	| || q%|S )Nz	attn.attnqkv_projzencoder.layers)attnr   blockszradio_model. weight_loader)setrX   named_parametersr   listitems
startswithr   r
   getattrr	   add)r-   weightsremap_substringsremap_prefixesloaded_paramsparams_dictweights_listnameweightparamr   r   r   r   load_weights  s.   





zRadioModel.load_weightsr   c                 C   s0   t | jdd }|d ur|d d |jd f }|S )Nr   )r   r   ro   )r-   r   	patch_genall_featr   r   r   r     s   zRadioModel._extract_finalr   r   )r9   r:   r;   r   r   r   r&   r)   r>   r   r8   r   r   r   r   r?   r   r   r/   r   r     s,    
 r   )0rM   collections.abcr   	itertoolsr   typingr   r)   torch.nnr'   torch.nn.functional
functionalr{   einopsr   transformersr   transformers.modeling_outputsr   *sglang.srt.layers.quantization.base_configr   $sglang.srt.model_loader.weight_utilsr	   r
   r   sglang.srt.models.internvlr   r<   r   r   __annotations__r   r>   r   r   	to_1tupler   	to_3tuple	to_4tuple	to_ntupleModuler   r@   r[   Linearr]   r   r   r   r   r   r   <module>   s:   
	1  1