o
    -ic                     @   s  U d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
m	  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZ eeeef B Zeed
< eeeef ejB Zeed< dd Z e dZ!e dZ"e dZ#e dZ$e Z%deeef dedefddZ&de'eeef  dede'e fddZ(G dd de	j)Z*G dd de	j)Z+G dd de	j)Z,G d d! d!e	j-Z.G d"d# d#eZ/G d$d% d%eZ0G d&d' d'eZ1G d(d) d)e	j)Z2G d*d+ d+e	j)Z3dS ),    N)Iterable)repeat)	TypeAlias)	rearrange)PretrainedConfig)QuantizationConfig)default_weight_loader)InternParallelAttentionInternVisionEncoderInternVisionEncoderLayerinput_dim_tnorm_tc                    s    fdd}|S )Nc                    s*   t | trt | tst| S tt|  S N)
isinstancer   strtupler   )xn ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/radio.pyparse#   s   z_ntuple.<locals>.parser   )r   r   r   r   r   _ntuple"   s   r               size
patch_sizereturnc                 C   s   | \}}|| ||  S r   r   )r   r   hwr   r   r   calc_seq_len2   s   r"   sizesc                    s    fdd| D S )Nc                    s   g | ]}t | qS r   )r"   ).0r   r   r   r   
<listcomp>8   s    z!calc_seq_lens.<locals>.<listcomp>r   )r#   r   r   r%   r   calc_seq_lens7   s   r'   c                       sT   e Zd Z				ddededededB dedB f
 fd	d
ZdejfddZ  Z	S )ClsTokenr   TNndim
num_tokensenabledregister_multiplenum_registersc                    s   t    || _|| _d| _|| _|r6|r|| _n	|r"|||  | _|d }tt	|| j || | _
nd | _
| j| j | _d S )Nr         )super__init__r)   r+   r-   r*   nn	Parametertorchrandntokennum_patches)selfr)   r*   r+   r,   r-   scale	__class__r   r   r0   <   s$   
zClsToken.__init__r   c                 C   s@   | j d u r|S | j d|jd dd}tj||gdd}|S Nr   r   dim)r5   	unsqueezeexpandshaper3   cat)r7   r   r5   r   r   r   forward\   s   
zClsToken.forward)r   TNN)
__name__
__module____qualname__intboolr0   r3   TensorrC   __classcell__r   r   r9   r   r(   ;   s"     r(   c                       s  e Zd Z												d<dededed	ed
edededB dededededB dedB def fddZ	d=dej	de
eeef  dB dej	fddZdej	de
eeef  deej	ej	dB f fddZdej	de
eeef  dej	fddZedd  Zed!d" Zed#d$ Zed%d& Zed'd( Zd)ej	d*ejfd+d,Zd-ej	d.ej	fd/d0Zdej	dej	fd1d2Z		d>dej	d3ej	dB d4eeef dB dej	fd5d6Z		d>d7ed3ej	dB d4eeef dB dej	fd8d9Zd7edeeef fd:d;Z  ZS )?ViTPatchGeneratorTFN        r   r   	embed_dim
input_dimsabs_posnormalize_patches	cls_tokenmax_input_dimspos_dropoutreturn_pos_encnum_cls_tokensr,   r-   
patch_biasc                    sT  t    t|tr||f}|d u r|}t|tr||f}t fdd|D }||k| _|| _|	| _t||d} | _	|| _
|| _|d   | _|d   | _t fdd|D | _| j| j | _|| _t | _t |fd|i|| _|r|d }ttjd| j|fi || | _t||
|||d	| _|rt|| _d S t | _d S )
Nc                 3   s&    | ]}t t|    V  qd S r   )rG   mathceilr$   dr%   r   r   	<genexpr>   s    
z-ViTPatchGenerator.__init__.<locals>.<genexpr>)devicedtyper   r   c                 3   s    | ]}|  V  qd S r   r   rY   r%   r   r   r[          biasr.   )r*   r+   r,   r-   )r/   r0   r   rG   r   cpe_moderS   rT   dictr   rO   rM   num_rowsnum_colsrN   r6   rR   
Im2Patchesim_to_patchesViTPatchLinearembedderr1   r2   r3   r4   	pos_embedr(   rQ   	LayerNormIdentitypatch_normalizer)r7   r   rM   rN   rO   rP   rQ   rR   rS   rT   rU   r,   r-   rV   r\   r]   factoryr8   r9   r%   r   r0   m   s\   




	zViTPatchGenerator.__init__r   
imgs_sizesr   c                 C   s   |d ur|  |}| j||d\}}| j||d}n| |}| j||jdd  d\}}| |}| |}| jr>||fS |S )Nrm   r   
input_size)	rg   apply_pos_enc_dynamiccls_token_dynamicembed_patchesapply_pos_encrA   rQ   rk   rT   )r7   r   rm   patchespos_encr   r   r   rC      s   




zViTPatchGenerator.forwardru   c              	   C   s   | j s|d fS d}g }|D ]P}t|| j}|d d ||| d d f }| j|jd |d}|| }	tj|d d d |d d f |	|d d || d d d f gdd}|| ||7 }q|rgtj|ddnd }
||
fS )Nr   ro   r   r=   )rO   r"   r   get_pos_encrA   r3   rB   append)r7   ru   rm   current_lengthpos_enc_listr   
seq_lengthimg_patchesrv   img_patches_with_posfull_pos_encr   r   r   rq      s(   

z'ViTPatchGenerator.apply_pos_enc_dynamicc                 C   s   | j js|S g }d}t|| jD ],}| j jd|jd dd}|| ||d d ||| d d f  ||7 }qt	j
|ddS r;   )rQ   r+   r'   r   r5   r?   r@   rA   rx   r3   rB   )r7   ru   rm   outry   seq_lenclass_tokenr   r   r   rr      s   
$
z#ViTPatchGenerator.cls_token_dynamicc                 C      | j jS r   )rQ   r+   r7   r   r   r   apply_cls_token      z!ViTPatchGenerator.apply_cls_tokenc                 C   r   r   )rQ   r*   r   r   r   r   rU      r   z ViTPatchGenerator.num_cls_tokensc                 C   r   r   )rQ   r6   r   r   r   r   num_cls_patches   r   z!ViTPatchGenerator.num_cls_patchesc                 C   r   r   )rQ   r-   r   r   r   r   r-     r   zViTPatchGenerator.num_registersc                 C   s   | j | j S r   )rU   r-   r   r   r   r   num_skip  s   zViTPatchGenerator.num_skip	src_embed
targ_embedc                 C   s   |j |j kr8tt|j d }|d |j d ksJ dt|d||d}tj|| j| jfdddd	}t|d
}|j	
| d S )Nr   r   z*Unable to interpolate non-square embeddingzb (h w) c -> b c h w)r    r!   bicubicTFr   modealign_corners	antialiaszb c h w -> b (h w) c)rA   rG   rW   sqrtr   Finterpolaterb   rc   datacopy_)r7   r   r   src_sizer   r   r   _load_embed  s"   

zViTPatchGenerator._load_embedsrc_proj_weighttarg_proj_weightc                 C   s   |j |j kr=tt|j d d }|d d |j d ks!J dt|dd||d}tj|| j| jfddd	d
}t|d}|j	| d S )Nr   r   r   z+Unable to interpolate non-square patch sizezb (c h w) -> b c h w)cr    r!   r   TFr   zb c h w -> b (c h w))
rA   rG   rW   r   r   r   r   r   r   r   )r7   r   r   src_patch_sizer   r   r   _load_projection   s*   

z"ViTPatchGenerator._load_projectionc                 C   s   |  |}| |}|S r   )re   rg   )r7   r   ru   r   r   r   rs   ;  s   

zViTPatchGenerator.embed_patches
patch_idxsrp   c                 C   sr   | j s|S | |jd ||}| jr1| jdkr1tj|jd dd|j|jd| jk}t	||d}n|}|| |fS )Nr   r   r]   r\   )
rO   rw   rA   trainingrS   r3   randr]   r\   where)r7   ru   r   rp   rv   keepspos_enc_dropr   r   r   rt   @  s   zViTPatchGenerator.apply_pos_enc
batch_sizec                    s|   |d u r j }nt fdd|D } ||}|d u r|S |ddd|jd }tj||jd ddd|d}|S )Nc                 3   s    | ]}| j  V  qd S r   r%   rY   r   r   r   r[   a  s    z0ViTPatchGenerator.get_pos_enc.<locals>.<genexpr>r<   r   r   )r>   index)rN   r   _get_pos_embeddingsr?   r@   rA   r3   gather)r7   r   r   rp   rN   rh   exp_patch_idxsr   r   r   rw   X  s   zViTPatchGenerator.get_pos_encc                    s<  | j | jf kr| jS | jd| j | jddddd} fdd}| jr| jrtd}t	j
|dd|jd	d|  | }td
}| }t	t	j
|dd|jd	||  | }	||	 }
|d|	  }t	j|
|gdddd}t	j
|ddd|jd	d|  }t	jdd d |jdd | d d}t	jdd d |jdd d d d f |d d }t	j||gdd}|| | }|dd tj| |ddd|dddd|j}nt }tj| ||fddd|j}||}n||}|jdd   krtj|  ddd|j}|dddd}|S )Nr   r<   r   r   r   c                    s\    d | j d k r| dd  d d d f }  d | j d k r,| dd d d  d f } | S )Nr   .r   r<   )rA   )rh   rN   r   r   window_selectw  s
   z<ViTPatchGenerator._get_pos_embeddings.<locals>.window_selectg?r\   g      ?r=   )stepsr\   NNbilinearzerosT)gridr   padding_moder   )r   r   r   r   )rb   rc   rh   reshapepermuter`   r   rW   r   r3   r   r\   logexpstackclamp_linspacer@   mul_sub_r   grid_samplefloattor]   maxr   rA   flatten)r7   r   rN   rh   r   	min_scaler8   
aspect_min
aspect_maxaspectscale_xscale_yscale_xypos_xylin_xlin_ylin_xygrid_xymax_dimr   r   r   r   o  s   


z%ViTPatchGenerator._get_pos_embeddings)TFFNrL   Fr   NNFNNr   r   )rD   rE   rF   rG   r   rH   r   r0   r3   rI   listr   rC   rq   rr   propertyr   rU   r   r-   r   r1   r2   r   r   rs   rt   rw   r   rJ   r   r   r9   r   rK   l   s    	
H










"rK   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )rd   r   c                    s   t    || _d S r   r/   r0   r   )r7   r   r9   r   r   r0     s   

zIm2Patches.__init__r   r   c                 C   sb   | j dkr|d}|ddd}|S |jd | j  }|jd | j  }t|d|| j || j d}|S )Nr   r   r   r   r<   z*b c (py yy) (px xx) -> b (py px) (c yy xx))pyyypxxx)r   r   r   rA   r   )r7   r   ru   r   r   r   r   r   rC     s   

zIm2Patches.forward)	rD   rE   rF   rG   r0   r3   rI   rC   rJ   r   r   r9   r   rd     s    rd   c                       s,   e Zd Zddededef fddZ  ZS )rf   Fr   rM   r_   c                    s,   t  jd|d  |fd|i| || _d S )Nr   r   r_   r   )r7   r   rM   r_   rl   r9   r   r   r0     s   "
zViTPatchLinear.__init__)F)rD   rE   rF   rG   rH   r0   rJ   r   r   r9   r   rf     s    $rf   c                       s8   e Zd Z	ddejdejdB dejf fddZ  ZS )RadioParallelAttentionNr   	attn_maskr   c                    s   |d u r
t  |S |j\}}}| |\}}|jddd\}}}	| jr,| ||\}}|||| j| j	}|||| j| j	}|	||| j| j	}	dd |||	fD \}}}	t
j|||	|| jd}
|
dd||d}
| |
\}
}|
S )	Nr   r<   r=   c                 s   s    | ]	}| d dV  qdS )r   r   N)	transposer$   tr   r   r   r[     s    z1RadioParallelAttention.forward.<locals>.<genexpr>)r   r8   r   r   )r/   rC   rA   qkvchunkqk_normalization_apply_qk_normviewnum_heads_per_partitionhead_dimr   scaled_dot_product_attentionr8   r   r   proj)r7   r   r   BN_r   qkvr   r9   r   r   rC     s"   zRadioParallelAttention.forwardr   )rD   rE   rF   r3   rI   rC   rJ   r   r   r9   r   r     s    r   c                       <   e Zd Zd	 fddZ	d
dejdejdB fddZ  ZS )RadioVisionEncoderLayerr   Nc                       t  j|dti| d S )Nattn_cls)r/   r0   r   r7   argskwargsr9   r   r   r0        z RadioVisionEncoderLayer.__init__hidden_statesr   c                 C   s<   || j | ||d| j  }|| | || j  }|S N)r   )attnnorm1ls1mlpnorm2ls2)r7   r   r   r   r   r   rC     s   zRadioVisionEncoderLayer.forwardr   Nr   rD   rE   rF   r0   r3   rI   rC   rJ   r   r   r9   r   r         r   c                       r   )RadioVisionEncoderr   Nc                    r   )N	layer_cls)r/   r0   r   r   r9   r   r   r0     r   zRadioVisionEncoder.__init__inputs_embedsr   c                 C   s    |}| j D ]}|||d}q|S r   )layers)r7   r   r   r   encoder_layerr   r   r   rC     s   
zRadioVisionEncoder.forwardr   r   r   r   r   r9   r   r     r   r   c                       s   e Zd ZddgiZ		ddddddededB dedB d	ed
eddf fddZdee	eef B fddZ
dd Zdee	eef  dejdejfddZ	ddejdejdB dejfddZ  ZS )RadioInternVisionModelr   Nr    num_hidden_layers_overridenum_dummy_headsprefixconfigquant_configr  r  r  r   c             	      s   t    || _| t|j|j\| _| _| _	t
t|j|j |j }tdd |jD }t|j|j| j|d|jr?t|nd|jd| _t||||| dd| _d S )Nc                 s   s    | ]}|d  V  qdS )nameNr   r   r   r   r   r[   3  r^   z2RadioInternVisionModel.__init__.<locals>.<genexpr>Tr   )rN   rR   rQ   rU   r,   z.encoderr  r  r  r  r  )r/   r0   r  _init_img_size	to_2tupler   
image_sizeimg_size	grid_sizer6   rG   roundcpe_max_sizesetteachersrK   hidden_sizecls_token_per_teacherlenr,   patch_generatorr   encoder)r7   r  r  r  r  r  max_img_sizeunique_teachersr9   r   r   r0   !  s2   
	
zRadioInternVisionModel.__init__r  c                 C   sF   |d u rdS t |}tdd t||D }|d |d  }|||fS )N)NNNc                 S   s   g | ]\}}|| qS r   r   )r$   spr   r   r   r&   J  s    z9RadioInternVisionModel._init_img_size.<locals>.<listcomp>r   r   )r
  r   zip)r7   r   r  r  r6   r   r   r   r	  F  s   
z%RadioInternVisionModel._init_img_sizec                 C   s   | j S r   )
embeddingsr   r   r   r   get_input_embeddingsN  s   z+RadioInternVisionModel.get_input_embeddingsrm   r\   c                    sz   | j j}| j j t||} fdd|D }t|}tj||tj|d}d}|D ]}	||	 }
d|||
||
f< |
}q(|S )Nc                    s   g | ]}|  qS r   r   )r$   r   r   r   r   r&   X  s    zLRadioInternVisionModel.create_inter_image_attention_mask.<locals>.<listcomp>r   r   T)r  r   r   r'   sumr3   r   rH   )r7   rm   r\   r   seq_lenspatch_countstotal_patchesmask	start_idxpatch_countend_idxr   r  r   !create_inter_image_attention_maskQ  s   

z8RadioInternVisionModel.create_inter_image_attention_maskr   c                 C   sH   | j ||d}d }|d urt|dkr| j||jd}| j||d}|S )Nrn   r   r   )r   r   )r  r  r'  r\   r  )r7   r   rm   r   r   encoder_outputsr   r   r   rC   j  s   zRadioInternVisionModel.forwardr   r   )rD   rE   rF   packed_modules_mappingr   r   rG   r   r0   r   r	  r  r   r3   r\   rI   r'  FloatTensorrC   rJ   r   r   r9   r   r     sN    %
r   c                       s   e Zd ZddgiZ	ddddddededB dedB d	ed
eddf fddZ		dddde	j
dB de	j
dB de	j
dB dee	je	jf fddZdee fddZ	dde	j
deeeef  dB dee	je	jf fddZ  ZS )
RadioModelr   Nr   r   r  r  r  r  r  r  r   c                   sj   t    || _t|||||d| _d }|jr0tdd t|jD }|	 dkr0| 
d| || _d S )Nr  c                 S   s    g | ]\}}| d dr|qS )use_summaryT)get)r$   ir   r   r   r   r&     s     z'RadioModel.__init__.<locals>.<listcomp>r   summary_idxs)r/   r0   r  r   modelr  r3   tensor	enumeratenumelregister_bufferr/  )r7   r  r  r  r  r  r/  r9   r   r   r0     s"   
	
zRadioModel.__init__rn   pixel_valuespixel_embedsrm   c                C   s   | j ||d}| j||dS )Nrn   )r0  _extract_final)r7   r5  r6  rm   yr   r   r   rC     s   zRadioModel.forwardc                 C   sP  t  }t|  }t|trt| }nt|}|D ]\}}|ds%q|tdd  }|dv r2q|dr8qd }|drKd|ddd  }n@|dr\d|ddd  }n/|d	r|d}	t|	d
kr|	d }
d	|	dd  }|dv s|drqd|
 d| }|r||v r|| }t
|dt}||| || q|S )Nzradio_model.>   r/  zinput_conditioner.zmodel.patch_generator..r   r<   r   zmodel.blocks.r   r   >   r   r   )zls1.zls2.zmodel.encoder.layers.weight_loader)r  ra   named_parametersr   r   items
startswithr  splitjoingetattrr   add)r7   weightsloaded_paramsparams_dictweights_listr  weightsubvllm_keyparts	layer_idxsuffixparamr:  r   r   r   load_weights  sB   








zRadioModel.load_weightsr8  c                 C   s  | j jj}| j jj}| j jj}|d u r(|d d d |f }|d d |d f }nNg }g }	d}
t||D ]4}|d d |
| |
| | d d f }|| |d d |
|
| d d f }|	| |
|| 7 }
q3tj|	dd}tj|dd}| j	d ur|d d | j	f }n|}|
d|fS )Nr   r   r=   )r0  r  r   r   rU   r'   rx   r3   rB   r/  r   )r7   r8  rm   r   r   rU   all_summaryall_featall_patches	summariescurrent_posr6   ru   summary
bb_summaryr   r   r   r7    s.   


 


zRadioModel._extract_finalr   r   )rD   rE   rF   r)  r   r   rG   r   r0   r3   rI   r   r*  rC   r  rM  r   r7  rJ   r   r   r9   r   r+  z  sV    

2r+  )4rW   collections.abcr   	itertoolsr   typingr   r3   torch.nnr1   torch.nn.functional
functionalr   einopsr   transformersr   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.intern_vitr	   r
   r   rG   r   r   __annotations__r   rI   r   r   	to_1tupler
  	to_3tuple	to_4tuple	to_ntupler"   r   r'   Moduler(   rK   rd   Linearrf   r   r   r   r   r+  r   r   r   r   <module>   sB   

	&1  X^