o
    i&                     @   sL  d Z ddlZddlmZ ddlmZ ddlZddlZddl	m
  mZ ddlm
Z
 ddlmZ ddlmZ ee
jdd	Zd
ejdejeB dejfddZ	d!dedejdeeef dejfddZ	d!dedejdeeef dejfddZ		d"dedeeeef B dedeeef dejf
ddZG dd de
jZG dd  d eZdS )#z
Shared resampler perceiver network used in multimodal models and
related helpers for sincos positional embeddings.

Example models: Qwen (Qwen-VL), MiniCPM-V 2.0
    N)Callable)partial)nn)ReplicatedLinear)QuantizationConfiggư>)epsabs_postgt_sizereturnc                 C   s   t t| d}| j}t|t r||f}||d kr$||d kr$| S tj|  	d||d
dddd|d |d fddd
ddddddj|d	S )
Nr            bicubicF)sizemodealign_cornersdtype)intmathsqrtr   r   
isinstanceFinterpolatefloatreshapepermuteflattento)r   r	   src_sizer    r!   Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/layers/resampler.pyget_abs_pos3   s"   
r#   r   r   	embed_dimposversionc                 C   s   | d dksJ t j| d t jd}|| d  }dd|  }|dkrC|d}t d	||}t |}t |}t j||gd
d}|S t d||}t |}t |}t j||gdd}|S )z
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,) / (H, W)
    out: (M, D) / (H, W, D)
    r   r   r   g       @g      ?i'  r$   r   zm,d->mdr   axisz	hw,d->hwd)nparangefloat32r   einsumsincosconcatenate)r%   r&   r'   omegaoutemb_sinemb_cosembr!   r!   r"   !get_1d_sincos_pos_embed_from_gridL   s    




r6   gridc                 C   sl   | d dksJ t | d |d |}t | d |d |}|dkr+tj||gdd}|S tj||gdd}|S )Nr   r   r   r$   r(   r   )r6   r*   r0   )r%   r7   r'   emb_hemb_wr5   r!   r!   r"   !get_2d_sincos_pos_embed_from_gridg   s   r:   F	grid_size	cls_tokenc           
      C   s   t |tr||}}n	|d |d }}tj|tjd}tj|tjd}t||}tj|dd}t |tjr?|jd||fksAJ |dkrf|	dd||g}t
| ||}	|rdtjtd| g|	gdd}	|	S t
| ||}	|	S )z
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or
                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    r   r   r   r(   r   r$   )r   r   r*   r+   r,   meshgridstackndarrayshaper   r:   r0   zeros)
r%   r;   r<   r'   grid_h_sizegrid_w_sizegrid_hgrid_wr7   	pos_embedr!   r!   r"   get_2d_sincos_pos_embed{   s    
 rG   c                       st   e Zd ZdZdedddfdededededB d	eegejf d
e	de
dB deddf fddZdefddZ  ZS )BaseResamplerz
    A 2D perceiver-resampler network with one cross attention layers by
        (grid_size**2) learnable queries and 2d sincos pos_emb.
    Outputs:
        A tensor with the shape of (grid_size**2, embed_dim)
    NT num_queriesr%   	num_headskv_dim
norm_layerdo_post_projectionquant_configprefixr
   c	           
         s   t    || _|| _|| _tt| j|| _	|d ur/||kr/t
||d|| dd| _ndd | _t||| _||| _||| _|| _| jrc||| _|d t|| }	tj|	d| _d S d S )NFz.kv_proj)biasrO   rP   c                  _   s   t  | i |d fS )N)r   Identity)argskwargsr!   r!   r"   <lambda>   s   z(BaseResampler.__init__.<locals>.<lambda>g      )data)super__init__rJ   r%   rK   r   	Parametertorchemptyqueryr   kv_projMultiheadAttentionattnln_qln_kvrN   ln_postproj)
selfrJ   r%   rK   rL   rM   rN   rO   rP   rV   	__class__r!   r"   rX      s.   


	


zBaseResampler.__init__Nc                 C   s   | dd|dS )Nr   )	unsqueezerepeat)rd   r\   rg   r!   r!   r"   _repeat   s   zBaseResampler._repeat)__name__
__module____qualname____doc__
DEFAULT_LNr   r   r   	LayerNormboolr   strrX   rj   __classcell__r!   r!   re   r"   rH      s6    	
*rH   c                       s   e Zd ZdZdeddddfdededed	edB d
eegejf de	de	de
dB deddf fddZ		ddejdejdB dejdB dejfddZ  ZS )
Resampler2aX  Resampler-perceiver network to be used for a variety of model types,
    e.g., Qwen-vl / Minicpmv 2.0. The main difference is the addition of the
    do_post_projection arg, which indicates whether or not there should be
    a post layer normalization and projector after the attention. This is
    present in minicpmv2.0, but not qwen-vl.
    NFTrI   r;   r%   rK   rL   rM   adaptiverN   rO   rP   r
   c
              
      sP   t  j|d |||||||	d || _t||dd}
tt|
d| _	d S )Nr   )rN   rO   rP   r$   r'   F)
rW   rX   ru   rG   r   rY   rZ   
from_numpyrequires_grad_rF   )rd   r;   r%   rK   rL   rM   ru   rN   rO   rP   pos_embed_arrre   r!   r"   rX      s   
zResampler2.__init__x	tgt_sizes	attn_maskc           
      C   s   |d u rt t|d}| jr&t| j|dd}t|j	|j
|jd}nt| j|j	|j
|jd}| |\}}| |ddd}|jd }| | j}| j| ||| jd ||d ||dd }	|	ddd}| jr}| |}|| j }|S )Nr   r$   rv   )devicer   r   r   )r|   )r   r   r   r   ru   rG   r%   rZ   rw   r   r}   r   r#   rF   r]   ra   r   r@   r`   r\   r_   rj   rh   rN   rb   rc   )
rd   rz   r{   r|   ry   rF   _rg   qr2   r!   r!   r"   forward   s:   



zResampler2.forward)NN)rk   rl   rm   rn   ro   r   r   r   rp   rq   r   rr   rX   rZ   Tensorr   rs   r!   r!   re   r"   rt      sP    	
!rt   )r$   )Fr$   )rn   r   collections.abcr   	functoolsr   numpyr*   rZ   torch.nn.functionalr   
functionalr   !vllm.model_executor.layers.linearr   'vllm.model_executor.layers.quantizationr   rp   ro   r   r   r#   r?   tupler6   r:   rq   rG   ModulerH   rt   r!   r!   r!   r"   <module>   s^    





!6