o
    -i                  	   @  s  d dl mZ d dlZd dlmZmZmZmZ d dlm	Z	m
Z
 d dlZd dlZd dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl"m$Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z<m=Z=m>Z>m?Z? d dl@mAZA d dlBmCZCmDZDmEZEmFZF d dlGmHZHmIZI d dlJmKZKmLZLmMZMmNZNmOZOmPZP d dlQmRZR d dlSmTZT d dlUmVZV d dlWmXZX d dlYmZZZm[Z[ d dl\m]Z]m^Z^ dd l_m`Z` d{d'd(ZaG d)d* d*ejbZc		d|d}d/d0Zd	d~dd2d3Zed4Zfd5Zgd5Zhd6Zidd9d:Zjdd>d?Zk		@	dddKdLZlejmegejndModdddNZpejmehejndModdddNZqddSdTZreifddVdWZsddXdYZt		ddd\d]ZuG d^d_ d_ed`daZvG dbdc dcZwG ddde deZxG dfdg dgeMZyG dhdi dieKey ZzG djdk dke]Z{G dldm dmeLZ|G dndo doejbZ}G dpdq dqejbZ~G drds dsejbZG dtdu duejbZG dvdw dwejbZeAje|eyezdxG dydz dzejbe5e3e6e4ZdS )    )annotationsN)IterableIteratorMappingSequence)	AnnotatedAny	rearrange)BatchFeature)
TensorType)	TypedDictUnpack)
VllmConfig)ModelConfig)parallel_state)utils)MMEncoderAttention)ColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)default_weight_loader)MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)MultiModelKeys)	SiglipMLP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)get_tokenizer)get_cached_tokenizer)patch_rope_parameters)IsaacConfigPixelShuffleSiglip2VisionConfig)TensorSchemaTensorShape   )is_vit_use_data_parallel	seq_sizestorch.Tensordevicetorch.devicereturn!tuple[torch.Tensor, torch.Tensor]c                 C  sZ   t jt| d t j|d}| d|dd< t| dkr |  nt jdt j|d}||fS )zACreate cumulative sequence lengths for variable-length attention.r:   )dtyper>   r   N)torchzeroslenint32cumsummaxtensor)r<   r>   
cu_seqlens
max_seqlen rL   ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/isaac.pycreate_cumulative_seq_lengthsN   s   rN   c                      s2   e Zd Zd fddZdd	d
ZdddZ  ZS )!Siglip2VariableSequenceEmbeddingsconfigr7   c                   sn   t    || _|j| _|j| _t|j| j | j | jdd| _|j	| _	t
| j	d | _t| j	| j| _d S )NF)
input_sizeoutput_sizereturn_bias      ?)super__init__rP   hidden_size	embed_dim
patch_sizer   num_channelspatch_embeddingnum_patchesintposition_embedding_sizenn	Embeddingposition_embedding)selfrP   	__class__rL   rM   rV   ]   s   
z*Siglip2VariableSequenceEmbeddings.__init__packed_seq_patches/tuple[torch.Tensor, torch.Tensor, torch.Tensor]r@   r=   c                 C  s   | j j| j| jddddd}|\}}}g }d}d}d}	|D ]K}
t|
d t|
d }}|dkrS|dkrStj|||f|||	d}|| j	|| 
dd}n|| j	| j| j 
ddd ||  }|| q"tj|dd	}|S )
N   r   r:   bilinearFT)sizemodealign_corners	antialiasdim)ra   weightreshaper^   permute	unsqueezer]   FinterpolaterX   	transposeappendrC   cat)rb   re   positional_embeddings_seq_patches
_seq_sizesspatial_shapespos_embeds_listrk   rl   rm   spatial_shapeheightwidthresized_pos_embed
pos_embedsrL   rL   rM   ry   m   sF   


	

z7Siglip2VariableSequenceEmbeddings.positional_embeddingsc           	      C  sb   |\}}}| j j}|j|j|jd}|  |}| |}| dkr+|d|d}|| }|S )Nr>   rB      rg   )	r[   rp   tor>   rB   ry   ro   viewrj   )	rb   re   seq_patchesr{   _spatial_shapestarget_weightpatch_embedsr   
embeddingsrL   rL   rM   forward   s   


z)Siglip2VariableSequenceEmbeddings.forward)rP   r7   )re   rf   r@   r=   )re   rf   )__name__
__module____qualname__rV   ry   r   __classcell__rL   rL   rc   rM   rO   \   s    
-rO   token_gridsscale_factorr]   torch.device | Nonec                 C  s<  |du r| j }t|}|dk rtdtj s@|dddf | dk r4|dddf | dk s@td| d|  g }d}t	|  | dd	D ]E\}\}}	tj
||tjd
| }
|
||	}
|
||	| |}
|
|| ||	| |}
|
dddd }
||
d||  ||7 }qOtj|dd}|S )u  
    Build a gather-index map that tells us, for every *output* token after
    pixel-shuffle, which `scale_factor**2` *input* tokens are being merged.

    Args
    ----
    seq_sizes     : (num_images,)  - #patches in each image (row-major order)
    token_grids   : (num_images,2) - (height, width) for every image
    scale_factor  : spatial down-scale factor (≥2)
    device        : (optional) overrides `seq_sizes.device`

    Returns
    -------
    gather_idx : (new_total_seq_len, scale_factor**2) int64 tensor.
                 gather_idx[i, j] is the *flat* index into the *original*
                 packed sequence for the j-th sub-patch that forms the
                 i-th output token.
    Nrh   u   `scale_factor` must be ≥ 2r   r:   z?Every (H,W) in `token_grids` must be divisible by scale_factor=z, got F)strictr   r   rg   rn   )r>   r]   
ValueErrorrC   compileris_compilingallAssertionErrortolistziparangeint64r   rr   
contiguousrw   rq   rx   )r<   r   r   r>   rgather_chunks
tok_offsetseq_lenhwgrid
gather_idxrL   rL   rM   create_pixel_shuffle_index_map   s:   
"
r   xc                 C  s   |   dk}|r| ddkrtd| d}n| }|d}t|}tj|dd}t||||jd}|| }	|		|	d|| | }
|rL|

d}
|
S )ao  Apply pixel shuffle to a packed vision sequence without unpacking per image.

    Args:
        x (`torch.Tensor`):
            Concatenated vision embeddings. Accepts `(seq_len, hidden_size)` or
            `(1, seq_len, hidden_size)` shapes produced by stacking image
            patches.
        token_grids (`torch.Tensor`):
            Integer tensor of shape `(num_images, 2)` whose rows give the
            `(height, width)` patch grid sizes corresponding to each image
            segment inside `x`.
        scale_factor (`int`, *optional*, defaults to 1):
            Spatial down-sampling factor specific to pixel shuffle. Values
            greater than one merge `scale_factor**2` neighboring patches into a
            single embedding channel-group.

    Returns:
        `torch.Tensor`: Pixel-shuffled embeddings with shape matching the input
        convention: `(seq_len, hidden_size * scale_factor**2)` when the input
        was 2D, or `(1, seq_len, hidden_size * scale_factor**2)` if the
        singleton batch dimension was present.

    Raises:
        ValueError: If more than one batch item is provided.
    r   r   r:   z3Packed sequence is expected to have batch_size == 1rg   rn   )r<   r   r   r>   )ro   rj   r   squeezer]   rC   prodr   r>   rq   rs   )r   r   r   keep_batch_dimx_rX   r   r<   r   gatheredoutrL   rL   rM   pixel_shuffle_varlen   s(   

r   i )rT   rT   rT   gp?arr
np.ndarrayc                 C  s:   | j jr| S z	| jdd | W S  ty   |   Y S w )a  Return *arr* itself if it is already writeable, otherwise try to flip the
    write flag in-place and finally fall back to `arr.copy()`.
    This guarantees the buffer handed to `torch.from_numpy()` is always
    writeable, silencing the PyTorch warning about undefined behaviour.
    T)write)flags	writeablesetflagsr   copy)r   rL   rL   rM   _make_writeable=  s   r   imagePIL.Image.Imagetorch.Tensor | Nonec                 C  sd   | j | j tkrtd| j  d| j dt d| jdkr| n| d}t|}t|}t	
|S )Nz	Image (w=z, h=z	) > MAX=``RGB)r   r   
MAX_PIXELSr   rk   convertnpasarrayr   rC   
from_numpy)r   imgr   rL   rL   rM   extract_image_pilP  s   

r   h㈵>image_heightimage_widthrY   max_num_patchesmin_num_patches
int | Noneepsfloatpixel_shuffle_scaletuple[int, int]c                 C  s  dd }|| }t | | | }	t||	}	t || | }
t||
}
|	| |
|  }|dur}||k r}d\}}|| |kri|| d }||| ||}|||||}|| ||  }||kra|}n|}|| |ks>|}||| ||}|||||}||fS ||kr|	|
fS |d d}}|| |kr|| d }||| ||}|||||}|| ||  }||kr|}n|}|| |ks|}||| ||}|||||}||fS )a  Compute a target resolution whose patch grid satisfies patching parametrization.

    Args:
        image_height (`int`):
            Height in pixels of the source image prior to any resizing.
        image_width (`int`):
            Width in pixels of the source image prior to any resizing.
        patch_size (`int`):
            Size of the square patch used by the vision encoder.
        max_num_patches (`int`):
            Upper bound on `(height / patch_size) * (width / patch_size)` after
            resizing.
        min_num_patches (`int`, *optional*):
            Lower bound on the number of patches. When provided the image will
            be scaled up if necessary.
        eps (`float`, *optional*, defaults to 1e-5):
            Convergence tolerance for the internal binary search to determine
            the target dimensions.
        pixel_shuffle_scale (`int`, *optional*, defaults to 1):
            Additional stride multiplier applied when pixel shuffle later
            reduces spatial resolution.

    Returns:
        `tuple[int, int]`: Height and width (in pixels) that are multiples of
        `patch_size * pixel_shuffle_scale` and respect both the maximum and
        optional minimum patch-count constraints.
    c                 S  s4   | | }|| }t || | }t||}t|S N)mathceilrH   r]   )scaleoriginal_sizerY   r   scaled_sizedivisorrL   rL   rM   get_scaled_image_size  s
   
zAget_image_size_for_max_num_patches.<locals>.get_scaled_image_sizeN)      ?g      Y@rh   
   r   )r   r   rH   )r   r   rY   r   r   r   r   r   r   adjusted_heightadjusted_widthr\   	scale_min	scale_maxr   target_heighttarget_widthrL   rL   rM   "get_image_size_for_max_num_patches[  sl   %

r   rB   rg   model_configr   vision_tokenstrc                 C  s>   | j p| j}tt|| j| j| jp| jd}|j|ddd S )N)tokenizer_modetrust_remote_coderevisionF)add_special_tokensr   )		tokenizermodelr4   r3   r   r   tokenizer_revisionr   encode)r   r   tokenizer_namer   rL   rL   rM   _resolve_vision_token_id  s   
r   r   c                 C  sB   t | s	|  } | | }t| j}t| j}|| | }|S )a  Standardize RGB images prior to patch extraction via rescaling and whitening.

    Args:
        image (`torch.Tensor`):
            Tensor with shape `(..., height, width, 3)` containing RGB values.
            The tensor is converted to floating point if needed.
        scale (`float`, *optional*, defaults to `VISION_SCALE`):
            Scalar multiplier applied before normalization.
    Returns:
        `torch.Tensor`: Normalized tensor with the same shape as the input and
        dtype `torch.float32`.
    )rC   is_floating_pointr   _MEAN_TENSORr   r>   _STD_TENSOR)r   r   rescaledmean_tensor
std_tensor
normalizedrL   rL   rM   prepare_image_tensor  s   
r   c                 C  s   | j \}}}}|| s|| rtd| j  d| d| ||| ||| ||}|dddddd	}|||| || || | }|S )
a  Convert normalized images into flattened ViT-style patches.

    Args:
        image (`torch.Tensor`):
            Tensor of shape `(num_images, height, width, channels)`.
        patch_size (`int`):
            Edge length of the square patches

    Returns:
        `torch.Tensor`:
            Patch tensor where each position stores the flattened pixels
            belonging to that patch.

    Raises:
        ValueError: If `height` or `width` is not divisible by `patch_size`.
    zDimensions of images z! are not divisible by patch_size=.r   r:   r   rh         )shaper   rq   rr   )r   rY   
num_imagesr   r   channelspatchesrL   rL   rM   patchify_vision  s2   
r   imagestuple[torch.Tensor, list[int]]c                 C  s   |   dkr| d} | dddd} | j\}}}}t||||||d\}}	tj| ||	fddd} | dddd} t| } t| |d	}
|
j\}}}}|dkrSd||gnd|| || g}|
|fS )
a  Resize, normalize, and patchify RGB images for the vision encoder.

    Args:
        images (`torch.Tensor`):
            Either `(height, width, channels)` for a single image or
            `(num_images, height, width, channels)` for a batch. Channels are
            expected to be RGB.
        patch_size (`int`):
            Edge length of square patches; implictly controls resize grid granularity.
        max_num_patches (`int`):
            Maximum number of patches allowed after resizing.
        min_num_patches (`int`, *optional*):
            Minimum number of patches. If provided, the routine upsamples images
            as needed to satisfy the lower bound.
        pixel_shuffle_scale (`int`, *optional*, defaults to 1):
            Pixel shuffle scale factor; influences the target grid that the
            function produces.

    Returns:
        `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)`
        where `patches` has shape `(num_images, target_h / patch_size, target_w
        / patch_size, channels * patch_size**2)` and `dims_virtual` encodes
        effective `(images, height, width)` dimensions after optional pixel
        shuffling.
    r   r   r:   rh   r   r   ri   F)rj   rk   rl   )rY   )	ro   rs   rr   r   r   rt   ru   r   r   )r   rY   r   r   r   _orig_height
orig_widthr   r   r   n_images	h_patches	w_patchesdims_virtualrL   rL   rM   process_vision_for_patches  s6   !



r  c                   @  s.   e Zd ZU ded< ded< ded< ded< dS )IsaacImageProcessorKwargsr]   rY   r   r   r   N)r   r   r   __annotations__rL   rL   rL   rM   r  h  s
   
 r  F)totalc                   @  s:   e Zd ZdZdZdZdZeZddgZ	dd Z
dddZdS )IsaacImageProcessor   i      rh   pixel_valuesimage_grid_thwc                 C  sB   | d| j| _| d| j| _| d| j| _| dd| _d S )NrY   vision_max_num_patchesvision_min_num_patchesr   rh   )poprY   r   r  r   r  r   rb   kwargsrL   rL   rM   rV   x  s   zIsaacImageProcessor.__init__r   list[torch.Tensor]return_tensorsstr | TensorType | Noner  !Unpack[IsaacImageProcessorKwargs]r@   r   c                 K  s   g }g }|D ]L}t |}t|| j| j| j| jd\}}	|d}|jd |jd |jd }
}}|
| }|||}d|
|g}t	
|d}|| || q|rdt	j|dd}t	j|dd}nt	dd}t	dd}t||d	|d
S )zEPreprocess images into format compatibile with vLLM input processing.)rY   r   r   r   r:   rg   r   rn   r   r  r  )datatensor_type)r   r  rY   r  r  r   rs   r   rq   rC   rI   rw   rx   emptyr   )rb   r   r  r  all_pixel_valuesall_image_gridsr   image_tensorr   r  hpwpro   current_num_patchesr  	dims_realr  final_pixel_valuesfinal_image_gridsrL   rL   rM   
preprocess  s<   

	"

zIsaacImageProcessor.preprocessN)r   r  r  r  r  r  r@   r   )r   r   r   rY   r   r   r   r  valid_kwargsmodel_input_namesrV   r(  rL   rL   rL   rM   r  o  s    
r  c                   @  s6   e Zd ZdZdddZddddZ				ddddZdS )IsaacProcessorz4Processor wrapper (tokenizer + IsaacImageProcessor).Nc                 K  s&   | dd| _|pt|| _|| _d S )Nimage_token<image>)r  r,  r  image_processorr   )rb   r.  r   r  rL   rL   rM   rV     s   
zIsaacProcessor.__init__r@   r   c                 K  s   i }|d url| j j|fi |}|d }|| |d urlt|ts%|g}| }| j jd }d}tt|D ]4}	| j	||	 v ra|| 
 | }
||	 | j	d|
 d||	< |d7 }| j	||	 v s@||	 dd||	< q7|d ur||| j|fi | t|S )Nr  rh   r   z<|placeholder|>r:   <|image_pad|>)r.  r(  update
isinstancelistr   r   rangerE   r,  r   replacer   r   )rb   textr   r  resultimage_inputsr  merge_lengthindexinum_image_tokensrL   rL   rM   __call__  s.   

zIsaacProcessor.__call__Fmessageslist[dict[str, Any]]tokenizebooladd_generation_promptr   c           
      K  s   g }|D ]K}d|v rJt |d trJg }|d D ] }|ddkr*||dd q|ddkr7|| j q|ddd|d}	||	 q|| q| jj|f||d	|S )
Ncontenttyper5   r   roleuser)rE  rB  )r?  rA  )r1  r2  getrw   r,  joinr   apply_chat_template)
rb   r=  r?  rA  r  processed_messagesmessage
text_partscontent_itemprocessed_messagerL   rL   rM   rI    s.   
z"IsaacProcessor.apply_chat_templateNN)r@   r   )FF)r=  r>  r?  r@  rA  r@  r@   r   )r   r   r   __doc__rV   r<  rI  rL   rL   rL   rM   r+    s    
 r+  c                   @  sP   e Zd ZdddZdddZdd	 ZdddZdddZdddZdddZ	dS ) IsaacProcessingInfor@   r6   c                 C  st   t | jdr7| j }tt|dd t|ddt|ddt|dd t|dd	t|d
dt|ddt|dd dS t S )Nget_hf_configvision_configvideo_patch_sizer  r  r  r  r   r:   max_sequence_lengthi @  r   r-  vision_attn_implementation)rS  vision_patch_sizer  r  r   rU  r   rV  )hasattrctxrR  r6   getattr)rb   original_configrL   rL   rM   rR    s*   




z!IsaacProcessingInfo.get_hf_configr+  c                 K  s0   |   }d|ji}|| | jjtfi |S )Nr,  )rR  r   r0  rY  get_hf_processorr+  )rb   r  	hf_configprocessor_kwargsrL   rL   rM   r\    s
   
z$IsaacProcessingInfo.get_hf_processorc                 C     | j jS r   )rY  r   rb   rL   rL   rM   r3   $     z!IsaacProcessingInfo.get_tokenizerr*   c                 C  s4   |   }tdd|j|j|j|jd\}}t||dS )Ni r   )r   r   )rR  r   rT  r  r  r   r*   )rb   r]  r   r   rL   rL   rM   !get_image_size_with_most_features'  s   
z5IsaacProcessingInfo.get_image_size_with_most_featuresr  c                 K  s   | j di |jS NrL   )r\  r.  r  rL   rL   rM   get_image_processor4  s   z'IsaacProcessingInfo.get_image_processorMapping[str, int | None]c                 C  s   dd iS )Nr   rL   r`  rL   rL   rM   get_supported_mm_limits7  ra  z+IsaacProcessingInfo.get_supported_mm_limitsr   r]   	mm_countsMapping[str, int]c                 C  s    |   }|j|jd  }d|iS )Nrh   r   )rR  r  r   )rb   r   rg  r]  num_vision_tokensrL   rL   rM   get_mm_max_tokens_per_item:  s
   z.IsaacProcessingInfo.get_mm_max_tokens_per_itemN)r@   r6   )r@   r+  )r@   r*   )r@   r  )r@   re  )r   r]   rg  rh  r@   rh  )
r   r   r   rR  r\  r3   rb  rd  rf  rj  rL   rL   rL   rM   rQ    s    




rQ  c                   @  s$   e Zd ZdddZ	ddddZdS )IsaacDummyInputsBuilderrg  rh  r@   r   c                 C  s$   | dd}| j }|j}|| S )Nr   r   )rG  infor\  r,  )rb   rg  r   hf_processorr,  rL   rL   rM   get_dummy_textG  s   
z&IsaacDummyInputsBuilder.get_dummy_textNr   r]   
mm_optionsMapping[str] | Noner&   c                 C  sB   | dd}| j \}}|r| dnd }d| j||||diS )Nr   r   )r   r   r   	overrides)rG  rl  rb  _get_dummy_images)rb   r   rg  ro  r   r   r   image_overridesrL   rL   rM   get_dummy_mm_dataO  s   z)IsaacDummyInputsBuilder.get_dummy_mm_data)rg  rh  r@   r   r   )r   r]   rg  rh  ro  rp  r@   r&   )r   r   r   rn  rt  rL   rL   rL   rM   rk  F  s    
rk  c                   @  s"   e Zd ZU dZded< ded< dS )IsaacImagePixelInputsaR  
    Schema for validating Isaac image inputs.

    Dimensions:
        - np: Number of patches
        - d: Patch dimension
        - ni: Number of images

    The schema enforces:
        - pixel_values must be 2D: (num_patches, patch_dim)
        - image_grid_thw must be 2D: (num_images, 3)
          where 3 represents [T, H, W]
    z/Annotated[torch.Tensor, TensorShape('np', 'd')]r  z-Annotated[torch.Tensor, TensorShape('ni', 3)]r  N)r   r   r   rP  r	  rL   rL   rL   rM   ru  d  s   
 ru  c                   @  s    e Zd ZdddZdddZdS )IsaacMultiModalProcessor	hf_inputsr   hf_processor_mm_kwargsMapping[str, object]r@   #Mapping[str, MultiModalFieldConfig]c                 C  s4   | dtd}|d}td|tddS )Nr  )r   r   rg   r   r  )rG  rC   r  r   r(   flat_from_sizesbatched)rb   rw  rx  r  image_grid_sizesrL   rL   rM   _get_mm_fields_config  s   
z.IsaacMultiModalProcessor._get_mm_fields_configmm_itemsr+   Mapping[str, Any]out_mm_kwargsr)   Sequence[PromptUpdate]c                   sF   | j jd
i |}t|dd}|d  d fdd}tdd|d	gS )Nr   rh   item_idxr]   c                   sJ   d |  }|d j }t|tjsJ t|   }d| }t|dS )Nr   r  r/  )r  r1  rC   Tensorr]   r   r1   select_text)r  out_itemgrid_thwfeature_size	repl_fullr8  r  rL   rM   get_replacement_isaac  s   
zKIsaacMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_isaacr   r-  )modalitytargetreplacementrL   )r  r]   )rl  rd  rZ  r/   )rb   r  rx  r  r.  r   r  rL   r  rM   _get_prompt_updates  s   
z,IsaacMultiModalProcessor._get_prompt_updatesN)rw  r   rx  ry  r@   rz  )r  r+   rx  r  r  r)   r@   r  )r   r   r   r~  r  rL   rL   rL   rM   rv  ~  s    
rv  c                      s<   e Zd Z	dddd fddZdddZdddZ  ZS )Siglip2VisionAttentionNrD  prefixrP   r7   quant_configQuantizationConfig | Noner  r   r@   Nonec             
     s   t    t }|rdnt | _t | _t	|j
|j| _t	|j| j| _t|j
| j|j|jd|| d|d| _t|j
|j
|| d|d| _t| j| j| jd | dd	| _d S )
Nr:   Tz	.qkv_proj)rW   	head_sizetotal_num_headstotal_num_kv_headsbiasr  r  
disable_tpz	.out_proj)rQ   rR   r  r  r  g      z.attn)	num_headsr  r   r  )rU   rV   r;   r   $get_tensor_model_parallel_world_sizetp_sizeget_tensor_model_parallel_ranktp_rank
dist_utilsdividerW   num_attention_headshidden_size_per_attention_head!num_attention_heads_per_partitionr   qkv_projr   out_projr   attn)rb   rP   r  r  use_data_parallelrc   rL   rM   rV     sH   


zSiglip2VisionAttention.__init__qkvr=   tuple[torch.Tensor, ...]c                   sX   |j \}}}|jddd\}}}||| j| jf  fdd|||fD \}}}|||fS )Nr   rh   rn   c                 3  s    | ]}|j   V  qd S r   )r   ).0r   	new_shaperL   rM   	<genexpr>      z3Siglip2VisionAttention.split_qkv.<locals>.<genexpr>)r   chunkr  r  )rb   r  r   bsr   qkvrL   r  rM   	split_qkv  s   
z Siglip2VisionAttention.split_qkvhidden_statesrJ   rK   r   c                C  s   |j \}}}|dkrtdt|d}| |\}}| |\}}}	dd |||	fD \}}}	| j|||	||d}
t|
d }
| |
\}}t|d}|S )	Nr:   z5packed variable-length attention expects batch_size=1zb s d -> s b dc                 s  s    | ]}t |d V  qdS )zs b h d -> b s h dNr	   )r  trL   rL   rM   r    r  z1Siglip2VisionAttention.forward.<locals>.<genexpr>)querykeyvaluerJ   rK   zb s h d -> s b (h d)zs b d -> b s d)r   r   r
   r  r  r  r   r  )rb   r  rJ   rK   
batch_sizer   r   r  r  r  context_layeroutputrL   rL   rM   r     s$   

zSiglip2VisionAttention.forwardr   rP   r7   r  r  r  r   r@   r  )r  r=   r@   r  r  r=   rJ   r=   rK   r   r@   r=   )r   r   r   rV   r  r   r   rL   rL   rc   rM   r    s    
0r  c                      s2   e Zd Z	dddd fddZdddZ  ZS )Siglip2EncoderLayerNrD  r  rP   r7   r  r  r  r   r@   r  c                  sj   t    |j| _tj| j|jd| _t||| dd| _	tj| j|jd| _
t||| dd| _d S )Nr   z
.self_attnr  r  z.mlp)rU   rV   rW   rX   r_   	LayerNormlayer_norm_epslayer_norm1r  	self_attnlayer_norm2r    mlprb   rP   r  r  rc   rL   rM   rV     s   
zSiglip2EncoderLayer.__init__r  r=   rJ   rK   r   c                C  sJ   |}|  |}| j|||d}|| }|}| |}| |}|| }|S )N)r  rJ   rK   )r  r  r  r  )rb   r  rJ   rK   residualrL   rL   rM   r     s   


zSiglip2EncoderLayer.forwardr   r  r  r   r   r   rV   r   r   rL   rL   rc   rM   r    s    r  c                      s:   e Zd Z	dddd fddZddddddZ  ZS )Siglip2EncoderNrD  r  rP   r7   r  r  r  r   r@   r  c                  s8   t     | _t fddt jD | _d S )Nc                   s$   g | ]}t   d | dqS )z.layers.r  )r  )r  	layer_idxrP   r  r  rL   rM   
<listcomp>A  s    z+Siglip2Encoder.__init__.<locals>.<listcomp>)rU   rV   rP   r_   
ModuleListr3  num_hidden_layerslayersr  rc   r  rM   rV   7  s   

zSiglip2Encoder.__init__rJ   rK   inputs_embedsr=   rJ   r   rK   c                C  s"   |}| j D ]	}||||d}q|S )Nr  )r  )rb   r  rJ   rK   r  encoder_layerrL   rL   rM   r   K  s   
zSiglip2Encoder.forwardr   r  )r  r=   rJ   r   rK   r   r@   r=   r  rL   rL   rc   rM   r  6  s    r  c                      s8   e Zd Z		dd fd	d
ZdddZdddZ  ZS )Siglip2VisionTransformerNrD  rP   r7   r  r  r  r   c                   sZ   t    || _|| _|j}t|| _|j| _t||| dd| _	t
j||jd| _d S )Nz.encoderr  r  )rU   rV   rP   r  rW   rO   r   pixel_shuffle_scale_factorr  encoderr_   r  r  post_layernorm)rb   rP   r  r  rX   rc   rL   rM   rV   ]  s   

z!Siglip2VisionTransformer.__init__re   rA   r@   r=   c                 C  s   |\}}t j|dd}| |||f}|d}t||j\}}| j|||d}| |}| jdkr:t	||| jd}|
d}|S )z
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width)
            of the input images.
        rg   rn   r   )r  rJ   rK   r:   )r   r   r   )rC   r   r   rs   rN   r>   r  r  r  r   r   )rb   re   r   r   r<   r  rJ   rK   rL   rL   rM   r   q  s*   




z Siglip2VisionTransformer.forwardweights"Iterable[tuple[str, torch.Tensor]]set[str]c                 C  s   g d}t |  }t }|D ]9\}}|D ]\}}}	||vrq|||}|| }
|
j}||
||	  n|| }
t|
dt}||
| || q|S )N))r  q_projr  )r  k_projr  )r  v_projr  weight_loader)dictnamed_parameterssetr4  r  rZ  r   add)rb   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr  rL   rL   rM   load_weights  s"   
z%Siglip2VisionTransformer.load_weightsNrD  )rP   r7   r  r  r  r   re   rA   r@   r=   r  r  r@   r  )r   r   r   rV   r   r  r   rL   rL   rc   rM   r  \  s    
*r  c                      s.   e Zd Z		dd fddZdddZ  ZS )IsaacVisionEmbeddingNrD  
vision_cfgr7   
hidden_dimr]   
output_dimr  r  r  r   c                   sn   t    t||t|dd| _t|d| d|t|ddd| _t | _	t
d| |d|t|ddd| _d S )N0r  r   F1)r  r  r  rS   3)rU   rV   r  r$   transformerr   
linear_fc1r_   SiLUactr   
linear_fc2)rb   r  r  r  r  r  rc   rL   rM   rV     s.   

zIsaacVisionEmbedding.__init__re   rA   r@   r=   c                 C  s,   |  |}| |}| |}| |}|S r   )r  r  r  r  )rb   re   r  rL   rL   rM   r     s
   



zIsaacVisionEmbedding.forwardr  )
r  r7   r  r]   r  r]   r  r  r  r   r  r  rL   rL   rc   rM   r    s
     r  )rl  dummy_inputsc                      s   e Zd Zg dddgdZdZeddddd	d
ddddd
dZedKddZdddL fddZ	dMd#d$Z
dNd&d'ZdOd+d,ZdPd0d1ZdQd3d4Z	5	5dRdSd>d?ZdTdAdBZdUdFdGZdVdIdJZ  ZS )WIsaacForConditionalGeneration)r  r  r  	gate_projup_proj)r  gate_up_projTzlanguage_model.lm_head.zlanguage_model.model.zvision_embedding.transformerzvision_embedding.linear_fc1zvision_embedding.actvision_embedding.linear_fc2zvision_embedding.)
zlm_head.zmodel.text_model.lm_head.zmodel.text_model.zmodel.vision_embedding.0zmodel.vision_embedding.1zmodel.vision_embedding.2zmodel.vision_embedding.3zmodel.vision_embedding.zmodel.lm_head.zmodel.)orig_to_new_prefixr  r   r:  r]   r@   
str | Nonec                 C  s   | drdS td)Nr   r-  z Only image modality is supported)
startswithr   )clsr  r:  rL   rL   rM   get_placeholder_str  s   
z1IsaacForConditionalGeneration.get_placeholder_strr   r  vllm_configr   r  c             	     s  t    |jj}|j}|| _|j}|d |d |d g}t|j|j| _	| j	|_
t|dd }|d ur:t|ts:|n|}t|dd }	|	d u rP||u rPt|dd }	t| |j}
||
d< |	d urkd|	v rk|
d|	d  |
|_| | t|dgt|d	d
| _W d    n1 sw   Y  | jj| _|j}|d u rtd|jd ur|jnt|dd }|d ur||_|j|jd  }| |d t|||j|t|dd| _W d    d S 1 sw   Y  d S )Nr      text_configrope_scaling_rope_scalingmrope_sectionmrope_interleavedQwen3ForCausalLMlanguage_model)r	  architecturesr  z,IsaacConfig should always have vision_config_attn_implementationrh   r   vision_embedding)r  r  r  r  r  )rU   rV   r   r]  r  rP   head_dimr   r   vision_token_idimage_token_idrZ  r1  r  r5   rope_parameters
setdefault_mark_language_modelr#   r$   r  make_empty_intermediate_tensorsrS  r   rV  r  rW   r  _mark_tower_modelr  r  )rb   r	  r  rP   r  r  calculated_mrope_sectiontext_cfg
target_cfgr  r  r  	attn_implr  rc   rL   rM   rV     sr   




"z&IsaacForConditionalGeneration.__init__input_tokens	list[int]mm_featureslist[MultiModalFeatureSpec]Iterator[tuple[int, int, int]]c           	      c  s    | j jj}t|dd dD ]3}|jj}|jdkr:|jd j \}}}|dks/J d| ||| || fV  qt	d|j d S )	Nc                 S  r_  r   )mm_positionoffset)frL   rL   rM   <lambda>Z  s    z?IsaacForConditionalGeneration.iter_mm_grid_hw.<locals>.<lambda>)r  r   r  r:   zImage must have 1 frame, got zUnsupported modality: )
rP   rS  r  sortedr&  r'  r  r  r   r   )	rb   r!  r#  spatial_merge_size
mm_featurer'  r  r   r   rL   rL   rM   iter_mm_grid_hwV  s   

z-IsaacForConditionalGeneration.iter_mm_grid_hwtuple[torch.Tensor, int]c                 C  sN  g }d}|  ||D ]R\}}}|| }t|dkr!|d  d nd}	|tt|d|f|	  td||fdd}
|
dd d f | |	 |
dd d f< ||
 |||  }q
|t|k rt|dkrq|d d d nd}	t|| }|tt|d|f|	  tj	|dddd}| d t| 
 }t||fS )Nr   rg   r:   r   )r   rg   )axis)r-  rE   rH   rw   r   broadcast_tor   indicesrq   concatenateitemrC   r   )rb   r!  r#  llm_pos_ids_liststr'  
llm_grid_h
llm_grid_wtext_lenst_idxgrid_indicesllm_positionsmrope_position_deltarL   rL   rM   get_mrope_input_positionsc  s.    $
 z7IsaacForConditionalGeneration.get_mrope_input_positionsr  objectIsaacImagePixelInputs | Nonec                 K  s4   | d}| d}|d u s|d u rd S t||dS )Nr  r  r  )rG  ru  )rb   r  r  r  rL   rL   rM   _parse_and_validate_image_input  s   

z=IsaacForConditionalGeneration._parse_and_validate_image_inputimage_inputru  r  c           
      C  s   |d }|d }|  dkrdS t| j j}| jjjj}|j	||d}|d d ddf j	|t
jd}| ||f}| jjj}|d	||  }	t||	 S )
Nr  r  r   rL   r   r:   r   r   rg   )numelnextr  
parametersr>   r  r  rp   rB   r   rC   rF   rP   rS  r  r   tuplesplitr   )
rb   rA  r  r  r>   rB   spatial_gridsvision_embeddings
merge_sizesizesrL   rL   rM   _process_image_input  s    
z2IsaacForConditionalGeneration._process_image_inputMultiModalEmbeddings | Nonec                 K  s&   | j di |}|d u rdS | |S rc  )r@  rK  )rb   r  rA  rL   rL   rM   embed_multimodal  s   
z.IsaacForConditionalGeneration.embed_multimodalN	input_idsr=   	positionsintermediate_tensorsIntermediateTensors | Noner  r   "torch.Tensor | IntermediateTensorsc                 K  s   | j d||||d|S )N)rN  rO  rP  r  rL   )r  )rb   rN  rO  rP  r  r  rL   rL   rM   r     s   z%IsaacForConditionalGeneration.forwardr  c                 C  s   | j |S r   )r  compute_logits)rb   r  rL   rL   rM   rS    s   z,IsaacForConditionalGeneration.compute_logitsr  r  r  c                 C  s   t | }|j|| jdS )N)mapper)r!   r  hf_to_vllm_mapper)rb   r  loaderrL   rL   rM   r    s   z*IsaacForConditionalGeneration.load_weightsr   c                 C  s   t jddddS )z<
        Get the module prefix in multimodal models
        r  r  r  )r  	connectortower_model)r   from_string_fieldr`  rL   rL   rM   get_mm_mapping  s
   z,IsaacForConditionalGeneration.get_mm_mapping)r  r   r:  r]   r@   r  )r	  r   r  r   )r!  r"  r#  r$  r@   r%  )r!  r"  r#  r$  r@   r.  )r  r>  r@   r?  )rA  ru  r@   r  )r  r>  r@   rL  rO  )rN  r=   rO  r=   rP  rQ  r  r   r  r>  r@   rR  )r  r=   r@   r   r  )r@   r   )r   r   r   packed_modules_mappingsupports_encoder_tp_datar"   rU  classmethodr  rV   r-  r=  r@  rK  rM  r   rS  r  rZ  r   rL   rL   rc   rM   r    sD    	
G

!




r  )r<   r=   r>   r?   r@   rA   )r:   N)
r<   r=   r   r=   r   r]   r>   r   r@   r=   )r:   )r   r=   r   r=   r   r]   r@   r=   )r   r   r@   r   )r   r   r@   r   )Nr   r:   )r   r]   r   r]   rY   r]   r   r]   r   r   r   r   r   r]   r@   r   )r   r   r   r   r@   r]   )r   r=   r   r   r@   r=   )r   r=   rY   r]   r@   r=   )Nr:   )r   r=   rY   r]   r   r]   r   r   r   r]   r@   r   )
__future__r   r   collections.abcr   r   r   r   typingr   r   numpyr   	PIL.ImagePILrC   torch.nnr_   torch.nn.functional
functionalrt   einopsr
   #transformers.image_processing_utilsr   transformers.tokenization_utilsr   typing_extensionsr   r   vllm.configr   vllm.config.modelr   vllm.distributedr   r   r  9vllm.model_executor.layers.attention.mm_encoder_attentionr   !vllm.model_executor.layers.linearr   r   r   r   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.interfacesr   r   r   r   r   )vllm.model_executor.models.module_mappingr   !vllm.model_executor.models.siglipr     vllm.model_executor.models.utilsr!   r"   r#   r$   vllm.multimodalr%   vllm.multimodal.inputsr&   r'   r(   r)   vllm.multimodal.parser*   r+   vllm.multimodal.processingr,   r-   r.   r/   r0   r1   vllm.sequencer2   vllm.tokenizersr3   vllm.tokenizers.hfr4   vllm.transformers_utils.configr5   vllm.transformers_utils.configsr6   r7   vllm.utils.tensor_schemar8   r9   visionr;   rN   ModulerO   r   r   r   VISION_MEAN
VISION_STDVISION_SCALEr   r   r   rI   float32r   r   r   r   r   r   r  r  r  r+  rQ  rk  ru  rv  r  r  r  r  r  register_processorr  rL   rL   rL   rM   <module>   s    
VED

i

-NHKD.Z0&[+

