o
    پir                  
   @   s   d dl mZmZ d dlmZmZ d dlZd dlmZ d dl	m
Z
 e
eZededZG dd	 d	eee Zd
ejeej B dee dB dejjdB dedejf
ddZdS )    )ABCabstractmethod)GenericTypeVarN)PretrainedConfig)init_logger_C)boundc                       s   e Zd Zdeddf fddZedededefdd	Zedefd
dZedefddZ	edefddZ
edefddZ  ZS )VisionEncoderInfovision_configreturnNc                    s   t    || _d S N)super__init__r   )selfr   	__class__ h/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/encoders/vision.pyr      s   

zVisionEncoderInfo.__init__image_widthimage_heightc                C      t r   NotImplementedError)r   r   r   r   r   r   get_num_image_tokens   s   z&VisionEncoderInfo.get_num_image_tokensc                 C   r   r   r   r   r   r   r   get_max_image_tokens#      z&VisionEncoderInfo.get_max_image_tokensc                 C   r   r   r   r   r   r   r   get_image_size'   r   z VisionEncoderInfo.get_image_sizec                 C   r   r   r   r   r   r   r   get_patch_size+   r   z VisionEncoderInfo.get_patch_sizec                 C   r   r   r   r   r   r   r   get_patch_grid_length/   r   z'VisionEncoderInfo.get_patch_grid_length)__name__
__module____qualname__r   r   r   intr   r   r   r   r    __classcell__r   r   r   r   r
      s$    r
   encoder_outputsfeature_sample_layerspost_layer_normmax_possible_layersr   c                    s   |du r|dur| S  S t  d }||  fdd|D }|d t |d dfv }|dur:|r:| |d< tj|ddS )a^  Given the outputs a visual encoder module that may correspond to the
    output of the last layer, or a list of hidden states to be stacked,
    handle post normalization and resolve it into a single output tensor.

    Args:
        encoder_outputs: Output of encoder's last layer or all hidden states.
        feature_sample_layers: Optional layer indices to grab from the encoder
            outputs; if provided, encoder outputs must be a list.
        post_layer_norm: Post norm to apply to the output of the encoder.
        max_possible_layers: Total layers in the fully loaded visual encoder.

    N   c                    s(   g | ]}|d kr | n |  qS )r   r   ).0	layer_idxr&   offsetr   r   
<listcomp>S   s    
z2resolve_visual_encoder_outputs.<locals>.<listcomp>)dim)lentorchcat)r&   r'   r(   r)   num_loaded_layershs_pooluses_last_layerr   r-   r   resolve_visual_encoder_outputs4   s   
r8   )abcr   r   typingr   r   r3   transformersr   1sglang.multimodal_gen.runtime.utils.logging_utilsr   r!   loggerr   r
   Tensorlistr$   nn	LayerNormr8   r   r   r   r   <module>   s&   !

