o
    پi*                     @   s   d Z ddlZddlmZmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  e!e"Z#G dd dej$Z%e%Z&dS )zJInference-only Sarashina2Vision model compatible with HuggingFace weights.    N)IterableListOptionalTuple)nn)LlamaConfig)LogitsProcessor)PoolerPoolingType)QuantizationConfig)MultimodalDataItemMultimodalInputs/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)ForwardBatch)default_weight_loader)LlamaForCausalLM)Qwen2VisionTransformer)
add_prefixc                       s   e Zd ZdZ		ddee deddf fddZd	ee	 d
e
fddZdd ZdejdejdejfddZdee dejfddZ	d d	ejdejdededejf
ddZdeeeejf  fddZ  ZS )!Sarashina2VisionForCausalLMz
    Sarashina2Vision model that combines:
    - Llama text backbone (sbintuitions/sarashina2-7b)
    - Qwen2VL vision encoder
    N quant_configprefixreturnc                    s:  t    || _t|d|}t|dd }|d ur)t|t|dd|td|d| _nd | _t|j	| _
t|drZ|jdkrZtdi |j}t|d	rN|j|_t||td
|d| _nt|d	rc|j|_t||td
|d| _t|dd| _t|dd| _t|dd| _t|d	r|j| jj_t|| _ttjdd| _d S )Ntext_configvision_configrms_norm_epsgh㈵>visual)norm_epsr   r   
model_typellama
vocab_sizellm)r   r   image_token_index   start_image_token_indexi end_image_token_indexi T)pooling_type	normalize )super__init__configgetattrr   r   r   r   	LayerNormhidden_sizenormhasattrr   r   __dict__r!   r   r"   r#   r%   r&   r   logits_processorr	   r
   LASTpooler)selfr,   r   r   r   r   llama_config	__class__r)   W/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/sarashina2_vision.pyr+   0   sL   







z$Sarashina2VisionForCausalLM.__init__	input_ids	mm_inputsc                 C   s   t  }|||S )z@Pad input tokens with multimodal data hashes for RadixAttention.)r   pad_input_tokens)r6   r;   r<   patternr)   r)   r:   pad_input_idso   s   z)Sarashina2VisionForCausalLM.pad_input_idsc                 C   s
   | j  S )z-Get input embeddings from the language model.)r"   get_input_embeddings)r6   r)   r)   r:   r@   t   s   
z0Sarashina2VisionForCausalLM.get_input_embeddingspixel_valuesimage_grid_thwc                 C   s(   | j du r	td|  ||}| |S )z6Extract image embeddings using the vision transformer.NVisual encoder not initialized)r   
ValueErrorr0   )r6   rA   rB   hidden_statesr)   r)   r:   get_image_embedsx   s   

z,Sarashina2VisionForCausalLM.get_image_embedsitemsc                 C   s   | j du r	tdtjdd |D dd| j j}tjdd |D dd}| dks2J | | dks>J | | ||S )	z0Extract image features for SGLang compatibility.NrC   c                 S      g | ]}|j qS r)   )feature.0itemr)   r)   r:   
<listcomp>       zASarashina2VisionForCausalLM.get_image_feature.<locals>.<listcomp>r   dimc                 S   rH   r)   )rB   rJ   r)   r)   r:   rM      rN      )r   rD   torchcattypedtyperP   rF   )r6   rG   rA   rB   r)   r)   r:   get_image_feature   s   
z-Sarashina2VisionForCausalLM.get_image_featureF	positionsforward_batchget_embeddingc                 C   s:   t ||| jj| |d}|r| ||S | ||| jj|S )zForward pass through the model.)r;   rX   language_modelmultimodal_modelrW   )r   r"   modelr5   r3   lm_head)r6   r;   rW   rX   rY   rE   r)   r)   r:   forward   s   	z#Sarashina2VisionForCausalLM.forwardweightsc                 C   s  t |  }t }i }i }|D ]\}}d|v r6|dd}||v r6|| }	t|	dt}
|
|	| || qd|v rO|dd}||i ||< ||| d< qd|v rh|d	d}||i ||< ||| d
< qd|v r|dd}||i ||< ||| d< qd|v r|dd}||i ||< ||| d< qd|v r|dd}||i ||< ||| d< q||v r|| }	t|	dt}
|
|	| || q| D ]I\}}d|v rd
|v rd|v r| d}||v r|d |d
 |d }}}t	j
|||gdd}|| }	t|	dt}
|
|	| || q| D ]?\}}d|v r\d|v r\| d}||v r\|d |d }}t	j
||gdd}|| }	t|	dt}
|
|	| || qdS )zLoad model weights.z
.attn.qkv.z.attn.qkv_proj.weight_loaderz.self_attn.q_proj.weightz.q_proj.weightr   qz.self_attn.k_proj.weightz.k_proj.weightkz.self_attn.v_proj.weightz.v_proj.weightvz.mlp.gate_proj.weightz.gate_proj.weightgatez.mlp.up_proj.weightz.up_proj.weightupz.qkv_proj.weightr   rO   z.gate_up_proj.weightN)dictnamed_parameterssetreplacer-   r   addgetrG   rR   rS   )r6   r_   params_dictloaded_paramsqkv_weightsgate_up_weightsnameloaded_weightmapped_nameparamr`   baseweights_dictqkv_namera   rb   rc   qkvgate_up_namerd   re   gate_upr)   r)   r:   load_weights   s   











z(Sarashina2VisionForCausalLM.load_weights)Nr   )F)__name__
__module____qualname____doc__r   r   strr+   r   intr   r?   r@   rR   TensorrF   r   rV   r   boolr^   r   r   rz   __classcell__r)   r)   r8   r:   r   )   sD    	?

$r   )'r~   loggingtypingr   r   r   r   rR   r   transformersr   "sglang.srt.layers.logits_processorr   sglang.srt.layers.poolerr	   r
   *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   r   r   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.llamar   sglang.srt.models.qwen2_vlr   sglang.srt.utilsr   	getLoggerr{   loggerModuler   
EntryClassr)   r)   r)   r:   <module>   s&   
 d