o
    پi1                     @   s   d Z ddlmZ ddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ G dd de	jZeZ dS )a  
Support for lightonai/LightOnOCR-2-1B.

LightOnOCR is a vision-language OCR model that combines:
- Pixtral vision encoder (24 layers, 1024 hidden dim)
- Spatial merge projection with RMSNorm + PatchMerger (2x2 = 4x token reduction)
- Qwen3 language decoder (28 layers, 1024 hidden dim)

Key differences from PixtralForConditionalGeneration:
- Uses Qwen3ForCausalLM instead of MistralLarge3ForCausalLM as the language model
- Has an RMSNorm applied to vision encoder output before patch merging
- Does not use image break/end tokens (single contiguous image token range)
- HuggingFace checkpoint uses a vision_projection namespace for norm, patch_merger,
  and adapter weights

References:
- https://huggingface.co/lightonai/LightOnOCR-2-1B
    )fields)IterableListTupleN)RMSNorm)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)PATCH_MERGEPatchMergerPixtralHFVisionModelVisionEncoderArgsVisionLanguageAdapter)Qwen3ForCausalLMc                       s   e Zd ZdZdZededededB fddZd	d
def fddZ	de
e defddZde
e dejfddZdejjfddZdejdejdefddZdejdejdB fddZdd Zd eeeejf  fd!d"Z  ZS )#"LightOnOCRForConditionalGenerationa]  
    LightOnOCR model for SGLang inference.

    Architecture:
    - Pixtral-based vision encoder (PixtralHFVisionModel, 24 layers)
    - RMSNorm on vision encoder output
    - Spatial merge via PatchMerger (2x2 = 4x token reduction)
    - VisionLanguageAdapter projection to text hidden size
    - Qwen3-based decoder (28 layers) with QK norms
    TmodalityireturnNc                 C   s   | drd S td)Nimagez Only image modality is supported)
startswith
ValueError)clsr   r    r   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/lightonocr.pyget_placeholder_strH   s   
z6LightOnOCRForConditionalGeneration.get_placeholder_str )prefixr   c                   s
  t    || _|d}|j}dd ttD   fdd|  D }d|vr2t	|dd|d< d|vr>t	|dd	|d< d
|vrJt	|dd|d
< t
|d< tdi || _t|d d| _t| jjdd| _t| jj| jjd| _t| j|jjd| _t|j|d| _d S )Nquant_configc                 S   s   h | ]}|j qS r   )name).0fieldr   r   r   	<setcomp>U       z>LightOnOCRForConditionalGeneration.__init__.<locals>.<setcomp>c                    s   i | ]\}}| v r||qS r   r   )r"   keyvaluedataclass_fieldsr   r   
<dictcomp>V   s
    z?LightOnOCRForConditionalGeneration.__init__.<locals>.<dictcomp>image_token_idigP spatial_merge_size   adapter_biasmultimodal_projector_biasTmm_projector_id)r    gh㈵>)eps)vision_encoder_dimr,   dim)configr    r   )super__init__r5   getvision_configr   r   to_dictitemsgetattrr   vision_argsr   vision_encoderr   hidden_sizevision_projection_normr   r,   patch_mergerr   text_configvision_language_adapterr   language_model)selfr5   r   kwargsr    r9   r=   	__class__r(   r   r7   N   s>   




z+LightOnOCRForConditionalGeneration.__init__	input_ids	mm_inputsc                 C   s   t  }|||S N)r   pad_input_tokens)rE   rI   rJ   patternr   r   r   pad_input_ids   s   z0LightOnOCRForConditionalGeneration.pad_input_idsr;   c                    s  dd |D }g }|D ]>}|j r/d|j v r/|j d }|D ]}|t|d t|d f qq|j}t|jd D ]}||jd |jd f q9qt|dkrXtj|dd}	n|d }	| j	|	|d	}
|

d|
jd }
| |
}
| jj  fd
d|D }| j|
|d	}
| |
}|S )z>Process images through vision encoder and projection pipeline.c                 S   s   g | ]}|j qS r   )feature)r"   itemr   r   r   
<listcomp>   r%   zHLightOnOCRForConditionalGeneration.get_image_feature.<locals>.<listcomp>image_sizesr      r3   )rR   c                    s    g | ]\}}|  |  fqS r   r   )r"   hw
patch_sizer   r   rQ      s    )model_specific_dataappendintrO   rangeshapelentorchcatr>   viewr@   r=   rY   rA   rC   )rE   r;   imagesimage_sizes_listrP   sizes_tensorsizeimg_pixel_valuesimage_featuresimg_patch_dimsimage_embedsr   rX   r   get_image_feature   s2   
 


z4LightOnOCRForConditionalGeneration.get_image_featurec                 C   s   | j S rK   )rD   rE   r   r   r   get_language_model   s   z5LightOnOCRForConditionalGeneration.get_language_model	positionsforward_batchc                 C   s   t ||| j| |dS )N)rI   rq   rD   multimodal_modelrp   )r   rD   )rE   rI   rp   rq   r   r   r   forward   s   z*LightOnOCRForConditionalGeneration.forwardhidden_statesc                 C   s   | j |S rK   )rD   compute_logits)rE   rt   r   r   r   ru      s   z1LightOnOCRForConditionalGeneration.compute_logitsc                 C   s
   | j  S rK   )rD   get_embed_and_headrn   r   r   r   rv      s   
z5LightOnOCRForConditionalGeneration.get_embed_and_headweightsc                    sh   t | j t | j t | j t | j  g d fdd}| j|  dS )a&  Load weights from HuggingFace checkpoint.

        HF checkpoint weight layout (after stripping ``model.`` prefix):
        - ``vision_encoder.*`` -> self.vision_encoder
        - ``vision_projection.norm.*`` -> self.vision_projection_norm
        - ``vision_projection.patch_merger.*`` -> self.patch_merger
        - ``vision_projection.linear_1.*`` -> self.vision_language_adapter.w_in
        - ``vision_projection.linear_2.*`` -> self.vision_language_adapter.w_out
        - ``language_model.*`` -> self.language_model (Qwen3ForCausalLM)
        )).attention.qkv_projz.attention.q_projq)rx   z.attention.k_projk)rx   z.attention.v_projv).feed_forward.gate_up_projz.feed_forward.gate_projr   )r|   z.feed_forward.up_projrS   c               
   3   s   D ]C\} }|  dr| tdd  } |  dr| tdd  }d}D ];\}}}||v ra|||}|v ra| }t|dt}	t  |	||| W d    n1 sXw   Y  d} nq&|sd|v rn|dd}|v r| }t|dt}	t  |	|| W d    n1 sw   Y  q|  dr2| tdd  }
|
 d	r|
td	d  }|v r҈| }t  t|| W d    n1 sw   Y  q|
 d
r|
td
d  }|v r| }t  t|| W d    n1 sw   Y  q|
dddd}| v r1 | }t  t|| W d    n	1 s,w   Y  q|  drBd| tdd   } | |fV  qd S )Nzmodel.zvision_encoder.Fweight_loaderTz.attention.o_projz.attention.projzvision_projection.zpatch_merger.znorm.z	linear_1.zw_in.z	linear_2.zw_out.zlanguage_model.)r   r_   replacer<   r   r`   no_grad)r!   rW   trimmedloaded
param_nameweight_nameshard_idtransformedparamr}   	remainingadapter_dict	norm_dictpatch_merger_dictstacked_params_mappingvision_encoder_dictrw   r   r   llm_weights_generator   s   









zNLightOnOCRForConditionalGeneration.load_weights.<locals>.llm_weights_generatorN)dictr>   named_parametersrA   r@   rC   rD   load_weights)rE   rw   r   r   r   r   r      s   Hz/LightOnOCRForConditionalGeneration.load_weights)__name__
__module____qualname____doc__merge_by_field_configclassmethodstrr\   r   r7   r   r
   rN   r	   r`   Tensorrm   nnModulero   r   rs   ru   rv   r   r   r   __classcell__r   r   rG   r   r   :   s.    1)

$r   )!r   dataclassesr   typingr   r   r   r`   torch.nnr   sglang.srt.layers.layernormr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr	   r
   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.pixtralr   r   r   r   r   sglang.srt.models.qwen3r   r   r   
EntryClassr   r   r   r   <module>   s    q