o
    پi                  
   @   s  d Z ddlmZmZ ddlmZmZmZmZm	Z	m
Z
 ddlZddlmZ ddlm  mZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- dZ.dZ/eG dd dZ0G dd dej1Z2G dd dej1Z3dej4de5e6e7e7f  de7de5ej4 fddZ8G d d! d!ej1Z9d"e5ej4 dej4fd#d$Z:G d%d& d&ej1Z;G d'd( d(ej1Z<G d)d* d*ej1Z=d+ej4dej4dej4fd,d-Z>d.e7d/e7d0e7d1e?dej4f
d2d3Z@d4ej4d5ej4d+ej4de6ej4ej4f fd6d7ZAG d8d9 d9ej1ZBG d:d; d;ej1ZCG d<d= d=ej1ZDG d>d? d?ej1ZEG d@dA dAej1ZFdBe
ej4eej4 f dCeee7  dDeej1 dEe7dej4f
dFdGZGG dHdI dIej1ZHG dJdK dKeHZIe2eIgZJdS )Lz3
Using mistral-community/pixtral-12b as reference.
    )	dataclassfields)IterableListOptionalSetTupleUnionN)PixtralVisionConfigPretrainedConfig)PixtralRotaryEmbeddinggenerate_block_attention_mask)position_ids_in_meshgrid)
SiluAndMul)VisionAttention)RMSNorm)MergedColumnParallelLinearRowParallelLinear)QuantizationConfig)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputs)default_weight_loader)MistralLarge3ForCausalLMFpatch_mergec                   @   s   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< d
Zeed< dZeed< dZ	eed< dZ
eed< dS )VisionEncoderArgshidden_sizenum_channels
image_size
patch_sizeintermediate_sizenum_hidden_layersnum_attention_heads
rope_thetaimage_token_idTadapter_bias   spatial_merge_sizeFadd_pre_mm_projector_layer_norm mm_projector_idN)__name__
__module____qualname__int__annotations__floatr'   boolr)   r*   r,   str r5   r5   M/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/pixtral.pyr   6   s   
 r   c                       s   e Zd ZdZededededB fddZdd	d
ef fddZde	e de
fddZdeeeejf  fddZdejjfddZde	e dejfddZdd ZdejdejdB fddZdd Z  ZS ) PixtralForConditionalGenerationTmodalityireturnNc                 C   s   | drd S td)Nimagez Only image modality is supported)
startswith
ValueError)clsr8   r9   r5   r5   r6   get_placeholder_strJ   s   
z3PixtralForConditionalGeneration.get_placeholder_strr+   prefixrA   c                   s   t    || _dd ttD   fdd| jj  D }tdi || _t	| jj
|dd| _t| j| _| jjrGt| jjdd| _| jjtkrYt| jj| jjd	d
| _t| j| jj
jd| _d S )Nc                 S   s   h | ]}|j qS r5   )name).0fieldr5   r5   r6   	<setcomp>T       z;PixtralForConditionalGeneration.__init__.<locals>.<setcomp>c                    s   i | ]\}}| v r||qS r5   r5   )rC   keyvaluedataclass_fieldsr5   r6   
<dictcomp>U   s
    z<PixtralForConditionalGeneration.__init__.<locals>.<dictcomp>quant_config)configrL   h㈵>epsF)vision_encoder_dimr)   use_mlp_biasdimr5   )super__init__rM   r   r   vision_configto_dictitemsvision_argsr   text_configgetlanguage_modelVisionTransformervision_encoderr*   r   r   pre_mm_projector_normr,   PATCH_MERGEPatchMergerr)   patch_mergerVisionLanguageAdaptervision_language_adapter)selfrM   rA   kwargsrZ   	__class__rI   r6   rV   Q   s.   

z(PixtralForConditionalGeneration.__init__	input_ids	mm_inputsc                 C   s   t  }|||S N)r   pad_input_tokens)rf   rj   rk   patternr5   r5   r6   pad_input_idsr   s   z-PixtralForConditionalGeneration.pad_input_idsweightsc              	      s   dt ttjf fdddt ttjf fdddt ttjf fdd dt ttjf fdd	t| j | jjt	krDt| j
 nt | jjrRt| j nt t| j  f	d
d}| j|  d S )Nweightc                 S      | d  dS )Nr   r_   r<   rq   r5   r5   r6   is_vision_encoder_weightsw      zOPixtralForConditionalGeneration.load_weights.<locals>.is_vision_encoder_weightsc                 S   rr   )Nr   re   rs   rt   r5   r5   r6   is_vision_lang_adapter_weightsz   rv   zTPixtralForConditionalGeneration.load_weights.<locals>.is_vision_lang_adapter_weightsc                 S   rr   )Nr   rc   rs   rt   r5   r5   r6   is_patch_merger}   rv   zEPixtralForConditionalGeneration.load_weights.<locals>.is_patch_mergerc                 S   rr   )Nr   r`   rs   rt   r5   r5   r6   is_pre_mm_projector_norm   rv   zNPixtralForConditionalGeneration.load_weights.<locals>.is_pre_mm_projector_normc               	   3   s   D ]\} }| |fr<d | ddd  }d|v rq| }t  t|| W d    n1 s6w   Y  q | |frld | ddd  }| }t  t|| W d    n1 sfw   Y  q| |frd | ddd  }| }t  t|| W d    n1 sw   Y  q| |frd | ddd  }| }t  t|| W d    n1 sw   Y  q| |fV  qd S )N.r(   zfake_quantizer.qscale_act)joinsplittorchno_gradr   )rB   wtrimmed_nameparam	rx   ry   ru   rw   patch_merger_dictpre_mm_projector_norm_dictvision_encoder_dictvision_lang_adapter_dictrp   r5   r6   llm_weights_generator   sD   



zKPixtralForConditionalGeneration.load_weights.<locals>.llm_weights_generator)tupler4   r}   Tensordictr_   named_parametersrZ   r,   ra   rc   r*   r`   re   r]   load_weights)rf   rp   r   r5   r   r6   r   v   s    &z,PixtralForConditionalGeneration.load_weightsc                 C   s   | j S rl   )r]   rf   r5   r5   r6   get_language_model   s   z2PixtralForConditionalGeneration.get_language_modelrY   c                    s~   dd |D }|  |}| jjr|d|jd }| |}| jjtkr8| jj  fdd|D }| j	||d}| 
|}|S )Nc                 S   s   g | ]}|j qS r5   )feature)rC   itemr5   r5   r6   
<listcomp>   rF   zEPixtralForConditionalGeneration.get_image_feature.<locals>.<listcomp>c                    s<   g | ]}t |jd  D ]}|jd   |jd   fqqS )r   r   )rangeshape)rC   img_r!   r5   r6   r      s    )image_sizes)r_   rZ   r*   viewr   r`   r,   ra   r!   rc   re   )rf   rY   imagesimage_featuresimg_patch_dimsimage_embedsr5   r   r6   get_image_feature   s   



z1PixtralForConditionalGeneration.get_image_featurec                 C   s   t ||| j| |dS )N)rj   forward_batchr]   multimodal_model	positions)r   r]   )rf   rj   r   r   r5   r5   r6   forward   s   z'PixtralForConditionalGeneration.forwardhidden_statesc                 C   s   | j |S rl   )r]   compute_logits)rf   r   r5   r5   r6   r      s   z.PixtralForConditionalGeneration.compute_logitsc                 C   s
   | j  S rl   )r]   get_embed_and_headr   r5   r5   r6   r      s   
z2PixtralForConditionalGeneration.get_embed_and_head)r-   r.   r/   merge_by_field_configclassmethodr4   r0   r?   rV   r   r   ro   r   r   r}   r   r   nnModuler   r   r   r   r   r   __classcell__r5   r5   rh   r6   r7   G   s     !C	
r7   c                	       s   e Zd ZdZ	ddedededdf fdd	Zd
ejde	e
eef  dejfddZd
ejde	e
eef  dejfddZ  ZS )rb   z<
    Learned merging of spatial_merge_size ** 2 patches
    FrQ   r)   rR   r:   Nc                    s8   t    ||d  }|| _|| _tj|||d| _d S )N   bias)rU   rV   r)   mlp_input_dimr   Linearmerging_layer)rf   rQ   r)   rR   r   rh   r5   r6   rV      s   
zPatchMerger.__init__xr   c                 C   s:   t dd |D |jd ksJ | ||}| |}|S )Nc                 S      g | ]\}}|| qS r5   r5   rC   hr   r5   r5   r6   r          z'PatchMerger.forward.<locals>.<listcomp>r   )sumr   permuter   )rf   r   r   r5   r5   r6   r      s    
zPatchMerger.forwardc                 C   sL   t ||| jd}g }|D ]}|jd }||d|  qtj|ddS )a  
        Args:
            x: (N, D) where N is flattened and concatenated patch tokens
                for all images
            image_sizes: list of tuple of (height, width) in tokens for
                each image
        Returns:
            image_features: reorders patch tokens so each grid of
                (spatial_merge_size, spatial_merge_size) is contiguous.
                now (N / spatial_merge_size ** 2, D * spatial_merge_size ** 2)
        )r   r   r)   r   r   rS   )get_sub_gridsr)   r   appendr   tr}   cat)rf   r   r   	sub_gridspermuted_tensorgrid	n_patchesr5   r5   r6   r     s   
zPatchMerger.permuteF)r-   r.   r/   __doc__r0   r3   rV   r}   r   listr   r   r   r   r5   r5   rh   r6   rb      s4    
rb   r   r   r)   r:   c                 C   s   dd |D }| j d }g }|}t| |D ]<\}}|| \}	}
||	|
|dddd d d d d d d f }tjjj|||d}|d|||d}|	|d  q|S )Nc                 S   r   r5   r5   r   r5   r5   r6   r   0  r   z!get_sub_grids.<locals>.<listcomp>r   r   r   r(   )kernel_sizestride)
r   	enumerater|   r   r   r}   r   
functionalunfoldr   )r   r   r)   tokens_per_imagedall_img_sub_gridssub_grid_sizeimage_indeximage_tokensr   r   
image_gridr   r5   r5   r6   r   *  s"   

r   c                       s   e Zd Zdef fddZedefddZedej	j
fddZedejfd	d
ZedejfddZdeej dejfddZ  ZS )r^   argsc                    sv   t    || _tj|j|j|j|jdd| _t	|jdd| _
t|| _| jj| jj }|d dks6J dd | _d S )NFin_channelsout_channelsr   r   r   rN   rO   r   r   zROPE requires even head_dim)rU   rV   r   r   Conv2dr   r   r!   
patch_convr   ln_preTransformertransformerr$   
_freqs_cis)rf   r   head_dimrh   r5   r6   rV   H  s   


zVisionTransformer.__init__r:   c                 C   s   | j j| j j S rl   )r   r    r!   r   r5   r5   r6   max_patches_per_sideY  s   z&VisionTransformer.max_patches_per_sidec                 C      t |  jS rl   next
parametersdevicer   r5   r5   r6   r   ]     zVisionTransformer.devicec                 C   r   rl   r   r   dtyper   r5   r5   r6   r   a  r   zVisionTransformer.dtypec                 C   sV   | j d u rt| jj| jj | j| j| jjd| _ | j j| jkr(| j j| jd| _ | j S )N)rT   heightwidththetar   )	r   precompute_freqs_cis_2dr   r   r$   r   r%   r   tor   r5   r5   r6   	freqs_cise  s   
zVisionTransformer.freqs_cisr   c           
         s    fdd|D }dd |D }t j|dd}|j}|d|d } |}||}t| j} j|dddf |dddf f }t	r]dd	l
m} |jjjd
d |D }nddlm}	 |	dd |D |} j|||dS )a  
        Args:
            images: list of N_img images of variable sizes,
                each of shape (B, C, H, W)
        Returns:
            image_features: tensor of token features for
                all tokens of all images of shape (N_toks, D)
        c                    s   g | ]}  | jqS r5   )r   r   r   )rC   r   r   r5   r6   r     s    z-VisionTransformer.forward.<locals>.<listcomp>c                 S   s    g | ]}| d dd dqS )r   r   r(   )flattenr   rC   pr5   r5   r6   r          r(   rS   r   Nr   opsc                 S       g | ]}|j d  |j d  qS r   r   r   r   r5   r5   r6   r     r   r   c                 S   r   r   r   r   r5   r5   r6   r     r   maskr   )r}   r   r   r   r   position_meshgridr   r   r   USE_XFORMERS_OPSxformersr   fmha	attn_biasBlockDiagonalMaskfrom_seqlens,transformers.models.pixtral.modeling_pixtralr   r   )
rf   r   patch_embeds_listpatch_embedspatch_embeds_shaper   r   xopsr   r   r5   r   r6   r   t  s&   

&
zVisionTransformer.forward)r-   r.   r/   r   rV   propertyr0   r   r}   typesDevicer   r   r   r   r   r   r   r5   r5   rh   r6   r^   G  s    r^   r   c                 C   s   t dd | D }|S )Nc              	   S   sF   g | ]}t jt jt |jd  t |jd ddddddqS )r   r   ij)indexingrS   r   )r}   stackmeshgridaranger   reshaper   r5   r5   r6   r     s    	z%position_meshgrid.<locals>.<listcomp>)r}   r   )r   r   r5   r5   r6   r     s   	r   c                
       sV   e Zd ZdZ	ddddedee deddf fd	d
Zde	j
de	j
fddZ  ZS )PixtralHFMLPz5MLP for PixtralHFVisionModel using SGLang components.Nr+   r@   rM   rL   rA   r:   c                   sf   t    |jd usJ t|j|j|jgd|| dd| _t|j|jd|| dd| _t | _	d S )NFz.gate_up_proj)
input_sizeoutput_sizesr   rL   rA   z
.down_proj)r  output_sizer   rL   rA   )
rU   rV   r"   r   r   gate_up_projr   	down_projr   act_fn)rf   rM   rL   rA   rh   r5   r6   rV     s"   

zPixtralHFMLP.__init__r   c                 C   s*   |  |\}}| |}| |\}}|S rl   )r  r  r  )rf   r   gate_up_outputr   gate_upoutr5   r5   r6   r     s   
zPixtralHFMLP.forwardrl   )r-   r.   r/   r   r   r   r   r4   rV   r}   r   r   r   r5   r5   rh   r6   r    s    r  c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	rd   r   rT   c                    sP   t    t|tsJ tj|j||jd| _t	 | _
tj|||jd| _d S )Nr   )rU   rV   
isinstancer   r   r   r   r'   w_inGELUgeluw_out)rf   r   rT   rh   r5   r6   rV     s   

zVisionLanguageAdapter.__init__r   r:   c                 C   s   |  | | |S rl   )r  r  r  rf   r   r5   r5   r6   r     s   zVisionLanguageAdapter.forward)
r-   r.   r/   r   r0   rV   r}   r   r   r   r5   r5   rh   r6   rd     s    rd   c                       sx   e Zd ZdZ	ddddededee ded	df
 fd
dZ	de
jdee
j deee
je
jf  d	e
jfddZ  ZS )PixtralHFTransformerBlockzCTransformer block for PixtralHFVisionModel using SGLang components.Nr+   r@   rM   layer_idrL   rA   r:   c                   sv   t    || _t|jdd| _t|j|j|jd|ddddd| dd| _t	||| dd	| _
t|jdd| _d S )
NrN   rO   Tg        Fz
.attention)	embed_dim	num_headsprojection_sizeuse_qkv_parallelrL   dropoutuse_context_forwardflatten_batchqkv_bias	proj_biasrA   z.feed_forward)rL   rA   )rU   rV   r  r   r   attention_normr   r$   	attentionr  feed_forwardffn_norm)rf   rM   r  rL   rA   rh   r5   r6   rV     s(   
z"PixtralHFTransformerBlock.__init__r   attention_maskposition_embeddingsc                 C   st   |j \}}}| |d||||}| j||d |d}|| }| |d||||}	| |	}
||
 }|S )Nr   )r*  
cu_seqlensr+  )r   r&  r   r'  r)  r(  )rf   r   r*  r+  
batch_sizeseq_len
hidden_dimattn_normalizedattention_outputffn_normalized
ffn_outputoutputr5   r5   r6   r     s"   
z!PixtralHFTransformerBlock.forwardrl   )r-   r.   r/   r   r   r0   r   r   r4   rV   r}   r   r   r   r   r5   r5   rh   r6   r    s2    "r  r   c                    sl   |j   dks	J | j|jd |jd fks%J | j|jd |jd ff fddt|jD }| j| S )zd
    freqs_cis: complex - (seq_len, head_dim / 2)
    x: complex - (bsz, seq_len, head_dim / 2)
    r(   r   c                    s,   g | ]\}}|d ks| d  kr|nd qS r(   r5   )rC   r9   r   ndimr5   r6   r   G  s   , z*_reshape_for_broadcast.<locals>.<listcomp>)r7  r   r   r   )r   r   r   r5   r6  r6   _reshape_for_broadcast<  s   
r8  rT   r   r   r   c           
      C   s   d|t d| d |    }t j||jd}t j||jd}t ||ddd  }t ||ddd  }t j|dddddf d|d|dddddf |ddgdd}	t t |	|	S )	z
    freqs_cis: 2D complex tensor of shape (height, width, dim // 2)
        to be indexed by (height, width) position tuples
    g      ?r   r   r   Nr(   r   rS   )	r}   r	  r2   r   outerr   repeatpolar	ones_like)
rT   r   r   r   freqsr   r   freqs_hfreqs_wfreqs_2dr5   r5   r6   r   K  s   r   xqxkc                 C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }|jt jks4J t||}t || 	d}t || 	d}|
| |
|fS )Nr   r      )r}   view_as_complexr2   r
  r   r   	complex64r8  view_as_realr   type_as)rA  rB  r   xq_xk_xq_outxk_outr5   r5   r6   apply_rotary_emb_vitg  s   ,,
rL  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )FeedForwardr   c                    s^   t    |jd usJ tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _d S NFr   )	rU   rV   r"   r   r   r   w1w2w3rf   r   rh   r5   r6   rV   v  s
   
zFeedForward.__init__r   r:   c                 C   s    |  t| || | S rl   )rP  FsilurO  rQ  r  r5   r5   r6   r   }  s    zFeedForward.forward	r-   r.   r/   r   rV   r}   r   r   r   r5   r5   rh   r6   rM  u  s    rM  c                       D   e Zd Zdef fddZdejdejdejdejfdd	Z  ZS )
	Attentionr   c                    s   t    || _|j|j rJ |j| _|j|j | _tj|j|jdd| _	tj|j|jdd| _
tj|j|jdd| _tj|j|jdd| _d S rN  )rU   rV   r   r   r$   n_headsr   r   r   wqwkwvworR  rh   r5   r6   rV     s   
zAttention.__init__r   r   r   r:   c                 C   s  |j \}}}| || || |}}}	|||| j| j}|||| j| j}|	||| j| j}	t|||d\}}trPddl	m
}
 |
j|||	|d}n"|dd}|dd}|	dd}	tjj|||	|d}|dd}|||| j| j }| |S )N)r   r   r   )r   r(   r   )	attn_mask)r   rY  rZ  r[  r
  rX  r   rL  r   r   r   memory_efficient_attention	transposer   r   scaled_dot_product_attentionr\  )rf   r   r   r   batchpatchesr   qkvr  r  r5   r5   r6   r     s    "
zAttention.forwardrU  r5   r5   rh   r6   rW    s    rW  c                       rV  )
TransformerBlockr   c                    sB   t    t|| _t|| _t|jdd| _t|jdd| _	d S )NrN   rO   )
rU   rV   rW  r'  rM  r(  r   r   r&  r)  rR  rh   r5   r6   rV     s
   


zTransformerBlock.__init__r   r   r   r:   c           	      C   sz   |  |d|jd }||j}| jj|||d}|| }| |d|jd }||j}| j|}|| }|S )Nr   r   )r&  r   r   r'  r   r)  r(  )	rf   r   r   r   attention_norm_xrr   
ffn_norm_hr  r5   r5   r6   r     s   zTransformerBlock.forwardrU  r5   r5   rh   r6   rf    s    rf  c                       sH   e Zd Zdef fddZdejdejdejdB dejfd	d
Z  ZS )r   r   c                    s:   t    tj | _t|jD ]
}| jt	| qd S rl   )
rU   rV   r}   r   
ModuleListlayersr   r#   r   rf  )rf   r   r   rh   r5   r6   rV     s
   
zTransformer.__init__r   r   r   Nr:   c                 C   s   | j D ]	}||||d}q|S )Nr   )rk  )rf   r   r   r   layerr5   r5   r6   r     s   
zTransformer.forwardrU  r5   r5   rh   r6   r     s    r   c                       s   e Zd ZdZ	dddddedee dee ded	df
 fd
dZ		dde
jdee
j deee
je
jf  ded	ee
jee
j f f
ddZ  ZS )PixtralHFTransformerz=Transformer for PixtralHFVisionModel using SGLang components.Nr+   num_hidden_layers_overriderA   rM   rL   ro  rA   r:   c                   sB   t     j}|d ur|}t fddt|D | _d S )Nc              	      s&   g | ]}t  | d | dqS )z.layers.)rM   r  rL   rA   )r  rC   	layer_idxrM   rA   rL   r5   r6   r     s    z1PixtralHFTransformer.__init__.<locals>.<listcomp>)rU   rV   r#   r   rj  r   rk  rf   rM   rL   ro  rA   r#   rh   rr  r6   rV     s   

zPixtralHFTransformer.__init__Fr   r*  r+  return_all_hidden_statesc           	      C   sL   |}|r|gnd}t | jD ]\}}||||}|r|| q|r$|S |S )a  Forward pass through transformer layers.

        Args:
            x: Input tensor
            attention_mask: Optional attention mask
            position_embeddings: Optional position embeddings for rotary attention
            return_all_hidden_states: Whether to return all hidden states

        Returns:
            Either the final hidden state, or a list of all hidden states if
            return_all_hidden_states is True
        N)r   rk  r   )	rf   r   r*  r+  rt  r   all_hidden_statesr9   rl  r5   r5   r6   r     s   
zPixtralHFTransformer.forwardrl   r   )r-   r.   r/   r   r
   r   r   r0   r4   rV   r}   r   r   r3   r	   r   r   r   r5   r5   rh   r6   rm    s:    rm  outputsfeature_sample_layers	post_normr#   c                    s   |du rt  tr d  |dur|   S t  ts td|D ]}|dk s,||kr7td| d| dq" fdd	|D }tj|dd
}|durP||}|S )zCResolve outputs from visual encoder based on feature_sample_layers.Nr   zDExpected outputs to be a list when feature_sample_layers is providedr   zFeature sample layer index z is out of range [0, ]c                    s   g | ]} | qS r5   r5   rp  rv  r5   r6   r   3  s    z2resolve_visual_encoder_outputs.<locals>.<listcomp>rS   )r  r   r=   r}   r   )rv  rw  rx  r#   rq  selected_outputscombined_outputsr5   rz  r6   resolve_visual_encoder_outputs  s.   

r}  c                       s   e Zd ZdZdZdee defddZ	dddd	d
e	de
e de
e deddf
 fddZedd Zedd Z		d dejdeeeef  dede
ee  deejef f
ddZdeeeejf  dee fddZ  ZS )!PixtralHFVisionModelzFHugging Face Pixtral Vision Model implemented using SGLang components.
   rj   rk   c                 C   s   | j ||S rl   )input_padderrm   )rf   rj   rk   r5   r5   r6   ro   C  rv   z"PixtralHFVisionModel.pad_input_idsNr+   rn  rM   rL   ro  rA   r:   c                   s   t    || _|j| _|j| _tj|j|j|j|jdd| _	t
|jdd| _t|||| dd| _|j}t| jj|jkrNtd| dt| jj d	t|| _t | _d S )
NFr   rN   rO   z.transformerrn  zThe original encoder only has z layers, but you requested z layers.)rU   rV   rM   r    r!   r   r   r   r   r   r   r   rm  r   r#   lenrk  r=   r   patch_positional_embeddingr   r  rs  rh   r5   r6   rV   F  s6   


zPixtralHFVisionModel.__init__c                 C   r   rl   r   r   r5   r5   r6   r   s  r   zPixtralHFVisionModel.dtypec                 C   r   rl   r   r   r5   r5   r6   r   w  r   zPixtralHFVisionModel.deviceFpixel_valuesr   output_hidden_statesrw  c                    s    |j j jd} fddt||D }tjdd |D dd} |d}t	| j
 j d j}	 ||	}
tdd |D |}|pO|d	u} j|||
|d
}d	}t|tru|}|d	u rk|d }nt||d	 jj}n|}|rtdd||dS |S )a  
        Args:
            pixel_values: [batch_size, C, H, W], padded if multiple images
            image_sizes: list of (H, W) for each image in the batch
            output_hidden_states: Whether to return all hidden states.
            feature_sample_layers: Layer indices whose features should be
                concatenated and used as the visual encoder output. If none
                are provided, the last layer is used.

        Returns:
            A tuple containing:
              - hidden_states: Final model outputs (or selected layers if feature_sample_layers given)
              - hidden_states tuple (optional): All hidden states if output_hidden_states=True
        )r   r   c                    s6   g | ]\}\}}|d d| j  d| j  f qS ).Nr   )rC   embedr   r   r   r5   r6   r     s    
 z0PixtralHFVisionModel.forward.<locals>.<listcomp>c                 S   s   g | ]}| d jqS r5  )r   Tr   r5   r5   r6   r     r   r   rS   )	max_widthc                 S   r   r   r   r   r5   r5   r6   r     r   N)rt  r   VisualOutputr5   )last_hidden_stater   )r   r   r   r   zipr}   r   r   	unsqueezer   r    r!   r  _get_pixtral_attention_maskr   r  r   r}  rM   r#   type)rf   r  r   r  rw  embeds_orig	embeds_2d	embeds_1dembeds_featurizedposition_idsposition_embeddingr*  rt  transformer_outputsru  r  r5   r   r6   r   {  sb   




	zPixtralHFVisionModel.forwardrp   c                 C   s   t |  }g d}|D ]O\}}|D ]%\}}}||v r7|||}	|	|v r7||	 }
t|
dt}||
||  n$qd|v rH|dd}||v rH|}||v r[|| }
t|
dt}||
| qdS )zILoad weights from a HuggingFace checkpoint with proper parameter mapping.)).attention.qkv_projz.attention.q_projrc  )r  z.attention.k_projrd  )r  z.attention.v_projre  ).feed_forward.gate_up_projz.feed_forward.gate_projr   )r  z.feed_forward.up_projr(   weight_loaderz.attention.o_projz.attention.projN)r   r   replacegetattrr   )rf   rp   params_dictstacked_params_mappingrB   loaded_weight
param_nameweight_nameshard_idtransformed_namer   r  alt_namer5   r5   r6   r     s4   	
z!PixtralHFVisionModel.load_weightsrl   )FN)r-   r.   r/   r   DEFAULT_IMAGE_TOKEN_IDr   r0   r   ro   r
   r   r   r4   rV   r  r   r   r}   r   r   r   r3   r	   r   r   r   r   r   r   r5   r5   rh   r6   r~  >  sJ    -



,\r~  c                   @   s   e Zd ZdS )PixtralVisionModelN)r-   r.   r/   r5   r5   r5   r6   r    s    r  )Kr   dataclassesr   r   typingr   r   r   r   r   r	   r}   torch.nnr   torch.nn.functionalr   rS  transformersr
   r   r   r   r   r  r   sglang.srt.layers.activationr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   $sglang.srt.model_loader.weight_utilsr   !sglang.srt.models.mistral_large_3r   r   ra   r   r   r7   rb   r   r   r   r0   r   r^   r   r  rd   r  r8  r2   r   rL  rM  rW  rf  r   rm  r}  r~  r  
EntryClassr5   r5   r5   r6   <module>   s     G
Y
,M

+>

) A