o
    پiuP                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	Z
d dlZd dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dl m!Z! e "e#Z$dZ%dddddiZ&G dd dej'Z(G dd dej'Z)e)gZ*dS )    N)Iterable)ListOptionalTuple)nn)PretrainedConfig)QuantizationConfig)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)ModalityMultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)Idefics2VisionTransformer)LlamaForCausalLM)AudioEmbeddingzsiglip-so400m-patch14-448        )vit_image_sizevit_patch_sizetoken_compression_factorc                       s   e Zd ZdZ		ddedee dededdf
 fd	d
Z	dde	j
de	j
fddZde	j
de	jde	jdee	j
 fddZ  ZS )Phi4MMImageEncoderzImage embedding. configquant_configprefix	model_dirreturnNc              
      s  t    t|dr|jn|j}d| _t|jdd| _| jj	j
j}| \}}tt|}	|	d |ks<J d| d|	d dkrLtd	| _|	d
7 }	|}
|	d d | _|	| _|
| _d | _d | _d| _d| _d| _d| _d| _d| _tjddd| _d
| _ | jd | _| j| jksJ d| jsJ dt!t"#d
d
| j| j d  g| _$t!t"#d
d
d
| j| j d  g| _%|}d}t&|
| j d  |g}t'd
|D ]}|(t) t&||g qtj*| | _+|j,| _,d | _-d| _.d S )Nn_embdpatchF)r   require_post_normr   zposition embedding size z is not squarer   )r      r   r#   r#   Tsub_glbr   avg_pool_2d)kernel_sizestridezDuse_hd_transform and with_learnable_separator should have same valuez,learnable separator is only for hd transform)/super__init__hasattrr    hidden_sizetype_featurer   vision_configimg_processor
embeddingsposition_embeddingweightsizeintmathsqrtr   ReflectionPad2dimg_processor_paddingnum_img_tokensbase_feat_height_targetimage_dim_out	img_sizesimage_attention_maskuse_hd_transformwith_learnable_separatorhd_transform_orderfreeze_img_processor	crop_sizeimage_token_compression_cls	AvgPool2dimage_token_compressionbase_feat_height_reduction	Parametertorchzerosglb_GNsub_GNLinearrangeextendGELU
Sequentialimg_projection
vocab_sizeimg_featuresuse_out_place_operations)selfr   r   r   r   r+   	pe_weightLDHr:   dim_projectiondepthlayers_	__class__ L/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/phi4mm.pyr)   <   sh   

zPhi4MMImageEncoder.__init__
img_embedsc                 C   s   | j ||d}|}| jd u}t| dd d u}|s|r_tt|d}|d|||d}|dddd}|r>| 	|}|rE| |}|dddd}|d|d|d |d}|S )N)patch_attention_maskr7   r#   r      r   )
r.   rD   getattrr3   r4   r5   r2   viewpermuter7   )rT   ra   attention_maskimg_featurepatch_featureuse_token_compressionuse_paddingwidthr_   r_   r`   get_img_features   s*   


z#Phi4MMImageEncoder.get_img_featurespixel_valuesimage_sizesr<   c           &   	   C   s  t | j }|j}|j}|}|j\}}	}
}}|}|dd}| ||t	j
dd|}| j}| j}| j}tt|jd  }}||krM||ks[J d| d| d| d||d|| | j}| j}|}g }g }t|t	jr{|dd}t|D ]}|| \}}|| }|| }|| }||d	df }|d|||d|| ||| || ddd
dddd|| || || |  }| jd|| dd}t	j||gdddd|| | }||dd	f }|d	| }|||||||| ||| || ddd
ddd|d|| |  }|d|||| || dddd
dddd|| | || | || | }|d	urt|dkr||d|d dd	ddd	df d|||| || ddd
ddd|| | || | }t|dd	d	df   }t|ddd	d	f   } |d	d	d	|d	| f }| jd|dd}!t||d	|d dd	ddd	df   |d  ||  }"n"| jd|| | dd}!t|| d | j  d |d | |  }"t	j||!gdddd|| | }| j!dkr!|"t	j|| j#|gdd n| j!dkr6|"t	j|| j#|gdd n	t$d| j! d|"|d jd ksXJ d|" d|d jd  |"|" qg }#|D ]}$| |$||}%|#"|%%d qb|#S )a  
        process image and return vision embeddings.

        pixel_values: (num_images, num_crops, c, h, w)
        image_sizes: [[h1, w1], [h2, w2]]
        image_attention_mask: num_images x num_crops x 32 x 32
        output: (num_images, num_img_tokens, hidden_size)
        r   r#   zbase_feat_height: z&,"                f" base_feat_width: z, "                f"expect z features for hd transformrc   r   Nrd         dimglb_subr$   zhd_transform_order = z+, "                        "not implementedz
temp_len: z2, output_imgs[-1].shape[1]: "                    ")&nextrP   
parametersdevicedtypeshapeflattenrn   typerG   
BoolTensortor9   rA   rE   r3   npr5   rf   r:   
isinstanceTensorrL   reshape
contiguousrg   rJ   repeatcatlensumitemr8   r?   appendrI   NotImplementedErrorsqueeze)&rT   ro   rp   r<   img_projection_paramstarget_devicetarget_dtyper;   
num_images	num_cropschwbsrR   r9   base_resolutionrE   base_feat_heightbase_feat_widthCrX   output_imgs
output_len_bsB_global_img_featureglb_imgtemp_glb_GNsub_imgreshaped_image_attention_maskuseful_heightuseful_widthtemp_sub_GNtemp_lenimg_set_tensor_output_imgimg_feature_projr_   r_   r`   forward   s4  




"

.zPhi4MMImageEncoder.forward)r   r   N)__name__
__module____qualname____doc__r   r   r   strr)   rG   FloatTensorrn   r   listr   __classcell__r_   r_   r]   r`   r   9   s<    M
!r   c                
       s   e Zd Zg dddgdZedZ		d"ded	ee	 d
e
f fddZdee dejfddZdee dejfddZdejdejdededejf
ddZdee defddZde
defddZdeee
ejf  fd d!Z  ZS )#Phi4MMForCausalLM)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzd^language_model\.model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)Nr   r   r   r   c                    s   t    t|||d| _t||d|jd| _t|jd t	r.d|jd d i|jd }nd|jd i}t
|fi || _d S )N)r   r   r   zmodel.vision_embed_tokens)r   r   audio_embd_layerembedding_cls)r(   r)   r   language_modelr   _name_or_pathvision_encoderr   
embd_layerdictr   embed_tokens_extend)rT   r   r   r   embedding_configr]   r_   r`   r)     s"   
zPhi4MMForCausalLM.__init__itemsr   c                 C   s|   t | j j}tjdd |D dd|}tjdd |D dd}tjdd |D dd}| |||}t||S )Nc                 S      g | ]}|j qS r_   )feature.0r   r_   r_   r`   
<listcomp>      z7Phi4MMForCausalLM.get_image_feature.<locals>.<listcomp>r   rs   c                 S   s   g | ]
}t |d r|jqS )r<   )r*   r<   r   r_   r_   r`   r     s    c                 S   r   r_   )rp   r   r_   r_   r`   r     r   )rv   r   rw   ry   rG   r   r|   )rT   r   ry   ro   r<   rp   image_embedsr_   r_   r`   get_image_feature  s   z#Phi4MMForCausalLM.get_image_featurec                    s@   t j }|j |j fdd|D }t|S )Nc                    s>   g | ]}j |j t|d r|j nddqS )audio_attention_maskN)audio_featuresr   )r   r   r~   r|   r*   r   r   rx   ry   rT   r_   r`   r     s    z7Phi4MMForCausalLM.get_audio_feature.<locals>.<listcomp>)rv   r   rw   rx   ry   rG   r   r|   )rT   r   embed_tokens_extend_paramaudio_embedsr_   r   r`   get_audio_feature  s   z#Phi4MMForCausalLM.get_audio_feature	input_ids	positionsforward_batchkwargsc                 K   s(   t ||| jtj| jtj| ji|d}|S )N)r   r   r   data_embedding_funcsr   )r
   r   r   IMAGEr   AUDIOr   )rT   r   r   r   r   hidden_statesr_   r_   r`   r     s   zPhi4MMForCausalLM.forward	mm_inputsc                 C   s   t  }|||S r   )r	   pad_input_tokens)rT   r   r   patternr_   r_   r`   pad_input_ids  s   zPhi4MMForCausalLM.pad_input_idsmodule_namec                 C   s   t | j|S r   )boollora_patternmatch)rT   r   r_   r_   r`   should_apply_lora  s   z#Phi4MMForCausalLM.should_apply_loraweightsc                    s   g d}dddddd}g d d	t d
tf fdd}t|  }|D ]j\}}||r,q#| D ]\}}	||rA|||	} nq0|dd}|dd}|D ]\}
}}||vrZqP|||
}|| }|j}||||  n||}|d u rd|vrt	
d q#t|dt}||| q#d S )N)).self_attn.qkv_projz.self_attn.q_projq)r   z.self_attn.k_projk)r   z.self_attn.v_projvz0embed_tokens_extend.audio_projection_for_vision.z%embed_tokens_extend.audio_projection.zembed_tokens_extend.zvision_encoder.zlanguage_model.model.)z>model.embed_tokens_extend.audio_embed.audio_projection.vision.z>model.embed_tokens_extend.audio_embed.audio_projection.speech.z&model.embed_tokens_extend.audio_embed.z&model.embed_tokens_extend.image_embed.zmodel.)zimg_processor.encoder.layers.26zimg_processor.headzimg_processor.post_layernormnamer   c                    s   t  fddD S )Nc                 3   s    | ]}| v V  qd S r   r_   )r   substrr   r_   r`   	<genexpr>  s    zGPhi4MMForCausalLM.load_weights.<locals>._should_skip.<locals>.<genexpr>)anyr   	skip_listr   r`   _should_skip  s   z4Phi4MMForCausalLM.load_weights.<locals>._should_skipzself_attn.out_projzself_attn.projzbase_layer.r   loraz-Warning: {name} not found in model parametersweight_loader)r   r   r   named_parametersr   
startswithreplacer   getloggerwarningre   r   )rT   r   stacked_params_mappingprefix_mappingr   params_dictr   loaded_weightold_namenew_name
param_nameweight_nameshard_idparamr   r_   r   r`   load_weights  sJ   



zPhi4MMForCausalLM.load_weights)Nr   )r   r   r   packed_modules_mappingrecompiler   r   r   r   r   r)   r   r   rG   r   r   r   r   objectr   r3   r   r   r   r   r   r   r  r   r_   r_   r]   r`   r   y  s@    
$r   )+loggingr4   r  collections.abcr   typingr   r   r   numpyr   rG   r   transformersr   sglang.srt.layers.quantizationr   sglang.srt.managers.mm_utilsr	   r
   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.idefics2r   sglang.srt.models.llamar   sglang.srt.models.phi4mm_audior   	getLoggerr   r   SIGLIP_NAME#VISION_ENCODER_TO_PROCESSING_CONFIGModuler   r   
EntryClassr_   r_   r_   r`   <module>   s>   
	  B 
"