o
    i                     @   s  U d dl Z d dlmZmZmZ d dlmZmZmZm	Z	 d dl
Zd dlZd dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9m:Z: d dl;m<Z< d dl=m>Z>m?Z? ddl@mAZA ddlBmCZCmDZDmEZE ddlFmGZG ddlHmIZImJZJmKZK dZLdZMdZNdZOddddd iZPd!eQd"eQd#eQd$eQfd%d&ZRd?d(eQfd)d*ZSG d+d, d,ejTZUG d-d. d.e>ZVG d/d0 d0e>ZWG d1d2 d2e>ZXeWeXB ZYe	eZd3< d@d4d5Z[G d6d7 d7e7Z\G d8d9 d9e4e\ Z]G d:d; d;e6e\ Z^e&j_e^e\e]d<G d=d> d>ejTeDeEZ`dS )A    N)IterableMappingSequence)	AnnotatedAnyLiteral	TypeAlias)BatchFeaturePretrainedConfigProcessorMixinSequenceFeatureExtractorSiglipVisionConfig)
VllmConfig)BaseDummyOptions)get_pp_group)LogitsProcessor)QuantizationConfig)ParallelLMHead)
LlamaModel)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)AudioProcessorItemsImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilder)BaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdateResolvedPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )Idefics2VisionTransformer)MultiModalEmbeddingsSupportsLoRASupportsMultiModal)AudioEmbedding)AutoWeightsLoaderWeightsMappermaybe_prefixiJ iK ih zsiglip-so400m-patch14-448        )vit_image_sizevit_patch_sizetoken_compression_factor
orig_widthorig_heighttarget_heighttarget_widthc                 C   sP   ||  }|| }||k rd}|t ||  }||fS |t | |  }d}||fS )Nr   )int)r9   r:   r;   r<   ratio_widthratio_heightpadding_widthpadding_height rB   W/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/phi4mm.py_get_padding_sizeL   s   rD   	layer_idxc                 K   sX   dddddddd}t di ||}| d	k r|j|  d
 }n| d
 }t|d|d}|S )Ni  r3   i  siglip_vision_model      r4   )hidden_size
image_sizeintermediate_size
model_typenum_attention_headsnum_hidden_layers
patch_sizer   r*   F)configrequire_post_normnum_hidden_layers_overriderB   )r   rO   r+   )rF   kwargsvision_configmodel_configrO   vision_modelrB   rB   rC   get_navit_vision_model[   s$   
rX   c                       s   e Zd ZdZ		ddededB dededdf
 fd	d
Z	ddej	dej	fddZ
dej	dejdejdeej	 fddZ  ZS )Phi4MMImageEncoderzImage embedding. rQ   quant_configNprefix	model_dirreturnc              
      s  t    t|dr|jn|j}t|jtr'|jdd| _	|jdd| _
nd| _	d| _
t| j	d| _| jjjj}| \}}tt|}	|	d |ksUJ d| d	|	d d
kretd| _|	d7 }	|}
|	d d | _|	| _|
| _d | _d | _d| _d| _d| _d| _d| _d| _ tj!ddd| _"d| _#| jd | _| j| jksJ d| jsJ dt$t%&dd| j| j#d  g| _'t$t%&ddd| j| j#d  g| _(|}d}t)|
| j#d  |g}t*d|D ]}|+t, t)||g qtj-| | _.|j/| _/d | _0d| _1d S )Nn_embdrF   type_featurepatch)rF   r5   zposition embedding size z is not squarer   )r   r*   r   r*   r*   Tsub_glbFr3   avg_pool_2d)kernel_sizestridezDuse_hd_transform and with_learnable_separator should have same valuez,learnable separator is only for hd transform)2super__init__hasattrr_   rJ   
isinstanceimg_processordictgetrF   ra   rX   
embeddingsposition_embeddingweightsizer=   mathsqrtnnReflectionPad2dimg_processor_paddingnum_img_tokensbase_feat_height_targetimage_dim_out	img_sizesimage_attention_maskuse_hd_transformwith_learnable_separatorhd_transform_orderfreeze_img_processor	crop_sizeimage_token_compression_cls	AvgPool2dimage_token_compressionbase_feat_height_reduction	Parametertorchzerosglb_GNsub_GNLinearrangeextendGELU
Sequentialimg_projection
vocab_sizeimg_featuresuse_out_place_operations)selfrQ   r[   r\   r]   rJ   	pe_weightLDHry   dim_projectiondepthlayers_	__class__rB   rC   rh   x   sl   

zPhi4MMImageEncoder.__init__
img_embedsc                 C   s   | j ||d}| jdkrf|}| jd u}t| dd d u}|s|rdtt|d}|d|||d}|	dddd}|rC| 
|}|rJ| |}|	dddd}|d|d|d |d}|S t)	N)patch_attention_maskrb   rv   r*   rE   r      r5   )rk   ra   r   getattrr=   rr   rs   rq   viewpermuterv   NotImplementedError)r   r   attention_maskimg_featurepatch_featureuse_token_compressionuse_paddingwidthrB   rB   rC   get_img_features   s2   



z#Phi4MMImageEncoder.get_img_featurespixel_valuesimage_sizesr{   c           %   	   C   s  t | jtjr| jd jj}| jd jj}n
| jjj}| jjj}|}|j\}}}	}
}|}|dd}| 	||
tjdd|}| j}| j}| j}tt|jd  }}||kr`||ksnJ d| d| d| d||d|| | j}| j}|}g }g }t |tjr|dd}t|D ]}|| \}
}|
| }
|| }|
| }||d	df }|d|||d|| ||| || ddd
dddd|| || || |  }| jd|| dd}tj||gdddd|| | }||dd	f }|d	| }|||||||| ||| || ddd
ddd|d|| |  }|d|
||| || dddd
dddd|
| | || | || | }|d	urt|dkr||d|d dd	ddd	df d|
||| || ddd
ddd|
| | || | }t|dd	d	df    }t|ddd	d	f    }|d	d	d	|d	|f }| jd|dd} t||d	|d dd	ddd	df    |d  ||  }!n"| jd|
| | dd} t|
| d | j! d |
d | |  }!tj|| gdddd|| | }| j"dkr4|#tj|| j$|gdd n| j"dkrI|#tj|| j$|gdd n	t%d| j" d|!|d jd kskJ d|! d|d jd  |#|! qg }"|D ]}#| |#||}$|"#|$&d qu|"S )a  
        process image and return vision embeddings.

        pixel_values: (num_images, num_crops, c, h, w)
        image_sizes: [[h1, w1], [h2, w2]]
        image_attention_mask: num_images x num_crops x 32 x 32
        output: (num_images, num_img_tokens, hidden_size)
        r   r*   zbase_feat_height: z, base_feat_width: z	, expect z features for hd transformrE   r5   Nr         dimglb_subrc   zhd_transform_order = z, not implementedz
temp_len: z, output_imgs[-1].shape[1]: )'rj   r   rt   r   biasdevicedtypeshapeflattenr   typer   
BoolTensortorx   r   r   r=   nprs   r   ry   Tensorr   reshape
contiguousr   r   repeatcatlensumitemrw   r~   appendr   r   squeeze)%r   r   r   r{   target_devicetarget_dtyperz   
num_images	num_cropschwbsr   rx   base_resolutionr   base_feat_heightbase_feat_widthCr   output_imgs
output_len_bsB_global_img_featureglb_imgtemp_glb_GNsub_imgreshaped_image_attention_maskuseful_heightuseful_widthtemp_sub_GNtemp_lenimg_set_tensor_output_imgimg_feature_projrB   rB   rC   forward   s8  






"

.zPhi4MMImageEncoder.forward)rZ   rZ   N)__name__
__module____qualname____doc__r
   r   strrh   r   FloatTensorr   r   listr   __classcell__rB   rB   r   rC   rY   u   s<    S
&rY   c                
   @   s   e Zd ZU dZed ed< eeje	ej B e
ddddddhd	f ed< eeje
dd
f ed< ee	e e
df ed< eeje
ddddf ed< dS )Phi4MMImagePixelInputsaX  
    Dimensions:
        - bn: Batch size * number of images
        - p: Number of patches (1 + num_patches)
        - c: Number of channels (3)
        - h: Height of each image patch
        - w: Width of each image patch
        - nc: Number of crops
        - H_mask: Height of attention mask
        - W_mask: Width of attention mask
    r   r   bnpr   r   r   dynamic_dimsr5   r   rw   nc    r{   N)r   r   r   r   r   __annotations__r   r   r   r   r)   r=   rB   rB   rB   rC   r     s2   
 r   c                   @   sH   e Zd ZU dZed ed< eeje	ej B e
ddddhdf ed< dS )	Phi4MMAudioFeatureInputsz^
    Dimensions:
        - bn: Batch size * number of audios
        - t: Time frames (M)
    audio_featuresr   r   tP   r   N)r   r   r   r   r   r   r   r   r   r   r)   rB   rB   rB   rC   r     s   
 r   c                   @   s8   e Zd ZU dZed ed< eeeddddf ed< d	S )
Phi4MMAudioEmbeddingInputsz
    Dimensions:
        - b: Batch size
        - n: Number of audios
        - f: Audio feature size
        - h: Hidden size (must match language model backbone)
    audio_embedsr   bnfr   dataN)	r   r   r   r   r   r   r   r   r)   rB   rB   rB   rC   r     s   
 r   Phi4MMAudioInputsc                    s   d   tfdddd D sJ dfddtD }t fd	dD | < d ||}d}D ]$fd
dtD }t||j   | < ||< |j  7 }q=|S )z<
    cat along dim, while pad to max for all other dims
    r   c                 3   s    | ]	}|   kV  qd S r   r   .0r   )ndimrB   rC   	<genexpr>  s    zcat_with_pad.<locals>.<genexpr>r*   Nz3All tensors must have the same number of dimensionsc                    s"   g | ] t  fd dD qS )c                 3       | ]}|j   V  qd S r   r   r   irB   rC   r        z*cat_with_pad.<locals>.<listcomp>.<genexpr>)max)r   )tensorsr  rC   
<listcomp>  s   " z cat_with_pad.<locals>.<listcomp>c                 3   r  r   r  r   r   rB   rC   r    r  c                    s   g | ]
}t d  j| qS r   )slicer   )r   d)r   rB   rC   r
        )r   allr   r   new_fullr  r   )r	  r   padding_valueout_sizeoutputindexslicesrB   )r   r  r   r	  rC   cat_with_pad  s    r  c                   @   s0  e Zd Zedee fddZedee fddZ	d-dedB de	fdd	Z
d
edefddZdd Zdeee	dB f fddZde	de	de	de	de	f
ddZ	d.de	de	de	de	de	de	fddZddd e	d!e	dedB de	fd"d#Z	d-dedB defd$d%Zd&e	d'ede	fd(d)Zd*e	de	fd+d,ZdS )/Phi4MMProcessingInfor^   c                 C      dd t dD S )Nc                 S      g | ]
}d |d  dqS )<|image_r*   |>rB   r   r  rB   rB   rC   r
  )  r  z5Phi4MMProcessingInfo.image_tokens.<locals>.<listcomp>d   r   r   rB   rB   rC   image_tokens'     z!Phi4MMProcessingInfo.image_tokensc                 C   r  )Nc                 S   r  )<|audio_r*   r  rB   r  rB   rB   rC   r
  -  r  z5Phi4MMProcessingInfo.audio_tokens.<locals>.<listcomp>r  r  r  rB   rB   rC   audio_tokens+  r!  z!Phi4MMProcessingInfo.audio_tokensN	processorc                 C   s   |d u r|   }|j}|jS r   )get_hf_processorimage_processor
dynamic_hd)r   r$  r&  rB   rB   rC   get_dynamic_hd/  s   z#Phi4MMProcessingInfo.get_dynamic_hdrT   c                 K   s   | j di |jS )NrB   )r%  audio_processor)r   rT   rB   rB   rC   get_feature_extractor8  s   z*Phi4MMProcessingInfo.get_feature_extractorc                 C   s   |   }t|jd|  dS )Nscipy)	target_sraudio_resample_methodexpected_hidden_size)r*  r    sampling_rate_get_expected_hidden_size)r   feature_extractorrB   rB   rC   get_data_parser;  s   z$Phi4MMProcessingInfo.get_data_parserc                 C   s
   d d dS )N)audioimagerB   r  rB   rB   rC   get_supported_mm_limitsD  s   
z,Phi4MMProcessingInfo.get_supported_mm_limitsr9   r:   rK   max_nummin_numc                    s   t |t| }t |t| }||  krP|| }t fddtd d D }	t|	dd d}	|  j}
|
||	|||}||d  }||d  }n|| }|| }||f}|||fS )Nc                 3   sD    | ]}t d  d  D ]}||  kr|| kr||fV  qqdS )r*   Nr  )r   r  jr6  r7  rB   rC   r  U  s    zAPhi4MMProcessingInfo._find_target_aspect_ratio.<locals>.<genexpr>r*   c                 S   s   | d | d  S )Nr   r*   rB   )xrB   rB   rC   <lambda>[  s    z@Phi4MMProcessingInfo._find_target_aspect_ratio.<locals>.<lambda>)keyr   )	rr   ceilfloatsetr   sortedr%  r&  find_closest_aspect_ratio)r   r9   r:   rK   r6  r7  
w_crop_num
h_crop_numaspect_ratiotarget_ratiosr&  target_aspect_ratior<   r;   rB   r9  rC   _find_target_aspect_ratioG  s,   
	
z.Phi4MMProcessingInfo._find_target_aspect_ratior5   dynamic_hd_sizer6   r7   r8   c                 C   s  || dks
J d|| | dksJ d| j ||||dd\}}}	|d | |	ks9J |d  d| d|	 |d | |ksOJ |d  d| d| || dkr[|	| dks]J t||||	\}
}|dksr|
dksrJ d|	| }|| }||kr|
dksJ d	|t||  }|}n|
|kr|dksJ d
|t|
|  }|}n|}|}|| }|| }|| dkr|d7 }|| dkr|d7 }|| }|}|| }|| d }d}|| }|| | | | S )av  
        compute the number of tokens an image is expected to take up considering
        the image encoder architecture and exclude output features containing
        only padding pixels

        for siglip, vit_image_size=448, vit_patch_size=14, so output will be
        32x32 feature map
        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
        r   z2vit_image_size must be divisible by vit_patch_sizezNvit_image_size // vit_patch_size must be divisible by token_compression_factorr*   )r7  z * z != z)padding_width or padding_height must be 0zpadding_height not 0zpadding_width not 0r5   )rG  rD   rr   floor)r   r9   r:   rH  r6   r7   r8   rF  r;   r<   rA   r@   target_feat_widthtarget_feat_heightnon_pad_feat_widthnon_pad_feat_height
feat_widthfeat_heightnum_hd_patch_tokensnum_hd_newline_tokensvit_feature_sizenum_global_image_tokensnum_sep_tokensnum_global_image_newline_tokensrB   rB   rC   _compute_num_image_tokensp  sz   
z.Phi4MMProcessingInfo._compute_num_image_tokensr$  image_widthimage_heightc                C   s`   |   }|j}|d u rt}t| }|d }|d }|d }	| j|d}
| j|||
|||	d}|S )Nr6   r7   r8   rW  )rH  r6   r7   r8   )get_hf_configrk   SIGLIP_NAME#VISION_ENCODER_TO_PROCESSING_CONFIGr(  rV  )r   rX  rY  r$  	hf_configvision_encoder_nameprepro_configr6   r7   r8   rH  image_num_tokensrB   rB   rC   get_num_image_tokens  s$   	z)Phi4MMProcessingInfo.get_num_image_tokensc                 C   sF   |   }|j}|d u rt}t| }|d }|| j|d }t||dS )Nr6   rW  )heightr   )rZ  rk   r[  r\  r(  r   )r   r$  r]  r^  r_  r6   max_siderB   rB   rC   !get_image_size_with_most_features  s   z6Phi4MMProcessingInfo.get_image_size_with_most_features	audio_lensrc                 C   sz   |dkr||d  }nd|  krdk rn n|d9 }n|dk r't d| d}d}|| | d }|dk r;td|S )	a  
        Compute the output size of the `extract_features` method.

        Args:
            audio_len (int): Length of the input waveform in samples.
            sr (float): Sampling rate of the waveform, either 16000 or 8000.

        Returns:
            tuple (int, int): Output size as (T, D), where:
                T: Number of time frames.
                D: Number of Mel filterbank bins (80).
        i>  i@  r5   zUnsupported sample rate i     r*   z(Waveform too short for given parameters.)RuntimeError
ValueError)r   re  rf  
win_length
hop_length
num_framesrB   rB   rC   get_audio_num_frames  s   
z)Phi4MMProcessingInfo.get_audio_num_framesaudio_framesc                 C   sj   |   }|jd d }d}|| }|| }|dkr|n|d }|| }|| }|dkr/|}|S |d }|S )zj
        Compute the audio embedding size based on the audio frames and
        compression rate.
        audio_embd_layercompression_rater*   r   )rZ  
embd_layer)r   rn  r]  rp  qformer_compression_rateinteger	remainderresultrB   rB   rC   _compute_audio_embed_size  s   z.Phi4MMProcessingInfo._compute_audio_embed_sizer   )r5   )r   r   r   propertyr   r   r   r#  r   r=   r(  objectr   r*  r2  r   r5  rG  rV  ra  r   rd  r>  rm  rv  rB   rB   rB   rC   r  &  sr    
		
0
^

#r  c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Phi4MMDummyInputsBuilder	mm_countsr^   c                 C   sF   | dd}| dd}| jjd | }| jjd | }d|| S )Nr3  r   r4  rZ   )rm   infor   r#  join)r   rz  
num_audiosr   r   r#  rB   rB   rC   get_dummy_text/  s
   z'Phi4MMDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sr   | dd}| dd}| j \}}|r| dnd }|r#| dnd }	| j||||d| jt||	dd}
|
S )Nr3  r   r4  )r   rb  r   	overrides)lengthr}  r  )r4  r3  )rm   r{  rd  _get_dummy_images_get_dummy_audios_AUDIO_MAX_SOUNDFILE_SIZE)r   r  rz  r  r}  r   r<   r;   image_overridesaudio_overridesmm_datarB   rB   rC   get_dummy_mm_data8  s$   z*Phi4MMDummyInputsBuilder.get_dummy_mm_datar   )
r   r   r   r   r   r=   r~  r   r   r  rB   rB   rB   rC   ry  .  s    
ry  c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZdededef fddZ  ZS )Phi4MMMultiModalProcessorpromptr  	mm_kwargs
tok_kwargsr^   c           
         s   |sj  |}|}tt|gdddS j jdi |j|dg  }r6fdd|D |d< t	 
||||}fdd|d D }||d	< |d
  fdd|D }	 fddt|	D |d
< |S )N)	input_idspt)tensor_typeaudiosc                    s   g | ]}| fqS rB   rB   )r   r   )rf  rB   rC   r
  f  s    z@Phi4MMMultiModalProcessor._call_hf_processor.<locals>.<listcomp>c                    s$   g | ]} j j|d  |d dqS )r   r*   )rX  rY  )r{  ra  )r   img_sizer  rB   rC   r
  l  s    r   rw   input_audio_embedsc                    s   g | ]} j t|qS rB   )r{  rm  r   )r   r3  )r   rf  rB   rC   r
  u  s    c                    s    g | ]\}} |d |f qS r   rB   )r   idxrq   )r   rB   rC   r
  x  s    rB   )r{  get_tokenizerencode_apply_hf_processor_tokens_onlyr	   rl   r*  r/  rm   rg   _call_hf_processor	enumerate)
r   r  r  r  r  
prompt_ids
audio_dataprocessed_outputsrw   feature_sizesr   )r   r   rf  rC   r  X  s,   



z,Phi4MMMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s0   t tdtdtdtdtddS )Nr4  r3  )input_image_embedsr{   r   rw   r  )rl   r   batched)r   r  r  rB   rB   rC   _get_mm_fields_config~  s   z/Phi4MMMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s   j j}j j}j jd	i | j jd	i |dtffdd}dtf fdd}td|j|dtd|j|dgS )
Nitem_idxc                    sP    dttf}t|tr|| }n|| }jj|j|j	 d}t
g| S )Nr4  )rX  rY  r$  )	get_itemsr   r   rj   get_feature_sizeget_image_sizer{  ra  r   rb  _IMAGE_PLACEHOLDER_TOKEN_ID)r  imagesnum_image_tokensrK   )hf_processorr  r   rB   rC   get_image_replacement_phi4mm  s   


zSPhi4MMMultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement_phi4mmc                    s<    dt}|| }j| j}j|}tg| S )Nr3  )r  r   get_audio_lengthr{  rm  r/  rv  _AUDIO_PLACEHOLDER_TOKEN_ID)r  r  re  rn  audio_embed_size)r1  r  r   rB   rC   get_audio_replacement_phi4mm  s   

zSPhi4MMMultiModalProcessor._get_prompt_updates.<locals>.get_audio_replacement_phi4mmr4  )modalitytargetreplacementr3  rB   )r{  r   r#  r*  r%  r=   r$   __getitem__)r   r  r  r  r   r#  r  r  rB   )r1  r  r  r   rC   _get_prompt_updates  s"   z-Phi4MMMultiModalProcessor._get_prompt_updatescached_updatenew_item_idxc                    sV   t  ||}|jdkr| jj}||| }|S |jdkr)| jj}||| }|S )Nr4  r3  )rg   _recompute_cached_prompt_updater  r{  r   with_targetr#  )r   r  r  
new_updater   r#  r   rB   rC   r    s   

z9Phi4MMMultiModalProcessor._recompute_cached_prompt_update)r   r   r   r   r   rx  r	   r  r   r  r   r   r   r   r%   r  r&   r=   r  r   rB   rB   r   rC   r  W  sF    


&




4r  )r{  dummy_inputsc                       sx  e Zd ZdZdgdgdZeddiddd	d
ddZedede	dedB fddZ
dddedef fddZdededB fddZdededefddZdededB fdd Zdedefd!d"Zd#edeej fd$d%Zdedefd&d'Z		d6d(ejdB d)ejd*edB d+ejdB dedejfd,d-Zd.ejdejdB fd/d0Zd1ee eejf  ddfd2d3Z!de"fd4d5Z#  Z$S )7Phi4MMForCausalLMzA
    Implements the Phi-4-multimodal-instruct model in vLLM.
    qkv_projgate_up_proj)r  r  zbase_layer.rZ   z0embed_tokens_extend.audio_projection_for_vision.z%embed_tokens_extend.audio_projection.zembed_tokens_extend.zvision_encoder.)z>model.embed_tokens_extend.audio_embed.audio_projection.vision.z>model.embed_tokens_extend.audio_embed.audio_projection.speech.z&model.embed_tokens_extend.audio_embed.z&model.embed_tokens_extend.image_embed.)orig_to_new_substrorig_to_new_prefixr  r  r^   Nc                 C   s4   | drd| dS | drd| dS td)Nr4  r  r  r3  r"  z)Only image or audio modality is supported)
startswithri  )clsr  r  rB   rB   rC   get_placeholder_str  s
   

z%Phi4MMForCausalLM.get_placeholder_str)r\   vllm_configr\   c                   s  t    |jj}|jj}|sJ d|j}|| _|| _|| _t jdks)J d| 	|ddh t
||d|jd| _W d    n1 sFw   Y  t|jd trbd	|jd d	 i|jd }nd	| jjd	 i}| 	|d
 t|fi || _W d    n1 sw   Y  | | t|t|dd| _W d    n1 sw   Y  t|j|j|t|dd| _|jr| j| jj| _t|dd}t|j|d| _d S )Nzmultimodal_config is requiredr*   z"pipeline parallel is not supportedr4  videozmodel.vision_embed_tokens)r\   r]   ro  embedding_clsr3  model)r  r\   lm_head)r[   r\   logit_scaleg      ?)scale) rg   rh   rV   r]  multimodal_configr[   rQ   r   
world_size_mark_tower_modelrY   _name_or_pathvision_encoderrj   rq  rl   r/   embed_tokens_extend_mark_language_modelr   r2   r  r   r   rJ   r  tie_word_embeddingstie_weightsembed_tokensr   r   logits_processor)r   r  r\   rQ   r  r[   embedding_configr  r   rB   rC   rh     sT   



zPhi4MMForCausalLM.__init__rT   c                 K   s\   | dd}| dd}|du r|du rdS |dur td|dS |dur*td|dS td)aL  
        Parse and validate the audio input to the model.  This handles both
        audio features and audio embeddings, but only the former is used for
        now.

        Args:
            kwargs (object): Keyword arguments.

        Returns:
            Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
        r  Nr   r   )r   r   )r   r   z This line should be unreachable.)popr   r   AssertionError)r   rT   r   r   rB   rB   rC   _parse_and_validate_audio_input.  s   z1Phi4MMForCausalLM._parse_and_validate_audio_inputaudio_inputaudio_projection_modec                    sF   |d dkr
|d S |d }t j j fdd|D }|S )ad  
        Create the audio embeddings from the audio input, where the audio input
        is pairs of audio features and audio embed lengths.  The audio input is
        created by `input_mapper_for_phi4mm_audio`.

        Args:
            audio_input (Phi4MMAudioInputs): Audio input.

        Returns:
            NestedTensors: Audio embeddings
        r   r   r   r   c                    s    g | ]}j | d qS )r  )r  r   )r   featuresr  r   r   rB   rC   r
  c  s    z:Phi4MMForCausalLM._process_audio_input.<locals>.<listcomp>)nextr  
parametersr   )r   r  r  r   r   rB   r  rC   _process_audio_inputM  s   z&Phi4MMForCausalLM._process_audio_inputc                 K   sf   | d}|d u rd S | d}| d}| d}|d ur&|d ur&|d us*J dtd||||dS )Nr  r   r{   rw   zMissing image inputsr   )r   r   r   r{   rw   )rm   r   )r   rT   r   r   r{   rw   rB   rB   rC   _parse_and_validate_image_inputl  s$   




z1Phi4MMForCausalLM._parse_and_validate_image_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)r  image_embedsr  )r  r   r  rB   )r  r  )r   rT   
modalities	input_keyrB   rB   rC   %_parse_and_validate_multimodal_inputs  s   z7Phi4MMForCausalLM._parse_and_validate_multimodal_inputsimage_inputc                 C   s@   t | j j}|d |}|d }|d }| |||}|S )Nr   r   r{   )r  r  r  r   r   )r   r  r   r   r   r{   r  rB   rB   rC   _process_image_input  s   z&Phi4MMForCausalLM._process_image_inputc           
      K   s   | j di |}|sg S d}d}|D ],}|dkr)d}|d }| |}|t|7 }|dkr>|d }| j||d}	|t|	7 }q|S )NrB   speechr  visionr  r  )r  r  tupler  )
r   rT   r  multimodal_embeddingsr  r  r  image_embeddingsr  audio_embeddingsrB   rB   rC   embed_multimodal  s&   
z"Phi4MMForCausalLM.embed_multimodalr  	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r  )r  )r   r  r  r  r  rT   hidden_statesrB   rB   rC   r     s   zPhi4MMForCausalLM.forwardr  c                 C   s   |  | j|}|S r   )r  r  )r   r  logitsrB   rB   rC   compute_logits  s   z Phi4MMForCausalLM.compute_logitsweightsc                 C   s   t | dgd}|j|| jdS )Nlora)skip_substrs)mapper)r0   load_weightshf_to_vllm_mapper)r   r  loaderrB   rB   rC   r    s   zPhi4MMForCausalLM.load_weightsc                 C   s   t jdddgddgdS )z<
        Get the module prefix in multimodal models
        zmodel.audio_projection_for_visionaudio_projectionr  r  )language_model	connectortower_model)r   from_string_fieldr  rB   rB   rC   get_mm_mapping  s
   z Phi4MMForCausalLM.get_mm_mapping)NN)%r   r   r   r   packed_modules_mappingr1   r  classmethodr   r=   r  r   rh   rx  r   r  r   r  r   r  rl   r  r   r   r   r  r,   r  r'   r   r  r   r  r  r   r  r   rB   rB   r   rC   r    s    	3



 

 r  )rE   r  )arr   collections.abcr   r   r   typingr   r   r   r   numpyr   r   torch.nnrt   transformersr	   r
   r   r   r   vllm.configr   vllm.config.multimodalr   vllm.distributedr   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr    vllm.model_executor.models.llamar   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r   r   r   r   r    vllm.multimodal.processingr!   $vllm.multimodal.processing.processorr"   r#   r$   r%   r&   vllm.sequencer'   vllm.utils.tensor_schemar(   r)   idefics2_vision_modelr+   
interfacesr,   r-   r.   phi4mm_audior/   utilsr0   r1   r2   r  r  r  r[  r\  r=   rD   rX   ModulerY   r   r   r   r   r   r  r  ry  r  register_processorr  rB   rB   rB   rC   <module>   s   
 	
  Q&
  
)|