o
    
۾iY                  	   @   s  d Z ddlmZ ddlmZmZmZ ddlmZm	Z	 ddl
ZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE ddlFmGZGmHZH ddlImJZJmKZKmLZL ddlMmNZNmOZOmPZPmQZQmRZR G d d! d!eGZSG d"d# d#ejTZUG d$d% d%ejTZVG d&d' d'ejTZWG d(d) d)ejTZXG d*d+ d+ejTZYG d,d- d-ejTZZG d.d/ d/ejTZ[G d0d1 d1eKZ\G d2d3 d3Z]G d4d5 d5e?Z^G d6d7 d7e=e^ Z_G d8d9 d9e>e^ Z`e4jae`e^e_d:G d;d< d<eJeQeOeRePZbdS )=z<Inference-only CogAgent model compatible with THUDM weights.    )	Namespace)IteratorMappingSequence)	AnnotatedLiteralN)nn)	LayerNorm)
transforms)InterpolationMode)BatchFeaturePreTrainedTokenizer
TensorType)
ImageInput)	TextInput)
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)
SiluAndMul
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)ChatGLMConfig)TensorSchemaTensorShape   )ChatGLMBaseModelChatGLMModelGLMTransformer)MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPPc                   @   s>   e Zd ZU dZdZed ed< eej	e
ddddf ed< d	S )
GLMVImagePixelInputsz
    Dimensions:
        - b: Batch size
        - c: Number of channels (3)
        - h: Height of image
        - w: Width of image
    pixel_valuestypeb   hwdataN)__name__
__module____qualname____doc__r9   r   __annotations__r   torchTensorr-    rF   rF   T/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/glm4v.pyr7   B   s   
  r7   c                       s2   e Zd Z fddZdejdejfddZ  ZS )EVA2CLIPPatchEmbeddingc                    sP   t    t|j|j|j|jd| _tt	
d|j| _t|j|j| _d S )N)kernel_sizestrider.   )super__init__r   in_channelshidden_size
patch_sizeprojr   	ParameterrD   zeroscls_embedding	Embeddingnum_positionsposition_embedding)selfconfig	__class__rF   rG   rL   P   s   
zEVA2CLIPPatchEmbedding.__init__imagesreturnc                 C   st   |j | jjj| jjjd}| |}|ddd}| j|j	d dd}t
j||fdd}|| jjd7 }|S )
        Parameters:
        images : torch.Tensor
            Input image tensor with shape (B, C, H, W)

        Returns:
        torch.Tensor
            Transformed tensor with shape (B, L, D)
        )devicedtype   r.   r   dim)torP   weightr^   r_   flatten	transposerS   expandshaperD   catrV   	unsqueeze)rW   r[   x	cls_tokenrF   rF   rG   forward[   s   

zEVA2CLIPPatchEmbedding.forward)r?   r@   rA   rL   rD   rE   rn   __classcell__rF   rF   rY   rG   rH   O   s    rH   c                       F   e Zd Z		ddedB def fddZdejdejfd	d
Z  Z	S )EVA2CLIPAttentionN quant_configprefixc                    s   t    |j| _t | _|j| j | _|j|j | _| jd | _t	|j| j|j|| dd| _
t|j|j|| dd| _t| j| j| j| dd| _tj|j| _d S )Ng      z.query_key_valuers   rt   z.densez.attnrt   )rK   rL   rN   r   tp_size	num_headsnum_heads_per_rankhead_dimscaler   query_key_valuer   denser   attnrD   r   Dropoutdropout_proboutput_dropoutrW   rX   rs   rt   rY   rF   rG   rL   o   s4   
zEVA2CLIPAttention.__init__rl   r\   c           	      C   sL   |  |\}}|jddd\}}}| |||}| |\}}| |}|S )Nr;   ra   rb   )r|   chunkr~   r}   r   )	rW   rl   qkv_qkvoutoutputrF   rF   rG   rn      s   
zEVA2CLIPAttention.forwardNrr   
r?   r@   rA   r   strrL   rD   rE   rn   ro   rF   rF   rY   rG   rq   n   s    #rq   c                       rp   )EVA2CLIPMLPNrr   rs   rt   c                    sX   t    || _t|j| _t|j|j|| dd| _	t
|j|j|| dd| _d S )Nz.fc1ru   z.fc2)rK   rL   rX   r   
hidden_actactivation_fnr   rN   intermediate_sizefc1r   fc2r   rY   rF   rG   rL      s   
zEVA2CLIPMLP.__init__rl   r\   c                 C   s*   |  |\}}| |}| |\}}|S N)r   r   r   rW   rl   r   rF   rF   rG   rn      s   
zEVA2CLIPMLP.forwardr   r   rF   rF   rY   rG   r      s    r   c                       8   e Zd Z		d	dedB def fddZdd Z  ZS )
EVA2CLIPTransformerLayerNrr   rs   rt   c                    s^   t    t|j|jd| _t||| dd| _t||| dd| _	t|j|jd| _
d S )N)epsz
.attentionru   z.mlp)rK   rL   r	   rN   layer_norm_epsinput_layernormrq   	attentionr   mlppost_attention_layernormr   rY   rF   rG   rL      s   
z!EVA2CLIPTransformerLayer.__init__c                 C   s<   |}|  | |}|| }|}| | |}|| }|S r   )r   r   r   r   )rW   hidden_statesattention_inputattention_output	mlp_input
mlp_outputr   rF   rF   rG   rn      s   z EVA2CLIPTransformerLayer.forwardr   r?   r@   rA   r   r   rL   rn   ro   rF   rF   rY   rG   r          r   c                       r   )
EVA2CLIPTransformerNrr   rs   rt   c                    s2   t    t fddt jD | _d S )Nc                    s$   g | ]}t   d | dqS )z.layers.ru   )r   ).0	layer_idxrX   rt   rs   rF   rG   
<listcomp>   s    z0EVA2CLIPTransformer.__init__.<locals>.<listcomp>)rK   rL   r   
ModuleListrangenum_hidden_layerslayersr   rY   r   rG   rL      s   

zEVA2CLIPTransformer.__init__c                 C   s   | j D ]}||}q|S r   )r   )rW   r   layer_modulerF   rF   rG   rn      s   

zEVA2CLIPTransformer.forwardr   r   rF   rF   rY   rG   r      r   r   c                       r   )
EVA2CLIPGLUNrr   rs   rt   c                    s   t    t||jd|| dd| _t|j| _t | _	t
 | _t|j|jgd d|| dd| _t|j|jd|| dd| _dS )a  
        The original implementation is the same as:
        ```python
        self.dense_h_to_4h = ColumnParallelLinear(
            config.hidden_size,
            config.ffn_hidden_size,
            bias=False,
            quant_config=quant_config,
        )

        self.gate_proj = ColumnParallelLinear(
            config.hidden_size,
            config.ffn_hidden_size,
            bias=False,
            quant_config=quant_config,
        )
        ```
        ```
        gate_proj_output, _ = self.gate_proj(x)
        dense_h_to_4h_output, _ = self.dense_h_to_4h(x)
        x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1)
        ```

        We merge two ColumnParallelLinear into one MergedColumnParallelLinear:
        ```
        self.merged_proj = MergedColumnParallelLinear(
            config.hidden_size,
            [config.ffn_hidden_size] * 2,
            bias=False,
            quant_config=quant_config,
        )
        ```
        ```
        x, _ = self.merged_proj(x)
        ```
        F.linear_proj)biasrs   rt   r`   z.merged_projz.dense_4h_to_hN)rK   rL   r   rN   linear_projr   r	   norm1GELUact1r   act2r   ffn_hidden_sizemerged_projr   dense_4h_to_h)rW   rX   in_featuresrs   rt   rY   rF   rG   rL      s2   
+

zEVA2CLIPGLU.__init__c                 C   sH   |  |\}}| | |}| |\}}| |}| |\}}|S r   )r   r   r   r   r   r   r   rF   rF   rG   rn   8  s   
zEVA2CLIPGLU.forwardr   r   rF   rF   rY   rG   r      s    Gr   c                       rp   )EVA2CLIPModelNrr   rs   rt   c                    s   t    tdi |j}t|| _t||| dd| _t||j	|| dd| _
t|j	|j	ddd| _ttdd|j	| _ttdd|j	| _|j| _d S )	Nz.transformerru   r   )r   rs   rt   r`   )rM   out_channelsrI   rJ   r.   rF   )rK   rL   r   vision_configrH   patch_embeddingr   transformerr   rN   r   r   convr   rQ   rD   rR   boieoiscaling_factor)rW   rX   rs   rt   r   rY   rF   rG   rL   B  s*   

zEVA2CLIPModel.__init__r[   r\   c           	      C   s   |  |}| |}|ddddf }|j\}}}t|d }|||||dddd}| |}|ddd}| 	|}| j
|jd dd}| j|jd dd}tj|||fdd}|| j }|S )	r]   Nr.   g      ?r   r;   r`   ra   rb   )r   r   ri   intviewpermuter   rf   rg   r   r   rh   r   rD   rj   r   )	rW   r[   rl   r:   sr<   	grid_sizer   r   rF   rF   rG   rn   ^  s   





zEVA2CLIPModel.forwardr   r   rF   rF   rY   rG   r   A  s    r   c                       s,   e Zd Zdddedef fddZ  ZS )
GLM4VModelrr   rv   vllm_configrt   c                   s2   t  j||d |j}t| j|| dd| _d S )N)r   rt   z.visionrv   )rK   rL   rs   r   rX   vision)rW   r   rt   rs   rY   rF   rG   rL   {  s
   zGLM4VModel.__init__)r?   r@   rA   r   r   rL   ro   rF   rF   rY   rG   r   z  s    $r   c                	       sp   e Zd ZdZdededdf fddZ			ddeee B dB d	e	ee	 B dB d
e
eB dB defddZ  ZS )GLM4VProcessorz_
    This model doesn't define its own HF processor,
    so we implement our own one here.
    rX   	tokenizerr\   Nc                    sX   t    || _|| _|j}|d }ttj||ftj	dt
 tjdddg| _d S )N
image_size)interpolation)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)meanstd)rK   rL   rX   r   r   r
   ComposeResizer   BICUBICToTensor	Normalizeimage_transform)rW   rX   r   r   r   rY   rF   rG   rL     s"   

zGLM4VProcessor.__init__textr[   return_tensorsc                    s   |d u rg }t |ts|g}|d u rg }t |ts|g} |}t|dkr*i }n fdd|D }dt|i}ti |||dS )Nr   c                    s   g | ]}  |qS rF   )r   )r   imagerW   rF   rG   r     s    z+GLM4VProcessor.__call__.<locals>.<listcomp>r8   )tensor_type)
isinstancelistr   lenrD   stackr   )rW   r   r[   r   text_inputsimage_inputsr8   rF   r   rG   __call__  s*   


zGLM4VProcessor.__call__)NNN)r?   r@   rA   rB   r+   r   rL   r   r   r   r   r   r   r   ro   rF   rF   rY   rG   r     s*    
r   c                   @   s\   e Zd Zdd ZdedefddZdeee	dB f fdd	Z
de	fd
dZde	fddZdS )GLM4VProcessingInfoc                 C   s   | j tS r   )ctxget_hf_configr+   r   rF   rF   rG   r     s   z!GLM4VProcessingInfo.get_hf_configkwargsr\   c                 K   s"   | j jtf|  |  d|S )N)rX   r   )r   init_processorr   r   get_tokenizer)rW   r   rF   rF   rG   get_hf_processor  s   z$GLM4VProcessingInfo.get_hf_processorNc                 C   s   ddiS )Nr   r.   rF   r   rF   rF   rG   get_supported_mm_limits  s   z+GLM4VProcessingInfo.get_supported_mm_limitsc                 C   s2   |   }|j}|d }|d }|| d }|| S )Nr   rO   r`   )r   r   )rW   	hf_configr   r   rO   grid_lengthrF   rF   rG   get_num_image_tokens  s   z(GLM4VProcessingInfo.get_num_image_tokensc                 C   s   |   d S )Nr`   )r   r   rF   rF   rG   get_num_image_feature_tokens  s   z0GLM4VProcessingInfo.get_num_image_feature_tokens)r?   r@   rA   r   objectr   r   r   r   r   r   r   r   rF   rF   rF   rG   r     s    	r   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )GLM4VDummyInputsBuilder	mm_countsr\   c                 C   s   | dd}d}|| S )Nr   r   /<|begin_of_image|><|endoftext|><|end_of_image|>)get)rW   r   
num_images	base_textrF   rF   rG   get_dummy_text  s   z&GLM4VDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           
      C   sP   | j  }|j}|d  }}|dd}|r|dnd }	d| j||||	diS )Nr   r   r   )widthheightr   	overrides)infor   r   r   _get_dummy_images)
rW   r   r   r   r   r   target_widthtarget_heightr   image_overridesrF   rF   rG   get_dummy_mm_data  s   
z)GLM4VDummyInputsBuilder.get_dummy_mm_datar   )
r?   r@   rA   r   r   r   r   r   r    r  rF   rF   rF   rG   r     s    
r   c                
   @   s   e Zd Zdededeeef deeef def
ddZde	deeef deee
f fd	d
Zdedeeef dedee fddZdS )GLM4VMultiModalProcessorprompt_textmm_itemshf_processor_mm_kwargstokenization_kwargsr\   c                 C   s   dS )NFrF   )rW   r  r  r  r  rF   rF   rG   _hf_processor_applies_updates  s   z6GLM4VMultiModalProcessor._hf_processor_applies_updates	hf_inputsc                 C   s   t tddS )Nr   )r8   )dictr"   batched)rW   r	  r  rF   rF   rG   _get_mm_fields_config  s   z.GLM4VMultiModalProcessor._get_mm_fields_configout_mm_kwargsc                    sJ   j  }|j |j|jdtf fdd}td g|dgS )Nitem_idxc                    s$   j  }g| } g| g S r   )r   r   )r  num_image_tokensimage_tokensboi_token_ideoi_token_idimage_token_idrW   rF   rG   get_replacement   s   

zEGLM4VMultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   )modalitytargetreplacement)r   r   r  pad_token_idr  r   r(   )rW   r  r  r  r   r  rF   r  rG   _get_prompt_updates  s   
z,GLM4VMultiModalProcessor._get_prompt_updatesN)r?   r@   rA   r   r$   r   r   boolr  r   r"   r  r#   r   r)   r  rF   rF   rF   rG   r    s8    


	



r  )r   dummy_inputsc                       sJ  e Zd ZdgdgddgdZdefddZeded	eded
B fddZ	de
ddededee
 dd
f fddZdeded
B fddZdedejfddZdee deeeeeef  fddZdee dee deejef fddZejZdedefd d!Z	
	
d(d"ejd
B d#ejd$ed
B d%ejd
B dedejeB fd&d'Z  Z S ))GLM4VForCausalLMr|   dense_h_to_4h	gate_proj)r|   r  r   r\   c                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        ztransformer.encoderztransformer.vision.linear_projztransformer.vision.transformer)language_model	connectortower_model)r   from_string_fieldr   rF   rF   rG   get_mm_mapping=  s
   zGLM4VForCausalLM.get_mm_mappingr  iNc                 C   s   | drdS td)Nr   r   z Only image modality is supported)
startswith
ValueError)clsr  r%  rF   rF   rG   get_placeholder_strG  s   
z$GLM4VForCausalLM.get_placeholder_strrr   )rt   transformer_typer   rt   r*  c                   sT   | j |tdtid t j|||d W d    |  d S 1 s!w   Y  |  d S )Nr   )language_targetstower_targets)r   rt   r*  )_mark_composite_modelr1   r   rK   rL   )rW   r   rt   r*  rY   rF   rG   rL   N  s   
zGLM4VForCausalLM.__init__r   c                 K   s<   | dd }|d ur| jjd  }}td|||ddS d S )Nr8   r   )r<   r=   )r9   r>   resolve_bindings)poprX   r   r7   )rW   r   r8   
expected_h
expected_wrF   rF   rG   _parse_and_validate_image_inputb  s   z0GLM4VForCausalLM._parse_and_validate_image_inputimage_inputc                 C   s    |d j | jjd}| j|S )Nr>   )r_   )rd   rX   r_   r   r   )rW   r3  r8   rF   rF   rG   _process_image_inputq  s   z%GLM4VForCausalLM._process_image_inputmm_featuresc           	      c   s    | j }|jj}t|dd dD ]4}|jj}|jdkr=|jd j \}}}|dks1J d| |||| || fV  qt	d|j d S )	Nc                 S   s   | j jS r   )mm_positionoffset)frF   rF   rG   <lambda>{  s    z3GLM4VForCausalLM.iter_mm_grid_thw.<locals>.<lambda>)keyr   image_grid_thwr.   zImage must have 1 frame, got zUnsupported modality: )
rX   r   spatial_merge_sizesortedr6  r7  r  r>   tolistr'  )	rW   r5  r   r<  
mm_featurer7  tr<   r=   rF   rF   rG   iter_mm_grid_thwv  s   
z!GLM4VForCausalLM.iter_mm_grid_thwinput_tokensc                 C   s>  g }d}|  |D ]K\}}}}|| }	t|dkr!|d  d nd}
|tt|	d|	f|
  t|||fdd}|||	 |
  |d || |  d }q	|t|k rt|| }	t|dkro|d  d nd}
|tt|	d|	f|
  tj	|dddd}| d t| 
 }t||fS )Nr   ra   r.   r;   )axis)rA  r   maxappendnpbroadcast_toarangeindicesreshapeconcatenateitemrD   
from_numpy)rW   rB  r5  llm_pos_ids_liststr7  
llm_grid_t
llm_grid_h
llm_grid_wtext_lenst_idxgrid_indicesllm_positionsmrope_position_deltarF   rF   rG   get_mrope_input_positions  s6     z*GLM4VForCausalLM.get_mrope_input_positionsc                 K   s*   | j di |}|d u rg S | |}|S )NrF   )r2  r4  )rW   r   r3  vision_embeddingsrF   rF   rG   embed_multimodal  s
   
z!GLM4VForCausalLM.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s    |d urd }|  ||||}|S r   )r   )rW   r[  r\  r]  r^  r   r   rF   rF   rG   rn     s   zGLM4VForCausalLM.forward)NN)!r?   r@   rA   packed_modules_mappingr   r$  classmethodr   r   r)  r   r   r9   rL   r   r7   r2  rD   rE   r4  r   r!   r   tuplerA  rX  r5   embed_input_idsr2   rZ  r*   rn   ro   rF   rF   rY   rG   r  /  sl    	




$r  )crB   argparser   collections.abcr   r   r   typingr   r   numpyrF  rD   r   torch.nnr	   torchvisionr
   torchvision.transformsr   transformersr   r   r   transformers.image_utilsr   $transformers.tokenization_utils_baser   vllm.configr   vllm.config.multimodalr   vllm.distributedr   %vllm.model_executor.layers.activationr   r   $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   r   r   'vllm.model_executor.layers.quantizationr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr    r!   r"   r#   vllm.multimodal.parser$   vllm.multimodal.processingr%   r&   r'   r(   r)   vllm.sequencer*   vllm.transformers_utils.configsr+   vllm.utils.tensor_schemar,   r-   chatglmr/   r0   r1   
interfacesr2   r3   r4   r5   r6   r7   ModulerH   rq   r   r   r   r   r   r   r   r   r   r  register_processorr  rF   rF   rF   rG   <module>   sh   	.Q9A ,


