o
    -ie                  	   @   s  d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZA ddlBmCZC ddlDmEZEmFZF ddlGmHZHmIZImJZJ ddlKmLZLmMZMmNZNmOZOmPZP G d d! d!eEZQG d"d# d#ejRZSG d$d% d%ejRZTG d&d' d'ejRZUG d(d) d)ejRZVG d*d+ d+ejRZWG d,d- d-ejRZXG d.d/ d/ejRZYG d0d1 d1eIZZG d2d3 d3Z[G d4d5 d5e=Z\G d6d7 d7e;e\ Z]G d8d9 d9e<e\ Z^e2j_e^e\e]d:G d;d< d<eHeOeMePeNZ`dS )=z<Inference-only CogAgent model compatible with THUDM weights.    N)	Namespace)MappingSequence)	AnnotatedLiteral)nn)	LayerNorm)
transforms)InterpolationMode)BatchFeaturePreTrainedTokenizer
TensorType)
ImageInput)	TextInput)
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)
SiluAndMul
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)ChatGLMConfig)TensorSchemaTensorShape   )ChatGLMBaseModelChatGLMModelGLMTransformer)MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPPc                   @   s>   e Zd ZU dZdZed ed< eej	e
ddddf ed< d	S )
GLMVImagePixelInputsz
    Dimensions:
        - b: Batch size
        - c: Number of channels (3)
        - h: Height of image
        - w: Width of image
    pixel_valuestypeb   hwdataN)__name__
__module____qualname____doc__r8   r   __annotations__r   torchTensorr,    rE   rE   ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/glm4v.pyr6   B   s   
  r6   c                       s2   e Zd Z fddZdejdejfddZ  ZS )EVA2CLIPPatchEmbeddingc                    sP   t    t|j|j|j|jd| _tt	
d|j| _t|j|j| _d S )N)kernel_sizestrider-   )super__init__r   in_channelshidden_size
patch_sizeprojr   	ParameterrC   zeroscls_embedding	Embeddingnum_positionsposition_embedding)selfconfig	__class__rE   rF   rK   P   s   
zEVA2CLIPPatchEmbedding.__init__imagesreturnc                 C   st   |j | jjj| jjjd}| |}|ddd}| j|j	d dd}t
j||fdd}|| jjd7 }|S )
        Parameters:
        images : torch.Tensor
            Input image tensor with shape (B, C, H, W)

        Returns:
        torch.Tensor
            Transformed tensor with shape (B, L, D)
        )devicedtype   r-   r   dim)torO   weightr]   r^   flatten	transposerR   expandshaperC   catrU   	unsqueeze)rV   rZ   x	cls_tokenrE   rE   rF   forward[   s   

zEVA2CLIPPatchEmbedding.forward)r>   r?   r@   rK   rC   rD   rm   __classcell__rE   rE   rX   rF   rG   O   s    rG   c                       F   e Zd Z		ddedB def fddZdejdejfd	d
Z  Z	S )EVA2CLIPAttentionN quant_configprefixc                    s   t    |j| _t | _|j| j | _|j|j | _| jd | _t	|j| j|j|| dd| _
t|j|j|| dd| _t| j| j| j| _tj|j| _d S )Ng      z.query_key_valuerr   rs   z.dense)rJ   rK   rM   r   tp_size	num_headsnum_heads_per_rankhead_dimscaler   query_key_valuer   denser   attnrC   r   Dropoutdropout_proboutput_dropoutrV   rW   rr   rs   rX   rE   rF   rK   o   s.   
zEVA2CLIPAttention.__init__rk   r[   c           	      C   sL   |  |\}}|jddd\}}}| |||}| |\}}| |}|S )Nr:   r`   ra   )rz   chunkr|   r{   r   )	rV   rk   qkv_qkvoutoutputrE   rE   rF   rm      s   
zEVA2CLIPAttention.forwardNrq   
r>   r?   r@   r   strrK   rC   rD   rm   rn   rE   rE   rX   rF   rp   n   s     rp   c                       ro   )EVA2CLIPMLPNrq   rr   rs   c                    sX   t    || _t|j| _t|j|j|| dd| _	t
|j|j|| dd| _d S )Nz.fc1rt   z.fc2)rJ   rK   rW   r   
hidden_actactivation_fnr   rM   intermediate_sizefc1r   fc2r   rX   rE   rF   rK      s   
zEVA2CLIPMLP.__init__rk   r[   c                 C   s*   |  |\}}| |}| |\}}|S N)r   r   r   rV   rk   r   rE   rE   rF   rm      s   
zEVA2CLIPMLP.forwardr   r   rE   rE   rX   rF   r      s    r   c                       8   e Zd Z		d	dedB def fddZdd Z  ZS )
EVA2CLIPTransformerLayerNrq   rr   rs   c                    s^   t    t|j|jd| _t||| dd| _t||| dd| _	t|j|jd| _
d S )N)epsz
.attentionrt   z.mlp)rJ   rK   r   rM   layer_norm_epsinput_layernormrp   	attentionr   mlppost_attention_layernormr   rX   rE   rF   rK      s   
z!EVA2CLIPTransformerLayer.__init__c                 C   s<   |}|  | |}|| }|}| | |}|| }|S r   )r   r   r   r   )rV   hidden_statesattention_inputattention_output	mlp_input
mlp_outputr   rE   rE   rF   rm      s   z EVA2CLIPTransformerLayer.forwardr   r>   r?   r@   r   r   rK   rm   rn   rE   rE   rX   rF   r          r   c                       r   )
EVA2CLIPTransformerNrq   rr   rs   c                    s2   t    t fddt jD | _d S )Nc                    s$   g | ]}t   d | dqS )z.layers.rt   )r   ).0	layer_idxrW   rs   rr   rE   rF   
<listcomp>   s    z0EVA2CLIPTransformer.__init__.<locals>.<listcomp>)rJ   rK   r   
ModuleListrangenum_hidden_layerslayersr   rX   r   rF   rK      s   

zEVA2CLIPTransformer.__init__c                 C   s   | j D ]}||}q|S r   )r   )rV   r   layer_modulerE   rE   rF   rm      s   

zEVA2CLIPTransformer.forwardr   r   rE   rE   rX   rF   r      r   r   c                       r   )
EVA2CLIPGLUNrq   rr   rs   c                    s   t    t||jd|| dd| _t|j| _t | _	t
 | _t|j|jgd d|| dd| _t|j|jd|| dd| _dS )a  
        The original implementation is the same as:
        ```python
        self.dense_h_to_4h = ColumnParallelLinear(
            config.hidden_size,
            config.ffn_hidden_size,
            bias=False,
            quant_config=quant_config,
        )

        self.gate_proj = ColumnParallelLinear(
            config.hidden_size,
            config.ffn_hidden_size,
            bias=False,
            quant_config=quant_config,
        )
        ```
        ```
        gate_proj_output, _ = self.gate_proj(x)
        dense_h_to_4h_output, _ = self.dense_h_to_4h(x)
        x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1)
        ```

        We merge two ColumnParallelLinear into one MergedColumnParallelLinear:
        ```
        self.merged_proj = MergedColumnParallelLinear(
            config.hidden_size,
            [config.ffn_hidden_size] * 2,
            bias=False,
            quant_config=quant_config,
        )
        ```
        ```
        x, _ = self.merged_proj(x)
        ```
        F.linear_proj)biasrr   rs   r_   z.merged_projz.dense_4h_to_hN)rJ   rK   r   rM   linear_projr   r   norm1GELUact1r   act2r   ffn_hidden_sizemerged_projr   dense_4h_to_h)rV   rW   in_featuresrr   rs   rX   rE   rF   rK      s2   
+

zEVA2CLIPGLU.__init__c                 C   sH   |  |\}}| | |}| |\}}| |}| |\}}|S r   )r   r   r   r   r   r   r   rE   rE   rF   rm   5  s   
zEVA2CLIPGLU.forwardr   r   rE   rE   rX   rF   r      s    Gr   c                       ro   )EVA2CLIPModelNrq   rr   rs   c                    s   t    tdi |j}t|| _t||| dd| _t||j	|| dd| _
t|j	|j	ddd| _ttdd|j	| _ttdd|j	| _|j| _d S )	Nz.transformerrt   r   )r   rr   rs   r_   )rL   out_channelsrH   rI   r-   rE   )rJ   rK   r   vision_configrG   patch_embeddingr   transformerr   rM   r   r   convr   rP   rC   rQ   boieoiscaling_factor)rV   rW   rr   rs   r   rX   rE   rF   rK   ?  s*   

zEVA2CLIPModel.__init__rZ   r[   c           	      C   s   |  |}| |}|ddddf }|j\}}}t|d }|||||dddd}| |}|ddd}| 	|}| j
|jd dd}| j|jd dd}tj|||fdd}|| j }|S )	r\   Nr-   g      ?r   r:   r_   r`   ra   )r   r   rh   intviewpermuter   re   rf   r   r   rg   r   rC   ri   r   )	rV   rZ   rk   r9   sr;   	grid_sizer   r   rE   rE   rF   rm   [  s   





zEVA2CLIPModel.forwardr   r   rE   rE   rX   rF   r   >  s    r   c                       s,   e Zd Zdddedef fddZ  ZS )
GLM4VModelrq   rs   vllm_configrs   c                   s2   t  j||d |j}t| j|| dd| _d S )N)r   rs   z.visionr   )rJ   rK   rr   r   rW   vision)rV   r   rs   rr   rX   rE   rF   rK   x  s
   zGLM4VModel.__init__)r>   r?   r@   r   r   rK   rn   rE   rE   rX   rF   r   w  s    $r   c                	       sp   e Zd ZdZdededdf fddZ			ddeee B dB d	e	ee	 B dB d
e
eB dB defddZ  ZS )GLM4VProcessorz_
    This model doesn't define its own HF processor,
    so we implement our own one here.
    rW   	tokenizerr[   Nc                    sX   t    || _|| _|j}|d }ttj||ftj	dt
 tjdddg| _d S )N
image_size)interpolation)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)meanstd)rJ   rK   rW   r   r   r	   ComposeResizer
   BICUBICToTensor	Normalizeimage_transform)rV   rW   r   r   r   rX   rE   rF   rK     s"   

zGLM4VProcessor.__init__textrZ   return_tensorsc                    s   |d u rg }t |ts|g}|d u rg }t |ts|g} |}t|dkr*i }n fdd|D }dt|i}ti |||dS )Nr   c                    s   g | ]}  |qS rE   )r   )r   imagerV   rE   rF   r     s    z+GLM4VProcessor.__call__.<locals>.<listcomp>r7   )tensor_type)
isinstancelistr   lenrC   stackr   )rV   r   rZ   r   text_inputsimage_inputsr7   rE   r   rF   __call__  s*   


zGLM4VProcessor.__call__)NNN)r>   r?   r@   rA   r*   r   rK   r   r   r   r   r   r   r   rn   rE   rE   rX   rF   r     s*    
r   c                   @   s\   e Zd Zdd ZdedefddZdeee	dB f fdd	Z
de	fd
dZde	fddZdS )GLM4VProcessingInfoc                 C   s   | j tS r   )ctxget_hf_configr*   r   rE   rE   rF   r     s   z!GLM4VProcessingInfo.get_hf_configkwargsr[   c                 K   s"   | j jtf|  |  d|S )N)rW   r   )r   init_processorr   r   get_tokenizer)rV   r   rE   rE   rF   get_hf_processor  s   z$GLM4VProcessingInfo.get_hf_processorNc                 C   s   ddiS )Nr   r-   rE   r   rE   rE   rF   get_supported_mm_limits  s   z+GLM4VProcessingInfo.get_supported_mm_limitsc                 C   s2   |   }|j}|d }|d }|| d }|| S )Nr   rN   r_   )r   r   )rV   	hf_configr   r   rN   grid_lengthrE   rE   rF   get_num_image_tokens  s   z(GLM4VProcessingInfo.get_num_image_tokensc                 C   s   |   d S )Nr_   )r   r   rE   rE   rF   get_num_image_feature_tokens  s   z0GLM4VProcessingInfo.get_num_image_feature_tokens)r>   r?   r@   r   objectr   r   r   r   r   r   r   r   rE   rE   rE   rF   r     s    	r   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )GLM4VDummyInputsBuilder	mm_countsr[   c                 C   s   | dd}d}|| S )Nr   r   /<|begin_of_image|><|endoftext|><|end_of_image|>)get)rV   r   
num_images	base_textrE   rE   rF   get_dummy_text  s   z&GLM4VDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           
      C   sP   | j  }|j}|d  }}|dd}|r|dnd }	d| j||||	diS )Nr   r   r   )widthheightr   	overrides)infor   r   r   _get_dummy_images)
rV   r   r   r   r   r   target_widthtarget_heightr   image_overridesrE   rE   rF   get_dummy_mm_data  s   
z)GLM4VDummyInputsBuilder.get_dummy_mm_datar   )
r>   r?   r@   r   r   r   r   r   r   r  rE   rE   rE   rF   r     s    
r   c                
   @   s   e Zd Zdededeeef deeef def
ddZde	deeef deee
f fd	d
Zdedeeef dedee fddZdS )GLM4VMultiModalProcessorprompt_textmm_itemshf_processor_mm_kwargstokenization_kwargsr[   c                 C   s   dS )NFrE   )rV   r  r  r  r  rE   rE   rF   _hf_processor_applies_updates  s   z6GLM4VMultiModalProcessor._hf_processor_applies_updates	hf_inputsc                 C   s   t tddS )Nr   )r7   )dictr!   batched)rV   r  r  rE   rE   rF   _get_mm_fields_config
  s   z.GLM4VMultiModalProcessor._get_mm_fields_configout_mm_kwargsc                    sJ   j  }|j |j|jdtf fdd}td g|dgS )Nitem_idxc                    s$   j  }g| } g| g S r   )r   r   )r  num_image_tokensimage_tokensboi_token_ideoi_token_idimage_token_idrV   rE   rF   get_replacement  s   

zEGLM4VMultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   )modalitytargetreplacement)r   r   r  pad_token_idr  r   r'   )rV   r  r  r  r   r  rE   r  rF   _get_prompt_updates  s   
z,GLM4VMultiModalProcessor._get_prompt_updatesN)r>   r?   r@   r   r#   r   r   boolr  r   r!   r  r"   r   r(   r  rE   rE   rE   rF   r     s8    


	



r  )r   dummy_inputsc                       s   e Zd ZdgdgddgdZdefddZeded	eded
B fddZ	de
ddededee
 dd
f fddZdeded
B fddZdedejfddZdee dee deejef fddZejZdedefddZ	
	
d&d ejd!ejd"ed
B d#ejd
B dedejeB fd$d%Z  ZS )'GLM4VForCausalLMrz   dense_h_to_4h	gate_proj)rz   r  r   r[   c                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        ztransformer.encoderztransformer.vision.linear_projztransformer.vision.transformer)language_model	connectortower_model)r   from_string_fieldr   rE   rE   rF   get_mm_mapping:  s
   zGLM4VForCausalLM.get_mm_mappingr  iNc                 C   s   | drdS td)Nr   r   z Only image modality is supported)
startswith
ValueError)clsr  r$  rE   rE   rF   get_placeholder_strD  s   
z$GLM4VForCausalLM.get_placeholder_strrq   )rs   transformer_typer   rs   r)  c                   sT   | j |tdtid t j|||d W d    |  d S 1 s!w   Y  |  d S )Nr   )language_targetstower_targets)r   rs   r)  )_mark_composite_modelr0   r   rJ   rK   )rV   r   rs   r)  rX   rE   rF   rK   K  s   
zGLM4VForCausalLM.__init__r   c                 K   s<   | dd }|d ur| jjd  }}td|||ddS d S )Nr7   r   )r;   r<   )r8   r=   resolve_bindings)poprW   r   r6   )rV   r   r7   
expected_h
expected_wrE   rE   rF   _parse_and_validate_image_input_  s   z0GLM4VForCausalLM._parse_and_validate_image_inputimage_inputc                 C   s    |d j | jjd}| j|S )Nr=   )r^   )rc   rW   r^   r   r   )rV   r2  r7   rE   rE   rF   _process_image_inputn  s   z%GLM4VForCausalLM._process_image_inputinput_tokensmm_featuresc           (   	   C   s  t |ddh}dd |dg D }dd |dg D }| j}|j}|j}|j}	|jj}
g }|s5|rg }d}|D ]0}||krDd}n||	krJd}||krX|du rX|	d q;||krf|du rf|	d	 q;|	d
 q;g }t
t|dd D ]\}}t|}|d d }|d d d }|	|||f qxd}d}|D ]\}}}t|dkr|d  d nd}|dkr|| \}}}|||
 ||
 }}} t|ddd||   }!t|ddd|d|  }"t| ddd||d }#|	t|!|"|#g|  |d7 }q|d	kr|g|| dd  R \}}}|||
 ||
 }}} t|D ]C}$t|$ddd||   }!t|ddddd|  }"t| dddd|d }#|	t|!|"|#g|  q3|d7 }|d7 }q|| }%|	t|%dddd|  d}qnt|}%|	t|%dddd tj|dddd}&|& d t|  }'|&|'fS )Nimage_grid_thwvideo_grid_thwc                 S      g | ]}|  qS rE   tolistr   itemrE   rE   rF   r   |      z>GLM4VForCausalLM.get_mrope_input_positions.<locals>.<listcomp>c                 S   r8  rE   r9  r;  rE   rE   rF   r   }  r=  FTr   videor   c                 S   s   | d S )Nr-   rE   )rk   rE   rE   rF   <lambda>  s    z<GLM4VForCausalLM.get_mrope_input_positions.<locals>.<lambda>r   r`   r-   r:   ra   )r    gather_kwargsr   rW   r  video_start_token_idvideo_end_token_idr   spatial_merge_sizeappend	itertoolsgroupby	enumerater   r   maxrC   aranger   rg   re   r   r   tensorri   reshaper<  )(rV   r4  r5  r   r6  r7  r   r  rA  rB  rC  llm_pos_ids_listinput_token_typevideo_check_flgtokeninput_type_groupkey
group_iter
group_liststart_index	end_indexvideo_frame_nummm_data_idxmodality_type	start_idxend_idxst_idxtr;   r<   
llm_grid_t
llm_grid_h
llm_grid_wt_indexh_indexw_indext_idxtext_lenllm_positionsmrope_position_deltarE   rE   rF   get_mrope_input_positionss  s   














P z*GLM4VForCausalLM.get_mrope_input_positionsc                 K   s*   | j di |}|d u rg S | |}|S )NrE   )r1  r3  )rV   r   r2  vision_embeddingsrE   rE   rF   embed_multimodal  s
   
z!GLM4VForCausalLM.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s    |d urd }|  ||||}|S r   )r   )rV   rj  rk  rl  rm  r   r   rE   rE   rF   rm     s   zGLM4VForCausalLM.forward)NN)r>   r?   r@   packed_modules_mappingr   r#  classmethodr   r   r(  r   r   r8   rK   r   r6   r1  rC   rD   r3  r   r    tuplerg  r4   embed_input_idsr1   ri  r)   rm   rn   rE   rE   rX   rF   r  ,  sd    	



 r  )arA   rE  argparser   collections.abcr   r   typingr   r   rC   r   torch.nnr   torchvisionr	   torchvision.transformsr
   transformersr   r   r   transformers.image_utilsr   $transformers.tokenization_utils_baser   vllm.configr   vllm.config.multimodalr   vllm.distributedr   %vllm.model_executor.layers.activationr   r   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   r   r   'vllm.model_executor.layers.quantizationr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r    r!   r"   vllm.multimodal.parser#   vllm.multimodal.processingr$   r%   r&   r'   r(   vllm.sequencer)   vllm.transformers_utils.configsr*   vllm.utils.tensor_schemar+   r,   chatglmr.   r/   r0   
interfacesr1   r2   r3   r4   r5   r6   ModulerG   rp   r   r   r   r   r   r   r   r   r   r  register_processorr  rE   rE   rE   rF   <module>   sh   	+Q9A ,


