o
    i                     @   sD  d dl mZmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlm  mZ d dlmZmZmZmZ d dlmZmZ d dlmZ d d	lmZmZ d d
lm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z>m?Z?m@Z@ d dlAmBZB d dlCmDZDmEZEmFZFmGZGmHZHmIZI d dlJmKZK d dlLmMZMmNZN ddlOmPZPmQZQmRZRmSZS ddlTmUZUmVZVmWZWmXZX e eYZZG dd  d eMZ[G d!d" d"eFZ\G d#d$ d$eDe\ Z]G d%d& d&eEe\ Z^G d'd( d(ej_Z`G d)d* d*ejaZbG d+d, d,ejaZcG d-d. d.ejaZdG d/d0 d0ejaZeG d1d2 d2ejaZfG d3d4 d4ejaZgG d5d6 d6ejaZhG d7d8 d8ejaZiG d9d: d:ejaZjG d;d< d<ejaZkG d=d> d>ZlG d?d@ d@ejaZme<jne^e\e]dAG dBdC dCejaeQeReSZodS )D    )IterableMappingSequence)cached_property)islice)	AnnotatedAnyLiteralN)BatchFeatureChameleonConfigChameleonProcessorChameleonVQVAEConfig)CacheConfig
VllmConfig)BaseDummyOptions)get_pp_group$get_tensor_model_parallel_world_size)init_logger)
SiluAndMul)	Attention)Conv2dLayer)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loaderrow_parallel_weight_loader)set_weight_attrs)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPPSupportsQuant)is_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                   @   s:   e Zd ZU dZed ed< eeje	ddddf ed< d	S )
ChameleonImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebn   hwdataN)
__name__
__module____qualname____doc__r	   __annotations__r   torchTensorr0    rI   rI   Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/chameleon.pyr:   N   s   
  r:   c                   @   sJ   e Zd Zdd ZdefddZdeeedB f fdd	Z	defd
dZ
dS )ChameleonProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr   selfrI   rI   rJ   rN   \   s   z%ChameleonProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S rL   )rM   get_hf_processorr   )rP   rQ   rI   rI   rJ   rR   _   s   z(ChameleonProcessingInfo.get_hf_processorreturnNc                 C   s   ddiS )Nimager1   rI   rO   rI   rI   rJ   get_supported_mm_limitsb   s   z/ChameleonProcessingInfo.get_supported_mm_limitsc                 C   s   |   }|jS rL   )rR   image_seq_length)rP   	processorrI   rI   rJ   get_num_image_tokense   s   z,ChameleonProcessingInfo.get_num_image_tokens)rB   rC   rD   rN   objectrR   r   strintrU   rX   rI   rI   rI   rJ   rK   [   s
    rK   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )ChameleonDummyInputsBuilder	mm_countsrS   c                 C   s$   | dd}| j }|j}|| S )NrT   r   )getinforR   image_token)rP   r]   
num_imagesrW   r`   rI   rI   rJ   get_dummy_textk   s   
z*ChameleonDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sJ   | j  }|jj }}|dd}|r|dnd }d| j||||diS )NrT   r   )widthheightra   	overrides)r_   rN   	vq_config
resolutionr^   _get_dummy_images)	rP   rc   r]   rd   configre   rf   ra   image_overridesrI   rI   rJ   get_dummy_mm_datas   s   
z-ChameleonDummyInputsBuilder.get_dummy_mm_datarL   )
rB   rC   rD   r   rZ   r[   rb   r   r$   rm   rI   rI   rI   rJ   r\   j   s    
r\   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZdee	 dee	 fd	d
Z
dedeeef deeef fddZdedeeef dedee fddZ  ZS )ChameleonMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrS   c                    sF   |s| j  |}| |}tt|gdddS t j||||dS )N)	input_idspt)tensor_type)ro   rp   rq   rr   )r_   get_tokenizerencode_apply_hf_processor_tokens_onlyr
   dictsuper_call_hf_processor)rP   ro   rp   rq   rr   
prompt_ids	__class__rI   rJ   r{      s   
z/ChameleonMultiModalProcessor._call_hf_processorprompt_tokensc                 C   s&   | j  }| }||j }||g S rL   )r_   rv   	get_vocab	sep_token)rP   r   	tokenizervocabsep_token_idrI   rI   rJ   rx      s   


z<ChameleonMultiModalProcessor._apply_hf_processor_tokens_only	hf_inputshf_processor_mm_kwargsc                 C   s   t tddS )NrT   )r;   )ry   r%   batched)rP   r   r   rI   rI   rJ   _get_mm_fields_config   s   z2ChameleonMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                 C   s~   | j jdi |}| j  }| }||j }||j }||j }	| j  }
|g|
 }td|gt	j
|g| |	g |ddgS )NrT   )embed_token_id)modalitytargetreplacementrI   )r_   rR   rv   r   image_start_tokenr`   image_end_tokenrX   r+   r-   select_token_id)rP   r   r   r   rW   r   r   image_start_idimage_token_idimage_end_idnum_image_tokensimage_tokensrI   rI   rJ   _get_prompt_updates   s"   





z0ChameleonMultiModalProcessor._get_prompt_updates)rB   rC   rD   rZ   r   rY   r
   r{   listr[   rx   r%   r   r'   r&   r   r,   r   __classcell__rI   rI   r}   rJ   rn      sB    







rn   c                       s$   e Zd Z fddZdd Z  ZS )ChameleonLayerNormc                    sJ   t  j|g|R i | |d f| _t| jdti t| jdti d S )Nweight_loader)rz   __init__normalized_shaper"   weightr!   bias)rP   hidden_sizeargsrQ   r}   rI   rJ   r      s   zChameleonLayerNorm.__init__c                 C   s*   t j|| jd d dd}|| j | j }|S )Ngh㈵>eps)F
layer_normr   r   r   rP   hidden_statesrI   rI   rJ   forward   s
   zChameleonLayerNorm.forward)rB   rC   rD   r   r   r   rI   rI   r}   rJ   r      s    r   c                       sN   e Zd Z			ddededededB ded	ed
df fddZdd Z  Z	S )ChameleonMLPNF r   intermediate_size
hidden_actquant_configr   prefixrS   c                    sh   t    t||gd ||| dd| _t||||| dd| _|dkr.td| dt | _d S )	N   .gate_up_proj)
input_sizeoutput_sizesr   r   r   z
.down_projr   output_sizer   r   r   siluzUnsupported activation: z!. Only silu is supported for now.)	rz   r   r   gate_up_projr   	down_proj
ValueErrorr   act_fn)rP   r   r   r   r   r   r   r}   rI   rJ   r      s(   
	
zChameleonMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S rL   )r   r   r   )rP   xgate_up_rI   rI   rJ   r      s   
zChameleonMLP.forward)NFr   )
rB   rC   rD   r[   rZ   r   boolr   r   r   rI   rI   r}   rJ   r      s(    r   c                       s   e Zd Z					ddedededeeef d	ed
edB dede	dB deddf fddZ
dejdejdeejejf fddZdejdejdejfddZ  ZS )ChameleonAttention   NFr   r   	num_headsnum_kv_headsrope_parametersmax_position_embeddingsr   r   cache_configr   rS   c
              	      sb  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _t|| j	| j| j|||	 dd| _t| j| j	 ||||	 dd| _t| j| j	f| _t| j| j	f| _t| j	||d| _t| j| j	| j| j|||	 d	d
| _d S )Nr   r1         	.qkv_proj)r   	head_sizetotal_num_headstotal_num_kv_headsr   r   r   z.o_projr   )max_positionr   z.attn)r   r   r   r   )rz   r   r   r   r   r   r   maxr   head_dimq_sizekv_sizescalingr   r   qkv_projr   o_projr   q_normk_normr   
rotary_embr   attn)rP   r   r   r   r   r   r   r   r   r   tp_sizer}   rI   rJ   r     s`   

	
zChameleonAttention.__init__qkc                 C   s|   | d| j| j}| d| j| j}| |}| |}|jg |jd d dR  }|jg |jd d dR  }||fS )Nr   )reshaper   r   r   r   r   viewshape)rP   r   r   rI   rI   rJ   _apply_qk_normK  s   

z!ChameleonAttention._apply_qk_norm	positionsr   c           
      C   sp   |  |\}}|j| j| j| jgdd\}}}| ||\}}| |||\}}| |||}| |\}	}|	S )Nr   dim)r   splitr   r   r   r   r   r   )
rP   r   r   qkvr   r   r   vattn_outputoutputrI   rI   rJ   r   W  s    zChameleonAttention.forward)r   NFNr   )rB   rC   rD   r[   ry   rZ   r   r   r   r   r   rG   rH   tupler   r   r   rI   rI   r}   rJ   r     sR    
	
D
r   c                       sv   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB de
ej	ej	dB f fddZ  ZS )ChameleonDecoderLayerNr   rk   r   r   r   rS   c                       t    |j| _t|dd}t| j|jt|d|j|j||d|| dd	| _t| j|j	|j
|t|dd| dd	| _t|j|jd
| _t|j|jd
| _d S Nr   r   num_key_value_headsFz
.self_attn)	r   r   r   r   r   r   r   r   r   mlp_biasz.mlp)r   r   r   r   r   r   r   rz   r   r   getattrr   num_attention_headsr   	self_attnr   r   r   mlpr   rms_norm_epsinput_layernormpost_attention_layernormrP   rk   r   r   r   r   r}   rI   rJ   r   g  8   

zChameleonDecoderLayer.__init__r   r   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS N)r   r   )r   r   r   r   rP   r   r   r   rI   rI   rJ   r     s   
zChameleonDecoderLayer.forwardNNr   rB   rC   rD   r   r   r   rZ   r   rG   rH   r   r   r   rI   rI   r}   rJ   r   f  s0    %r   c                       sr   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB de
ej	ej	f fddZ  ZS )ChameleonSwinDecoderLayerNr   rk   r   r   r   rS   c                    r   r   r   r   r}   rI   rJ   r     r   z"ChameleonSwinDecoderLayer.__init__r   r   r   c                 C   sL   |}| j ||d}| |}|| }|}| |}| |}|| }||fS r   )r   r   r   r   r   rI   rI   rJ   r     s   


z!ChameleonSwinDecoderLayer.forwardr   r   rI   rI   r}   rJ   r     s0    %r   c                       2   e Zd Zdef fddZdejfddZ  ZS )ChameleonVQVAEVectorQuantizerrk   c                    sF   t    |j| _|j| _t|dd| _t| j| j| _	| j| _
d S )Nbetag      ?)rz   r   num_embeddings	embed_dimembedding_dimr   r   nn	Embedding	embeddingre_embedrP   rk   r}   rI   rJ   r     s   
z&ChameleonVQVAEVectorQuantizer.__init__hidden_statec              
   C   s   | dddd }|d| j}tj|d dddtj| jjd dd dtd	|| jj	dd  }tj
|dd}| ||j}t| | d | jt||  d   }|||   }| dddd }|||fS )
Nr   r   r>   r1   r   T)r   keepdimr   z	bd,dn->bn)permute
contiguousr   r  rG   sumr  r   einsum	transposeargminr   meandetachr   )rP   r  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantlossrI   rI   rJ   r     s4   

z%ChameleonVQVAEVectorQuantizer.forward	rB   rC   rD   r   r   rG   rH   r   r   rI   rI   r}   rJ   r     s    	r   c                       r   )#ChameleonVQVAEEncoderConvDownsamplein_channelsc                    s"   t    t||dddd| _d S )Nr>   r   r   kernel_sizestridepadding)rz   r   r   convrP   r  r}   rI   rJ   r     s   

z,ChameleonVQVAEEncoderConvDownsample.__init__r   c                 C   s    t j|dddd}| |}|S )N)r   r1   r   r1   constantr   )padmodevalue)r   r!  r  r   rI   rI   rJ   r     s   
z+ChameleonVQVAEEncoderConvDownsample.forward	rB   rC   rD   r[   r   rG   rH   r   r   rI   rI   r}   rJ   r    s    r  c                       s<   e Zd Z		d
dedef fddZdejfdd	Z  Z	S ) ChameleonVQVAEEncoderResnetBlockNFrk   r  c                    s   t    || _|d u r|n|| _|| _tjjd|ddd| _t	||dddd| _
tjjd|ddd| _tj|j| _t	||dddd| _| j| jkrg| jr[t	||dddd| _d S t	||dddd| _d S d S )	N    ư>T
num_groupsnum_channelsr   affiner>   r1   r  r   )rz   r   r  out_channelsuse_conv_shortcutrG   r  	GroupNormnorm1r   conv1norm2Dropoutdropoutconv2conv_shortcutnin_shortcut)rP   rk   r  r,  r5  r}   rI   rJ   r      s4   




z)ChameleonVQVAEEncoderResnetBlock.__init__r   c                 C   s   |}|  |}|t|9 }| |}| |}|t|9 }| |}| |}| j| jkr@| j	r;| 
|}|| S | |}|| S rL   )r/  rG   sigmoidr0  r1  r3  r4  r  r,  r-  r5  r6  )rP   r   r   rI   rI   rJ   r   C  s   






z(ChameleonVQVAEEncoderResnetBlock.forward)NF)
rB   rC   rD   r   r[   r   rG   rH   r   r   rI   rI   r}   rJ   r%    s    #r%  c                       r   )ChameleonVQVAEEncoderAttnBlockr  c                    sz   t    || _tjjd|ddd| _t||dddd| _t||dddd| _	t||dddd| _
t||dddd| _d S )Nr&  r'  Tr(  r1   r   r  )rz   r   r  rG   r  r.  normr   r   r   r   proj_outr  r}   rI   rJ   r   Y  s"   




z'ChameleonVQVAEEncoderAttnBlock.__init__r   c                 C   s   |}|  |}| |}| |}| |}|j\}}}}	|||||	 ddd}|||||	 }t||}
|
t	|d  }
t
j|
dd}
|||||	 }|
ddd}
t||
||||	}| |}|| S )Nr   r   r1   r   r   )r9  r   r   r   r   r   r
  rG   bmmr[   r   softmaxr:  )rP   r   r   query_states
key_statesvalue_states
batch_sizechannelsrf   re   attn_weightsr   rI   rI   rJ   r   m  s,   






z&ChameleonVQVAEEncoderAttnBlock.forwardr$  rI   rI   r}   rJ   r8  X  s    r8  c                       r   )ChameleonVQVAEEncoderrk   c              	      s  t    t|j| _|j| _|j}|j}|j}|j	}|j
}|j}t||dddd| _|}dt| }	|	| _t | _t| jD ]`}
t }t }||	|
  }|||
  }t| jD ]$}|t|||d |}|jd ur||jv r|jdkr|t| q[t }||_||_|
| jd krt||_|d }| j| q@t | _t|||d| j_|jdkrt|nt | j_ t|||d| j_!t"jj#d|d	d
d| _$t||rd| n|dddd| _%d S )Nr>   r1   r  )r1   )rk   r  r,  vanillar   r&  r'  Tr(  )&rz   r   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsri   r  double_latentlatent_channelsr   conv_inr   in_channel_multiplierr  
ModuleListdownrangeappendr%  attn_resolutions	attn_typer8  Moduleblockr   r  
downsamplemidblock_1Identityattn_1block_2rG   r.  norm_outconv_out)rP   rk   rI  ri   r  rJ  rK  rF  curr_resrM  i_levelrU  r   block_in	block_outi_blockrO  r}   rI   rJ   r     s   










zChameleonVQVAEEncoder.__init__r;   c                 C   s   | | jjj}| |g}t| jD ]C}t| jD ]'}| j| j| |d }t	| j| j
dkr<| j| j
| |}|| q|| jd krV|| j| |d  q|d }| j|}| j|}| j|}| |}|t|9 }| |}|S )Nr   r   r1   )torL  r   dtyperP  rG  rH  rO  rU  rE  r   rQ  rV  rW  rX  rZ  r[  r\  rG   r7  r]  )rP   r;   r   r_  rb  r  last_hidden_staterI   rI   rJ   r     s&   

zChameleonVQVAEEncoder.forwardr  rI   rI   r}   rJ   rC    s    MrC  c                       sF   e Zd Zdef fddZdejdeejejejf fddZ  Z	S )ChameleonVQVAErk   c                    sN   t    t|| _t|| _t|j|jd| _	t|j|jd| _
|   d S )Nr1   )rz   r   rC  encoderr   quantizer   rK  r  
quant_convpost_quant_convevalr  r}   rI   rJ   r     s   


zChameleonVQVAE.__init__r;   rS   c                 C   s.   |  |}| |}| |\}}}|||fS rL   )rg  ri  rh  )rP   r;   r   quantemb_lossindicesrI   rI   rJ   rw     s   


zChameleonVQVAE.encode)
rB   rC   rD   r   r   rG   rH   r   rw   r   rI   rI   r}   rJ   rf    s    rf  c                   @   s   e Zd ZdZdeeef fddZedd Z	edd Z
ed	d
 Zedd Zedd Zedd ZdejdejfddZdS )ChameleonImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    	vocab_mapc                 C   s   || _ |d| _d S )N<image>)rp  r^   r   )rP   rp  rI   rI   rJ   r     s   z(ChameleonImageVocabularyMapping.__init__c                 C      dd | j  D S )Nc                 S      i | ]\}}||qS rI   rI   .0r   r   rI   rI   rJ   
<dictcomp>      z<ChameleonImageVocabularyMapping.val2name.<locals>.<dictcomp>)rp  itemsrO   rI   rI   rJ   val2name     z(ChameleonImageVocabularyMapping.val2namec                 C   s   t dd | j D S )Nc                 S   s   g | ]\}}| d r|qS )IMGIMG)
startswith)ru  namevalrI   rI   rJ   
<listcomp>  s    z@ChameleonImageVocabularyMapping.image_tokens.<locals>.<listcomp>)sortedrp  rx  rO   rI   rI   rJ   r     s   z,ChameleonImageVocabularyMapping.image_tokensc                    s>   dd t dD  dtdtf fddfddjD S )	Nc                 S   s"   i | ]}t td | t|qS )A)chrordrZ   )ru  irI   rI   rJ   rv    s   " z;ChameleonImageVocabularyMapping.bpe2img.<locals>.<dictcomp>
   old_namerS   c                    s$   d  fdd| tdd D S )Nr   c                 3   s    | ]	}  ||V  qd S rL   )r^   )ru  cimg_tkn_chr_mappingrI   rJ   	<genexpr>  s    
zIChameleonImageVocabularyMapping.bpe2img.<locals>.remap.<locals>.<genexpr>r{  r   )joinrE  )r  r  rI   rJ   remap  s   z6ChameleonImageVocabularyMapping.bpe2img.<locals>.remapc                    s    i | ]}|t  j| qS rI   )r[   ry  )ru  tok)r  rP   rI   rJ   rv  #  s     )rP  rZ   r   rO   rI   )r  r  rP   rJ   bpe2img  s   z'ChameleonImageVocabularyMapping.bpe2imgc                 C   rr  )Nc                 S   rs  rI   rI   rt  rI   rI   rJ   rv  '  rw  z;ChameleonImageVocabularyMapping.img2bpe.<locals>.<dictcomp>)r  rx  rO   rI   rI   rJ   img2bpe%  rz  z'ChameleonImageVocabularyMapping.img2bpec                 C   s(   t t| j t t| j fS rL   )rG   tensorr  r  keysvaluesrO   rI   rI   rJ   bpe2img_search_tensors)  s   z6ChameleonImageVocabularyMapping.bpe2img_search_tensorsc                 C   s>   t jt| j d t jd}| j D ]\}}|||< q|S )Nr1   )rd  )rG   zerosr   r  r  r[   rx  )rP   mappingr   r   rI   rI   rJ   img2bpe_mapping_tensor/  s   
z6ChameleonImageVocabularyMapping.img2bpe_mapping_tensor	img_batchrS   c                 C   s    |j }| j|d }||S )Ncpu)devicer  rc  )rP   r  r  
img_tokensrI   rI   rJ   convert_img2bpe6  s   
z/ChameleonImageVocabularyMapping.convert_img2bpeN)rB   rC   rD   rE   ry   rZ   r[   r   r   ry  r   r  r  r  r  rG   rH   r  rI   rI   rI   rJ   ro    s     






ro  c                       s   e Zd Zdddedef fddZdejdejfd	d
ZdejdejfddZ		ddejdB dejde
dB dejdB deje
B f
ddZ  ZS )ChameleonModelr   r   vllm_configr   c                   s   t    |jj|j |j| _j| _t| jj	| _
tj| _| jjs*tnttj fdd| dd\| _| _| _tj	jd| _tj| _tddgj	| _d S )Nc                    s    | dS )N)rk   r   r   r   rI   r  r   rk   decoder_layerr   rI   rJ   <lambda>S  s    z)ChameleonModel.__init__.<locals>.<lambda>z.layersr  r   r   r   )rz   r   model_config	hf_configr   r   rk   
vocab_sizer   r   embed_tokensro  vocabulary_mapvocabulary_mapping	swin_normr   r   r8   num_hidden_layersstart_layer	end_layerlayersr   r   r9  rf  rh   vqmodelr7   make_empty_intermediate_tensors)rP   r  r   r}   r  rJ   r   =  s2   


zChameleonModel.__init__rs   rS   c                 C   s
   |  |S rL   )r  )rP   rs   rI   rI   rJ   embed_input_idsb  s   
zChameleonModel.embed_input_idsr;   c                 C   s8   |j d }| j|\}}}| j|}||d}|S )z
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.
        r   r   )r   r  rw   r  r  r   )rP   r;   r@  r   
image_toksbpe_toksrI   rI   rJ   get_image_tokense  s
   
zChameleonModel.get_image_tokensNr   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	||\}}|S )Nr   r   )r   r   )
r   is_first_rankr  r   r  r  r  is_last_rankr.   r9  )	rP   rs   r   r  r  r   r   layerr   rI   rI   rJ   r   q  s(   

zChameleonModel.forwardrL   )rB   rC   rD   r   rZ   r   rG   rH   r  r  r.   r   r   rI   rI   r}   rJ   r  <  s     %r  )r_   dummy_inputsc                       s   e Zd Zg dddgdZededededB fd	d
Zdddedef fddZ	de
dedB fddZde
defddZ		d"dejdB dejdedB dejdB dejeB f
ddZdejdejdB fddZdeeeejf  dee fd d!Z  ZS )#!ChameleonForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)r   r   r   r  rS   Nc                 C   s   | drdS td)NrT   rq  z Only image modality is supported)r|  r   )clsr   r  rI   rI   rJ   get_placeholder_str  s   
z5ChameleonForConditionalGeneration.get_placeholder_strr   r  r  r   c                   s   t    |jj}|jj}|| _|| _| j|| jjstnt	dt
id t|t|dd| _W d    n1 s8w   Y  t|j|jt|dd| _|jrT| jjj| j_t|dd}t|j|d	| _| jj| _d S )
NrT   )language_targetstower_targetsmodel)r  r   lm_headr  logit_scaleg      ?)scale)rz   r   r  r  multimodal_configrk   _mark_composite_modelr  r   r   rf  r  r9   r  r   r  r   r  tie_word_embeddingsr  r   r   r   logits_processorr  )rP   r  r   rk   r  r  r}   rI   rJ   r     s8   
	
z*ChameleonForConditionalGeneration.__init__rQ   c                 K   s>   | dd }|d u rd S | jj}|j }}td|||ddS )Nr;   )r?   r@   )r<   rA   resolve_bindings)poprk   rh   ri   r:   )rP   rQ   r;   rh   
expected_h
expected_wrI   rI   rJ   _parse_and_validate_image_input  s   
zAChameleonForConditionalGeneration._parse_and_validate_image_inputc                 K   sV   | j di |}|d u rg S | jjd usJ | j|d | jj}| j|}|S )NrA   rI   )r  r  r  r  rc  rk   rd  r  )rP   rQ   image_inputr   vision_embeddingsrI   rI   rJ   embed_multimodal  s   z2ChameleonForConditionalGeneration.embed_multimodalrs   r   r  r  c                 K   s"   |d urd }| j ||||d}|S )N)r  )r  )rP   rs   r   r  r  rQ   r   rI   rI   rJ   r     s   z)ChameleonForConditionalGeneration.forwardr   c                 C   s>   |  | j|}|d ur| jjj}t|jj|d d |f< |S rL   )	r  r  r  r  r   rG   finford  min)rP   r   logitsr   rI   rI   rJ   compute_logits  s
   
z0ChameleonForConditionalGeneration.compute_logitsweightsc                 C   s  g d}t |  }t }|D ]\}}d|v rqd|v s d|v r!q| jjr*d|v r*qd}d|v r9| jjd ur8d}ni|D ].\}}	}
|	|vrEq;||	|}|d	rU||vrUq;t	|| r[q;|| }|j
}||||
  n8|d	rt||vrtq|d
r|dd}||vrtd|| q|}t	|| rq|| }t|dt}||| |r||v rt	|| rq|| }t|dt}||| || q|S )N))r   z.q_projr   )r   z.k_projr   )r   z.v_projr   )r   z
.gate_projr   )r   z.up_projr1   zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedzlm_head.weightFr  Tz.biaskv_scalez	.kv_scalez.attn.kv_scalez{Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.r   )ry   named_parameterssetrk   r  r  r  replaceendswithr6   r   loggerwarning_oncer   r    add)rP   r  stacked_params_mappingparams_dictloaded_paramsr}  loaded_weightuse_default_weight_loading
param_nameweight_nameshard_idparamr   remapped_kv_scale_namerI   rI   rJ   load_weights  sp   





z.ChameleonForConditionalGeneration.load_weights)NN)rB   rC   rD   packed_modules_mappingclassmethodrZ   r[   r  r   r   rY   r:   r  r2   r  rG   rH   r.   r   r  r   r   r  r  r   rI   rI   r}   rJ   r    s@    	#


,r  )pcollections.abcr   r   r   	functoolsr   	itertoolsr   typingr   r   r	   rG   torch.nnr  torch.nn.functional
functionalr   transformersr
   r   r   r   vllm.configr   r   vllm.config.multimodalr   vllm.distributedr   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr    r!   vllm.model_executor.utilsr"   vllm.multimodalr#   vllm.multimodal.inputsr$   r%   r&   vllm.multimodal.parser'   vllm.multimodal.processingr(   r)   r*   r+   r,   r-   vllm.sequencer.   vllm.utils.tensor_schemar/   r0   
interfacesr2   r3   r4   r5   utilsr6   r7   r8   r9   rB   r  r:   rK   r\   rn   	LayerNormr   rT  r   r   r   r   r   r  r%  r8  rC  rf  ro  r  register_processorr  rI   rI   rI   rJ   <module>   sr     D'`=?.92j5T


