o
    i/                     @   sL  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* G dd dej+Z,G dd dej+Z-e
G dd dej+Z.G dd dej+e%e&Z/dS )z|Inference-only GraniteMoeShared model.

The architecture is the same as granitemoe but with the addition of shared
experts.
    )Iterable)isliceN)nn)GraniteMoeSharedConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group)
SiluAndMul)RMSNorm)MergedColumnParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)IntermediateTensors   )GraniteMoeAttentionGraniteMoeModelGraniteMoeMoE)SupportsLoRA
SupportsPP)AutoWeightsLoadermake_layersmaybe_prefixc                       sJ   e Zd Z		ddededB def fddZdejd	ejfd
dZ	  Z
S )GraniteMoeSharedMLPN configquant_configprefixc                    s   t    |j| _|j| _t| j| jgd d|| dd| _t| j| jd|| dd| _|j	dkr<t
d|j	 d	t | _d S )
N   Fz.input_linear)
input_sizeoutput_sizesbiasr   r    z.output_linear)r$   r   r    siluzUnsupported activation: z!. Only silu is supported for now.)super__init__hidden_sizer"   shared_intermediate_sizer   input_linearr   output_linear
hidden_act
ValueErrorr
   act_fn)selfr   r   r    	__class__ a/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/granitemoeshared.pyr'   '   s,   


zGraniteMoeSharedMLP.__init__hidden_statesreturnc                 C   s*   |  |\}}| |}| |\}}|S N)r*   r.   r+   )r/   r4   _r2   r2   r3   forwardF   s   
zGraniteMoeSharedMLP.forward)Nr   )__name__
__module____qualname__r   r   strr'   torchTensorr8   __classcell__r2   r2   r0   r3   r   &   s    r   c                       s^   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	fddZ
  ZS )GraniteMoeSharedDecoderLayerNr   r   cache_configr   r    r5   c                    s   t    |j| _t| j|j|j|j|j||| d|jd	| _	t
|j|j|j|j|| dd| _t|dddkr<d n	t||| dd| _t|j|jd	| _t|j|jd	| _|j| _d S )
Nz
.self_attn)	r(   	num_headsmax_positionnum_kv_headsrope_parametersrA   r   r    attention_multiplierz.block_sparse_moe)num_expertstop_kr(   intermediate_sizer   r    r)   r   z.shared_mlpr   r    eps)r&   r'   r(   r   num_attention_headsmax_position_embeddingsnum_key_value_headsrE   rF   	self_attnr   num_local_expertsnum_experts_per_tokrI   block_sparse_moegetattrr   
shared_mlpr   rms_norm_epsinput_layernormpost_attention_layernormresidual_multiplier)r/   r   rA   r   r    r0   r2   r3   r'   N   s@   

z%GraniteMoeSharedDecoderLayer.__init__	positionsr4   c                 C   s   |}|  |}| j||d}||| j  }|}| |}| jd u r'| |}n| }| |}|| | }~||| j  }|S )N)rZ   r4   )rW   rP   rY   rX   rU   rS   clone)r/   rZ   r4   residualmoe_hidden_statesr2   r2   r3   r8   y   s"   



z$GraniteMoeSharedDecoderLayer.forward)NNr   )r9   r:   r;   r   r   r   r<   r'   r=   r>   r8   r?   r2   r2   r0   r3   r@   M   s,    +r@   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB dejf
ddZ
deeeejf  dee fddZ  ZS )GraniteMoeSharedModelr   r    vllm_configr    c                   s   t    |jj|j |j| _| _j| _j	| _	t
| j	jd| _j| _tj fdd| dd\| _| _| _tjjd| _d S )N)r   c                    s   t  | dS )NrJ   )r@   r_   rA   r   r   r2   r3   <lambda>   s    z0GraniteMoeSharedModel.__init__.<locals>.<lambda>z.layersr_   rK   )r&   r'   model_config	hf_configrA   r   r   pad_token_idpadding_idx
vocab_sizer   r(   embed_tokensembedding_multiplierr   num_hidden_layersstart_layer	end_layerlayersr   rV   norm)r/   r`   r    r0   ra   r3   r'      s(   
zGraniteMoeSharedModel.__init__	input_idsr5   c                 C   s
   |  |S r6   )rh   r/   ro   r2   r2   r3   embed_input_ids   s   
z%GraniteMoeSharedModel.embed_input_idsNrZ   intermediate_tensorsinputs_embedsc                 C   s   t  jr|d ur|}n| |}|| j9 }n
|d usJ |d }t| j| j| jD ]}|||}q)t  js;t	d|iS | 
|}|S )Nr4   )r	   is_first_rankrq   ri   r   rm   rk   rl   is_last_rankr   rn   )r/   ro   rZ   rr   rs   r4   layerr2   r2   r3   r8      s    

zGraniteMoeSharedModel.forwardweightsc                 C   s,  i }|D ]\}}| drKt|dD ]5}|dd| d}|dd| d}|| jddd\}}	||vs;J ||vsAJ |||< |	||< qq| drst|dD ]}|dd| d	}
|| }|
|vsmJ |||
< qWq| d
r|d
d}||vsJ |||< q|||< qt| | S )Nz%.block_sparse_moe.input_linear.weightr   z.block_sparse_moe.experts.z
.w1.weightz
.w3.weightr!   )dimz&.block_sparse_moe.output_linear.weightz
.w2.weightz%.block_sparse_moe.router.layer.weightz.block_sparse_moe.gate.weight)endswithrangesizereplacechunkr   _load_weightsitems)r/   rw   new_weightsnpew1_namew3_namew1_paramw3_paramw2_namew2_param	gate_namer2   r2   r3   load_weights   sJ   









z"GraniteMoeSharedModel.load_weightsr6   )r9   r:   r;   r   r<   r'   r=   r>   rq   r   r8   r   tuplesetr   r?   r2   r2   r0   r3   r^      s     
,r^   c                       s   e Zd ZdZdg diZdddZddd	ed
ef fddZde	j
de	j
fddZ		d"de	j
dB de	j
dedB de	j
dB de	j
f
ddZde	j
de	j
dB fddZdede	jde	jdefddZdeeee	j
f  dee fd d!Z  ZS )#GraniteMoeSharedForCausalLMFqkv_proj)q_projk_projv_projinput_embeddingsoutput_embeddings)rh   lm_headr   r_   r`   r    c                   s   t    |jj}|j}|| _t|t|dd| _t	|j
|j|t|dd| _|jr1| jjj| j_t|j
|j
d| jj d| _d S )Nmodel)r`   r    r   rJ   r   )scale)r&   r'   rc   rd   r   r   r^   r   r   r   rg   r(   r   tie_word_embeddingsrh   weightr   logits_scalinglogits_processor)r/   r`   r    r   r   r0   r2   r3   r'     s(   


z$GraniteMoeSharedForCausalLM.__init__ro   r5   c                 C   s   | j |S r6   )r   rq   rp   r2   r2   r3   rq   %  s   z+GraniteMoeSharedForCausalLM.embed_input_idsNrZ   rr   rs   c                 C   s   |  ||||}|S r6   )r   )r/   ro   rZ   rr   rs   r4   r2   r2   r3   r8   (  s   z#GraniteMoeSharedForCausalLM.forwardr4   c                 C   s   |  | j|}|S r6   )r   r   )r/   r4   logitsr2   r2   r3   compute_logits4  s   z*GraniteMoeSharedForCausalLM.compute_logits
batch_sizedtypedevicec                 C   s    t dtj|| jjf||diS )Nr4   )r   r   )r   r=   zerosr   r(   )r/   r   r   r   r2   r2   r3   make_empty_intermediate_tensors8  s   z;GraniteMoeSharedForCausalLM.make_empty_intermediate_tensorsrw   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   r   r   r   )r/   rw   loaderr2   r2   r3   r   C  s
   
z(GraniteMoeSharedForCausalLM.load_weights)NN)r9   r:   r;   fall_back_to_pt_during_loadpacked_modules_mappingembedding_modulesr   r<   r'   r=   r>   rq   r   r8   r   intr   r   r   r   r   r   r   r?   r2   r2   r0   r3   r      sB    


,r   )0__doc__collections.abcr   	itertoolsr   r=   r   $transformers.models.granitemoesharedr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   %vllm.model_executor.layers.activationr
   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   vllm.sequencer   
granitemoer   r   r   
interfacesr   r   utilsr   r   r   Moduler   r@   r^   r   r2   r2   r2   r3   <module>   s0   'Hd