o
    -i~R                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3 G dd dej4Z5G dd dej4Z6G dd dej4Z7eG dd dej4Z8G d d! d!ej4e-e.Z9dS )"z Inference-only GraniteMoe model.    )Iterable)islice)AnyN)nn)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)FusedMoE)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)sequence_parallel_chunk)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parametermake_layersmaybe_prefixc                       sr   e Zd ZdZ					ddedededed	ejdB d
edB dedB def fddZ	dej
dej
fddZ  ZS )GraniteMoeMoEa
  A tensor-parallel MoE implementation for GraniteMoe that shards each
    expert across all ranks.
    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    NF num_expertstop_khidden_sizeintermediate_sizeparams_dtypequant_configtp_sizeprefixc
           
         s^   t    || _|| _t||d|d |	 dd| _t|||||dd|||	 d| jd| _d S )NFz.gate)biasr(   r)   r+   Tz.experts)r$   r%   r&   r'   r(   reduce_resultsrenormalizer)   r*   r+   is_sequence_parallel)super__init__r&   r/   r   gater   experts)
selfr$   r%   r&   r'   r(   r)   r*   r/   r+   	__class__ b/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/granitemoe.pyr1   K   s0   
	zGraniteMoeMoE.__init__hidden_statesreturnc                 C   sj   |j }|d| j}| jrt|}| |\}}| ||}| jr0t|d}|d }|d | }||S )Nr   )shapeviewr&   r/   r   r2   r3   r   )r4   r9   
orig_shaperouter_logits_final_hidden_states
num_tokensr7   r7   r8   forwards   s   
zGraniteMoeMoE.forward)NNNFr#   )__name__
__module____qualname____doc__inttorchdtyper   strr1   TensorrC   __classcell__r7   r7   r5   r8   r"   C   s2    
(r"   c                       s   e Zd Z						ddededededeeef dB d	edB d
edB de	dB deddf fddZ
dejdejdejfddZ  ZS )GraniteMoeAttention   Nr#   r&   	num_headsnum_kv_headsmax_positionrope_parameterscache_configr)   attention_multiplierr+   r:   c
              	      sF  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _|| j | _	| j| j	 | _
| j| j	 | _|d ur[|n| j	d | _t|| j	| j| jd||	 dd| _t| j| j	 |d||	 dd| _t| j	||dd	| _t| j| j	| j| j|||	 d
d| _d S )Nr   r   r;   Fz	.qkv_proj)r,   r)   r+   z.o_projT)rR   rS   is_neox_stylez.attn)rQ   rT   r)   r+   )r0   r1   r&   r   total_num_headsrP   total_num_kv_headsmaxrQ   head_dimq_sizekv_sizescalingr   qkv_projr   o_projr   
rotary_embr   attn)r4   r&   rP   rQ   rR   rS   rT   r)   rU   r+   r*   r5   r7   r8   r1      sb   

	
zGraniteMoeAttention.__init__	positionsr9   c           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )Nr;   dim)r^   splitr[   r\   r`   ra   r_   )
r4   rb   r9   qkvr@   qkvattn_outputoutputr7   r7   r8   rC      s    zGraniteMoeAttention.forward)rO   NNNNr#   )rD   rE   rF   rH   dictrK   r   r   r   floatr1   rI   rL   rC   rM   r7   r7   r5   r8   rN      sF    	
ErN   c                       sJ   e Zd Z	ddededdf fddZdejd	ejdejfd
dZ  Z	S )GraniteMoeDecoderLayerr#   vllm_configr+   r:   Nc                    s   t    |jj}|j}|j}|j}|j| _t| j|j	|j
|j|j||| d|jd	| _t|j|j|j|j||j| dd| _t|j|jd| _t|j|jd| _|j| _d S )Nz
.self_attn)	r&   rP   rR   rQ   rS   rT   r)   r+   rU   z.block_sparse_moe)r$   r%   r&   r'   r)   r/   r+   eps)r0   r1   model_config	hf_configrT   r)   parallel_configr&   rN   num_attention_headsmax_position_embeddingsnum_key_value_headsrS   rU   	self_attnr"   num_local_expertsnum_experts_per_tokr'   use_sequence_parallel_moeblock_sparse_moer   rms_norm_epsinput_layernormpost_attention_layernormresidual_multiplier)r4   ro   r+   configrT   r)   rt   r5   r7   r8   r1      s>   

zGraniteMoeDecoderLayer.__init__rb   r9   c                 C   sT   |}|  |}| j||d}||| j  }|}| |}| |}||| j  }|S )N)rb   r9   )r~   rx   r   r   r|   )r4   rb   r9   residualr7   r7   r8   rC     s   


zGraniteMoeDecoderLayer.forward)r#   )
rD   rE   rF   r	   rK   r1   rI   rL   rC   rM   r7   r7   r5   r8   rn      s     )rn   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdejde	dB dejdB dejf
ddZ
deeeejf  dee fddZdeeeejf  dee fddZ  ZS )GraniteMoeModelr#   r+   ro   r+   c                   s   t     jj} j}|| _|| _|j| _t| j|j| _	|j
| _
t|j fdd| dd\| _| _| _t|j|jd| _d S )Nc                    s   t  | dS )Nr   )rn   r   ro   r7   r8   <lambda>0  s    z*GraniteMoeModel.__init__.<locals>.<lambda>z.layersr   rp   )r0   r1   rr   rs   r)   r   
vocab_sizer   r&   embed_tokensembedding_multiplierr    num_hidden_layersstart_layer	end_layerlayersr   r}   normr4   ro   r+   r   r)   r5   r   r8   r1     s"   

zGraniteMoeModel.__init__	input_idsr:   c                 C   s
   |  |S N)r   r4   r   r7   r7   r8   embed_input_ids6  s   
zGraniteMoeModel.embed_input_idsNrb   intermediate_tensorsinputs_embedsc                 C   s   t  jr|d ur|}n| |}|| j9 }n
|d usJ |d }t| j| j| jD ]}|||}q)t  js;t	d|iS | 
|}|S )Nr9   )r
   is_first_rankr   r   r   r   r   r   is_last_rankr   r   )r4   r   rb   r   r   r9   layerr7   r7   r8   rC   9  s    

zGraniteMoeModel.forwardweightsc              	   C   s  g d}t j| ddd| jjd}t|  }t }|D ]\}}| jdurM| j| }rM|| }	t	|	dt
}
| dkr>|n|d }|
|	| || q|D ]B\}}}||vrYqO|||}|d	si|d
rn||vrnqOt|| rtqO|drt||}|du rqO|| }	|	j}
|
|	||  nj|D ]9}|\}}}}||vrq|||}t|| rq|d	s|d
r||vrq|| }	|	j}
|
|	||||d  n.|d	s|d
r||vrqt|| rqt||}|du rq|| }	t	|	dt
}
|
|	| || q|S )z
        This function is copied from `MixtralModel.load_weights`, mainly to
        decouple from mixtral, avoiding impact on support like BNB
        quantization.
        ))r^   q_projrg   )r^   k_projrh   )r^   v_projri   w1w2w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer$   Nweight_loaderr   z.bias_biasscale)shard_id	expert_id)r   make_expert_params_mappingr   ry   rl   named_parameterssetr)   get_cache_scalegetattrr   rd   addreplaceendswithr   r   r   )r4   r   stacked_params_mappingexpert_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_namer   mappingr   r7   r7   r8   _load_weightsT  s   	









zGraniteMoeModel._load_weightsc                 C   s*  i }|D ]\}}| drKt|dD ]5}|dd| d}|dd| d}|| jddd\}}	||vs;J ||vsAJ |||< |	||< qq| drst|dD ]}|dd| d	}
|| }|
|vsmJ |||
< qWq| d
r|d
d}||vsJ |||< q|||< q| | S )Nz%.block_sparse_moe.input_linear.weightr   z.block_sparse_moe.experts.z
.w1.weightz
.w3.weight   rc   z&.block_sparse_moe.output_linear.weightz
.w2.weightz%.block_sparse_moe.router.layer.weightz.block_sparse_moe.gate.weight)r   rangesizer   chunkr   items)r4   r   new_weightsnpew1_namew3_namew1_paramw3_paramw2_namew2_param	gate_namer7   r7   r8   load_weights  sJ   









zGraniteMoeModel.load_weightsr   )rD   rE   rF   r	   rK   r1   rI   rL   r   r   rC   r   tupler   r   r   rM   r7   r7   r5   r8   r     s"    
$,ir   c                       s   e Zd ZdZdg diZdddZddd	ed
ef fddZde	j
de	j
fddZ		d"de	j
de	j
dedB de	j
dB de	j
f
ddZde	j
de	j
dB fddZdede	jde	jdefddZdeeee	j
f  dee fd d!Z  ZS )#GraniteMoeForCausalLMFr^   )r   r   r   input_embeddingsoutput_embeddings)r   lm_headr#   r   ro   r+   c                   s~   t    |jj}|j}|| _t|t|dd| _t	|j
|j|t|dd| _|jr1| jjj| j_t|j
d| jj d| _d S )Nmodel)ro   r+   r   )r)   r+   r   )r   )r0   r1   rr   rs   r)   r   r   r!   r   r   r   r&   r   tie_word_embeddingsr   weightr   logits_scalinglogits_processorr   r5   r7   r8   r1     s&   


zGraniteMoeForCausalLM.__init__r   r:   c                 C   s   | j |S r   )r   r   r   r7   r7   r8   r     s   z%GraniteMoeForCausalLM.embed_input_idsNrb   r   r   c                 C   s   |  ||||}|S r   )r   )r4   r   rb   r   r   r9   r7   r7   r8   rC     s   zGraniteMoeForCausalLM.forwardr9   c                 C   s   |  | j|}|S r   )r   r   )r4   r9   logitsr7   r7   r8   compute_logits  s   z$GraniteMoeForCausalLM.compute_logits
batch_sizerJ   devicec                 C   s    t dtj|| jjf||diS )Nr9   )rJ   r   )r   rI   zerosr   r&   )r4   r   rJ   r   r7   r7   r8   make_empty_intermediate_tensors!  s   z5GraniteMoeForCausalLM.make_empty_intermediate_tensorsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   r   r   r   )r4   r   loaderr7   r7   r8   r   ,  s
   
z"GraniteMoeForCausalLM.load_weights)NN)rD   rE   rF   fall_back_to_pt_during_loadpacked_modules_mappingembedding_modulesr	   rK   r1   rI   rL   r   r   rC   r   rH   rJ   r   r   r   r   r   r   rM   r7   r7   r5   r8   r     sB    


,r   ):rG   collections.abcr   	itertoolsr   typingr   rI   r   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   r   r   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r    vllm.model_executor.models.utilsr   vllm.sequencer   
interfacesr   r   utilsr   r   r    r!   Moduler"   rN   rn   r   r   r7   r7   r7   r8   <module>   s:   FS? I