o
    -iW                     @   s  d Z ddlZddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 G dd dej8Z9G dd dej8Z:G dd dej8Z;eG dd dej8Z<G dd  d ej8e/e0e.Z=dS )!zInference-only Mixtral model.    N)CallableIterable)islice)nn)MixtralConfig)	Attention)support_torch_compile)CacheConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_group$get_tensor_model_parallel_world_size)FusedMoE)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )MixtureOfExpertsSupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       s   e Zd ZdZ						ddedededed	ejdB d
edB dedB dedB dede	f fddZ
dejdejfddZ  ZS )
MixtralMoEa  A tensor-parallel MoE implementation for Mixtral that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    N Fnum_expertstop_khidden_sizeintermediate_sizeparams_dtypequant_configtp_sizedp_sizeprefixenable_eplbc                    s   t    || _t j| _t j| _| j | _	t
 }|j}|
| _|| _|| _|jj| _| j| j | _| j| j	 | _| j| j | _| j| j | _t||d|d |	 dd| _t|||||dd||||	 d| j| jd| _d S )NFz.gate)biasr,   r-   r0   Tz.experts)r(   r)   r*   r+   r,   reduce_resultsrenormalizer-   r.   r/   r0   r1   num_redundant_experts)super__init__r*   r   device_groupep_grouprank_in_groupep_ranksizeep_sizer   parallel_configr1   n_routed_expertsn_logical_expertseplb_configr5   n_redundant_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr   gater   experts)selfr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   vllm_configr>   	__class__ _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/mixtral.pyr7   S   sN   




	zMixtralMoE.__init__hidden_statesreturnc                 C   s8   |j }|d| j}| |\}}| ||}||S )N)shapeviewr*   rG   rH   )rI   rO   
orig_shaperouter_logits_final_hidden_statesrM   rM   rN   forward   s
   
zMixtralMoE.forward)NNNNr'   F)__name__
__module____qualname____doc__inttorchdtyper   strboolr7   TensorrX   __classcell__rM   rM   rK   rN   r&   J   s<    	
>r&   c                       sp   e Zd Z				ddededededed	edB d
edB deddf fddZde	j
de	j
de	j
fddZ  ZS )MixtralAttention   Nr'   configr*   	num_headsnum_kv_headsmax_positioncache_configr-   r0   rP   c	           
   	      sV  t    || _t }	|| _| j|	 dksJ | j|	 | _|| _| j|	kr/| j|	 dks.J n	|	| j dks8J td| j|	 | _t	|dd | _
| j
d u rT| j| j | _
| j| j
 | _| j| j
 | _| j
d | _t|| j
| j| jd|| dd| _t| j| j
 |d|| dd| _t| j
||jd	d
| _t| j| j
| j| j||| dd| _d S )Nr   r   head_dimg      Fz	.qkv_proj)r2   r-   r0   z.o_projT)ri   rope_parametersis_neox_stylez.attn)rh   rj   r-   r0   )r6   r7   r*   r   total_num_headsrg   total_num_kv_headsmaxrh   getattrrk   q_sizekv_sizescalingr   qkv_projr   o_projr   rl   
rotary_embr   attn)
rI   rf   r*   rg   rh   ri   rj   r-   r0   r.   rK   rM   rN   r7      s`   


	
zMixtralAttention.__init__	positionsrO   c           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )NrQ   )dim)ru   splitrr   rs   rw   rx   rv   )
rI   ry   rO   qkvrV   qkvattn_outputoutputrM   rM   rN   rX      s    zMixtralAttention.forward)re   NNr'   )rY   rZ   r[   r   r]   r	   r   r`   r7   r^   rb   rX   rc   rM   rM   rK   rN   rd      s>    	
Crd   c                       sn   e Zd Z				ddededB dedB deded	df fd
dZde	j
de	j
de	j
dB d	e	j
fddZ  ZS )MixtralDecoderLayerNr'   Frf   rj   r-   r0   r1   rP   c              
      s   t    |j| _t|| j|j|j|j||| dd| _t|j	|j
|j|j|| d|d| _t|j|jd| _t|j|jd| _d S )Nz
.self_attn)rf   r*   rg   ri   rh   rj   r-   r0   z.block_sparse_moe)r(   r)   r*   r+   r-   r0   r1   eps)r6   r7   r*   rd   num_attention_headsmax_position_embeddingsnum_key_value_heads	self_attnr&   num_local_expertsnum_experts_per_tokr+   block_sparse_moer   rms_norm_epsinput_layernormpost_attention_layernorm)rI   rf   rj   r-   r0   r1   rK   rM   rN   r7      s2   

	zMixtralDecoderLayer.__init__ry   rO   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)ry   rO   )r   r   r   r   )rI   ry   rO   r   rM   rM   rN   rX     s   
zMixtralDecoderLayer.forward)NNr'   F)rY   rZ   r[   r   r	   r   r`   ra   r7   r^   rb   rX   rc   rM   rM   rK   rN   r      s6    "r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdejde	dB dejdB deje	B f
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )MixtralModelr'   r0   rJ   r0   c                   s   t    |jj|j |j|j}__j_j_	t
jj_|j_|jj_tj fdd| dd\___tjjd_tddgj_d S )Nc                    s   t  | jdS )N)r-   r0   r1   )r   r1   r   rj   rf   r-   rI   rM   rN   <lambda>A  s    z'MixtralModel.__init__.<locals>.<lambda>z.layersr   r   rO   r   )r6   r7   model_config	hf_configrj   r-   r>   rf   
vocab_sizeorg_vocab_sizer   r*   embed_tokensr1   rA   r5   r$   num_hidden_layersstart_layer	end_layerlayersr   r   normr#   make_empty_intermediate_tensors)rI   rJ   r0   r>   rK   r   rN   r7   (  s0   



zMixtralModel.__init__	input_idsrP   c                 C   s
   |  |S N)r   rI   r   rM   rM   rN   embed_input_idsP     
zMixtralModel.embed_input_idsNry   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	||\}}|S )NrO   r   )rO   r   )
r   is_first_rankr   r   r   r   r   is_last_rankr   r   )	rI   r   ry   r   r   rO   r   layerrV   rM   rM   rN   rX   S  s    
zMixtralModel.forwardc                 C   s   t j| ddd| jj| jdS )Nw1w2w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer(   r5   )r   make_expert_params_mappingrf   r   r5   rI   rM   rM   rN   get_expert_mappingm  s   zMixtralModel.get_expert_mappingweightsc              
   C   s"  g d}t |  }t }|  }|D ]\}}| jd urE| j| }rE|| }	t|	dt}
| dkr6|n|d }|
|	| |	| q|D ]B\}}}||vrQqG|
||}|dsa|drf||vrfqGt|| rlqG|dr{t||}|d u r{qG|| }	|	j}
|
|	||  nd}|D ]I}|\}}}}||vrqd}|
||}t|| rq|ds|dr||vrq|| }	ttd	tf |	j}
|
|	||||dd
}|r|} n2q|rq|ds|dr||vrqt|| rqt||}|d u rq|| }	t|	dt}
|
|	| |	| q|S )N))ru   q_projr}   )ru   k_projr~   )ru   v_projr   weight_loaderr   z.bias_biasscaleFT.)shard_id	expert_idreturn_success)dictnamed_parameterssetr   r-   get_cache_scalerq   r   rz   addreplaceendswithr"   r   r   typingcastr   ra   )rI   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
scale_nameparamr   
param_nameweight_namer   is_expert_weightmappingr   name_mappedsuccessrM   rM   rN   load_weightsy  s   









zMixtralModel.load_weightsr   )rY   rZ   r[   r
   r`   r7   r^   rb   r   r   rX   listtupler]   r   r   r   r   rc   rM   rM   rK   rN   r   &  s"    (
,r   c                       s  e Zd ZdZdg diZdddZddd	ed
ef fddZde	de	ddfddZ
dejdejfddZ		d#dejdejdedB dejdB dejeB f
ddZdejdejdB fddZdeeeejf  dee fdd Zdeeeee	ef  fd!d"Z  ZS )$MixtralForCausalLMFru   )r   r   r   input_embeddingsoutput_embeddings)r   lm_headr'   r   rJ   r0   c                   s8  t    |jj}|j}|| _|| _t|t|dd| _t	|j
|j|t|dd| _| jjr5| jjj| j_t|j
| _| jj| _g | _g | _d }| jjD ]%}t|trTqLt|ts[J t|drqt|jtrq|j}| j|jj qLt| j| _|d u rt d|j!| _"|j#| _$|j%| _&|j'| _(|j)| _*d| _+d| _,d S )	Nmodel)rJ   r0   r   )r-   r0   r   z+No MixtralMoE layer found  in model.layers.r   r   )-r6   r7   r   r   r-   rf   r   r%   r   r   r   r*   r   tie_word_embeddingsr   weightr   logits_processorr   expert_weights
moe_layersr   
isinstancer!   r   hasattrr   r&   appendrH   lennum_moe_layersRuntimeErrorr@   num_logical_expertsrC   num_physical_expertsrD   num_local_physical_expertsr?   num_routed_expertsrB   r5   num_expert_groupsnum_shared_experts)rI   rJ   r0   rf   r-   example_moer   rK   rM   rN   r7     sT   



zMixtralForCausalLM.__init__r   r   rP   Nc                 C   sr   | j |ksJ || _|| _ || j | _| jjD ]}t|dr6t|jt	r6|j}||_
||_| j|_|j  qd S )Nr   )r   r   r   r5   r   r   r   r   r   r&   rD   rC   rB   rH   update_expert_map)rI   r   r   r   moerM   rM   rN    update_physical_experts_metadata)  s   
z3MixtralForCausalLM.update_physical_experts_metadatar   c                 C   s   | j |S r   )r   r   r   rM   rM   rN   r   <  s   z"MixtralForCausalLM.embed_input_idsry   r   r   c                 C   s   |  ||||}|S r   )r   )rI   r   ry   r   r   rO   rM   rM   rN   rX   ?  s   zMixtralForCausalLM.forwardrO   c                 C   s   |  | j|}|S r   )r   r   )rI   rO   logitsrM   rM   rN   compute_logitsK  s   z!MixtralForCausalLM.compute_logitsr   c                 C   s   t | }||S r   )r    r   )rI   r   loaderrM   rM   rN   r   R  s   
zMixtralForCausalLM.load_weightsc                 C   s
   | j  S r   )r   r   r   rM   rM   rN   r   V  r   z%MixtralForCausalLM.get_expert_mapping)NN)rY   rZ   r[   fall_back_to_pt_during_loadpacked_modules_mappingembedding_modulesr
   r`   r7   r]   r   r^   rb   r   r   rX   r   r   r   r   r   r   r   rc   rM   rM   rK   rN   r     sH    
5


$&r   )>r\   r   collections.abcr   r   	itertoolsr   r^   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   r
   r   vllm.distributedr   r   r   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   
interfacesr   r   r   utilsr    r!   r"   r#   r$   r%   Moduler&   rd   r   r   r   rM   rM   rM   rN   <module>   s:    
QQ: =