o
    -i/r                     @   s  U d Z ddlZddlmZ ddlmZ ddlmZ ddlZddl	m
  mZ ddlm
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9m:Z: dZ;dZ<dZ=dZ>ee?Z@deAfddZBdeAfd d!ZCdeDfd"d#ZEdeFeDef dB fd$d%ZGdeHfd&d'ZIG d(d) d)e
jJZKG d*d+ d+e
jJZLG d,d- d-e
jJZMG d.d/ d/e
jJZNeG d0d1 d1e
jJZOG d2d3 d3e
jJe3e4ZPG d4d5 d5ePZQG d6d7 d7ePZReQeRd8ZSeFeDeTeP f eUd9< G d:d; d;ePZVdS )<z(Inference-only Grok (Grok1/Grok2) model.    N)Iterable)islice)Any)nn)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)init_logger)
GeluAndMul)FusedMoE)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixg;f?g3Ey?g.!	S@      >@returnc                 C   s   t | dt | ddS )Nnum_expertsnum_local_experts   )getattrconfig r,   ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/grok1.py_get_num_expertsP   s   r.   c                 C   s   t | d| jS )Nmoe_intermediate_size)r)   intermediate_sizer*   r,   r,   r-   _get_moe_intermediate_sizeT   s   r1   c                 C   s&   t | dd}t| d}|s|rdS dS )z=Detect Grok version from HF config using multiple heuristics.residual_moeFr/   grok2grok1)r)   hasattr)r+   has_residual_moehas_moe_intermediate_sizer,   r,   r-   _get_grok_versionX   s
   
r8   c                 C   s   t | dd }|d u rIt | dd }|d u rd S d|i}t | dd }|d ur(||d< t | dd }|d ur6||d< dD ]}t | |d }|d urH|||< q8|ddkrXt|}d|d< |S )	Nrope_parameters	rope_type
rope_thetascaling_factorfactor) original_max_position_embeddingsextrapolation_factorattn_factor	beta_fast	beta_sloworiginaldefault)r)   getdict)r+   r9   r:   r;   r<   namevaluer,   r,   r-   _get_rope_parametersd   s*   rI   c                 C   s2   t | dt | dd }|d urt|S t | dd S )Nmoe_router_renormalizemoe_renormalizer2   F)r)   bool)r+   explicit_valuer,   r,   r-   _get_moe_renormalize   s   rN   c                       sR   e Zd Z		ddedededB deddf
 fdd	Zd
ejdejfddZ	  Z
S )Grok1MLPN hidden_sizer0   quant_configprefixr%   c                    sP   t    t||gd d|| dd| _t||d|| dd| _t | _d S )N   Fz.gate_up_proj)
input_sizeoutput_sizesbiasrR   rS   z
.down_proj)rU   output_sizerW   rR   rS   )super__init__r   gate_up_projr   	down_projr   act_fn)selfrQ   r0   rR   rS   	__class__r,   r-   rZ      s    
zGrok1MLP.__init__xc                 C   s*   |  |\}}| |}| |\}}|S N)r[   r]   r\   )r^   ra   _r,   r,   r-   forward   s   
zGrok1MLP.forward)NrP   )__name__
__module____qualname__intr   strrZ   torchTensorrd   __classcell__r,   r,   r_   r-   rO      s    rO   c                       s|   e Zd ZdZ						ddededed	ed
edejdB dedB dedB de	de
f fddZdejdejfddZ  ZS )Grok1MoEa  A tensor-parallel MoE implementation for Grok1 that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
            NFrP   r&   top_krQ   r0   router_logit_soft_capparams_dtyperR   tp_sizerenormalizerS   c                    s\   t    || _t||d|d |
 dd| _t|||||d|	||d|
 dd| _|| _d S )NFz.gate)rW   rq   rR   rS   Tgeluz.experts)r&   ro   rQ   r0   rq   reduce_resultsrs   rR   rr   
activationrS   )rY   rZ   rQ   r   gater   expertsrp   )r^   r&   ro   rQ   r0   rp   rq   rR   rr   rs   rS   r_   r,   r-   rZ      s0   
	
zGrok1MoE.__init__hidden_statesr%   c                 C   sX   |j }|d| j}| |\}}| jdkr!| jt|| j  }| ||}||S )Nr   )shapeviewrQ   rw   rp   Ftanhrx   )r^   ry   
orig_shaperouter_logitsrc   final_hidden_statesr,   r,   r-   rd      s   

zGrok1MoE.forward)rn   NNNFrP   )re   rf   rg   __doc__rh   floatrj   dtyper   rL   ri   rZ   rk   rd   rl   r,   r,   r_   r-   rm      s<    	
)rm   c                       s   e Zd Z						ddededededeeef dB d	edB d
edB deddf fddZ	de
jde
jde
jfddZ  ZS )Grok1Attention   NrP   rQ   	num_headsnum_kv_headsmax_positionr9   cache_configrR   rS   r%   c
              
      s  t    || _|	| _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr2| j|
 dks1J n	|
| j dks;J td| j|
 | _	|| j | _
| j| j
 | _| j	| j
 | _| j
d | _t|| j
| j| jd|| dd| _t| j| j
 |d|| dd| _t| j
||dd	| _tt|	d
dd}t|	dd }|dvrtd| t| j| j
| j| j	|||| dd| _| jrt| jdd| _d S d| _d S )Nr   r   g      Fz	.qkv_proj)rW   rR   rS   z.o_projT)r   r9   is_neox_styleattn_logit_softcappingr$   rn   attn_logit_softcapping_method)Nr~   z`Grok attention logit softcapping method '%s' is not supported; falling back to default behavior..attn)r   r   rR   logits_soft_caprS   attn_output_multiplier      ?)rY   rZ   rQ   r+   r   total_num_headsr   total_num_kv_headsmaxr   head_dimq_sizekv_sizescalingr   qkv_projr   o_projr   
rotary_embr)   loggerwarning_oncer   attnattn_multiplier)r^   rQ   r   r   r   r9   r   rR   rS   r+   rr   attn_logits_soft_capr   r_   r,   r-   rZ      sz   

	
zGrok1Attention.__init__	positionsry   c           
      C   sj   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	| j9 }	|	S )Nrz   )dim)r   splitr   r   r   r   r   r   )
r^   r   ry   qkvrc   qkvattn_outputoutputr,   r,   r-   rd   >  s    
zGrok1Attention.forward)r   NNNrP   N)re   rf   rg   rh   rF   ri   r   r   r   rZ   rj   rk   rd   rl   r,   r,   r_   r-   r      sB    	Rr   c                
       sn   e Zd Z			ddedB dedB deddf fddZd	ejd
ejdejdB de	ejejf fddZ
  ZS )Grok1DecoderLayerNrP   r   rR   rS   r%   c           	         sd  t    |j| _d| _|d ur&t|ddd  | _| js&t|dr&|j| _t| j|j|j	|j
t|||| d|d	| _t|}t|dd	}t|}t|}t|||j|tt|d
td||| dd| _t|dd| _dtd | _t|j|jd| _t|j|jd| _t|j|jd| _t|j|jd| _d | _| jrt|j|j || dd| _d S d S )NFis_fp8_w8a8c                   S   s   dS )NFr,   r,   r,   r,   r-   <lambda>Y  s    z,Grok1DecoderLayer.__init__.<locals>.<lambda>is_fp8r   )	rQ   r   r   r   r9   r   rR   rS   r+   num_experts_per_tokrT   router_logit_softcappingrn   z
.moe_block)r&   ro   rQ   r0   rp   rR   rs   rS   r2   r   g       @epsz.mlp)rQ   r0   rR   rS   )!rY   rZ   rQ   use_fp8r)   r5   r   r   num_attention_headsmax_position_embeddingsnum_key_value_headsrI   r   r.   r1   rN   rm   r   DEFAULT_ROUTER_LOGIT_SOFTCAP	moe_blockr2   mathsqrtresidual_moe_scaler   rms_norm_epspre_attn_normpost_attn_normpre_moe_normpost_moe_normmlprO   r0   )	r^   r+   r   rR   rS   r&   r   r/   rK   r_   r,   r-   rZ   M  sl   
zGrok1DecoderLayer.__init__r   ry   residualc                 C   s   |d u r|}|  |}n|  ||\}}| j||d}| |}| ||\}}| jr@| jd us2J | || | | j }n| |}| |}||fS )N)r   ry   )	r   r   r   r   r2   r   r   r   r   )r^   r   ry   r   r,   r,   r-   rd     s$   


zGrok1DecoderLayer.forward)NNrP   )re   rf   rg   r   r   ri   rZ   rj   rk   tuplerd   rl   r,   r,   r_   r-   r   L  s,    Br   c                       s   e Zd Zdddddddeded	ed
ededeeef dB f fddZdejdejfddZ		ddejdejde
dB dejdB deje
B f
ddZdeeeeeef  fddZdeeeejf  dee fddZ  ZS )
Grok1ModelrP   linearlinear_1linear_vN)rS   ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_nameweight_name_remappingvllm_configrS   r   r   r   r   c                   s   t    |jj|j |j| _| _j| _|| _	|| _
|| _|p%i | _j| _tdt| _t| jjd| _tj fdd| dd\| _| _| _tjjd| _tdd	gj| _d S )
Nembedding_multiplier_scale)rR   c                    s   t  | dS )NrR   rS   )r   rS   r   r+   rR   r,   r-   r     s    z%Grok1Model.__init__.<locals>.<lambda>z.layersr   r   ry   r   )rY   rZ   model_config	hf_configr   rR   r+   pad_token_idpadding_idxr   r   r   r   
vocab_sizer)   "DEFAULT_EMBEDDING_MULTIPLIER_SCALEr   r   rQ   embed_tokensr"   num_hidden_layersstart_layer	end_layerlayersr   r   normr!   make_empty_intermediate_tensors)r^   r   rS   r   r   r   r   r_   r   r-   rZ     s:   




zGrok1Model.__init__	input_idsr%   c                 C   s   |  |}|| j }|S rb   )r   r   )r^   r   ry   r,   r,   r-   embed_input_ids  s   

zGrok1Model.embed_input_idsr   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	||\}}|S )Nry   r   )ry   r   )
r
   is_first_rankr   r   r   r   r   is_last_rankr   r   )	r^   r   r   r   r   ry   r   layerrc   r,   r,   r-   rd     s    
zGrok1Model.forwardc                 C   s$   t | j}tj| | j| j| j|dS )N)r   r   r   r&   )r.   r+   r   make_expert_params_mappingr   r   r   )r^   r&   r,   r,   r-   get_expert_mapping	  s   
zGrok1Model.get_expert_mappingweightsc              	   C   sN  g d}t |  }t }|  }|D ]\}}| j D ]\}}	||v r+|||	}q| jd urZ| j| }
rZ||
 }t	|dt
}| dkrK|n|d }||| ||
 q|D ]G\}}}||vrfq\|||}|dsv|dr{||vr{q\t|| rq\|drt||}|d u rq\||vrq\|| }|j}||||  n{|D ]9}|\}}}}||vrq|||}t|| rq|ds|dr||vrq|| }|j}||||||d  n?|ds|dr||vrqt|| rqt||}|d u rqd|v r
|dd	}||vrq|| }t	|dt
}||| || q|S )
N))r   q_projr   )r   k_projr   )r   v_projr   )mlp.gate_up_projzmlp.gate_projr   )r   zmlp.up_projr   weight_loaderr   z.bias_biasscale)shard_id	expert_idz
norm.scaleweight)rF   named_parameterssetr   r   itemsreplacerR   get_cache_scaler)   r   r   addendswithr    r   r   )r^   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingrG   loaded_weightold_patternnew_pattern
scale_nameparamr   
param_nameweight_namer   mappingr   r,   r,   r-   load_weights  s   	











zGrok1Model.load_weightsrb   )re   rf   rg   r	   ri   rF   rZ   rj   rk   r   r   rd   listr   rh   r   r   r   r  rl   r,   r,   r_   r-   r     sD    3

,r   c                       s&  e Zd ZU dZdZdg diZdZeed< dZ	eed< d	Z
eed
< deeef fddZdddedef fddZdejdejfddZ		d%dejdejdedB dejdB dejeB f
ddZdejdejdB fddZd eeeejf  dee fd!d"Zdeeeeeef  fd#d$Z  ZS )&GrokBaseForCausalLMz-Base class for Grok models with shared logic.Fr   r   r   r   r   r   r   r   r   r   r%   c                 C      i S )zFReturn weight name remapping for this version. Override in subclasses.r,   r^   r,   r,   r-   get_weight_name_remapping     z-GrokBaseForCausalLM.get_weight_name_remappingrP   r   r   rS   c                   s   t    |jj}|j}|| _|| _t|t|d| j| j	| j
|  d| _t|j|j|t|dd| _| jjr>| jjj| j_t|dt| _t|j| jt|dd d| _| jj| _d S )Nmodel)r   rS   r   r   r   r   lm_headr   output_multiplier_scalefinal_logit_softcapping)r   soft_cap)rY   rZ   r   r   rR   r+   r   r#   r   r   r   r
  r  r   r   rQ   r  tie_word_embeddingsr   r   r)   DEFAULT_OUTPUT_MULTIPLIER_SCALEr  r   logits_processorr   )r^   r   rS   r+   rR   r_   r,   r-   rZ     s>   
	
zGrokBaseForCausalLM.__init__r   c                 C   s   | j |S rb   )r  r   )r^   r   r,   r,   r-   r     s   z#GrokBaseForCausalLM.embed_input_idsNr   r   r   c                 C   s   |  ||||}|S rb   )r  )r^   r   r   r   r   ry   r,   r,   r-   rd     s   zGrokBaseForCausalLM.forwardry   c                 C   s   |  | j|}|S rb   )r  r  )r^   ry   logitsr,   r,   r-   compute_logits  s   z"GrokBaseForCausalLM.compute_logitsr   c                 C   s(   | j jrdgnd }t| |d}||S )Nr  )skip_prefixes)r+   r  r   r  )r^   r   r  loaderr,   r,   r-   r    s   
z GrokBaseForCausalLM.load_weightsc                 C   s
   | j  S rb   )r  r   r	  r,   r,   r-   r     s   
z&GrokBaseForCausalLM.get_expert_mapping)NN)re   rf   rg   r   fall_back_to_pt_during_loadpacked_modules_mappingr   ri   __annotations__r   r   rF   r
  r	   rZ   rj   rk   r   r   rd   r  r   r   r   r  r  rh   r   rl   r,   r,   r_   r-   r    s>   
 	)

$&
r  c                   @   s2   e Zd ZdZdZdZdZdeeef fddZ	dS )	Grok1ForCausalLMzGrok1-specific implementation.r   r   r   r%   c                 C   r  rb   r,   r	  r,   r,   r-   r
    r  z*Grok1ForCausalLM.get_weight_name_remappingN)
re   rf   rg   r   r   r   r   rF   ri   r
  r,   r,   r,   r-   r    s    r  c                   @   sD   e Zd ZdZg dddgdZdZdZdZd	ee	e	f fd
dZ
dS )Grok2ForCausalLMzGrok2-specific implementation.r  	gate_projup_proj)r   r[   w1w2w3r%   c                 C   s
   dddS )Nz.attn.z.moe_block.)z.self_attn.z.block_sparse_moe.r,   r	  r,   r,   r-   r
    s   z*Grok2ForCausalLM.get_weight_name_remappingN)re   rf   rg   r   r  r   r   r   rF   ri   r
  r,   r,   r,   r-   r    s    r  )r4   r3   _GROK_VERSIONSc                   @   s(   e Zd ZdZdddedefddZdS )	GrokForCausalLMzAFactory class that dispatches to version-specific implementation.rP   r   r   rS   c                C   sV   |j j}t|}t|}|d u rtd| t| j| _| j|j |||dS )NzUnsupported Grok version: )r   rS   )	r   r   r8   r"  rE   
ValueErrorrF   r  update)clsr   rS   r+   versioninstance_clsr,   r,   r-   __new__  s   
zGrokForCausalLM.__new__N)re   rf   rg   r   r	   ri   r)  r,   r,   r,   r-   r#    s    r#  )Wr   r   collections.abcr   	itertoolsr   typingr   rj   torch.nn.functionalr   
functionalr}   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   
interfacesr   r   utilsr   r    r!   r"   r#   DEFAULT_ATTN_OUTPUT_MULTIPLIERr  r   r   re   r   rh   r.   r1   ri   r8   rF   rI   rL   rN   ModulerO   rm   r   r   r   r  r  r  r"  typer  r#  r,   r,   r,   r-   <module>   sb   		 @af Md