o
    
۾ip                     @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZmZmZmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 ddl8m9Z9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC G dd dejDZEG dd dejjDZFG dd  d ejjDZGe
G d!d" d"ejDZHG d#d$ d$ejDe;e9e:ZIdS )%    )IterableN)nn)GptOssConfig)support_torch_compile)CacheConfig
VllmConfig)get_dp_groupget_ep_groupget_pcp_groupget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)	Attention)FusedMoE)FusedMoEParallelConfig)RMSNorm)QKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)rocm_unquantized_gemm)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)sequence_parallel_chunk)current_platform)IntermediateTensors)cdiv)AttentionType   )SupportsEagle3SupportsLoRA
SupportsPP)AutoWeightsLoaderWeightsMapperextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                	       sZ   e Zd Z			ddededB dedB def fddZd	ej	d
ej	dej	fddZ
  ZS )OAIAttentionN configquant_configcache_configprefixc                    s  t    t|| _|j| _|j| _|j| _|j| _t| j|j	t
j|jd d|jd |jd |jd |jd |jddd	dd
| _t }t
jt
j|j| dd| _| j| j | | _| j| j | | _| jd | _t| j| j| j| j|| dd| _t| j| j | j|| dd| _|j| | _|j| | _| jd dkr|jnd }t| j| j| j| j|||tj | d| jd
| _!d S )N
rope_thetayarnfactor original_max_position_embeddings	beta_fast	beta_slowtruncateT)r2   	rope_typer4   r5   r6   r7   r8   )max_positiondtyperope_parametersis_neox_styleF)requires_gradg      	.qkv_proj)hidden_size	head_sizetotal_num_headstotal_num_kv_headsr/   r1   z.o_proj)
input_sizeoutput_sizer/   r1      r   .attn)num_kv_headsr0   r/   per_layer_sliding_window	attn_typer1   sinks)"super__init__r'   	layer_idxhead_dimnum_attention_headsnum_key_value_headsr@   r   max_position_embeddingstorchfloat32r<   get
rotary_embr   r   	ParameteremptyrK   q_sizekv_sizescalingr   qkv_projr   o_projnum_local_attention_headsnum_local_key_value_headssliding_windowr   r    DECODERattn)selfr.   r/   r0   r1   tp_sizer`   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/gpt_oss.pyrM   6   st   

	
zOAIAttention.__init__hidden_states	positionsreturnc           
      C   sh   |  |\}}|j| j| j| jgdd\}}}| |||\}}| }| |||}| |\}	}|	S )N)dim)r\   splitrY   rZ   rV   
contiguousrb   r]   )
rc   ri   rj   qkv_qkvattn_outputoutputrg   rg   rh   forward   s    zOAIAttention.forward)NNr-   )__name__
__module____qualname__r   r   r   strrM   rS   Tensorrw   __classcell__rg   rg   re   rh   r,   5   s(    Lr,   c                       sD   e Zd Z	ddededef fddZdejdejfd	d
Z	  Z
S )MLPBlockr-   vllm_configrN   r1   c                    s   t    |jj}|j}|j}|j| _|| _|j	| _
|j| _|j| _t r*t nd| _tj|j|j	| _|j| j dksAJ t|j	|j|j|jdd|| dddd| jd| _d S )Nr!   r   Tz.expertsF	swigluoai)num_expertstop_kr@   intermediate_sizereduce_resultsrenormalizer/   r1   apply_router_weight_on_inputhas_bias
activationis_sequence_parallel)rL   rM   model_config	hf_configr/   parallel_configuse_sequence_parallel_moer   rN   num_local_expertsr   r@   num_experts_per_tokexperts_per_tokendistis_initializedget_world_size
world_sizerS   r   Linearrouterr   r   experts)rc   r   rN   r1   r.   r/   r   re   rg   rh   rM      s4   
zMLPBlock.__init__xrk   c                 C   s   |j d }| jrt|}t r%t| |d d d | jf | jj| jj	}n| |}| j
||dd d d | jf }| jrJt| d}|d | }|S )Nr   )ri   router_logits)shaper   r   r   is_rocmr   r@   r   weightbiasr   r   ro   )rc   r   
num_tokensgrg   rg   rh   rw      s   
"
 zMLPBlock.forwardr-   )rx   ry   rz   r   intr{   rM   rS   r|   rw   r}   rg   rg   re   rh   r~      s    $r~   c                       sT   e Zd Z	ddededef fddZdejdejd	ejd
B dejfddZ	  Z
S )TransformerBlockr-   r   r/   r1   c                    sv   t    |jj}|j}t|| _t|| d||d| _t	|| j| dd| _
t|jdd| _t|jdd| _d S )NrG   )r1   r/   r0   z.mlpr1   h㈵>eps)rL   rM   r   r   r0   r'   rN   r,   rb   r~   mlpr   r@   input_layernormpost_attention_layernorm)rc   r   r/   r1   r.   r0   re   rg   rh   rM      s   

zTransformerBlock.__init__ri   rj   residualNrk   c                 C   sV   |d u r|}|  |}n|  ||\}}| ||}| ||\}}| |}||fS N)r   rb   r   r   )rc   ri   rj   r   rv   rg   rg   rh   rw      s   
zTransformerBlock.forwardr   )rx   ry   rz   r   r   r{   rM   rS   r|   rw   r}   rg   rg   re   rh   r      s$    r   c                       s&  e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB dejf
ddZ
dededededeeeejf  deeedf  dee fddZdededededeeeejf  deeedf  dee fddZdeeeejf  dee fddZ  ZS )GptOssModelr-   r   r   r1   c                   s   t    jj _j _j _ jj j_t jj	 jj _
t jj fdd| dd\ _ _ _t jjdd _tddg jj _ttd	f   _d S )
Nc                    s   t |  jdS )N)r1   r/   )r   r/   r   rc   r   rg   rh   <lambda>  s
    z&GptOssModel.__init__.<locals>.<lambda>z.layersr   r   r   ri   r   .)rL   rM   r   r   r.   r/   r   r@   r   
vocab_size	embeddingr*   num_hidden_layersstart_layer	end_layerlayersr   normr)   make_empty_intermediate_tensorstupler   aux_hidden_state_layersrc   r   r1   re   r   rh   rM      s&   

	zGptOssModel.__init__	input_idsrk   c                 C   s
   |  |S r   )r   rc   r   rg   rg   rh   embed_input_ids  s   
zGptOssModel.embed_input_idsNrj   intermediate_tensorsinputs_embedsc                 C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }g }t| j| jD ]!}| j| }	|| jv rC||d u r>|n||  |	|||\}}q*t  j	sWt
||dS | ||\}}
t|dkri||fS |S )Nri   r   )ri   r   r   )r   is_first_rankr   ranger   r   r   r   appendis_last_rankr   r   len)rc   r   rj   r   r   r   r   aux_hidden_statesilayerrq   rg   rg   rh   rw     s(   


zGptOssModel.forwardep_rank_endep_rank_startheads_per_rank
head_startweightsstacked_params_mapping.c                 C   s.  t |  }t }d}	| jj}
| jj}tjt	 t
 jt
 jt jt jd\}}| jj}||	 }t||}||	 }|| }t|d | |}|D ]\}}t|| rSqHd|v r|
rb|||df }n|d d d| d| df }|| }t|dt}||||d d d || qHd	|v r|
r|||df }n|d||	 ||	 f }|| }t|dt}||||d d d || qHd
|v r||d| d }|
r|||df }n|d d d| d| df }|| }t|dt}||||d d d || qHd|v rB||d|d  }|
r|||df }n|d|d |d f }|| }t|dt}||||d d d || qHd|v rz|
rS|||df }n|d d d| d| f }|| }t|dt}||||d d d || qHd|v r|| }t|dt}|
r|||df }n	|dkr|  ||||d d d || qHd|v r|| }|d||}|j| || qH|D ].\}}}||vrאq|||}|| }t|dt}|tkr||| n||||  n||vr qH|| }t|dt}||| || qH|S )N    rd   dp_sizedp_rankpcp_sizepcp_rankr!   .w13_weight_scale.rF   weight_loader)weight_nameshard_id	expert_id.w2_weight_scale.w13_weightrl   
.w2_weight	.w13_bias.w2_biasr   rK   )dictnamed_parameterssetr   enable_expert_parallelr.   r   r   flatten_tp_across_dp_and_pcpr   r   r   rank_in_groupr
   r   r   minr(   getattrr   addviewro   zero_narrowdatacopy_replace)rc   r   r   r   r   r   r   params_dictloaded_paramsmxfp4_blockuse_epr   rd   tp_rankr   intermediate_size_block per_rank_intermediate_size_blockper_rank_intermediate_sizetp_rank_starttp_rank_endnamer   narrow_weightparamr   
param_namer   r   rg   rg   rh   _load_weights_mxfp45  s  	






















zGptOssModel._load_weights_mxfp4c                 C   s  t |  }t }| jj}	tjt t j	t j
t j	t j
d\}
}| jj}t||
}|| }t|d | |}|D ]0\}}t|| rEq:d|v r}|	rT|||df }n|d d d d d| d| f }|ddd }|| }|| || q:d|v r|	r|||df }n|d d ||d d f }|ddd }|| }|| || q:d|v r|	r|||df }n|d d d| d| f }|| }|| || q:d	|v r|	r|||df }n|dkr|  || }|| || q:d
|v r || }|d||}|j| || q:|D ].\}}}||vr.q"|||}|| }t|dt}|tkrI||| n||||  n||vrWq:|| }t|dt}||| || q:|S )Nr   r!   r   .rF   r   r   r   r   rK   r   )r   r   r   r   r   r   r   r   r   r   r   r
   r.   r   r   r   r(   permutero   r   r   r   r   r   r   r   r   )rc   r   r   r   r   r   r   r   r   r   rd   r   r   r   r   r   r   r   r   r   r   r   r   r   rg   rg   rh   _load_weights_other  s   	


"














zGptOssModel._load_weights_otherc                 C   s   g d}t  }t }| jj| }|| }t j}t j}| jj}	|	| }
||
 }|d |
 }t| jdr:| jj	d nd }|dkrJ| 
||||||S | ||||||S )N))r?   z.q_projrr   )r?   z.k_projrs   )r?   z.v_projrt   r!   quantization_configquant_methodmxfp4)r   r   r.   rP   r	   r   rankr   hasattrr   r   r   )rc   r   r   r   rd   r   r   ep_sizeep_rankr   experts_per_rankr   r   r   rg   rg   rh   load_weightsc  s@   
	zGptOssModel.load_weightsNN)rx   ry   rz   r   r{   rM   rS   r|   r   r   rw   r   r   r   listr   r   r   r  r}   rg   rg   re   rh   r      sj    
!
 >
,qr   c                       s>  e Zd ZU dZeed< dg diZeddiddd	d
ddd	ddd	dZ	d-de	de
f fddZdeedf ddfddZdeedf fddZdejdejfddZ		d.dejdB d ejd!edB d"ejdB dejf
d#d$Zd%ejdejfd&d'Zdeee
e
ee
f  fd(d)Zd*eee
ejf  dee
 fd+d,Z  ZS )/GptOssForCausalLMTis_3d_moe_weightr\   )q_projk_projv_projz.self_attn.z.attn.z.embedding.weightr   r   r   r   r   r   )	z.embed_tokens.weightz.gate_up_proj_blocksz.down_proj_blocksz.gate_up_proj_scalesz.down_proj_scalesz.gate_up_projz
.down_projz.gate_up_proj_biasz.down_proj_bias)orig_to_new_substrorig_to_new_suffixr-   r   r1   c                    sh   t    || _|jj| _t|t|dd| _t	| jj
| jjt|dd| _t| jj
| _| jj| _d S )Nmodel)r   r1   lm_headr   )rL   rM   r   r   r   r.   r   r+   r  r   r   r@   r  r   logits_processorr   r   re   rg   rh   rM     s   

zGptOssForCausalLM.__init__r   .rk   Nc                 C   s   || j _d S r   )r  r   )rc   r   rg   rg   rh   set_aux_hidden_state_layers     z-GptOssForCausalLM.set_aux_hidden_state_layersc                 C   s   t | jj}d|d |d fS )NrF      )r   r  r   )rc   
num_layersrg   rg   rh   "get_eagle3_aux_hidden_state_layers  s   z4GptOssForCausalLM.get_eagle3_aux_hidden_state_layersr   c                 C   s   | j |S r   )r  r   r   rg   rg   rh   r     r  z!GptOssForCausalLM.embed_input_idsrj   r   r   c                 C   s   |  ||||S r   )r  )rc   r   rj   r   r   rg   rg   rh   rw     s   zGptOssForCausalLM.forwardri   c                 C   s   |  | j|}|S r   )r  r  )rc   ri   logitsrg   rg   rh   compute_logits  s   z GptOssForCausalLM.compute_logitsc                 C   s   t j| ddd| jjddS )N	gate_proj	down_projup_projr   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   num_redundant_experts)r   make_expert_params_mappingr.   r   )rc   rg   rg   rh   get_expert_mapping  s   z$GptOssForCausalLM.get_expert_mappingr   c                 C   s*   t | | jjr	dgnd d}|j|| jdS )Nzlm_head.)skip_prefixes)mapper)r%   r.   tie_word_embeddingsr  hf_to_vllm_mapper)rc   r   loaderrg   rg   rh   r    s
   zGptOssForCausalLM.load_weightsr   r  )rx   ry   rz   r	  bool__annotations__packed_modules_mappingr&   r%  r   r{   rM   r   r   r  r  rS   r|   r   r   rw   r  r  r!  r   r   r  r}   rg   rg   re   rh   r    sT   
 
	,r  )Jcollections.abcr   rS   torch.distributeddistributedr   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr   r	   r
   r   r   r   r   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   +vllm.model_executor.layers.fused_moe.configr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr    vllm.model_executor.layers.utilsr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr    vllm.model_executor.models.utilsr   vllm.platformsr   vllm.sequencer   vllm.utils.math_utilsr   vllm.v1.attention.backendr    
interfacesr"   r#   r$   utilsr%   r&   r'   r(   r)   r*   r+   Moduler,   r~   r   r   r  rg   rg   rg   rh   <module>   sF   $	$Y8+   #