o
    
۾i1                     @   sl  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z- G dd dej.Z/G dd dej.Z0G dd dej.Z1e
G dd dej.Z2G dd dej.e'Z3dS ) zBInference-only GPT-NeoX model compatible with HuggingFace weights.    )Iterable)isliceN)nn)GPTNeoXConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
get_act_fn)	Attention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                	       Z   e Zd Z			ddededB dedB def fddZd	ej	d
ej	dej	fddZ
  ZS )GPTNeoXAttentionN configcache_configquant_configprefixc                    s   t    |j| _|j| _| j| j | _t|dd| _t }| j| dks'J | j| | _	t
|j| j| j| j|| dd| _t|j|j| j|| dd| _t|dd}t| j||jd	| _| jd
 }t| j	| j|||| dd| _d S )Nattention_biasTr   z.query_key_value)biasr#   r$   z.densemax_position_embeddingsi    )max_positionrope_parametersg      z.attn)r"   r#   r$   )super__init__num_attention_headstotal_num_headshidden_size	head_sizegetattrr&   r
   	num_headsr   query_key_valuer   denser   r)   
rotary_embr   attn)selfr!   r"   r#   r$    tensor_model_parallel_world_sizer'   scaling	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/gpt_neox.pyr+   <   sL   

zGPTNeoXAttention.__init__position_idshidden_statesreturnc           
      C   sT   |  |\}}|jddd\}}}| |||\}}| |||}| |\}	}|	S )N   )chunksdim)r2   chunkr4   r5   r3   )
r6   r=   r>   qkv_qkvattn_outputoutputr;   r;   r<   forwardl   s   zGPTNeoXAttention.forwardNNr    __name__
__module____qualname__r   r   r   strr+   torchTensorrL   __classcell__r;   r;   r9   r<   r   ;   s(    0r   c                       s<   e Zd Z		d
dededB def fddZdd	 Z  ZS )
GPTNeoXMLPNr    r!   r#   r$   c                    sR   t    t|j|j|| dd| _t|j|j|| dd| _t|j	| _
d S )Nz.dense_h_to_4hr#   r$   z.dense_4h_to_h)r*   r+   r   r.   intermediate_sizedense_h_to_4hr   dense_4h_to_hr   
hidden_actact)r6   r!   r#   r$   r9   r;   r<   r+   z   s   
zGPTNeoXMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)rY   r\   rZ   )r6   r>   rF   r;   r;   r<   rL      s   
zGPTNeoXMLP.forward)Nr    )	rO   rP   rQ   r   r   rR   r+   rL   rU   r;   r;   r9   r<   rV   y   s    rV   c                	       r   )GPTNeoXLayerNr    r!   r"   r#   r$   c                    sl   t    |j| _tj|j|jd| _tj|j|jd| _t	|||| dd| _
t||| dd| _d S )Nepsz
.attentionr$   z.mlp)r*   r+   use_parallel_residualr   	LayerNormr.   layer_norm_epsinput_layernormpost_attention_layernormr   	attentionrV   mlp)r6   r!   r"   r#   r$   r9   r;   r<   r+      s   
zGPTNeoXLayer.__init__r=   r>   r?   c                 C   sj   |  |}| j||d}| jr!| |}| |}|| | }|S || }| |}| |}|| }|S )N)r=   r>   )re   rg   rb   rf   rh   )r6   r=   r>   
attn_inputrJ   	mlp_input
mlp_outputr;   r;   r<   rL      s   


	

zGPTNeoXLayer.forwardrM   rN   r;   r;   r9   r<   r^      s(    r^   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeejf  dee fddZ  ZS )GPTNeoXModelr    ra   vllm_configr$   c                   s   t    |jj|j |j| _tjj	| _
tj fdd| dd\| _| _| _tjj	jd| _tdgj	| _d S )Nc                    s   t  | dS )Nra   )r^   ra   r"   r!   r#   r;   r<   <lambda>   s    z'GPTNeoXModel.__init__.<locals>.<lambda>z.layersra   r_   r>   )r*   r+   model_config	hf_configr"   r#   r!   r   
vocab_sizer.   embed_inr   num_hidden_layersstart_layer	end_layerlayersr   rc   rd   final_layer_normr   make_empty_intermediate_tensors)r6   rm   r$   r9   rn   r<   r+      s(   

zGPTNeoXModel.__init__	input_idsr?   c                 C   s
   |  |S r]   )rs   r6   rz   r;   r;   r<   embed_input_ids   s   
zGPTNeoXModel.embed_input_idsNr=   intermediate_tensorsinputs_embedsc                 C   sn   t  jr|d ur|}n
| |}n|d }t| j| j| jD ]}|||}qt  js0td|iS | 	|}|S )Nr>   )
r	   is_first_rankr|   r   rw   ru   rv   is_last_rankr   rx   )r6   rz   r=   r}   r~   r>   layerr;   r;   r<   rL      s   
zGPTNeoXModel.forwardweightsc                 C   s   t |  }t }|D ]l\}}d|v sd|v sd|v rqd|v s$d|v r%qt|| r+q|| }d|v rgt|dd }| jj}|d urg|j}	||	d | |dd	f |	|d
 d   }|	||d
 }|
|	}t|dt}
|
|| || q|S )Nzattention.biaszattention.masked_biaszrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedr2   
output_dimr@   rA   r   weight_loader)dictnamed_parameterssetr   r0   r!   r,   shapeview	transposereshaper   add)r6   r   params_dictloaded_paramsnameloaded_weightparamr   r1   loaded_weight_shaper   r;   r;   r<   load_weights   s<   



zGPTNeoXModel.load_weightsr]   )rO   rP   rQ   r   rR   r+   rS   rT   r|   r   rL   r   tupler   r   rU   r;   r;   r9   r<   rl      s     
,rl   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB deje	B f
ddZ
dejdejdB fddZdeeeejf  dee fddZ  ZS )GPTNeoXForCausalLMr    ra   rm   r$   c                   s   t    |jj}|j}|| _|| _t|t|dd| _t	|j
|j|t|dd| _| jjr5| jjj| j_t|j
| _| jj| _d S )Ngpt_neox)rm   r$   	embed_outrW   )r*   r+   rp   rq   r#   r!   rl   r   r   r   rr   r.   r   tie_word_embeddingsrs   weightr   logits_processorry   )r6   rm   r$   r!   r#   r9   r;   r<   r+   &  s&   

zGPTNeoXForCausalLM.__init__rz   r?   c                 C   s   | j |S r]   )r   r|   r{   r;   r;   r<   r|   <  s   z"GPTNeoXForCausalLM.embed_input_idsN	positionsr}   r~   c                 C   s   |  ||||}|S r]   )r   )r6   rz   r   r}   r~   r>   r;   r;   r<   rL   ?  s   zGPTNeoXForCausalLM.forwardr>   c                 C   s   |  | j|}|S r]   )r   r   )r6   r>   logitsr;   r;   r<   compute_logitsK  s   z!GPTNeoXForCausalLM.compute_logitsr   c                 C   s   t | }||S r]   )r   r   )r6   r   loaderr;   r;   r<   r   R  s   
zGPTNeoXForCausalLM.load_weights)NN)rO   rP   rQ   r   rR   r+   rS   rT   r|   r   rL   r   r   r   r   r   rU   r;   r;   r9   r<   r   %  s,    

,r   )4__doc__collections.abcr   	itertoolsr   rS   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   r
   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   utilsr   r   r   r   r   Moduler   rV   r^   rl   r   r;   r;   r;   r<   <module>   s4   	>1]