o
    پi                     @   s   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ G dd deZG dd deZegZdS )    )OptionalN)nn)PretrainedConfig)get_pp_group)is_nsa_enable_prefill_cp)RMSNorm)RowParallelLinear)QuantizationConfig)VocabParallelEmbedding)ForwardBatchPPProxyTensors)DeepseekV2DecoderLayerDeepseekV2Model)MistralLarge3ForCausalLM)
add_prefixc                       sh   e Zd Z		ddedee defddZ		ddej	d	ej	d
e
dej	dee dej	f fddZ  ZS )MistralLarge3ModelN configquant_configprefixc                    s   t j|   | _ j| _t jdksJ t | _t | _	t
 j jtdd| _t  fddt| jjD | _d| _| jj| _t| jjd | jjdtd	dd
| _t j jd| _g | _t dd | _d S )N   embed_tokens)r   c                    s(   g | ]}t  td | |dqS )zlayers.)r   r   r   layer_id)r   r   ).0ir   r   r    [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/mistral_large_3_eagle.py
<listcomp>+   s    z/MistralLarge3Model.__init__.<locals>.<listcomp>r      Ffc)biasr   r   input_is_parallel)epsllama_4_scaling)r   Module__init__r   
vocab_sizer   
world_sizepp_groupr   nsa_enable_prefill_cpr
   hidden_sizer   r   
ModuleListrangenum_hidden_layerslayersstart_layer	end_layerr   r    r   rms_norm_epsnormlayers_to_capturegetattrllama_4_scaling_configselfr   r   r   r   r   r   r&      s:   


zMistralLarge3Model.__init__	input_ids	positionsforward_batchinput_embedspp_proxy_tensorsreturnc                    sZ   |d u r	|  |}| tj||jjfdd\}}t |||||}t|tj	s+J |S )N)dim)
r   r    torchcat	spec_infohidden_statessuperforward
isinstanceTensor)r8   r9   r:   r;   r<   r=   _output	__class__r   r   rF   D   s   

zMistralLarge3Model.forward)Nr   )NN)__name__
__module____qualname__r   r   r	   strr&   rA   rH   r   r   rF   __classcell__r   r   rK   r   r      s2    
3r   c                       sH   e Zd ZejddddB Zdddded	ee d
ef fddZ	  Z
S )MistralLarge3ForCausalLMEaglezmodel.fc.weightzmodel.fc.input_scalezmodel.fc.weight_scale)zeagle_linear\.weightzeagle_linear\.qscale_actzeagle_linear\.qscale_weightNr   )r   r   r   r   r   c                   s"   ||_ t| _t j|||d d S )N)r   r   r   )r   r   	model_clsrE   r&   r7   rK   r   r   r&   _   s   z&MistralLarge3ForCausalLMEagle.__init__)rM   rN   rO   r   	remappingr   r   r	   rP   r&   rQ   r   r   rK   r   rR   X   s    
rR   )typingr   rA   r   transformersr   sglang.srt.distributedr   %sglang.srt.layers.attention.nsa.utilsr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   *sglang.srt.layers.quantization.base_configr	   *sglang.srt.layers.vocab_parallel_embeddingr
   ,sglang.srt.model_executor.forward_batch_infor   r   sglang.srt.models.deepseek_v2r   r   !sglang.srt.models.mistral_large_3r   sglang.srt.utilsr   r   rR   
EntryClassr   r   r   r   <module>   s"   C
