o
    پi                     @   s@   d dl mZ d dlZd dlZd dlmZ G dd deZeZdS )    )IterableN)DeepseekV3ForCausalLMc                       s   e Zd Zi ddddddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-
Zd.eeeejf  d/e	e f fd0d1Z
d.eeeejf  d/eeeejf  fd2d3Z  ZS )4MistralLarge3ForCausalLMz%layers\.(\d+)\.attention_norm\.weightz&model.layers.\1.input_layernorm.weightz#layers\.(\d+)\.attention\.wq\.(\w+)z#model.layers.\1.self_attn.q_proj.\2z%layers\.(\d+)\.attention\.wq_a\.(\w+)z%model.layers.\1.self_attn.q_a_proj.\2z*layers\.(\d+)\.attention\.q_a_norm\.weightz.model.layers.\1.self_attn.q_a_layernorm.weightz%layers\.(\d+)\.attention\.wq_b\.(\w+)z%model.layers.\1.self_attn.q_b_proj.\2z/layers\.(\d+)\.attention\.wkv_a_with_mqa\.(\w+)z/model.layers.\1.self_attn.kv_a_proj_with_mqa.\2z+layers\.(\d+)\.attention\.kv_a_norm\.weightz/model.layers.\1.self_attn.kv_a_layernorm.weightz&layers\.(\d+)\.attention\.wkv_b\.(\w+)z&model.layers.\1.self_attn.kv_b_proj.\2z#layers\.(\d+)\.attention\.wo\.(\w+)z#model.layers.\1.self_attn.o_proj.\2z6layers\.(\d+)\.attention\.k_fake_quantizer\.qscale_actz3model.layers.\1.self_attn.mla_attn.mla_attn.k_scalez6layers\.(\d+)\.attention\.q_fake_quantizer\.qscale_actz3model.layers.\1.self_attn.mla_attn.mla_attn.q_scalez6layers\.(\d+)\.attention\.v_fake_quantizer\.qscale_actz3model.layers.\1.self_attn.mla_attn.mla_attn.v_scalezlayers\.(\d+)\.ffn_norm\.weightz/model.layers.\1.post_attention_layernorm.weightz&layers\.(\d+)\.feed_forward\.w1\.(\w+)z model.layers.\1.mlp.gate_proj.\2z&layers\.(\d+)\.feed_forward\.w2\.(\w+)z model.layers.\1.mlp.down_proj.\2z&layers\.(\d+)\.feed_forward\.w3\.(\w+)zmodel.layers.\1.mlp.up_proj.\2zlayers\.(\d+)\.gate\.weightzmodel.layers.\1.mlp.gate.weightz/model.layers.\1.mlp.shared_experts.gate_proj.\2z/model.layers.\1.mlp.shared_experts.down_proj.\2z-model.layers.\1.mlp.shared_experts.up_proj.\2z+model.layers.\1.mlp.experts.\2.gate_proj.\3z+model.layers.\1.mlp.experts.\2.down_proj.\3z)model.layers.\1.mlp.experts.\2.up_proj.\3z0model.layers.\1.mlp.gate.e_score_correction_biaszmodel.norm.weightzmodel.embed_tokens.weightzlm_head.weight)
z(layers\.(\d+)\.shared_experts\.w1\.(\w+)z(layers\.(\d+)\.shared_experts\.w2\.(\w+)z(layers\.(\d+)\.shared_experts\.w3\.(\w+)z(layers\.(\d+)\.experts\.(\d+)\.w1\.(\w+)z(layers\.(\d+)\.experts\.(\d+)\.w2\.(\w+)z(layers\.(\d+)\.experts\.(\d+)\.w3\.(\w+)zlayers\.(\d+)\.router_biasesznorm\.weightztok_embeddings\.weightzoutput\.weightweightsreturnc                    s   t  | |S )N)superload_weights_iterable_remap_mistral_to_ds)selfr   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/mistral_large_3.pyr   -   s   z%MistralLarge3ForCausalLM.load_weightsc                 c   s    |D ]K\}}| j  D ]\}}t||}|r!t|||} nqddl}|d| d q|dr=tdd|}n|drItd	d
|}||fV  qdS )z2Remap Mistral parameters to DeepseekV2 parameters.r   NzUnrecognized weight: z. Skipping.z.qscale_actz\.qscale_act$z.input_scalez.qscale_weightz\.qscale_weight$z.weight_scale)	remappingitemsre	fullmatchsubloggingwarningendswith)r
   r   nameloaded_weightkvmatchr   r   r   r   r	   0   s"   

z6MistralLarge3ForCausalLM._iterable_remap_mistral_to_ds)__name__
__module____qualname__r   r   tuplestrtorchTensorsetr   r	   __classcell__r   r   r   r   r      sj    	( r   )	collections.abcr   regexr   r!   sglang.srt.models.deepseek_v2r   r   
EntryClassr   r   r   r   <module>   s   C