o
    پi0H                     @   sL  d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z# ddl$m%Z%m&Z& e'e(Z)G dd dej*Z+G dd dej*Z,G dd dej*Z-G dd dej*Z.G dd dej*Z/e/Z0dS )zCInference-only LoopCoder model compatible with HuggingFace weights.    N)IterableOptionalTuple)nn)PretrainedConfig)$get_tensor_model_parallel_world_size)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)LlamaMLP)
add_prefixmake_layersc                	       sR   e Zd ZdZ		ddededee def fdd	Zd
e	j
de	j
fddZ  ZS )LoopGateProjectionau  Gate projection for mixed attention in Loop 2+.

    Computes: g = sigmoid(linear(Q)) for each head independently.
    This gate determines how much to use Loop1's KV (global) vs current loop's KV (local).

    Supports tensor parallelism: each GPU handles a subset of heads.
    The weight matrix has shape [num_heads, head_dim] and is split along the head dimension.
    N total_num_headshead_dimquant_configprefixc              	      s\   t    || _|| _t }| j| dksJ | j| | _t|| jdd|td|d| _d S )Nr   TF	gate_proj)biasgather_outputr   r   )	super__init__r   r   r   	num_headsr	   r   r   )selfr   r   r   r   tp_size	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/iquest_loopcoder.pyr!   8   s   
zLoopGateProjection.__init__queryreturnc           
      C   s   |j \}}}|| jksJ d| j d| |d|}| |\}}|||| j}tj|ddd}|dd}|d}t|}	|	dd}	|		dd|}	|	||| }	|	S )aV  Compute gate values from query tensor.

        Args:
            query: [num_heads, num_tokens, head_dim]
                where num_heads is the number of heads on this TP rank
                and num_tokens = batch * seq_len

        Returns:
            gate: [num_tokens, num_heads * head_dim] (flattened format matching q shape)
        z	Expected z heads, got r      )dim1dim2   )
shaper"   reshaper   torchdiagonal	transpose	unsqueezesigmoidexpand)
r#   r)   r"   
num_tokensr   
query_flatgate_logits_flat_gate_logitsgater'   r'   r(   forwardO   s   

zLoopGateProjection.forwardNr   )__name__
__module____qualname____doc__intr   r   strr!   r2   Tensorr>   __classcell__r'   r'   r%   r(   r   .   s    r   c                       s   e Zd Z				ddedededed	ed
edee deddf fddZ	dde	j
de	j
dededee de	j
fddZ  ZS )LoopCoderAttentionr      Nr   confighidden_sizer"   num_kv_headslayer_idmax_positionr   r   r*   c	                    s  t    || _|| _t }	|| _| j|	 dksJ | j|	 | _|| _| j|	kr2| j|	 dks1J n	|	| j dks;J td| j|	 | _	|| j | _
| j| j
 | _| j	| j
 | _| j
d | _t|dd| _t|dd| _t|| j
| j| jd|td	|d
| _t| j| j
 |d|td|d
| _t|dd}
t|dd }t|d|}t| j
| j
||
|d| _t | _t|dd}t| jD ]+}|dkrdn| j}|| | }| jt| j| j
| j| j	|||td| |d qd S )Nr   r/   g      loop_numr,   loop_window_size@   Fqkv_proj)r   r   r   o_proj
rope_thetai'  rope_scalingmax_position_embeddings)
rotary_dimrN   baserU   num_hidden_layers   r+   zattn.)rL   rM   sliding_window_sizer   r   )r    r!   rM   rK   r   r   r"   total_num_kv_headsmaxrL   r   q_sizekv_sizescalinggetattrrO   rP   r
   r   rR   r   rS   r   
rotary_embr   
ModuleListattnrangeappendr   )r#   rJ   rK   r"   rL   rM   rN   r   r   r$   rT   rU   rV   total_layersloop_idxsliding_windowunique_layer_idr%   r'   r(   r!   w   s   

	

zLoopCoderAttention.__init__	positionshidden_statesforward_batchrh   r   c                 C   s   |  |\}}|j| j| j| jgdd\}}	}
| |||	\}}	|dkr/| jd ||	|
|}n>| jd |d d |dd}| j| ||	|
|}|d usMJ d|jd }||| j| j	
dd}||}|| |d|   }| |\}}|S )Nr+   )dimr   F)save_kv_cachez+gate_proj must be provided for loop_idx > 0r/   )rR   splitr^   r_   rb   rd   r0   viewr"   r   r4   rS   )r#   rk   rl   rm   rh   r   qkvr;   qkvattn_outputglobal_attn_outputlocal_attn_outputr8   
q_reshapedr=   outputr'   r'   r(   r>      s$    

zLoopCoderAttention.forward)r   rI   Nr   Nr@   rA   rB   r   rD   r   r   rE   r!   r2   rF   r   r   r>   rG   r'   r'   r%   r(   rH   v   sL    	
_rH   c                       sn   e Zd Z			ddededee deddf
 fd	d
Z	dde	j
de	j
dededee de	j
fddZ  ZS )LoopCoderDecoderLayerr   Nr   rJ   rM   r   r   r*   c                    s   t    |j| _|| _t|| j|j|j|t|dd|td|d| _	t
| j|j|j|td|d| _t|j|jd| _t|j|jd| _d S )NrV   rI   	self_attn)rJ   rK   r"   rL   rM   rN   r   r   mlp)rK   intermediate_size
hidden_actr   r   eps)r    r!   rK   rM   rH   num_attention_headsnum_key_value_headsra   r   r~   LoopCoderMLPr   r   r   r   rms_norm_epsinput_layernormpost_attention_layernorm)r#   rJ   rM   r   r   r%   r'   r(   r!      s0   


zLoopCoderDecoderLayer.__init__rk   rl   rm   rh   r   c                 C   sN   |}|  |}| j|||||d}|| }|}| |}| |}|| }|S )N)rk   rl   rm   rh   r   )r   r~   r   r   )r#   rk   rl   rm   rh   r   residualr'   r'   r(   r>     s   	


zLoopCoderDecoderLayer.forward)r   Nr   r{   r|   r'   r'   r%   r(   r}      s:    'r}   c                       s^   e Zd Z		ddedee def fddZ	ddej	d	ej	d
e
dej	dej	f
ddZ  ZS )IQuestLoopCoderModelNr   rJ   r   r   c                    s"  t     | _| _ j| _t j jtd|d| _t	| jdd| _
t	| jdd| _ j j t j fddtd	|d
}t|trR|\| _| _| _nd j| _| _|| _t j fddtd|d
}t|tr{|\| _| _| _nd j| _| _|| _t j jd| _d S )Nembed_tokensr   r   rO   r,   rP   rQ   c                    s   t  j|dS )N)r   r   r   r   )r   r   idxr   rJ   r   r   r'   r(   <lambda>U  s    z/IQuestLoopCoderModel.__init__.<locals>.<lambda>gate_projections)r   r   c                    s   t  | |dS )N)rJ   rM   r   r   )r}   r   )rJ   r   r'   r(   r   e  s    layersr   )r    r!   rJ   r   
vocab_sizer   rK   r   r   ra   rO   window_sizer   r   rY   
isinstancetuplestart_layer	end_layerr   r   r   r   norm)r#   rJ   r   r   r   r   r%   r   r(   r!   <  s@   




zIQuestLoopCoderModel.__init__	input_idsrk   rm   input_embedsr*   c           
      C   sz   |d ur|}n|  |}t| jD ]$}t| j| jD ]}| j| }|dkr*| j| nd }	||||||	}qq| |}|S )Nr   )r   re   rO   r   r   r   r   r   )
r#   r   rk   rm   r   rl   rh   	layer_idxlayerr   r'   r'   r(   r>   u  s   



zIQuestLoopCoderModel.forwardr?   r{   )r@   rA   rB   r   r   r   rE   r!   r2   rF   r   r>   rG   r'   r'   r%   r(   r   ;  s,    >r   c                
       s|   e Zd Z		ddedee def fddZe	 	ddej
d	ej
d
edej
fddZdeeeej
f  fddZ  ZS )IQuestLoopCoderForCausalLMNr   rJ   r   r   c                    sh   t    || _|| _t||td|d| _|jr| jj| _	nt
|j|j|td|d| _	t|| _d S )Nmodel)rJ   r   r   lm_headr   )r    r!   rJ   r   r   r   r   tie_word_embeddingsr   r   r   r   rK   r   logits_processor)r#   rJ   r   r   r%   r'   r(   r!     s"   
z#IQuestLoopCoderForCausalLM.__init__r   rk   rm   r   c                 C   s"   |  ||||}| ||| j|S r{   )r   r   r   )r#   r   rk   rm   r   rl   r'   r'   r(   r>     s   
z"IQuestLoopCoderForCausalLM.forwardweightsc                 C   s2  g d}t |  }|D ]\}}d|v rq|drG|dr&|dd}n|dr2|dd}nq||v rF|| }t|dt}||| q|D ]/\}	}
}|
|vrSqI||
|	}|drc||vrcqI||v rw|| }t|dt}||||  n|dr||vrq||v r|| }t|dt}||| qd S )	N))rR   q_projrs   )rR   k_projrt   )rR   v_projru   )gate_up_projr   r   )r   up_projr/   zrotary_emb.inv_freqzgate_projections.z.weightz.gate_proj.weightz.biasz.gate_proj.biasweight_loader)dictnamed_parameters
startswithendswithreplacera   r   )r#   r   stacked_params_mappingparams_dictnameloaded_weightsglang_nameparamr   
param_nameweight_nameshard_idr'   r'   r(   load_weights  sR   




z'IQuestLoopCoderForCausalLM.load_weightsr?   r{   )r@   rA   rB   r   r   r   rE   r!   r2   no_gradrF   r   r>   r   r   r   rG   r'   r'   r%   r(   r     s,    $r   )1rC   loggingtypingr   r   r   r2   r   transformersr   sglang.srt.distributedr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr	   r
   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.llamar   r   sglang.srt.utilsr   r   	getLoggerr@   loggerModuler   rH   r}   r   r   
EntryClassr'   r'   r'   r(   <module>   s4   
H ?Tc