o
    
۾iW                     @  s  d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2 G dd de
j3Z4G dd de
j3Z5G dd de
j3Z6eddddd d!G d"d# d#e
j3Z7G d$d% d%e
j3Z8dS )&zCInference-only LoopCoder model compatible with HuggingFace weights.    )annotations)Iterable)replace)AnyN)nn)PretrainedConfig)support_torch_compile)CacheConfig
VllmConfig)$get_tensor_model_parallel_world_size)	Attention)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)LlamaMLP)IntermediateTensors)AttentionType   )AutoWeightsLoaderextract_layer_indexmake_layersmaybe_prefixc                      s>   e Zd Zddddejddfd" fddZ	d#d$d d!Z  ZS )%LoopCoderAttentioni   N r   configr   hidden_sizeint	num_headsnum_kv_headsmax_positioncache_configCacheConfig | Nonequant_configQuantizationConfig | Noneprefixstr	attn_typedual_chunk_attention_configdict[str, Any] | None	layer_idxreturnNonec                   s  t    || _|| _t }|| _| j| dksJ | j| | _|| _| j|kr2| j| dks1J n	|| j dks;J td| j| | _	|| j | _
| j| j
 | _| j	| j
 | _| j
d | _|
| _t|dd| _t|dd| _|j}t|| j
| j| jd|| d	d
| _t| j| j
 |d|| dd
| _t| j
||j|
d| _t | _|}t| jD ]X}t|}|| | }|d| d| }|dkr|}n|d urt|| jd}nt | jdd}| j!t"| j| j
| jf| j	|||	| dd|
r|dkr||
dni  qd S )Nr   r   g      loop_num   loop_window_size@   Fz	.qkv_proj)biasr*   r,   z.o_proj)r'   rope_parametersr/   zlayers.)sliding_windowauto)r:   cache_dtypez.attn)r&   r(   r*   r.   r,   )r1   r/   )#super__init__r1   r#   r   total_num_headsr%   total_num_kv_headsmaxr&   head_dimq_sizekv_sizescalingr/   getattrr4   r6   num_hidden_layersr   qkv_projr   o_projr   r9   
rotary_embr   
ModuleListattnranger   r   r	   appendr   )selfr"   r#   r%   r&   r'   r(   r*   r,   r.   r/   r1   tp_sizetotal_layersbase_cache_configloop_idxbase_layer_idxunique_layer_idxunique_prefixloop_cache_config	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/iquest_loopcoder.pyr>   =   s   

	

zLoopCoderAttention.__init__	positionstorch.Tensorhidden_statesrS   	gate_projLoopGateProjection | Nonec                 C  sF  |dkr8| j d }| |\}}|j| j| j| jgdd\}}	}
| |||	\}}	|||	|
}| |\}}|S | j d }| j | }| |\}}|j| j| j| jgdd\}}	}
| |||	\}}	|j\}}| j}| j	}|
|||dd}||d d }|||	|
}|d usJ d||}|| |d|   }| |\}}|S )Nr   )dimr   z+gate_proj must be provided for loop_idx > 0)rL   rH   splitrC   rD   rJ   rI   shaper%   rB   view	transpose)rO   r\   r^   rS   r_   rL   qkv_qkvattn_outputoutputglobal_attn
local_attn
num_tokensr%   rB   
q_reshapedglobal_attn_outputlocal_attn_outputgaterZ   rZ   r[   forward   s0   
 

 
zLoopCoderAttention.forward)r"   r   r#   r$   r%   r$   r&   r$   r'   r$   r(   r)   r*   r+   r,   r-   r.   r-   r/   r0   r1   r$   r2   r3   N
r\   r]   r^   r]   rS   r$   r_   r`   r2   r]   )__name__
__module____qualname__r   DECODERr>   ru   __classcell__rZ   rZ   rX   r[   r    <   s    vr    c                      s6   e Zd Z				dd fddZ	ddddZ  ZS )LoopCoderDecoderLayerNr!   r   r"   r   r(   r)   r*   r+   r,   r-   r1   r$   r2   r3   c                   s   t    |j| _t|dd }|| _t|ddrtj}ntj}t|| j|j	|j
|j||| d||| jd| _t| j|j|j|| dd| _t|j|jd| _t|j|jd| _d S )	Nr/   	is_causalTz
.self_attn)r"   r#   r%   r'   r&   r(   r*   r,   r.   r/   r1   z.mlp)r#   intermediate_size
hidden_actr*   r,   eps)r=   r>   r#   rF   r1   r   r{   ENCODER_ONLYr    num_attention_headsmax_position_embeddingsnum_key_value_heads	self_attnr   r   r   mlpr   rms_norm_epsinput_layernormpost_attention_layernorm)rO   r"   r(   r*   r,   r1   r/   r.   rX   rZ   r[   r>      sB   
zLoopCoderDecoderLayer.__init__r\   r]   r^   rS   r_   r`   c                 C  sL   |}|  |}| j||||d}|| }|}| |}| |}|| }|S )N)r\   r^   rS   r_   )r   r   r   r   )rO   r\   r^   rS   r_   residualrZ   rZ   r[   ru      s   


zLoopCoderDecoderLayer.forward)NNr!   r   )r"   r   r(   r)   r*   r+   r,   r-   r1   r$   r2   r3   rv   rw   )rx   ry   rz   r>   ru   r|   rZ   rZ   rX   r[   r}      s    1r}   c                      s2   e Zd ZdZ		dd fddZdddZ  ZS )LoopGateProjectiona}  Gate projection for mixed attention in Loop 2+.

    Computes: g = sigmoid(linear(Q)) for each head independently.
    This gate determines how much to use Loop1's KV (global) vs current
    loop's KV (local).

    Supports tensor parallelism: each GPU handles a subset of heads.
    The weight matrix has shape [num_heads, head_dim] and is split along
    the head dimension.
    Nr!   r?   r$   rB   r*   r+   r,   r-   c                   s\   t    || _|| _t }| j| dksJ | j| | _t|| jdd|| dd| _d S )Nr   TFz
.gate_proj)r8   gather_outputr*   r,   )r=   r>   r?   rB   r   r%   r   r_   )rO   r?   rB   r*   r,   rP   rX   rZ   r[   r>   #  s   
zLoopGateProjection.__init__queryr]   r2   c           
      C  s   |j \}}}|| jksJ d| j d| |d|}| |\}}|||| j}tj|ddd}|dd}|d}t|}	|	dd}	|		dd|}	|	||| }	|	S )an  Compute gate values from query tensor.

        Args:
            query: [num_heads, num_tokens, head_dim] (vLLM flattened format)
                where num_heads is the number of heads on this TP rank
                and num_tokens = batch * seq_len

        Returns:
            gate: [num_tokens, num_heads * head_dim] (flattened format matching q shape)
        z	Expected z heads, got ra   r   r5   )dim1dim2r   )
rd   r%   reshaper_   torchdiagonalrf   	unsqueezesigmoidexpand)
rO   r   r%   rp   rB   
query_flatgate_logits_flatrh   gate_logitsrt   rZ   rZ   r[   ru   :  s*   

zLoopGateProjection.forward)Nr!   )r?   r$   rB   r$   r*   r+   r,   r-   )r   r]   r2   r]   )rx   ry   rz   __doc__r>   ru   r|   rZ   rZ   rX   r[   r     s    r   ra   )	input_idsr\   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                      sJ   e Zd Zdedd fd	d
ZdddZ		d d!ddZd"ddZ  ZS )#IQuestLoopCoderModelr!   )r,   decoder_layer_typevllm_configr
   r,   r-   r   type[nn.Module]c                  s  t    |jj|j |j jd ur)tdr)jj	ks)J d
jj	| _| _j| _tjj| dd| _t| jdd| _t| jdd| _jj tj	fd	d
| dd\}}| _tj	 fdd
| dd\| _| _| _tjjd| _d S )Nmax_window_layerszSliding window for some but all layers is not supported. This model uses sliding window but `max_window_layers` = {} is less than `num_hidden_layers` = {}. Please open an issue to discuss this feature.z.embed_tokensr*   r,   r4   r5   r6   r7   c                   s   t  j| dS )N)r?   rB   r*   r,   )r   r   r,   )r"   rB   r*   rZ   r[   <lambda>  s    z/IQuestLoopCoderModel.__init__.<locals>.<lambda>z.gate_projectionsr   c                   s   t  | t| dS )N)r"   r(   r*   r,   r1   )r}   r   r   )r(   r"   r*   rZ   r[   r     s    z.layersr   )r=   r>   model_config	hf_configr(   r*   r:   hasattrr   rG   formatr"   
vocab_sizer   r#   embed_tokensrF   r4   window_sizer   r   gate_projectionsstart_layer	end_layerlayersr   r   norm)rO   r   r,   r   rh   rX   )r(   r"   rB   r*   r[   r>   q  sJ   

zIQuestLoopCoderModel.__init__r   r]   r2   c                 C  s
   |  |S rv   )r   rO   r   rZ   rZ   r[   embed_input_ids  s   
z$IQuestLoopCoderModel.embed_input_idsNtorch.Tensor | Noner\   r   IntermediateTensors | Noner   "torch.Tensor | IntermediateTensorsc                 C  s   |d ur|}n|  |}t| jD ])}t| j| j| j D ]\}}| j| }	|dkr0| j|	 nd }
|||||
}qq| |}|S )Nr   )	r   rM   r4   	enumerater   r   r   r   r   )rO   r   r\   r   r   r^   rS   r1   layeractual_layer_idxr_   rZ   rZ   r[   ru     s   



zIQuestLoopCoderModel.forwardweights"Iterable[tuple[str, torch.Tensor]]set[str]c                 C  s  g d}t | jdd}t }|D ]\}}d|v rq| jd urH| j| }rH|| }t|dt}	| dkr9|n|d }|	|| || q|D ]I\}
}}d|v rTqJ||vrYqJ|	||
}|
dri||vriqJ|
d	rxt||}|d u rxqJ|| }t|dt}	|	tkr|	|| n|	|||  n[|d
r|
dr|	dd}n|
dr|	dd}nq||v r|| }t|dt}	|	|| || qq|
dr||vrqt||}|d u rq|| }t|dt}	|	|| || q|S )N))rH   q_projri   )rH   k_projrj   )rH   v_projrk   )gate_up_projr_   r   )r   up_projr   F)remove_duplicatezrotary_emb.inv_freqweight_loaderr   r   z.biasscalezgate_projections.z.weightz.gate_proj.weightz.gate_proj.bias)dictnamed_parameterssetr*   get_cache_scalerF   r   rb   addr   endswithr   
startswith)rO   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_nameshard_id	vllm_namerZ   rZ   r[   load_weights  sv   











z!IQuestLoopCoderModel.load_weights)r   r
   r,   r-   r   r   r   r]   r2   r]   NN
r   r   r\   r]   r   r   r   r   r2   r   r   r   r2   r   )	rx   ry   rz   r}   r>   r   ru   r   r|   rZ   rZ   rX   r[   r   h  s    
Cr   c                      sR   e Zd Zddd fddZd ddZ		d!d"ddZd#ddZd$ddZ  ZS )%IQuestLoopCoderForCausalLMr!   r   r   r
   r,   r-   c                  sv   t    |jj}|j}|| _|| _t|t|dd| _|j	r%| jj
| _nt|j|j|t|dd| _t|j| _d S )Nmodel)r   r,   lm_headr   )r=   r>   r   r   r*   r"   r   r   r   tie_word_embeddingsr   r   r   r   r#   r   logits_processor)rO   r   r,   r"   r*   rX   rZ   r[   r>      s"   

z#IQuestLoopCoderForCausalLM.__init__r   r]   r2   c                 C  s   | j |S rv   )r   r   r   rZ   rZ   r[   r   8  s   z*IQuestLoopCoderForCausalLM.embed_input_idsNr   r\   r   r   r   r   c                 C  s   |  ||||}|S rv   )r   )rO   r   r\   r   r   r^   rZ   rZ   r[   ru   ;  s   z"IQuestLoopCoderForCausalLM.forwardr^   c                 C  s   |  | j|}|S rv   )r   r   )rO   r^   logitsrZ   rZ   r[   compute_logitsG  s   z)IQuestLoopCoderForCausalLM.compute_logitsr   r   r   c                 C  s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   r"   r   r   )rO   r   loaderrZ   rZ   r[   r   N  s
   
z'IQuestLoopCoderForCausalLM.load_weights)r   r
   r,   r-   r   r   r   )r^   r]   r2   r   r   )	rx   ry   rz   r>   r   ru   r   r   r|   rZ   rZ   rX   r[   r     s    

r   )9r   
__future__r   collections.abcr   dataclassesr   typingr   r   r   transformersr   vllm.compilation.decoratorsr   vllm.configr	   r
   vllm.distributedr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r    vllm.model_executor.models.llamar   vllm.sequencer   vllm.v1.attention.backendr   utilsr   r   r   r   Moduler    r}   r   r   r   rZ   rZ   rZ   r[   <module>   sJ    EQ 0