o
    i<                     @   s  d dl Z d dlmZ d dl mZ d dlmZmZmZ d dlm	Z	m
Z
mZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ ddlmZmZmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z- ee.Z/de j0de j0de j0de j0de j0de j0de1ddfddZ2de j0de j0de j0de j0de j0de j0de1ddfddZ3ede2dge3d  G d!d" d"ej4e%Z5dS )#    N)	rearrange)nn)CacheConfigModelConfigget_current_vllm_config)divideget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)ForwardContextget_forward_context)init_logger)sharded_weight_loader)set_weight_attrs)direct_register_custom_op)AttentionMetadata)GDNAttentionMetadata   )FusedRMSNormGated	chunk_kdafused_kda_gatefused_recurrent_kda)ColumnParallelLinearReplicatedLinearRowParallelLinear)	MambaBase)MambaStateDtypeCalculatorMambaStateShapeCalculator)causal_conv1d_fncausal_conv1d_update)QuantizationConfigq_proj_statesk_proj_statesv_proj_statesg1betacore_attn_out
layer_namereturnc           	      C   s*   t  }|j| }|j| |||||d d S )N)r    r!   r"   r#   r$   r%   )r   no_compile_layers_forward)	r    r!   r"   r#   r$   r%   r&   forward_contextself r,   T/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/layers/kda.pykda_attention)   s   	

r.   c                 C   s   d S )Nr,   )r    r!   r"   r#   r$   r%   r&   r,   r,   r-   kda_attention_fake>   s   	r/   )op_nameop_funcmutates_args	fake_implc                       s  e Zd ZedefddZdeejejejejf fddZ	deee
df ee
df ee
df ee
df f fddZ							
	d"de
de
ded	B ded	B ded	B dededd	f fddZdejdejdejdd	fddZdejdejdejdejdejdejdd	fd d!Z  ZS )#KimiDeltaAttentionr'   c                 C   s   dS )Ngdn_attentionr,   r+   r,   r,   r-   
mamba_typeS   s   zKimiDeltaAttention.mamba_typec                 C   s0   | j d u s
| jd u rtdt| j j| jjS )Nz)model_config and cache_config must be set)model_configcache_config
ValueErrorr   kda_state_dtypedtypemamba_cache_dtyper6   r,   r,   r-   get_state_dtypeW   s
   z"KimiDeltaAttention.get_state_dtype.c                 C   s   t j| j| j| j| jdS )N)conv_kernel_size)r   kda_state_shapetp_size	num_headshead_dim	conv_sizer6   r,   r,   r-   get_state_shape`   s   z"KimiDeltaAttention.get_state_shapeNh㈵> 	layer_idxhidden_sizequant_configr9   r8   rms_norm_epsprefixc              	      s  t    t | _t | _|| _|| _|| _|d u rt	d|j
}	|	d | _|	d | _|| _|| _| j| j dks;J t| j| j| _| j| j }
|	d | _t| j|
d|| dd| _t| j|
d|| d	d| _t| j|
d|| d
d| _t| j| jd|| dd| _t| j|
d|| dd| _ttjt|
| jtjd| _t| jdt di t| j| jd|| dd| _!t| j|
dtj| dd| _"t| j|
dtj| dd| _#t| j|
dtj| dd| _$| j"j%j&'d| j"j%_&| j#j%j&'d| j#j%_&| j$j%j&'d| j$j%_&ttjdd| jdtjd| _(t| j(dt di t| j| jd|| dd| _)t| j|
d|| dd| _*t+| j|dd| _,t-|
| jd|| dd| _.t/ j0}||j1v rkt	d| | |j1|< d S )Nzmodel_config must be providedrC   rB   r   short_conv_kernel_sizeFz.q_proj)biasrJ   rL   z.k_projz.v_projz	.f_a_projz	.f_b_proj)r<   weight_loaderz.b_projz	.q_conv1d)
input_sizeoutput_sizerN   params_dtyperL   z	.k_conv1dz	.v_conv1dr      z	.g_a_projz	.g_b_projsigmoid)eps
activationz.o_projzDuplicate layer name: )2super__init__r	   rA   r   tp_rankrI   r8   r9   r:   linear_attn_configrC   rB   rH   rL   r   local_num_headsrD   r   q_projk_projv_projr   f_a_projf_b_projr   	Parametertorchemptyfloat32dt_biasr   r   b_projq_conv1dk_conv1dv_conv1dweightdata	unsqueezeA_logg_a_projg_b_projr   o_normr   o_projr   compilation_configstatic_forward_context)r+   rH   rI   rJ   r9   r8   rK   rL   kwargs
kda_configprojection_sizerr   	__class__r,   r-   rX   g   s   



zKimiDeltaAttention.__init__hidden_states	positionsoutputc              	   C   s   | d}| |d }| |d }| |d }| |d   }| | |d d }	t	|	| j
| j| jd}	|d}|	d}	| | |d d }
t|
d| jd}tjd|| j| jf|j|jd}tjj||||	||| j | ||}t|d}| |d |d d < d S )Nr   )g_biasz... (h d) -> ... h ddr   )r<   devicez1 n h d -> n (h d))sizer\   r]   r^   rf   floatrT   r`   r_   r   rm   rC   re   rl   ro   rn   r   rb   zerosr[   r<   r   opsvllmr.   rL   rp   rq   )r+   ry   rz   r{   
num_tokensqkvr$   r#   g_proj_statesg2r%   r,   r,   r-   forward   s8   


	
zKimiDeltaAttention.forwardr    r!   r"   r#   r$   r%   c                    s  t  }|j}|d u rd S t|tsJ | j }t|tsJ |j}	|j}
|j}|j	} j
|j }|d | }|d | }|d | }|d | }|d | }|\}}}}|dd}|dd}|dd} jj jjd jjd} jj jjd jjd} jj jjd jjd}|jdkr|dd}|dd}|dd}t|| jjd||	||
|d	dd}t|| jjd||	||
|d	dd}t|| jjd||	||
|d	dd}n.|d |j	 }t||| jjd|dd	}t||| jjd|dd	}t||| jjd|dd	}t fd
d|||f\}}}|jdkrT||	  }d||< ||  }t||||||dd|
d	\}}|||< nt||||||d|
d |jd  |d	\}}|dd |f |dd |f< d S )Nr   rS   r   silu)rV   conv_stateshas_initial_statecache_indicesquery_start_locmetadataT)rV   conv_state_indicesvalidate_datac                    s   t | d jdS )Nzn (h d) -> 1 n h dr}   )r   rC   )xr6   r,   r-   <lambda>  s    z-KimiDeltaAttention._forward.<locals>.<lambda>)	r   r   r   gr$   initial_stateoutput_final_stateuse_qk_l2norm_in_kernel
cu_seqlens)	r   r   r   r   r$   r   r   r   ssm_state_indices)r   attn_metadata
isinstancedictrL   r   r   non_spec_query_start_locnon_spec_state_indices_tensornum_actual_tokenskv_cachevirtual_engine	transposerg   rj   viewr   rh   ri   num_prefillsr   rN   r   map
contiguousr   r   num_decodes)r+   r    r!   r"   r#   r$   r%   r*   r   r   r   r   r   constant_cachesconv_state_qconv_state_kconv_state_vrecurrent_stateq_conv_weightsk_conv_weightsv_conv_weightsr   r   r   decode_conv_indiceszero_idxr   core_attn_out_non_speclast_recurrent_stater,   r6   r-   r)     s  	




		




zKimiDeltaAttention._forward)NNNrF   rG   )__name__
__module____qualname__propertystrr7   tuplerb   r<   r>   intrE   r   r   r   r   rX   Tensorr   r)   __classcell__r,   r,   rw   r-   r4   R   sp    
	.

 
&r4   )6rb   einopsr   r   vllm.configr   r   r   vllm.distributedr   r   r	   vllm.forward_contextr
   r   vllm.loggerr   -vllm.model_executor.model_loader.weight_utilsr   vllm.model_executor.utilsr   vllm.utils.torch_utilsr   vllm.v1.attention.backendr   #vllm.v1.attention.backends.gdn_attnr   fla.ops.kdar   r   r   r   linearr   r   r   mamba.abstractr   mamba.mamba_utilsr   r   mamba.ops.causal_conv1dr   r   quantization.base_configr   r   loggerr   r   r.   r/   Moduler4   r,   r,   r,   r-   <module>   sx   

