o
    پiK                     @   s  d dl mZ d dlmZ d dlZd dlZd dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZmZ d d	lmZmZ d d
lmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0 G dd dej1Z2G dd dej1Z3G dd dej1Z4G dd dej1Z5G dd dej1Z6G dd dej1Z7e7Z8dS )    )Iterable)castN)JetBlockConfigJetNemotronConfig)'fused_recurrent_gated_delta_rule_update)RMSNorm)HybridLinearAttnBackendMambaAttnBackendBase)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessorLogitsProcessorOutput)EmbeddingPoolerOutputPoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)ParallelLMHead)ForwardBatch)default_weight_loader)Qwen2MLP
Qwen2Model)
add_prefixc                       sV   e Zd Z		ddededededB deddf fd	d
ZdejdejfddZ	  Z
S )&DynamicShortConvolutionKernelGeneratorN 
input_sizehidden_sizeoutput_sizequant_configprefixreturnc                    sL   t    t||d|td|d| _t | _t||d|td|d| _d S )NFw1biasr!   r"   Tw2)	super__init__r
   r   r$   nnSiLUactr'   )selfr   r   r    r!   r"   	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/jet_nemotron.pyr)   %   s    

z/DynamicShortConvolutionKernelGenerator.__init__xc                 C   s*   |  |\}}| |}| |\}}|S N)r$   r,   r'   )r-   r2   _r0   r0   r1   forwardA   s   
z.DynamicShortConvolutionKernelGenerator.forwardNr   )__name__
__module____qualname__intr   strr)   torchTensorr5   __classcell__r0   r0   r.   r1   r   $   s"    r   c                       s   e Zd Z		ddedededededB ded	df fd
dZdejdejdejdejd	e	ejejf f
ddZ
dejdejd	ejfddZdejdejd	eej fddZdeej d	ejfddZ  ZS )DynamicShortConvolutionNr   r   kernel_sizegenerator_input_sizegenerator_reductionr!   r"   r#   c                    s@   t    || }t|||| |td|d| _|| _|| _d S )Nkernel_generator)r   r   r    r!   r"   )r(   r)   r   r   rC   r   r@   )r-   r   r@   rA   rB   r!   r"   generator_hidden_sizer.   r0   r1   r)   I   s   
	
z DynamicShortConvolution.__init__r2   
conv_stategenerator_inputseq_lensc                   s   | j ||dt d  fddttD | }t|d}|dddd| jd  df }|jd| jdd	}t|d
}| |}tj|d| j	| jd}| | j ||d}|| j
dd}| j||d}tj|}||fS )ag  
        Args:
            x: (cu_seq_len, hidden_size)
            conv_state: (batch_size, hidden_size, kernel_size - 1)
            generator_input: (cu_seq_len, generator_input_size)
            seq_lens: (batch_size,)

        Returns:
            out: (cu_seq_len, hidden_size)
            conv_state: (batch_size, hidden_size, kernel_size - 1)
        )rG   zb d k -> b k dc                    s"   g | ]}t  | | gqS r0   )r<   cat.0irE   x_seqsr0   r1   
<listcomp>w      " z3DynamicShortConvolution.forward.<locals>.<listcomp>zb l d -> b d lN   )	dimensionsizestepzb d l k -> b l d kzl (d k) -> l d k)dkdim)_continuous_to_seqseinops	rearrangerangelen_seqs_to_batchr@   unfoldrC   r   sum_batch_to_continuousr*   
functionalsilu)r-   r2   rE   rF   rG   new_conv_statekernelsoutr0   rL   r1   r5   a   s@   zDynamicShortConvolution.forwardc                   s$   t  fddt dD S )Nc                    s"   g | ]}| |  d f qS r3   r0   rI   rG   r2   r0   r1   rN      rO   z@DynamicShortConvolution._batch_to_continuous.<locals>.<listcomp>r   )r<   rH   r\   rS   r-   r2   rG   r0   rg   r1   ra      s   $z,DynamicShortConvolution._batch_to_continuousc                   s    fddt  dD S )Nc                    s4   g | ]} d |    d |d     qS )NrP   )r`   rI   rg   r0   r1   rN      s    &z?DynamicShortConvolution._continuous_to_seqs.<locals>.<listcomp>r   )r\   rS   rh   r0   rg   r1   rY      s   z+DynamicShortConvolution._continuous_to_seqsseqsc                 C   s   t jjj|dddS )NTleft)batch_firstpadding_side)r*   utilsrnnpad_sequence)r-   ri   r0   r0   r1   r^      s
   z&DynamicShortConvolution._seqs_to_batchr6   )r7   r8   r9   r:   r   r;   r)   r<   r=   tupler5   ra   listrY   r^   r>   r0   r0   r.   r1   r?   H   s`    
=

r?   c                       \   e Zd Z		ddedededB deddf
 fdd	Zd
ej	dej	de
dej	fddZ  ZS )JetBlockNr   configlayer_idr!   r"   r#   c                    s&  t    || _td	i | jj| jj|  }| jj}|j}|j}|| }	t	||j
 }
||
 }|j}t||	|	||||gd|td|d| _t||dd| _ttj|tjd| _tt|| _t|td|||||jd| _t|
t|jd| _|| _|| _|
| _ || _!|| _|	| _"|| _#d S )
NFqkvabz_projr%   )r&   dtypedynamic_conv1d)r!   r"   r   r@   rA   rB   epsr0   )$r(   r)   rt   r   efficient_attention_configlayer_typesr   	num_headshead_dimr:   expand_v	conv_sizer   r   rv   r   o_projr*   	Parameterr<   emptyfloat32A_logdt_biasr?   dconv_generator_reductionry   RMSNormGatedfloatnorm_epso_norm
head_k_dim
head_v_dimru   total_k_dimtotal_v_dim)r-   rt   ru   r!   r"   jet_block_configr   r~   r   r   r   r   r   r.   r0   r1   r)      s^   
	
zJetBlock.__init__	positionshidden_statesforward_batchc                 C   s  t |jtsJ t |jjtsJ |jj}|j}|j| j}| 	|\}}|j
| j| j| j| j| j| jgdd\}	}
}}}}tj|	}	tj|	d| j| jd}	tj|
}
tj|
d| j| jd}
|j}t |tjslJ | j|||j| j d d d f ||jd ur|jn	tj|jftjdd\}}|||j| j d d d f< tj|d| j| jd}| j    tj!| | j"  }tj#|}t$|	%d|
%d|%d|%d|%d|j&|jt'tj(|j)dd		*d}tj|d| jd
}| +||}t|d}| ,|\}}|S )NrQ   rW   zl (h d) -> l h d)hrU   rw   )rE   rF   rG   r   T)	qrV   vgbetainitial_state_sourceinitial_state_indices
cu_seqlensuse_qk_l2norm_in_kernel)r   zl h d -> l (h d))-
isinstanceattn_backendr   linear_attn_backendr	   forward_metadatareq_to_token_poolmamba2_layer_cacheru   rv   splitr   r   r~   r*   rb   rc   rZ   r[   r   convr<   r=   ry   mamba_cache_indicesextend_seq_lensones
batch_sizelongr   r   r   expsoftplusr   sigmoidr   	unsqueezetemporalr   
LongTensorquery_start_locsqueezer   r   )r-   r   r   r   r   r   layer_cacheqkvabzr4   r   rV   r   ar   z
conv_cacherd   r   or0   r0   r1   r5      s|   

&
zJetBlock.forwardr6   r7   r8   r9   r   r:   r   r;   r)   r<   r=   r   r5   r>   r0   r0   r.   r1   rs      s.    Brs   c                       rr   )JetNemotronAttentionNr   rt   ru   r!   r"   r#   c                    s*  t    || _| jj| jj | _| jj| j | _| jj| j | _t	| jj| j| jj| jjd|t
d|d| _t| jj| j | jjd|t
d|d| _t| j| j| jjt| jj| jjd| _| jj|  dkrl d}nd	krx| jjd	 d
 }n	 tt| jj| j| jd | jj|||t
d|d| _d S )NTqkv_projr%   Fr   )
rotary_dimmax_positionbaserope_scalingattnrQ   swawindow_sizeg      )num_kv_headsru   sliding_window_sizer!   r"   )r(   r)   rt   r   num_attention_headsr   q_sizenum_key_value_headskv_sizer   r   r   r   r   r   max_position_embeddingsr:   
rope_thetar   
rotary_embr}   r|   NotImplementedErrorr   r   )r-   rt   ru   r!   r"   r   r.   r0   r1   r)   T  s^   
	



zJetNemotronAttention.__init__r   r   r   c                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )NrQ   rW   )r   r   r   r   r   r   r   )r-   r   r   r   qkvr4   r   rV   r   attn_outputoutputr0   r0   r1   r5     s    zJetNemotronAttention.forwardr6   r   r0   r0   r.   r1   r   S  s.    @r   c                       s   e Zd Z				ddedejjdB dededB de	d	df fd
dZ
dejdejdedejdB d	eejejdB f f
ddZ  ZS )JetNemotronDecoderLayerNr   r   rt   
alt_streamru   r!   r"   r#   c                    s   t    |j|   dkrn dkrn n  t||td||d| _ndkr5t||td||d| _n	 tt|j	|j
|j|td|d| _t|j	|jd| _t|j	|jd| _d S )	Nr   r   	self_attn)r!   r"   ru   jetmlp)r   intermediate_size
hidden_actr!   r"   rz   )r(   r)   r}   r   r   r   rs   r   r   r   r   r   r   r   rms_norm_epsinput_layernormpost_attention_layernorm)r-   rt   r   ru   r!   r"   r.   r0   r1   r)     s:   


z JetNemotronDecoderLayer.__init__r   r   r   residualc                 C   sN   |}|  |}| j|||d}|| }|}| |}| |}|| }|d fS )N)r   r   r   )r   r   r   r   )r-   r   r   r   r   r0   r0   r1   r5     s   


zJetNemotronDecoderLayer.forward)Nr   Nr   )r7   r8   r9   r   r<   cudaStreamr:   r   r;   r)   r=   r   rp   r5   r>   r0   r0   r.   r1   r     s:    
*r   c                       s   e Zd Z		ddededB deddf fddZe 			dd
ej	dej	de
dej	dB dedeeB fddZdejfddZdeeeej	f  fddZ  ZS )JetNemotronForCausalLMNr   rt   r!   r"   r#   c                    sz   t    || _|| _t||td|td| _|jr | jj	| _
nt|j|j|td|d| _
t|| _ttjdd| _d S )Nmodel)r!   r"   decoder_layer_typelm_head)r!   r"   T)	normalize)r(   r)   rt   r!   r   r   r   r   tie_word_embeddingsembed_tokensr   r   
vocab_sizer   r   logits_processorr   r   LASTpooler)r-   rt   r!   r"   r.   r0   r1   r)     s&   

zJetNemotronForCausalLM.__init__F	input_idsr   r   input_embedsget_embeddingc                 C   s2   |  ||||}|s| ||| j|S | ||S r3   )r   r   r   r   )r-   r   r   r   r   r   r   r0   r0   r1   r5     s   	
zJetNemotronForCausalLM.forwardc                 C   s   | j jS r3   )r   r   )r-   r0   r0   r1   get_input_embeddings$  s   z+JetNemotronForCausalLM.get_input_embeddingsweightsc                 C   s   g d}t |  }|D ]@\}}|D ](\}}}||dvrq|||}	|	|vr*q||	 }
t|
d}||
||  n|}	||	 }
t|
dt}||
| qd S )N))r   q_projr   )r   k_projrV   )r   v_projr   )gate_up_proj	gate_projr   )r   up_projrP   )rv   r   r   )rv   r   rP   )rv   r      )rv   a_proj   )rv   b_proj   )rv   g_proj   .weight_loader)dictnamed_parametersr   replacegetattrr   )r-   r   stacked_params_mappingparams_dictweight_nameloaded_weightparam_name_partshard_weight_name_partshard_id
param_nameparamr   r0   r0   r1   load_weights'  s2   

z#JetNemotronForCausalLM.load_weightsr6   )NF)r7   r8   r9   r   r   r;   r)   r<   no_gradr=   r   boolr   r   r5   r*   Moduler   r   rp   r	  r>   r0   r0   r.   r1   r     s<    $r   )9collections.abcr   typingr   rZ   r<   torch.nnr*   sglang.srt.configs.jet_nemotronr   r   /sglang.srt.layers.attention.fla.fused_recurrentr   /sglang.srt.layers.attention.fla.layernorm_gatedr   r   6sglang.srt.layers.attention.hybrid_linear_attn_backendr   r	   sglang.srt.layers.layernormsglang.srt.layers.linearr
   r   r   r   "sglang.srt.layers.logits_processorr   r   sglang.srt.layers.poolerr   r   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.qwen2r   r   sglang.srt.utilsr   r  r   r?   rs   r   r   r   
EntryClassr0   r0   r0   r1   <module>   s:    $t OKj