o
    پi,                     @   s   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ G dd deZG dd deZG dd dejZ G dd deZ!e!gZ"dS )    )IterableN)nn)NemotronHConfig)get_pp_group)RMSNorm)ColumnParallelLinear)LogitsProcessor)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)NemotronHAttentionDecoderLayerNemotronHForCausalLMNemotronHMoEDecoderLayer)get_global_server_args)
add_prefixc                          e Zd Z				ddedededB deded	ed
df fddZddde	j
de	j
de	j
dB ded
ee	j
e	j
f f
 fddZ  ZS )!NemotronHMTPAttentionDecoderLayerN Fconfig	layer_idxquant_configprefixhas_start_projectionshas_end_normreturnc              	         t  j||||d || _|| _|r@t|j|jd| _t|j|jd| _t	|jd |jddt
|dr5|jntj|| dd| _|rPt|jt|d	d
d| _d S d S N)r   r   r   r   )eps   FTdtypez.eh_proj)
input_sizeoutput_sizebiasgather_outputparams_dtyper   r   layer_norm_epsilongh㈵>super__init__r   r   r   hidden_sizer&   enormhnormr   hasattrr    torchbfloat16eh_projgetattrfinal_layernormselfr   r   r   r   r   r   	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/nemotron_h_mtp.pyr)   )   s4   	
z*NemotronHMTPAttentionDecoderLayer.__init__residualinputs_embedshidden_statesr:   forward_batchc          	         |   | j r| |}| |}tj||gdd}| |\}}t j|||d\}}| jr:|d ur5|| }d }| 	|}||fS N)dim)r<   r:   r=   
r   r+   r,   r.   catr0   r(   forwardr   r2   	r4   r;   r<   r:   r=   inputs_embeds_normedprevious_hidden_states_normedfused_r5   r7   r8   rD   R   $   



z)NemotronHMTPAttentionDecoderLayer.forwardNr   FF__name__
__module____qualname__r   intr	   strboolr)   r.   Tensorr   tuplerD   __classcell__r7   r7   r5   r8   r   (   s@    .r   c                       r   )NemotronHMTPMoEDecoderLayerNr   Fr   r   r   r   r   r   r   c              	      r   r   r'   r3   r5   r7   r8   r)   t   s4   	
z$NemotronHMTPMoEDecoderLayer.__init__r9   r;   r<   r:   r=   c          	         r>   r?   rB   rE   r5   r7   r8   rD      rJ   z#NemotronHMTPMoEDecoderLayer.forwardrK   rL   r7   r7   r5   r8   rV   s   s@    -rV   c                       s|   e Zd Z		ddededB deddf fddZd	ejdejfd
dZ		dd	ejdejde
dejdB dejf
ddZ  ZS )NemotronHMultiTokenPredictorNr   r   r   r   r   c              	      sH  t    || _|j| _|j| _|j| _t|dd| _| jdks$J d|j	| _
t| j
| _| jdks5J t| j|j| _t | _| j| j }t|D ]U}|| j }| j
| }|dk}|| jd k}	| d| }
t||||
||	d}|dkrtdi || jt|< qL|dkrtdi || jt|< qLtd	| d
| j
 dd S )Nnum_nextn_predict_layers   z1Only one MTP layer is supported for NemotronH-MTPr   z.layers.)r   r   r   r   r   r   *EzPattern char 'z' in z not implementedr7   )r(   r)   r   
vocab_sizeorg_vocab_sizenum_hidden_layersmtp_start_layer_idxr1   num_mtp_layersmtp_hybrid_override_patternpattern_strlenpattern_lenr   r*   embed_tokensr   
ModuleDictlayersrangedictr   rQ   rV   NotImplementedError)r4   r   r   r   total_layersistep_rel_idxcharis_start_of_stepis_end_of_steplayer_prefixcommon_kwargsr5   r7   r8   r)      sP   



	z%NemotronHMultiTokenPredictor.__init__	input_idsc                 C   s   | j d us	J d|  |S )Nz?embed_tokens not initialized - must be shared from target model)re   )r4   rs   r7   r7   r8   get_input_embeddings   s   
z1NemotronHMultiTokenPredictor.get_input_embeddingsr<   r=   r;   c                 C   sH   |d u r	|  |}d }t| jD ]}| jt| ||||d\}}q|S )N)r;   r<   r:   r=   )rt   rh   rd   rg   rQ   )r4   rs   r<   r=   r;   r:   rl   r7   r7   r8   rD      s   
z$NemotronHMultiTokenPredictor.forwardNr   N)rM   rN   rO   r   r	   rQ   r)   r.   rS   rt   r   rD   rU   r7   r7   r5   r8   rW      s2    <rW   c                       s   e Zd Z		ddededB defddZe 	ddej	d	ej	d
e
dej	dB dej	f
ddZ	ddeeeej	f  def fddZ  ZS )NemotronHForCausalLMMTPNr   r   r   r   c                 C   s~   t j|  || _|| _t | _t|j|_	|j|_
t||td|d| _t| jj| jj|td|t jd| _t|| _d S )Nmodel)r   r   r   lm_head)r   r   use_attn_tp_group)r   Moduler)   r   r   r   pp_grouprc   ra   r^   hybrid_override_patternrW   r   rx   r
   r\   r*   r   enable_dp_lm_headry   r   logits_processor)r4   r   r   r   r7   r7   r8   r)     s&   z NemotronHForCausalLMMTP.__init__rs   	positionsr=   input_embedsr   c                 K   s*   |j j}| ||||}| ||| j|S rv   )	spec_infor<   rx   r   ry   )r4   rs   r   r=   r   kwargsr<   r7   r7   r8   rD   9  s   	
zNemotronHForCausalLMMTP.forwardFweightsis_mtpc                    s   t  j|dd d S )NT)r   )r(   load_weights)r4   r   r   r5   r7   r8   r   N  s   z$NemotronHForCausalLMMTP.load_weightsru   rv   )F)rM   rN   rO   r   r	   rQ   r)   r.   no_gradrS   r   rD   r   rT   rR   r   rU   r7   r7   r5   r8   rw     s:    
"rw   )#collections.abcr   r.   r   sglang.srt.configsr   sglang.srt.distributedr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   "sglang.srt.layers.logits_processorr   sglang.srt.layers.quantizationr	   *sglang.srt.layers.vocab_parallel_embeddingr
   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.models.nemotron_hr   r   r   sglang.srt.server_argsr   sglang.srt.utilsr   r   rV   r{   rW   rw   
EntryClassr7   r7   r7   r8   <module>   s&   KJY
>