o
    پivQ                     @   s  d dl Z d dlmZmZmZmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZ d d
lmZmZmZ d dlmZ d dlm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6m7Z7 e 8e9Z:e6 Z;G dd de	j<Z=G dd de	j<Z>de>iZ?G dd de	j<Z@G dd de	j<ZAeAZBdS )    N)AnyIterableListOptionalSetTuple)nn)FalconH1Config)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)HybridLinearAttnBackendMamba2AttnBackend)MambaMixer2)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)get_global_server_args)
add_prefixis_cudamake_layersc                       sf   e Zd Z			ddededededee d	ee d
ede	ddf fddZ
		dde	fddZ  ZS )FalconH1MLPN Thidden_sizeintermediate_size
hidden_actlayer_idmlp_multipliersquant_configprefixreduce_resultsreturnc	           	         s   t    t||gd d|td|d| _t||d|td||d| _|dkr/td| d	t | _	|| _
|| _t | _|\| _| _d S )
N   Fgate_up_proj)biasr,   r-   	down_proj)r2   r,   r-   r.   siluzUnsupported activation: z!. Only silu is supported for now.)super__init__r   r"   r1   r   r3   
ValueErrorr   act_fnr*   r(   r   tp_sizegate_multiplierdown_multiplier)	selfr'   r(   r)   r*   r+   r,   r-   r.   	__class__ O/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/falcon_h1.pyr6   -   s2   

zFalconH1MLP.__init__Fuse_reduce_scatterc                 C   s^   |  |\}}|d d d | j| j f  | j9  < | |}| j||d\}}|| j }|S )N)skip_all_reduce)r1   r(   r9   r:   r8   r3   r;   )r<   xforward_batchrA   gate_up_r?   r?   r@   forwardU   s   &


zFalconH1MLP.forward)Nr&   T)NF)__name__
__module____qualname__intstrr   floatr   r   boolr6   rG   __classcell__r?   r?   r=   r@   r%   ,   s8    	
+r%   c                       s   e Zd Z			ddededee dedeej	j
 ddf fd	d
Zdd ZdejdejdedejfddZdejdejdeej dedef
ddZ  ZS )#FalconH1HybridAttentionDecoderLayerNr&   configr*   r,   r-   
alt_streamr/   c                    s  t    || _|j| _t | _t | _t | _	|j
| _| j| j dks&J | j| j | _|j| _| j| jkrB| j| j dksAJ n
| j| j dksLJ td| j| j | _|jp^| j| j | _| j| j | _| j| j | _| jd | _t|dd| _t|dd| _t|dd | _t|d	d| _|| _t| j| j| j| j| j| jd
t d| _t|j| j| j| jd|| j| jd| _ t!| j| j |jd|d| j| jd| _"t#| j| j| j| j|| dd| _$|j%d u rt&|j'|j n|j%| _(t)|j*|j|j+|j,|j-|j.|j/|j0| dd	| _1d| _2d}d}t3j4||j5| j2||d| _6t7| j|j8|j/||j9|t:d|d| _;t<|j|j.d| _=t<|j|j.d| _>t<| j|j.d| _?t<| j|j.d| _@tA| j6| j=| j>d
d| _B|| _C|jD| _D|jE| _E|jF| _F|jG| _G|jH| _I| j1jJ|jK | _L|jM| _N| O  d S )Nr      g      
rope_thetai'  max_position_embeddingsi    rope_scalingpartial_rotary_factorT)	head_size
rotary_dimmax_positionrV   baserW   is_neox_styledtypeF)r2   r,   tp_rankr9   )r2   r,   r.   r^   r9   z.attn)num_kv_headsr*   r-   z.mixer)	cache_paramsr'   use_conv_biasuse_biasn_groupsrms_norm_eps
activationuse_rms_normr-   )r*   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparsemlp)r'   r(   r)   r*   r+   r,   r-   eps)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatter)Pr5   r6   rQ   r'   r   attn_tp_rankr   attn_tp_sizer   r9   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr_   head_dimq_sizekv_sizescalinggetattrrT   rU   rV   rW   r*   r   torchget_default_dtype
rotary_embr   qkv_projr   o_projr   attnmamba_d_ssmrK   mamba_expandd_ssmr   mamba2_cache_paramsmamba_conv_biasmamba_proj_biasmamba_n_groupsrd   r)   mamba_rms_normmambarh   r   init_newnum_hidden_layersrn   r%   r(   r+   r"   feed_forwardr   ro   pre_ff_layernormq_normk_normr   layer_communicatorrR   key_multiplierssm_out_multiplierssm_in_multiplierattention_in_multiplierattention_out_multiplierattn_out_multiplierrc   mamba_d_stategroups_time_state_sizessm_multiplierszxbcdt_multipliers_init_mup_vector)r<   rQ   r*   r,   r-   rR   ri   rj   r=   r?   r@   r6   i   s   




z,FalconH1HybridAttentionDecoderLayer.__init__c                 C   sd  d| j  d| j  | jj | j }td|}|ddd| j | j f  | jd 9  < |dd| j | j d| j  | j f  | jd 9  < |ddd| j  | j d| j  | j | j f  | jd 9  < |ddd| j  | j | j d| j  d| j  | j f  | jd 9  < |ddd| j  d| j  | j df  | jd 9  < | jd|dd	 dS )
u  
        Non learnable per-block scaling vector composed of element-wise
        multipliersapplied to each separate contiguous block of the output
        of the linear projection (in_proj) before further processing
        (gating, convolution, SSM):

            - Z block:  [0 : d_ssm]                      → zxbcdt_multipliers[0]
            - X block:  [d_ssm : 2 * d_ssm]              → zxbcdt_multipliers[1]
            - B block:  [2 * d_ssm : 2 * d_ssm + G * S]  → zxbcdt_multipliers[2]
            - C block:  [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S]
                        → zxbcdt_multipliers[3]
            - dt block: [2 * d_ssm + 2 * G * S : end]    → zxbcdt_multipliers[4]

        where:
            - d_ssm:     Dimension of state-space model latent
            - G:         Number of groups (n_groups)
            - S:         SSM state size per group
            - All indices are divided by tp_size to support tensor parallelism
        r0   rS   Nr         
mup_vectorF)
persistent)	r   r   rQ   mamba_n_headsr9   r   onesr   register_buffer)r<   vector_shaper   r?   r?   r@   r      sX   *"z4FalconH1HybridAttentionDecoderLayer._init_mup_vector	positionshidden_statesrD   c                 C   sl   |  |\}}|j| j| j| jgdd\}}}|| j }| |||\}}| ||||}	| |	\}
}|
S )N)dim)r   splitr{   r|   r   r   r   r   )r<   r   r   rD   qkvrF   qkvattn_outputoutputr?   r?   r@   self_attention0  s    
z2FalconH1HybridAttentionDecoderLayer.self_attentionresidualkwargsc           
      K   s   | j |||\}}|j sP| j||| j |d}|| j }|j}t|t	s)J t|j
ts1J t|}|j
j| j|| j || j| jd || j }|| }| j |||\}}| j |}	| |||	}| j |||\}}||fS )N)r   r   rD   )r*   r   )r   prepare_attnforward_modeis_idler   r   r   attn_backend
isinstancer   linear_attn_backendr   r   
empty_likerG   r   r   r*   r   r   prepare_mlpshould_use_reduce_scatterr   postprocess_layer)
r<   r   r   r   rD   r   attention_hidden_statesr   mamba_hidden_statesrA   r?   r?   r@   rG   @  sH   



z+FalconH1HybridAttentionDecoderLayer.forward)Nr&   N)rH   rI   rJ   r	   rK   r   r   rL   r   cudaStreamr6   r   Tensorr   r   r   rG   rO   r?   r?   r=   r@   rP   g   sN    
 6
rP   	falcon_h1c                       sf   e Zd Z		ddedee deddf fddZ	dd	ej	d
ej	de
deej	 dej	f
ddZ  ZS )FalconH1ModelNr&   rQ   r,   r-   r/   c                    s   t    | _trtj nd  j| _tj	j
j	t d| _dtdtf fdd}tj|| dd| _tj
jd| _d	| _d S )
N)org_num_embeddingsuse_attn_tp_groupidxr-   c                    s    t j|   }|| | dS )N)r,   r-   rR   )ALL_DECODER_LAYER_TYPESlayers_block_type)r   r-   layer_classrR   rQ   r,   r?   r@   	get_layer  s   z)FalconH1Model.__init__.<locals>.get_layerz.layersr-   rl   r   )r5   r6   rQ   _is_cudar   r   r   embedding_multiplierr   
vocab_sizer'   r   embed_tokensrK   rL   r$   r   layersr   rd   final_layernorminfer_count)r<   rQ   r,   r-   r   r=   r   r@   r6   }  s    


zFalconH1Model.__init__	input_idsr   rD   inputs_embedsc           
      C   s   |d ur
|| j  }n| || j  }d }tt| jD ]}| j| }||||||d\}}q|j sF|d u r>| |}|S | ||\}}	|S )N)r*   r   r   r   rD   )r   r   rangelenr   r   r   r   )
r<   r   r   rD   r   r   r   ilayerrF   r?   r?   r@   rG     s&   


zFalconH1Model.forwardNr&   N)rH   rI   rJ   r	   r   r   rL   r6   r   r   r   rG   rO   r?   r?   r=   r@   r   |  s0    *r   c                       s   e Zd ZdZ		ddedee deddf fdd	Ze	
 	dd
e	jde	jdedee	j fddZdd Zdd Z	ddeeee	jf  dedee fddZ  ZS )FalconH1ForCausalLMFNr&   rQ   r,   r-   r/   c                    s   t    || _t | _| jjr| jjsJ || _t||t	d|d| _
|jr-| j
j| _nt|j|j||jt	d|t jd| _| j | _|j| _t|| jd| _d S )Nmodelr   lm_head)r,   r   r-   r   )logit_scale)r5   r6   rQ   r
   pp_groupis_first_rankis_last_rankr,   r   r"   r   tie_word_embeddingsr   r   r   r   r'   r!   enable_dp_lm_headrM   lm_head_multiplierr   logits_processor)r<   rQ   r,   r-   r=   r?   r@   r6     s.   
zFalconH1ForCausalLM.__init__r   r   rD   r   c                 K   s"   |  ||||}| ||| j|S r   )r   r   r   )r<   r   r   rD   r   r   r   r?   r?   r@   rG     s   	
zFalconH1ForCausalLM.forwardc                 C   s   | j jj| jjfS r   )r   r   weightr   )r<   r?   r?   r@   get_embed_and_head  s   z&FalconH1ForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S r   )r   r   r   r   r   r   empty_cachesynchronize)r<   embedheadr?   r?   r@   set_embed_and_head  s   

z&FalconH1ForCausalLM.set_embed_and_headweightsis_mtpc                 C   s   g d}t |  }t }|D ]m\}}d|v rqd|v r"|dd}d|v r,|dd}|D ]/\}}	}
|	|vr8q.||	|}|drH||vrHq.||vrMq.|| }t|d	}||||
  n|drh||vrhq|| }t|d	t}||| || q|S )
N))r   q_projr   )r   k_projr   )r   v_projr   )r1   	gate_projr   )r1   up_projrS   zrotary_emb.inv_freqz.self_attn.z
.self_attnr&   A_logAz.biasweight_loader)dictnamed_parameterssetreplaceendswithr~   r    add)r<   r   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   r?   r?   r@   load_weights  s:   	

z FalconH1ForCausalLM.load_weightsr   r   )F)rH   rI   rJ   fall_back_to_pt_during_loadr	   r   r   rL   r6   r   no_gradr   r   rG   r   r   r   r   rN   r   r
  rO   r?   r?   r=   r@   r     sD    	r   )Cloggingtypingr   r   r   r   r   r   r   r   sglang.srt.configs.falcon_h1r	   sglang.srt.distributedr
   r   sglang.srt.layers.activationr   6sglang.srt.layers.attention.hybrid_linear_attn_backendr   r   'sglang.srt.layers.attention.mamba.mambar   sglang.srt.layers.communicatorr   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr    sglang.srt.server_argsr!   sglang.srt.utilsr"   r#   r$   	getLoggerrH   loggerr   Moduler%   rP   r   r   r   
EntryClassr?   r?   r?   r@   <module>   sB     
;  Jv