o
    
۾i`                  	   @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z? G dd dej@ZAG dd dej@ZBG dd  d ej@ZCG d!d" d"ej@ZDe
G d#d$ d$ej@ZEG d%d& d&ej@e5e7e9e6e8ZFdS )'zInference-only FalconH1 model.    )Iterable)isliceN)nn)FalconH1Config)support_torch_compile)CacheConfigModelConfig
VllmConfig)$get_tensor_model_parallel_world_size)get_pp_group)
SiluAndMul)	Attention)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)MambaMixer2)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)set_default_rope_theta   )HasInnerStateIsHybridSupportsLoRASupportsMambaPrefixCaching
SupportsPP)PPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sF   e Zd Z			ddededB dededdf
 fd	d
Zdd Z  Z	S )FalconH1MLPNF configquant_configbiasprefixreturnc                    s   t    t|j|jgd ||| dd| _t|j|j||| dd| _t | _	|j| _|j
\| _| _|jdkrCtd|j dt | _d S )	N   z.gate_up_proj)
input_sizeoutput_sizesr/   r.   r0   z
.down_proj)r3   output_sizer/   r.   r0   siluzUnsupported activation: z!. Only silu is supported for now.)super__init__r   hidden_sizeintermediate_sizegate_up_projr   	down_projr
   tp_sizemlp_multipliersgate_multiplierdown_multiplier
hidden_act
ValueErrorr   act_fn)selfr-   r.   r/   r0   	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/falcon_h1.pyr8   >   s.   


zFalconH1MLP.__init__c                 C   sZ   |  |\}}|d d d | j| j f  | j9  < | |}| |\}}|| j }|S N)r;   r:   r=   r?   rC   r<   r@   )rD   x_rG   rG   rH   forward^   s   &

zFalconH1MLP.forward)NFr,   )
__name__
__module____qualname__r   r   boolstrr8   rL   __classcell__rG   rG   rE   rH   r+   =   s      r+   c                       sn   e Zd Z				ddededB dedB dedB deddf fd	d
Zdd Z	de
jde
jdB fddZ  ZS )FalconH1SSMDecoderLayerNr,   r-   model_configcache_configr.   r0   r1   c                    s   t    || _t | _|jd u rt|j|j n|j| _	t
di d|jd|jd|jd| j	d|jd|jd|jd|jd	|jd
|jd|jd|d|d|d|jd| d| _| jj|j | _|j| _|   d S )Nr9   ssm_state_sizeconv_kernel_sizer:   use_conv_biasuse_biasn_groups	num_headshead_dimrms_norm_eps
activationrT   rU   r.   use_rms_normr0   z.mixerrG   )r7   r8   r-   r
   r=   mamba_d_ssmintmamba_expandr9   d_ssmr   mamba_d_statemamba_d_convmamba_conv_biasmamba_proj_biasmamba_n_groupsmamba_n_headsmamba_d_headr]   rA   mamba_rms_normmambarZ   groups_time_state_sizessm_multiplierszxbcdt_multipliers_init_mup_vector)rD   r-   rT   rU   r.   r0   rE   rG   rH   r8   h   sV   

	

z FalconH1SSMDecoderLayer.__init__c                 C   sd  d| j  d| j  | jj | j }td|}|ddd| j | j f  | jd 9  < |dd| j | j d| j  | j f  | jd 9  < |ddd| j  | j d| j  | j | j f  | jd 9  < |ddd| j  | j | j d| j  d| j  | j f  | jd 9  < |ddd| j  d| j  | j df  | jd 9  < | jd|dd	 dS )
u  
        Non learnable per-block scaling vector composed of element-wise
        multipliersapplied to each separate contiguous block of the output
        of the linear projection (in_proj) before further processing
        (gating, convolution, SSM):

            - Z block:  [0 : d_ssm]                      → zxbcdt_multipliers[0]
            - X block:  [d_ssm : 2 * d_ssm]              → zxbcdt_multipliers[1]
            - B block:  [2 * d_ssm : 2 * d_ssm + G * S]  → zxbcdt_multipliers[2]
            - C block:  [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S]
                        → zxbcdt_multipliers[3]
            - dt block: [2 * d_ssm + 2 * G * S : end]    → zxbcdt_multipliers[4]

        where:
            - d_ssm:     Dimension of state-space model latent
            - G:         Number of groups (n_groups)
            - S:         SSM state size per group
            - All indices are divided by tp_size to support tensor parallelism
        r2   r    Nr         
mup_vectorF)
persistent)	rc   rm   r-   ri   r=   torchonesro   register_buffer)rD   vector_shapers   rG   rG   rH   rp      sL   *"		z(FalconH1SSMDecoderLayer._init_mup_vectorhidden_statesresidualc                 K   s   | j || jd}||fS )N)rs   )rl   rs   )rD   ry   rz   kwargsoutputrG   rG   rH   rL      s
   zFalconH1SSMDecoderLayer.forwardNNNr,   )rM   rN   rO   r   r   r   r   rQ   r8   rp   ru   TensorrL   rR   rG   rG   rE   rH   rS   g   s0    )8rS   c                       s~   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	fddZ
d
ej	dej	dej	dB fddZ  ZS )FalconH1AttentionDecoderLayerNr,   r-   rU   r.   r0   r1   c              	      s  t    t|dd t|dd}|j| _t }|j| _| j| dks%J | j| | _|j	| _
| j
|kr>| j
| dks=J n	|| j
 dksGJ td| j
| | _t|dd d u r^|j| j n|j| _| j| j | _| j| j | _| jd | _|| _t|d	| j}|| j |jd
< t| j||jdd d| _t|j| j| j| j
d|| dd| _t| j| j |jd|| dd| _t| j| j| j| j||| dd| _|j| _d S )Ng   vH7B)default_thetamax_position_embeddingsi    r   r    r\   g      attn_rotary_embpartial_rotary_factorT)	head_sizemax_positionrope_parametersis_neox_styledtypeFz	.qkv_proj)r/   r.   r0   z.o_projz.attn)num_kv_headsrU   r.   r0   )r7   r8   r   getattrr9   r
   num_attention_headstotal_num_headsr[   num_key_value_headstotal_num_kv_headsmaxr   r\   q_sizekv_sizescalingr   r   r   
rotary_embr   qkv_projr   o_projr   attnkey_multiplier)rD   r-   rU   r.   r0   r   r=   
rotary_dimrE   rG   rH   r8      sp   

	
	z&FalconH1AttentionDecoderLayer.__init__	positionsry   c                 K   sj   |  |\}}|j| j| j| jgdd\}}}|| j }| |||\}}| |||}	| |	\}
}|
S )N)dim)r   splitr   r   r   r   r   r   )rD   r   ry   r{   qkvrK   qkvattn_outputr|   rG   rG   rH   self_attention!  s    
z,FalconH1AttentionDecoderLayer.self_attentionrz   c                 K   s   | j ||d}||fS )Nr   ry   )r   )rD   r   ry   rz   r{   rG   rG   rH   rL   0  s
   z%FalconH1AttentionDecoderLayer.forward)NNr,   )rM   rN   rO   r   r   r   rQ   r8   ru   r~   r   rL   rR   rG   rG   rE   rH   r      s:    J
r   c                       sj   e Zd ZdZ				ddedededB dedB dedB d	e	d
df fddZ
dejdejfddZ  ZS )FalconH1ParallelHybrida  
    A hybrid decoder layer for FalconH1 where the input is processed
    in parallel through both the self-attention branch and the SSM (Mamba)
    branch. Their outputs are then summed to produce the final hidden state.

    This layer uses:
      - FalconH1AttentionDecoderLayer for the multi-head self-attention branch.
      - FalconH1SSMDecoderLayer for the state-space (Mamba) branch.
    Nr,   r-   	layer_idxrT   rU   r.   r0   r1   c           	         s   t    t||||d| _|j| }|dd d|  }t|||||d| _|j| _|j	| _	|j
| _
|j| _t||| dd| _t|j|jd| _t|j|jd| _d S )N)r-   rU   r.   r0   .r   )r-   rT   rU   r.   r0   z.feed_forwardr.   r0   eps)r7   r8   r   	self_attnnum_hidden_layersr   rS   rl   ssm_out_multiplierssm_in_multiplierattention_in_multiplierattention_out_multiplierattn_out_multiplierr+   feed_forwardr   r9   r]   input_layernormpre_ff_layernorm)	rD   r-   r   rT   rU   r.   r0   ssm_layer_idx
ssm_prefixrE   rG   rH   r8   I  s2   
	
	zFalconH1ParallelHybrid.__init__r   ry   c                 K   s   |}|  |}| jd||| j |d|\}}| jd|| j |d|\}}|| j || j  }|| }|}| |}| |}|| }|S )N)r   ry   rz   )ry   rz   rG   )	r   r   r   rl   r   r   r   r   r   )rD   r   ry   r{   rz   attn_hiddenrK   
ssm_hiddenrG   rG   rH   rL   v  s0   





zFalconH1ParallelHybrid.forwardr}   )rM   rN   rO   __doc__r   ra   r   r   r   rQ   r8   ru   r~   rL   rR   rG   rG   rE   rH   r   >  s4    -r   c                       sz   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB dejf
ddZ
  ZS )FalconH1Modelr,   r0   vllm_configr0   c                   s   t    |jj|j|j |j| _j| _t j	r*t
| jj| _j| _nt | _d| _dtf fdd}tj|| dd\| _| _| _tddgj| _t jrftjjd	| _d S t | _d S )
Ng      ?r0   c                    s,   t | ddd }t}|| | dS )Nr   r    r   )ra   rsplitr   )r0   r   layer_classrU   r-   rT   r.   rG   rH   	get_layer  s   z)FalconH1Model.__init__.<locals>.get_layerz.layersr   ry   rz   r   )r7   r8   rT   	hf_configrU   r.   r-   
vocab_sizer   is_first_rankr   r9   embed_tokensembedding_multiplierr&   rQ   r)   r   start_layer	end_layerlayersr(   make_empty_intermediate_tensorsis_last_rankr   r]   final_layernorm)rD   r   r0   r   rE   r   rH   r8     s2   


zFalconH1Model.__init__	input_idsr1   c                 C   s
   |  |S rI   )r   rD   r   rG   rG   rH   embed_input_ids  s   
zFalconH1Model.embed_input_idsNr   intermediate_tensorsinputs_embedsc                 C   s   t  jr|d ur|| j }n| || j }n
|d usJ |d }t| j| j| jD ]}|||d}q*t  js=t	d|iS | 
|}|S )Nry   r   )r   r   r   r   r   r   r   r   r   r   r   )rD   r   r   r   r   ry   layerrG   rG   rH   rL     s&   
zFalconH1Model.forwardNN)rM   rN   rO   r	   rQ   r8   ru   r~   r   r   rL   rR   rG   rG   rE   rH   r     s     ,r   c                
       s:  e Zd Zg dddgdZdddZedd	d
eejejf fddZ	edd	d
eee
e
f ee
e
e
f f fddZed
eeef fddZdddedef fddZdejd
ejfddZ		d%dejdB dejdedB dejdB fddZdejd
ejdB fd d!Zd"eeeejf  d
ee fd#d$Z  ZS )&FalconH1ForCausalLM)q_projk_projv_proj	gate_projup_proj)r   r;   input_embeddingsoutput_embeddings)r   lm_headr   r	   r1   c                 C   s   t |jj|jj|jjS rI   )r   mamba2_state_dtyperT   r   rU   mamba_cache_dtypemamba_ssm_cache_dtype)clsr   rG   rG   rH   !get_mamba_state_dtype_from_config  s
   z5FalconH1ForCausalLM.get_mamba_state_dtype_from_configc              	   C   sR   |j }|jj}|jdu rt|j|j n|j}tj||j	|j
|j|j|j|jdS )a3  Calculate shapes for Mamba's convolutional and state caches.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
            - temporal_state_shape: Shape for state space model cache
        N)r:   tp_world_sizerZ   r[   r\   
state_sizeconv_kernel)parallel_configrT   r   r`   ra   rb   r9   r   mamba2_state_shapetensor_parallel_sizerh   ri   rj   rd   re   )r   r   r   r   r:   rG   rG   rH   !get_mamba_state_shape_from_config  s   
z5FalconH1ForCausalLM.get_mamba_state_shape_from_configc                 C   s   t  S rI   )r   mamba2_state_copy_func)r   rG   rG   rH   get_mamba_state_copy_func2  s   z-FalconH1ForCausalLM.get_mamba_state_copy_funcr,   r   r0   c                   s   |j j}|| _|j | _ |j}|j| _t   || _|| _t|t	|dd| _
|j| _t jrXt|j|jt	|dd| _|j| _| jrL| j| j
j| _t|j|j|jd| _nt | _| j
j| _d S )Nmodel)r   r0   r   r   )scale)rT   r   r   scheduler_configr.   r7   r8   r-   r   r*   r   tie_word_embeddingsr   r   r   r   r9   r   lm_head_multipliertie_weightsr   r   logits_processorr&   r   )rD   r   r0   r-   r   rE   rG   rH   r8   6  s:   


zFalconH1ForCausalLM.__init__r   c                 C   s   | j |S rI   )r   r   r   rG   rG   rH   r   ^  s   z#FalconH1ForCausalLM.embed_input_idsNr   r   r   c                 K   s   |  ||||}|S rI   )r   )rD   r   r   r   r   r{   ry   rG   rG   rH   rL   a  s   zFalconH1ForCausalLM.forwardry   c                 C   s   |  | j|}|S rI   )r   r   )rD   ry   logitsrG   rG   rH   compute_logitsr  s   z"FalconH1ForCausalLM.compute_logitsweightsc                 C   sD  g d}t |  }t }|D ]\}}d|v rqd|v r"|dd}d|v r,|dd}d|v r:t||}|d u r:q|D ].\}}}	||vrFq<|||}|drV||vrVq<t|| r\q<|| }
|
j}||
||	  n'|dru||vruqt|| r{q| jrd	|v rq|| }
t	|
d
t
}||
| || q| jr|d |S )N))r   r   r   )r   r   r   )r   r   r   )r;   r   r   )r;   r   r    zrotary_emb.inv_freqA_logArl   zmamba.mambar   z.biasr   weight_loaderzlm_head.weight)dictnamed_parameterssetreplacer   endswithr'   r   r   r   r   add)rD   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   rG   rG   rH   load_weightsz  sN   	




z FalconH1ForCausalLM.load_weightsr   )rM   rN   rO   packed_modules_mappingembedding_modulesclassmethodtupleru   r   r   ra   r   r   r   r	   rQ   r8   r~   r   r   rL   r   r   r   r  rR   rG   rG   rE   rH   r     sP    	
!(

,r   )Gr   collections.abcr   	itertoolsr   ru   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   r	   vllm.distributedr
   vllm.distributed.parallel_stater   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   -vllm.model_executor.layers.mamba.mamba_mixer2r   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   vllm.transformers_utils.configr   
interfacesr!   r"   r#   r$   r%   utilsr&   r'   r(   r)   r*   Moduler+   rS   r   r   r   r   rG   rG   rG   rH   <module>   sN   	*ohc
Q