o
    
۾ie                  
   @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z< G dd dej=Z>G dd dej=Z?G dd dej=Z@e?e>d ZAeG d!d" d"ej=ZBG d#d$ d$ej=e1e3e5e2e6e4	ZCdS )%z&Inference-only GraniteMoeHybrid model.    )IterableN)nn)GraniteMoeHybridConfig)support_torch_compile)CacheConfigModelConfig
VllmConfig)$get_tensor_model_parallel_world_size)get_pp_group)	Attention)RMSNorm)QKVParallelLinearRowParallelLinear)LogitsProcessor)MambaMixer2)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )GraniteMoeMoE)GraniteMoeSharedMLP)HasInnerStateIsHybridSupportsLoRASupportsMambaPrefixCaching
SupportsPPSupportsQuant)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sj   e Zd Z				ddedededB dedB dedB ded	df fd
dZ	de
jde
jdB fddZ  ZS )!GraniteMoeHybridMambaDecoderLayerN config	layer_idxmodel_configcache_configquant_configprefixreturnc                    s   t    || _|j| _|j| _t|j|j|j|j|j |j	|j
|j|j|j|j|j|||| dd| _d | _t|dddkrRt|j|j|j|j|| dd| _t|dddkr\d n	t||| dd	| _t|j|jd
| _t|j|jd
| _d S )Nz.mixer)hidden_sizessm_state_sizeconv_kernel_sizeintermediate_sizeuse_conv_biasuse_biasn_groups	num_headshead_dimrms_norm_eps
activationr-   r.   r/   r0   num_local_expertsr   .block_sparse_moenum_expertstop_kr2   r5   r/   r0   shared_intermediate_size.shared_mlpr/   r0   eps)super__init__r+   r2   residual_multiplierr   mamba_d_statemamba_d_convmamba_expandmamba_conv_biasmamba_proj_biasmamba_n_groupsmamba_n_headsmamba_d_headr;   
hidden_actmambablock_sparse_moegetattrr   r=   num_experts_per_tokr5   r   
shared_mlpr   input_layernormpost_attention_layernormselfr+   r,   r-   r.   r/   r0   	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/granitemoehybrid.pyrH   8   sR   
	
z*GraniteMoeHybridMambaDecoderLayer.__init__hidden_statesresidualc                 K   s   |}|  |}| |}||| j  }|}| |}| jd u r*| jd ur)| |}n| jd urA| }| |}|| | }~n| |}||| j  }||fS N)rX   rS   rI   rY   rW   rT   clone)r[   r`   ra   kwargsoutputmoe_hidden_statesr^   r^   r_   forwardp   s$   








z)GraniteMoeHybridMambaDecoderLayer.forwardNNNr*   __name__
__module____qualname__r   intr   r   r   strrH   torchTensorrg   __classcell__r^   r^   r\   r_   r)   7   s2    8r)   c                       sv   e Zd Z				ddedededB dedB dedB ded	df fd
dZ	de
jde
jde
jdB d	e
jfddZ  ZS )%GraniteMoeHybridAttentionDecoderLayerNr*   r+   r,   r-   r.   r/   r0   r1   c                    s   t    |j| _|j| _t|||| dd| _d | _t|dddkr6t|j	|j
|j|j|| dd| _t|dddkr@d n	t||| dd	| _t|j|jd
| _t|j|jd
| _d S )Nz
.self_attn)r.   r/   r0   r=   r   r>   r?   rB   rC   rD   rE   )rG   rH   r2   rI   GraniteMoeHybridAttention	self_attnrT   rU   r   r=   rV   r5   r   rW   r   r;   rX   rY   rZ   r\   r^   r_   rH      s:   
	z.GraniteMoeHybridAttentionDecoderLayer.__init__	positionsr`   ra   c                 C   s   |}|  |}| j||d}||| j  }|}| |}| jd u r,| jd ur+| |}n| jd urC| }| |}|| | }~n| |}||| j  }||fS )N)ru   r`   )rX   rt   rI   rY   rW   rT   rc   )r[   ru   r`   ra   rf   r^   r^   r_   rg      s*   







z-GraniteMoeHybridAttentionDecoderLayer.forwardrh   ri   r^   r^   r\   r_   rr      s:    ,rr   c                       sh   e Zd Z				ddededB dedB dedB deddf fd	d
Zde	j
de	j
de	j
fddZ  ZS )rs   Nr*   r+   r-   r.   r/   r0   r1   c              	      sH  t    d| _|j| _|j| _|j| _|j| _| j| j | _|j	| _
t }| j| dks/J | j| | _| j
|krD| j
| dksCJ n	|| j
 dksMJ td| j
| | _	t| j| j| j| j
| j|| dd| _t| j| j| j|| dd| _|jdkrt| j|j|jdd| _nd | _t| j| j| j| j	||| d	d
| _d S )NTr   r   	.qkv_proj)biasr/   r0   z.o_projrope)max_positionrope_parametersis_neox_stylez.attn)num_kv_headsr.   r/   r0   )rG   rH   causalr2   attention_biasattention_multipliernum_attention_headstotal_num_headsr:   num_key_value_headstotal_num_kv_headsr	   r9   maxr   qkv_projr   o_projposition_embedding_typer   max_position_embeddingsrz   
rotary_embr   attn)r[   r+   r-   r.   r/   r0   tp_sizer\   r^   r_   rH      s`   




z"GraniteMoeHybridAttention.__init__ru   r`   c                 C   s   |  |\}}|j| j| j | j| j | j| j gdd\}}}| jd ur.| |||\}}| |||}~~~| |d }|S )Ndimr   )r   splitr9   r:   r   r   r   r   )r[   ru   r`   qkv_querykeyvaluer^   r^   r_   rg   %  s   



	z!GraniteMoeHybridAttention.forwardrh   )rj   rk   rl   r   r   r   r   rn   rH   ro   rp   rg   rq   r^   r^   r\   r_   rs      s2    Ers   )	attentionrS   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB dejf
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )GraniteMoeHybridModelr*   r0   vllm_configr0   c                   s   t    |jj|j|j |j| _| _j| _t| jj	| _
j| _dtf fdd}tj|| dd\| _| _| _tddgj	| _tj	jd| _d S )	Nr0   c                    s6   t | ddd }tj|  }|| | dS )N.r   rD   )rm   rsplitALL_DECODER_LAYER_TYPESlayer_types)r0   r,   layer_classr.   r+   r-   r/   r^   r_   	get_layerY  s   z1GraniteMoeHybridModel.__init__.<locals>.get_layerz.layersr   r`   ra   rE   )rG   rH   r-   	hf_configr.   r/   r+   
vocab_sizer   r2   embed_tokensembedding_multiplierrn   r'   num_hidden_layersstart_layer	end_layerlayersr&   make_empty_intermediate_tensorsr   r;   norm)r[   r   r0   r   r\   r   r_   rH   F  s*   

zGraniteMoeHybridModel.__init__	input_idsr1   c                 C   s
   |  |S rb   )r   r[   r   r^   r^   r_   embed_input_idsn  s   
z%GraniteMoeHybridModel.embed_input_idsNru   intermediate_tensorsinputs_embedsc           
      C   s   t  jr|d ur|}n
| |}|| j }d }n|d u r td|d }|d }d}t| jD ]\}}	t|	tr<|d7 }|	|||d\}}q/t  j	sQt
||dS | |}|S )Nz%Intermediate tensors may not be None!r`   ra   r   r   )ru   r`   ra   )r`   ra   )r
   is_first_rankr   r   RuntimeError	enumerater   
isinstancerr   is_last_rankr   r   )
r[   r   ru   r   r   r`   ra   num_attnilayerr^   r^   r_   rg   q  s.   



zGraniteMoeHybridModel.forwardc                    s.   dd d| j j} fddt|D S )N	gate_proj	down_projup_projc              	      sV   g | ]'}d fd fdffD ]\}}|fv rdndd| d| d||fqqS )w1w2w3zblock_sparse_moe.experts.w13_zblock_sparse_moe.experts.w2_zblock_sparse_moe.experts.r   r^   ).0	expert_idshard_idweight_nameckpt_down_proj_nameckpt_gate_proj_nameckpt_up_proj_namer^   r_   
<listcomp>  s     
z<GraniteMoeHybridModel.get_expert_mapping.<locals>.<listcomp>)r+   r=   range)r[   r@   r^   r   r_   get_expert_mapping  s   
z(GraniteMoeHybridModel.get_expert_mappingweightsc              	      sB  g d}t  t   fdd}fdd}fdd} fdd	}|D ]\}}d
|v r?|d
d}jd urej| }	re|}
|
 dkrV|
n|
d }
||	|
 |	 q1|||rkq1|	dsu|	drt
|dD ];}|dd| d}|dd| d}|| jddd\}}||dd||d|d ||dd||d|d q|q1|	ds|	drt
|dD ]}|dd| d}|| }||dd||d|d qq1|	dr|dd }||| q1d!}|D ]\}}}||v r||||||d" d#}q|s||| q1S )$N))rv   z.q_projq)rv   z.k_projk)rv   z.v_projvc                    s,   |  }t |dt}|||  |  d S Nweight_loaderrU   r   add)npparamr   loaded_paramsparams_dictr^   r_   _load  s   
z1GraniteMoeHybridModel.load_weights.<locals>._loadc                    s<   t | s|  }t|dt}||||  |  d S d S r   )r%   rU   r   r   )r   r   r   r   r   )r   r   r[   r^   r_   _load_shard  s   
z7GraniteMoeHybridModel.load_weights.<locals>._load_shardc                    s4   |  }t |dt}||||||d  |  d S )Nr   r   r   r   )r   r   namer   r   r   r   r   r^   r_   _load_expert  s   z8GraniteMoeHybridModel.load_weights.<locals>._load_expertc              	      sv    D ]6}|\}}}}|| vrq|  ||}t|rq| }|j}	d}
|	d ur2|	|||||dd}
|
r8|  S qd S )NFT)r   r   return_success)replacer%   r   )r   loaded_weightmapping
param_namer   r   r   name_mappedr   r   success)expert_params_mappingr   r[   r^   r_   _load_quant_expert  s.   
	z>GraniteMoeHybridModel.load_weights.<locals>._load_quant_expertA_logAr   z%.block_sparse_moe.input_linear.weightz+.block_sparse_moe.input_linear.weight_scalez.block_sparse_moe.experts.z
.w1.weightz
.w3.weight   r   z.input_linear.z.experts.w13_r   r   r   z&.block_sparse_moe.output_linear.weightz,.block_sparse_moe.output_linear.weight_scalez
.w2.weightz.output_linear.z.experts.w2_r   z%.block_sparse_moe.router.layer.weightz.block_sparse_moe.gate.weightF)r   T)dictnamed_parameterssetr   r   r/   get_cache_scaler   r   endswithr   sizechunk)r[   r   stacked_params_mappingr   r   r   r   r   r   
scale_namer   ew1_namew3_namew1_paramw3_paramw2_namew2_param	gate_nameloadedr   r   r   r^   )r   r   r   r[   r_   load_weights  s   












z"GraniteMoeHybridModel.load_weightsNN)rj   rk   rl   r   rn   rH   ro   rp   r   r   rg   listtuplerm   r   r   r   r   rq   r^   r^   r\   r_   r   D  s$    (
$,r   c                
       s@  e Zd Zg ddgdgdgdZdddZed	d
deejejf fddZ	ed	d
deee
e
f ee
e
e
f f fddZedeeef fddZddd	edef fddZdejdejfddZ		d&dejdB dejdedB dejdB fddZd ejdejdB fd!d"Zd#eeeejf  dee fd$d%Z  ZS )'GraniteMoeHybridForCausalLM)q_projk_projv_projconv1din_projinput_linear)r   r  r  r  input_embeddingsoutput_embeddings)r   lm_headr   r   r1   c                 C   s   t |jj|jj|jjS rb   )r   mamba2_state_dtyper-   dtyper.   mamba_cache_dtypemamba_ssm_cache_dtype)clsr   r^   r^   r_   !get_mamba_state_dtype_from_config^  s
   z=GraniteMoeHybridForCausalLM.get_mamba_state_dtype_from_configc              	   C   s>   |j }|jj}|j|j }tj||j|j|j	|j
|j|jdS )a3  Calculate shapes for Mamba's convolutional and state caches.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
            - temporal_state_shape: Shape for state space model cache
        )r5   tp_world_sizer8   r9   r:   
state_sizeconv_kernel)parallel_configr-   r   rL   r2   r   mamba2_state_shapetensor_parallel_sizerO   rP   rQ   rJ   rK   )r  r   r  r   r5   r^   r^   r_   !get_mamba_state_shape_from_configi  s   z=GraniteMoeHybridForCausalLM.get_mamba_state_shape_from_configc                 C   s   t  S rb   )r   mamba2_state_copy_func)r  r^   r^   r_   get_mamba_state_copy_func  s   z5GraniteMoeHybridForCausalLM.get_mamba_state_copy_funcr*   r   r0   c                   s   t    |jj}|| _|j| _|j}|j| _|| _|| _t|t	|dd| _
t|j|j| jt	|dd| _|jr@| j
jj| j_t|j|jd| jj d| _| j
j| _d S )Nmodel)r   r0   r  rD   r   )scale)rG   rH   r-   r   r   scheduler_configr/   r+   r   r(   r  r   r   r2   r  tie_word_embeddingsr   weightr   logits_scalinglogits_processorr   )r[   r   r0   r+   r  r\   r^   r_   rH     s4   


z$GraniteMoeHybridForCausalLM.__init__r   c                 C   s   | j |S rb   )r  r   r   r^   r^   r_   r     s   z+GraniteMoeHybridForCausalLM.embed_input_idsNru   r   r   c                 K   s   |  ||||}|S rb   )r  )r[   r   ru   r   r   rd   r`   r^   r^   r_   rg     s   z#GraniteMoeHybridForCausalLM.forwardr`   c                 C   s   |  | j|}|S rb   )r  r  )r[   r`   logitsr^   r^   r_   compute_logits  s   z*GraniteMoeHybridForCausalLM.compute_logitsr   c                 C   s   t | }||S rb   )r$   r   )r[   r   loaderr^   r^   r_   r     s   
z(GraniteMoeHybridForCausalLM.load_weightsr   )rj   rk   rl   packed_modules_mappingembedding_modulesclassmethodr   ro   r  r  rm   r  r   r  r   rn   rH   rp   r   r   rg   r  r   r   r   rq   r^   r^   r\   r_   r   F  sT    

!

,r   )D__doc__collections.abcr   ro   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   r   vllm.distributedr	   vllm.distributed.parallel_stater
   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   -vllm.model_executor.layers.mamba.mamba_mixer2r   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
granitemoer   granitemoesharedr   
interfacesr   r   r    r!   r"   r#   utilsr$   r%   r&   r'   r(   Moduler)   rr   rs   r   r   r   r^   r^   r^   r_   <module>   sV    	XP`  
