o
    -iI                  	   @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZmZ d dlmZ d dlmZm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= G dd dej>Z?G dd dej>Z@G dd dej>ZAG dd dej>ZBeG dd  d ej>ZCG d!d" d"ej>e1e3e4e2e5ZDdS )#    )Iterable)isliceN)
Lfm2Config)	Attention)support_torch_compile)CacheConfigModelConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)	ShortConv)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )HasInnerStateIsHybridSupportsLoRA
SupportsPPSupportsQuant)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       s^   e Zd Z		ddedededededB dedB d	ef fd
dZde	j
de	j
fddZ  ZS )Lfm2MLPN dimff_dimmultiple_ofauto_adjust_ff_dimffn_dim_multiplierquant_configprefixc                    s   t    |r#td| d }|d urt|| }||| d |  }t||gd d|| dd| _t||d|| dd| _t | _d S )	N      r   F.w1)
input_sizeoutput_sizesbiasr1   r2   z.w2r6   output_sizer8   r1   r2   )	super__init__intr   w1r   w2r   act_fn)selfr,   r-   r.   r/   r0   r1   r2   	__class__ \/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/lfm2.pyr<   3   s*   

zLfm2MLP.__init__xreturnc                 C   s*   |  |\}}| |}| |\}}|S N)r>   r@   r?   )rA   rF   gate_up_rD   rD   rE   forwardU   s   
zLfm2MLP.forward)Nr+   )__name__
__module____qualname__r=   boolfloatr   strr<   torchTensorrK   __classcell__rD   rD   rB   rE   r*   2   s&    "r*   c                       st   e Zd Z				ddededededed	ed
edB dedB deddf fddZde	j
de	j
de	j
fddZ  ZS )Lfm2Attention    Nr+   config	layer_idxhidden_size	num_headsnum_kv_headsmax_position_embeddingscache_configr1   r2   rG   c
              	      sx  t    || _|| _|| _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr5| j|
 dks4J n	|
| j dks>J t	d| j|
 | _| j| j | _
| j| j
 | _| j| j
 | _| j
d | _|| _t| j| j
| j| jd||	 dd| _t| j| j
 | jd||	 dd| _t| j
| j|jd	d
| _t| j| j
| j| j||	 dd| _t| j
|jd| _t| j
|jd| _d S )Nr   r   g      F	.qkv_proj)rY   	head_sizetotal_num_headstotal_num_kv_headsr8   r1   r2   z	.out_projr9   T)max_positionrope_parametersis_neox_stylez.attn)r[   r]   r2   eps)r;   r<   rX   rY   r[   r   r`   rZ   ra   maxhead_dimq_sizekv_sizescalingr\   r   qkv_projr   out_projr   rc   
rotary_embr   attnr   norm_epsq_layernormk_layernorm)rA   rW   rX   rY   rZ   r[   r\   r]   r1   r2   tp_sizerB   rD   rE   r<   ]   sd   

	
zLfm2Attention.__init__	positionshidden_statesc                 C   s   |j \}}| |\}}|j| j| j| jgdd\}}}||| j| j }||| j	| j }| 
|}| |}| |||\}}||| j| j }||| j	| j }| |||}	| |	\}
}|
S )N)r,   )shaperl   splitri   rj   viewrZ   rh   
contiguousr[   rq   rr   rn   ro   rm   )rA   rt   ru   n_tokensrJ   qkvqkvattn_outputoutputrD   rD   rE   rK      s   
 

zLfm2Attention.forward)rV   NNr+   )rL   rM   rN   r   r=   r   r   rQ   r<   rR   rS   rK   rT   rD   rD   rB   rE   rU   \   sB    	
FrU   c                       s   e Zd Z				ddedededB dedB dedB ded	df fd
dZ	de
jde
jde
jdB d	ee
je
jf fddZ  ZS )Lfm2AttentionDecoderLayerNr+   rW   rX   model_configr]   r1   r2   rG   c                    s   t    || _|| _|| _t|dd}t|||j|j|j	|||| dd	| _
t|j|j|j|j|j|| dd| _t|j|jd| _t|j|jd| _d S )Nr\   rV   z
.self_attn)	rW   rX   rY   rZ   r[   r\   r]   r1   r2   .feed_forwardr,   r-   r.   r/   r0   r1   r2   re   )r;   r<   r2   rW   rX   getattrrU   rY   num_attention_headsnum_key_value_heads	self_attnr*   	block_dimblock_ff_dimblock_multiple_ofblock_auto_adjust_ff_dimblock_ffn_dim_multiplierfeed_forwardr   rp   operator_normffn_norm)rA   rW   rX   r   r]   r1   r2   r\   rB   rD   rE   r<      s6   
		z"Lfm2AttentionDecoderLayer.__init__rt   ru   residualc                 K   sT   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| ||fS )N)rt   ru   )r   r   r   r   )rA   rt   ru   r   kwargsrD   rD   rE   rK      s   z!Lfm2AttentionDecoderLayer.forwardNNNr+   )rL   rM   rN   r   r=   r   r   r   rQ   r<   rR   rS   tuplerK   rT   rD   rD   rB   rE   r      s:    (r   c                       sj   e Zd Z				ddedededB dedB dedB ded	df fd
dZ	de
jde
jdB fddZ  ZS )Lfm2ShortConvDecoderLayerNr+   rW   rX   r   r]   r1   r2   rG   c              	      s~   t    || _t||j|||| dd| _t|j|j|j	|j
|j|| dd| _t|j|jd| _t|j|jd| _d S )Nz.conv)rW   r,   rX   r   r]   r2   r   r   re   )r;   r<   rX   r   conv_dim
short_convr*   r   r   r   r   r   r   r   rY   rp   r   r   )rA   rW   rX   r   r]   r1   r2   rB   rD   rE   r<      s*   
			z"Lfm2ShortConvDecoderLayer.__init__ru   r   c                 K   s`   |d u r|}|  |}n|  ||\}}t|}| || | ||\}}| |}||fS rH   )r   rR   
empty_liker   r   r   )rA   ru   r   r   r   rD   rD   rE   rK     s   

z!Lfm2ShortConvDecoderLayer.forwardr   )rL   rM   rN   r   r=   r   r   r   rQ   r<   rR   rS   rK   rT   rD   rD   rB   rE   r      s2     r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdejde	dB dejdB dejf
ddZ
deeeejf  dee fddZ  ZS )	Lfm2Modelr+   r2   vllm_configr2   c                   s   t    |jj|j|j |j_j_tjj	jd_
dtf fdd}tj|| dd\___tddgj	_t jrZtj	jd	_d S t _d S )
N)org_num_embeddingsr2   c                    s8   t | }jj| dk}|rtnt}|| | dS )Nfull_attentionr1   r2   )r%   rW   layer_typesr   r   )r2   rX   is_attnlayer_classr]   rW   r   r1   rA   rD   rE   	get_layer9  s   
z%Lfm2Model.__init__.<locals>.get_layerz.layersr   ru   r   re   )r;   r<   r   	hf_configr]   r1   rW   
vocab_sizer   rY   embed_tokensrQ   r(   num_hidden_layersstart_layer	end_layerlayersr'   make_empty_intermediate_tensorsr
   is_last_rankr   rp   embedding_normr$   )rA   r   r2   r   rB   r   rE   r<   )  s(   

zLfm2Model.__init__	input_idsrG   c                 C   s
   |  |S rH   )r   rA   r   rD   rD   rE   embed_input_idsT  s   
zLfm2Model.embed_input_idsNrt   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]}||||d\}}q*t  jsAt||dS | 	||\}}|S )Nru   r   )rt   ru   r   )ru   r   )
r
   is_first_rankr   r   r   r   r   r   r   r   )	rA   r   rt   r   r   ru   r   layerrJ   rD   rD   rE   rK   W  s(   
zLfm2Model.forwardweightsc                 C   s   g d}t |  }t }|D ]P\}}d|v r|ddd}|D ]$\}}}	||vr*q |||}t|| r6q || }
|
j}||
||	  nt|| rKq|| }
t|
dt}||
| || q|S )N))r^   z.q_projr}   )r^   z.k_projr~   )r^   z.v_projr   )r5   r5   r   )r5   z.w3r   z.conv.z.short_conv.r   weight_loader)	dictnamed_parameterssetreplacer&   r   r   r   add)rA   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   rD   rD   rE   load_weightsv  s.   


zLfm2Model.load_weightsNN)rL   rM   rN   r	   rQ   r<   rR   rS   r   r   rK   r   r   r   r   rT   rD   rD   rB   rE   r   '  s"    +
,r   c                       s,  e Zd Zg dddgdgdZdddZed	d
deejdf fddZ	ed	d
deee
e
f  fddZedee fddZddd	ededdf fddZdejdejfddZ		d'dejdejdedB dejdB dejf
dd Zd!ejdejfd"d#Zd$eeeejf  dee fd%d&Z  ZS )(Lfm2ForCausalLM)q_projk_projv_projr>   w3in_proj)rl   r>   r   input_embeddingsoutput_embeddings)r   lm_headr   r	   rG   .c                 C   s   t |jj|jjS rH   )r   short_conv_state_dtyper   dtyper]   mamba_cache_dtype)clsr   rD   rD   rE   !get_mamba_state_dtype_from_config  s   z1Lfm2ForCausalLM.get_mamba_state_dtype_from_configc                 C   s$   |j }|jj}tj|j|j|jdS )zCalculate shapes for LFM2's convolutional cache.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
        )tp_world_sizeintermediate_sizeconv_kernel)parallel_configr   r   r   short_conv_state_shapetensor_parallel_sizer   conv_L_cache)r   r   r   r   rD   rD   rE   !get_mamba_state_shape_from_config  s   z1Lfm2ForCausalLM.get_mamba_state_shape_from_configc                 C   s   t  S rH   )r   short_conv_state_copy_func)r   rD   rD   rE   get_mamba_state_copy_func  s   z)Lfm2ForCausalLM.get_mamba_state_copy_funcr+   r   r2   Nc                   s   |j j}|j}|j}|jdkrtdt   || _t	|t
|dd| _t jrAt|j|j|t
|dd| _| j| jj| _nt | _t|j| _| jj| _d S )NallzcLfm2 currently does not support 'all' prefix caching, please use '--mamba-cache-mode=align' insteadmodel)r   r2   r   r   )r   r   r1   r]   mamba_cache_modeNotImplementedErrorr;   r<   rW   r   r)   r   r
   r   r   r   rY   r   tie_weightsr   r$   r   logits_processorr   )rA   r   r2   rW   r1   r]   rB   rD   rE   r<     s0   


zLfm2ForCausalLM.__init__r   c                 C   s   | j |S rH   )r   r   r   rD   rD   rE   r     s   zLfm2ForCausalLM.embed_input_idsrt   r   r   c                 K   s   |  ||||}|S rH   )r   )rA   r   rt   r   r   r   ru   rD   rD   rE   rK     s   zLfm2ForCausalLM.forwardru   c                 C   s   |  | j|}|S rH   )r   r   )rA   ru   logitsrD   rD   rE   compute_logits  s   zLfm2ForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r#   rW   tie_word_embeddingsr   )rA   r   loaderrD   rD   rE   r   	  s
   
zLfm2ForCausalLM.load_weightsr   )rL   rM   rN   packed_modules_mappingembedding_modulesclassmethodr   rR   r   r   r=   r   r   r   r	   rQ   r<   rS   r   r   rK   r   r   r   r   rT   rD   rD   rB   rE   r     sR    	 !
,r   )Ecollections.abcr   	itertoolsr   rR   torch.nnnntransformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r   r	   vllm.distributedr
   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   +vllm.model_executor.layers.mamba.short_convr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   r   r    r!   r"   utilsr#   r$   r%   r&   r'   r(   r)   Moduler*   rU   r   r   r   r   rD   rD   rD   rE   <module>   s>   $*[:6
q