o
    i	r                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" ddl#m$Z$m%Z% G dd deZ&G dd de&Z'dS )    N)PretrainedConfig)envs)
LoRAConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)divide)BaseLayerWithLoRA)get_lora_op_configs)FusedMoE)_get_config_dtype_str)MarlinExperts)TritonExperts)FusedMoEModularMethod)UnfusedOAITritonExperts)FusedMoEModularKernel)MoEPrepareAndFinalizeNoEP   )_get_lora_devicetry_get_optimal_moe_lora_configc                       s  e Zd Zdeddf fddZdeeedB f deeedB f fddZd	ed
ededededededefddZ	dd Z
dedefddZdedefddZ	d?dedededB ddfddZdejdejfdd Zd!ejfd"d#Zd$ejdejfd%d&Zd'ejdejfd(d)Zd*efd+d,Zd*ed-ejeej B d.ejeej B fd/d0Zd1d2 Zd3d4 Zed5d6 Zed7d8 Zedefd9d:Ze 	d?d;e!j"ded<ededB def
d=d>Z#  Z$S )@FusedMoEWithLoRA
base_layerreturnNc                    sX   t    || _| jjrJ dt | _t | _t|| _	|j
jr#dnd| _|   d S )Nz5EP support for Fused MoE LoRA is not implemented yet.   r   )super__init__r   use_epr   tp_sizer   tp_rankr   device
moe_configis_act_and_mul_w13_slices_inject_lora_into_fused_moeselfr   	__class__ P/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/lora/layers/fused_moe.pyr   -   s   


zFusedMoEWithLoRA.__init__configc                 C   sZ   i }|  D ]$\}}| r$|drd|dd   }n| }n|}|||< q|S )Nblock_BLOCK_SIZE__)itemsislower
startswithsplitupper)r$   r)   normalized_configkeyvaluenormalized_keyr'   r'   r(   _normalize_keys<   s   


z FusedMoEWithLoRA._normalize_keys	op_prefix	num_lorasrank
num_slicesMlayertop_kconfig_dtypec	              
   C   s   t jr(|j}	|j}
td| d|||	|||
d}td| d|||	|||
d}n(tjt|j	 |j
	 |||||jjjd}|d| dd}|d| dd}| |}| |}||fS )Nfused_moe_lora__shrink)op_type	max_lorasbatchhidden_sizer:   r;   moe_intermediate_size_expand)w1_shapew2_shaper:   r>   dtyper<   block_shape)rB   )r   VLLM_TUNED_CONFIG_FOLDERrE   intermediate_size_per_partitionr	   	functoolspartialr   
w13_weightsize	w2_weightquant_methodmoe_quant_configrK   r7   )r$   r8   r9   r:   r;   r<   r=   r>   r?   rE   intermediate_sizeshrink_configexpand_configget_config_funcr'   r'   r(   _get_lora_moe_configsI   sP   
	






z&FusedMoEWithLoRA._get_lora_moe_configsc                    s  i  j jj   j jj}tj jddrj jj}nt }t|j j	|j j j
}|jr>t|jttfs=J nt|jtsFJ  fdd} fdd} fdd}|j}|j |j|_|j |j|_|j |j|_tj j|j _d S )	Nsupports_internal_mkFc                    s    fdd}|S )Nc                     sN   |d d< |d d< |d d< |d d< |d d<  | i |}|S )Nhidden_statestopk_idstopk_weights
expert_mapapply_router_weight_on_inputr'   )argskwargsresult)funcmoe_state_dictr'   r(   wrapper   s   zTFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.fwd_decorator.<locals>.wrapperr'   r=   rc   re   )rd   )rc   r(   fwd_decorator   s   zCFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.fwd_decoratorc                        fdd}|S )Nc                     s|  | \}}}d }d }d }d }t |jdddd}	tj}
|d}t||
}jd jd }jd	j	|j
||	d
\}}d}|d u oV| | jjj	 k}j|||d jjj	j||\}}}}|d< |d< |d< |d< |d ur|j	d}|j	d}jj|d|jd |jj|||||||jj|d  | i |}|d< |S )Nr[   r]   r\   r^   FrJ   use_fp8_w8a8use_int8_w8a16use_int4_w4a16r   w13r8   r9   r:   r;   r<   r=   r>   r?      BLOCK_SIZE_Msorted_token_ids_loraexpert_ids_loranum_tokens_post_padded_loratoken_lora_mappingr-   )fully_shardedru   intermediate_cache2)r   rJ   r   VLLM_FUSED_MOE_CHUNK_SIZErQ   minw13_lora_a_stackedshaperY   rC   r!   r   local_num_expertspunica_wrappermoe_lora_align_block_sizeadapter_enabledviewadd_lora_fused_moew13_lora_b_stackedrv   )r`   ra   r,   outputinputr[   r]   curr_topk_idsr^   r?   
CHUNK_SIZE
num_tokensr<   max_lora_rankrV   rW   SPARSITY_FACTORnaive_block_assignmentru   rr   rs   rt   rb   rc   r=   rd   r$   r>   r'   r(   re      s   




zTFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.act_decorator.<locals>.wrapperr'   rf   rd   r$   r>   rc   r=   r(   act_decorator   s   ]zCFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.act_decoratorc                    rh   )Nc                     s*  d }d }t |jdddd}tj}|d}t||}jd jd }jdj	|d||d	\}	}
d
 }d }d }
d}|d urY|j	d}|j	d}d }| d }tjjj}jj||jj||||||	|
jdjjr|j nd|d  | i |}|S )Nr[   r]   Fri   r   rm   w2r   ro   rr   rs   rt   ru   r-   rw   T)rv   offsetru   )r   rJ   r   rx   rQ   ry   w2_lora_a_stackedr{   rY   rC   getr   r   r   rE   r   r}   r   w2_lora_b_stackedr   rv   r   )r`   ra   r[   r]   r?   r   r   r<   r   rV   rW   rr   rs   rt   ru   rw   intermediate_cache3shard_size_w2rb   r   r'   r(   re   	  sr   



zXFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.moe_sum_decorator.<locals>.wrapperr'   rf   r   r   r(   moe_sum_decorator  s   AzGFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.moe_sum_decorator)r   r>   ensure_moe_quant_config_initrS   rT   getattrmoe_mkr   r   select_gemm_implshared_expertsuse_mxfp4_w4a16
isinstancefused_expertsr   r   r   forward
activationmoe_sumr   )r$   quant_configm_fused_moe_fnprepare_finalizerg   r   r   r   r'   r   r(   r"   ~   sB   



`Dz,FusedMoEWithLoRA._inject_lora_into_fused_moerC   lora_configc                    sP   t  fddtjD _tjjj jjj	f j
jdf_d S )Nc                 3   sJ    | ] }t jjjjs jnt jjjjf j	j
d V  qdS rJ   r   N)torchzerosr   r|   rv   r   r   r   rE   
lora_dtyper   .0r,   r   rC   r$   r'   r(   	<genexpr>^  s    
z:FusedMoEWithLoRA._create_lora_a_weights.<locals>.<genexpr>r   )tupleranger!   rz   r   r   r   r|   r   rM   r   r   r   r$   rC   r   r'   r   r(   _create_lora_a_weightsY  s   

z'FusedMoEWithLoRA._create_lora_a_weightsc                    f   t  fddtjD _tjjjjsjj	nt
jj	j jf jjdf_d S )Nc                 3   s6    | ]}t jjjjj jf jjd V  qdS r   r   r   r   r|   rM   r   r   r   r   r   r'   r(   r   {  s    
z:FusedMoEWithLoRA._create_lora_b_weights.<locals>.<genexpr>r   r   r   r!   r   r   r   r   r|   rv   rE   r   r   r   r   r   r   r   r'   r   r(   _create_lora_b_weightsz     


z'FusedMoEWithLoRA._create_lora_b_weightsmodel_configc                 C   s  |j | _ |j| _tjdg|d  tj| jd| _| || | 	|| g | _
g | _t|D ]^}t| jjD ]U}| j
| jd | |  | j
| jd | |  | j| jd | |  | j| jd | |  | jdkr| j
| jd | |  | j| jd | |  q6q.dS )Initializes lora matrices.r   r   r   r   N)rC   fully_sharded_lorasrv   r   tensorintr   r   r   r   lora_a_stackedlora_b_stackedr   r   r|   appendrz   r   r   r   r!   )r$   rC   r   r   lora_id
experts_idr'   r'   r(   create_lora_weights  sB   
z$FusedMoEWithLoRA.create_lora_weights
w13_lora_ac                 C   sr   | j dks| js
|S |jd }|| j  dksJ | jd jd }| j| }| jd | }|dd||ddf S D
        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
        r   r   r   N)r   rv   r{   rz   r   )r$   r   current_lora_rank
shard_size	start_idxend_idxr'   r'   r(   _slice_w13_a  s   

zFusedMoEWithLoRA._slice_w13_a
w13_lora_bc                 C   sH   | j dkr|S | jj}| j| }| jd | }|d d ||d d f S Nr   r   r   rM   r   )r$   r   r   r   r   r'   r'   r(   _slice_w13_b  s   

zFusedMoEWithLoRA._slice_w13_b	w2_lora_ac                 C   sH   | j dkr|S | jj}| j| }| jd | }|dddd||f S )r   r   Nr   )r$   r   r   r   r   r'   r'   r(   _slice_w2_a  s   

zFusedMoEWithLoRA._slice_w2_a	w2_lora_bc                 C   sV   | j dks| js
|S | jd jd }| j| }| jd | }|dd||ddf S r   )r   rv   r   r{   r   )r$   r   r   r   r   r'   r'   r(   _slice_w2_b  s   
zFusedMoEWithLoRA._slice_w2_bindexc                 C   sV   t | jD ]}d| j| |< d| j| |< qd| jd |< d| jd |< d| j|< dS )z+Resets the lora weights at index back to 0.r   N)r   r!   rz   r   r   r   r   )r$   r   posr'   r'   r(   
reset_lora  s   zFusedMoEWithLoRA.reset_loralora_alora_bc                 C   s&  t |tsJ t |tsJ | | d| j|< | jd jd }|\}}}|\}}	}
||jd   krC|jd   krC|jd ksFJ  J | |}| |}| |}| 	|	}| jd |ddd|jd d|jd f j
|dd | jd |ddd|jd d|jd f j
|dd | jdkr| |}| |
}| jd |ddd|jd d|jd f j
|dd | jd |ddd|jd d|jd f j
|dd | jd |ddd|jd d|jd f j
|dd | jd |ddd|jd d|jd f j
|dd dS )!Overwrites lora tensors at index.r   r   Nr   Tnon_blocking)r   listr   r   rz   r{   r   r   r   r   copy_r   r!   r   r   )r$   r   r   r   num_experts	w1_lora_ar   	w3_lora_a	w1_lora_br   	w3_lora_bslliced_w1_lora_aslliced_w1_lora_bsliced_w2_lora_asliced_w2_lora_bslliced_w3_lora_aslliced_w3_lora_br'   r'   r(   set_lora	  sf   







""


""""
zFusedMoEWithLoRA.set_lorac                 O      | j j|i |S N)r   r   r$   r`   ra   r'   r'   r(   r   E     zFusedMoEWithLoRA.forwardc                 O   r   r   )r   &maybe_all_reduce_tensor_model_parallelr   r'   r'   r(   r   H  r   z7FusedMoEWithLoRA.maybe_all_reduce_tensor_model_parallelc                 C      | j jS r   )r   _shared_expertsr$   r'   r'   r(   r   K     z FusedMoEWithLoRA._shared_expertsc                 C   r   r   )r   rS   r   r'   r'   r(   rS   O  r   zFusedMoEWithLoRA.quant_methodc                 C   r   r   )r   is_internal_routerr   r'   r'   r(   r   S  r   z#FusedMoEWithLoRA.is_internal_routersource_layerpacked_modules_listc                 C      t |to
t|dkS )=Returns True if the layer can be replaced by this LoRA layer.r   r   r
   lenclsr   r   r   r   r'   r'   r(   can_replace_layerW  s   z"FusedMoEWithLoRA.can_replace_layerr   )%__name__
__module____qualname__r
   r   dictstrr   r7   rY   r"   r   r   r   r   r   r   Tensorr   r   r   r   r   r   r   r   r   propertyr   rS   boolr   classmethodnnModuler   __classcell__r'   r'   r%   r(   r   ,   s    *	
5 \
!!
0
<

r   c                       s   e Zd Z fddZdd Z	d dedededB d	dfd
dZde	j
fddZdede	j
ee	j
 B de	j
ee	j
 B fddZedd Zedd Zedd Zedd Ze	d dejdedededB d	ef
ddZ  ZS )!FusedMoE3DWithLoRAc                    s   t  | d| _d S r   )r   r   r!   r#   r%   r'   r(   r   f  s   
zFusedMoE3DWithLoRA.__init__c                    r   )Nc                 3   s:    | ]}t jjjjjd   jf jjdV  qdS )r   r   Nr   r   r   r'   r(   r   k  s    

z<FusedMoE3DWithLoRA._create_lora_b_weights.<locals>.<genexpr>r   r   r   r'   r   r(   r   j  r   z)FusedMoE3DWithLoRA._create_lora_b_weightsNrC   r   r   r   c                 C   sf   t |tsJ |jd | _|j| _|j| _tjdg|d  tj	| j
d| _| || | || dS )r   r   r   r   N)r   r   architectures_base_modelrC   r   rv   r   r   r   r   r   r   r   )r$   rC   r   r   r'   r'   r(   r     s   z&FusedMoE3DWithLoRA.create_lora_weightsr   c           
      C   sF  | j dkr|S | jj}| j| }| jd | }| jdkr_|d d d d dd d f }|d d dd dd d f }|d d ||d d f }|d d ||d d f }tj||gddddS |jd d }	|d d d |	d d f }|d d |	d d d f }|d d ||d d f }|d d ||d d f }tj	||gddS )Nr   GptOssForCausalLMr   )dim)
r   r   rM   r   r
  r   stackflattenr{   cat)
r$   r   r   r   r   r   r   sliced_w1_lora_bsliced_w3_lora_b
slice_sizer'   r'   r(   r     s&   


zFusedMoE3DWithLoRA._slice_w13_br   r   r   c                 C   sp  t |tsJ t |tsJ t|t|  krdks J  J | | d| j|< |\}}|\}}| |}| |}	| |}
| |}| j	d |ddd|j
d d|j
d f j|dd | jd |ddd|
j
d d|
j
d f j|
dd | jd |ddd|	j
d d|	j
d f j|	dd | jd |ddd|j
d d|j
d f j|dd dS )r   r   r   r   NTr   )r   r   r   r   r   r   r   r   r   rz   r{   r   r   r   r   )r$   r   r   r   r   r   r   r   sliced_w13_lora_asliced_w13_lora_br   r   r'   r'   r(   r     s>   $





""""
zFusedMoE3DWithLoRA.set_lorac                 C   s   | j d jd S 
        Full size
        r   r-   )rz   r{   r   r'   r'   r(   w13_input_size  s   z!FusedMoE3DWithLoRA.w13_input_sizec                 C      | j d jd | j S )r  r   rm   )r   r{   r   r   r'   r'   r(   w13_output_size     z"FusedMoE3DWithLoRA.w13_output_sizec                 C   r  r  )r   r{   r   r   r'   r'   r(   w2_input_size  r  z FusedMoE3DWithLoRA.w2_input_sizec                 C   r   )r  )r   rE   r   r'   r'   r(   w2_output_size  s   z!FusedMoE3DWithLoRA.w2_output_sizer   r   c                 C   r   )r   r   r   r   r'   r'   r(   r     s   
z$FusedMoE3DWithLoRA.can_replace_layerr   )r   r   r   r   r   r   r   r   r   r   r  r   r   r   r  r  r  r  r  r  r  r  r  r   r  r'   r'   r%   r(   r  e  sT    !

&



r  )(rN   r   torch.nnr  transformersr   vllmr   vllm.config.lorar   vllm.distributed.parallel_stater   r   vllm.distributed.utilsr   vllm.lora.layers.baser   vllm.lora.ops.triton_ops.utilsr	   $vllm.model_executor.layers.fused_moer
   +vllm.model_executor.layers.fused_moe.configr   5vllm.model_executor.layers.fused_moe.fused_marlin_moer   .vllm.model_executor.layers.fused_moe.fused_moer   =vllm.model_executor.layers.fused_moe.fused_moe_modular_methodr   ?vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moer   3vllm.model_executor.layers.fused_moe.modular_kernelr   5vllm.model_executor.layers.fused_moe.prepare_finalizer   utilsr   r   r   r  r'   r'   r'   r(   <module>   s2       =