o
    -im                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" ddl#m$Z$m%Z% G dd deZ&G dd de&Z'dS )    N)PretrainedConfig)envs)
LoRAConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)divide)BaseLayerWithLoRA)get_lora_op_configs)FusedMoE)_get_config_dtype_str)MarlinExperts)TritonExperts)FusedMoEModularMethod)UnfusedOAITritonExperts)FusedMoEModularKernel)MoEPrepareAndFinalizeNoEP   )_get_lora_devicetry_get_optimal_moe_lora_configc                       s  e Zd Zdeddf fddZdeeedB f deeedB f fddZd	ed
ededededededefddZ	dd Z
dedefddZdedefddZ	d?dedededB ddfddZdejdejfdd Zd!ejfd"d#Zd$ejdejfd%d&Zd'ejdejfd(d)Zd*efd+d,Zd*ed-ejeej B d.ejeej B fd/d0Zd1d2 Zd3d4 Zed5d6 Zed7d8 Zedefd9d:Ze 	d?d;e!j"ded<ededB def
d=d>Z#  Z$S )@FusedMoEWithLoRA
base_layerreturnNc                    sX   t    || _| jjrJ dt | _t | _t|| _	|j
jr#dnd| _|   d S )Nz5EP support for Fused MoE LoRA is not implemented yet.   r   )super__init__r   use_epr   tp_sizer   tp_rankr   device
moe_configis_act_and_mul_w13_slices_inject_lora_into_fused_moeselfr   	__class__ W/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/lora/layers/fused_moe.pyr   -   s   


zFusedMoEWithLoRA.__init__configc                 C   sZ   i }|  D ]$\}}| r$|drd|dd   }n| }n|}|||< q|S )Nblock_BLOCK_SIZE__)itemsislower
startswithsplitupper)r$   r)   normalized_configkeyvaluenormalized_keyr'   r'   r(   _normalize_keys<   s   


z FusedMoEWithLoRA._normalize_keys	op_prefix	num_lorasrank
num_slicesMlayertop_kconfig_dtypec	              
   C   s   t jr(|j}	|j}
td| d|||	|||
d}td| d|||	|||
d}n(tjt|j	 |j
	 |||||jjjd}|d| dd}|d| dd}| |}| |}||fS )Nfused_moe_lora__shrink)op_type	max_lorasbatchhidden_sizer:   r;   moe_intermediate_size_expand)w1_shapew2_shaper:   r>   dtyper<   block_shape)rB   )r   VLLM_TUNED_CONFIG_FOLDERrE   intermediate_size_per_partitionr	   	functoolspartialr   
w13_weightsize	w2_weightquant_methodmoe_quant_configrK   r7   )r$   r8   r9   r:   r;   r<   r=   r>   r?   rE   intermediate_sizeshrink_configexpand_configget_config_funcr'   r'   r(   _get_lora_moe_configsI   sP   
	






z&FusedMoEWithLoRA._get_lora_moe_configsc                    s   i  j jj   j jj}t }t|j j|j j j}|j	r0t
|jttfs/J n
t
|jttfs:J  fdd} fdd} fdd}|j}|j |j|_|j |j|_|j |j|_tj j|j _d S )Nc                    s    fdd}|S )Nc                     sN   |d d< |d d< |d d< |d d< |d d<  | i |}|S )Nhidden_statestopk_idstopk_weights
expert_mapapply_router_weight_on_inputr'   )argskwargsresult)funcmoe_state_dictr'   r(   wrapper   s   zTFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.fwd_decorator.<locals>.wrapperr'   r=   rb   rd   )rc   )rb   r(   fwd_decorator   s   zCFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.fwd_decoratorc                        fdd}|S )Nc                     s@  | \}}}d }d }d }d }t |jdddd}	tj}
|d}t||
}jd jd }jd	j	|j
||	d
\}}j|||d jjj	j|\}}}|d< |d< |d< |j	d}|j	d}jj|d|jd |jj|||||||jjd  | i |}|d< |S )NrZ   r\   r[   r]   FrJ   use_fp8_w8a8use_int8_w8a16use_int4_w4a16r   w13r8   r9   r:   r;   r<   r=   r>   r?   BLOCK_SIZE_Msorted_token_ids_loraexpert_ids_loranum_tokens_post_padded_lorar-   )fully_shardedintermediate_cache2)r   rJ   r   VLLM_FUSED_MOE_CHUNK_SIZErQ   minw13_lora_a_stackedshaperY   rC   r!   punica_wrappermoe_lora_align_block_sizer   local_num_expertsadapter_enabledviewadd_lora_fused_moew13_lora_b_stackedrs   )r_   r`   r,   outputinputrZ   r\   curr_topk_idsr]   r?   
CHUNK_SIZE
num_tokensr<   max_lora_rankrV   rW   rp   rq   rr   ra   rb   r=   rc   r$   r>   r'   r(   rd      s~   



zTFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.act_decorator.<locals>.wrapperr'   re   rc   r$   r>   rb   r=   r(   act_decorator   s   MzCFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.act_decoratorc                    rg   )Nc                     s  d }d }t |jdddd}tj}|d}t||}jd jd }jdj	|d||d	\}	}
d
 }d }d }|
j	d}|
j	d}d }| d }tjjj}jj||jj||||||	|
jdjjr~|j ndd  | i |}|S )NrZ   r\   Frh   r   rl   w2r   rn   rp   rq   rr   r-   rt   T)rs   offset)r   rJ   r   ru   rQ   rv   w2_lora_a_stackedrx   rY   rC   r}   r   r   rE   r   ry   r~   w2_lora_b_stackedr|   rs   r   )r_   r`   rZ   r\   r?   r   r   r<   r   rV   rW   rp   rq   rr   rt   intermediate_cache3shard_size_w2ra   r   r'   r(   rd      sh   


zXFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.moe_sum_decorator.<locals>.wrapperr'   re   r   r   r(   moe_sum_decorator   s   <zGFusedMoEWithLoRA._inject_lora_into_fused_moe.<locals>.moe_sum_decorator)r   r>   ensure_moe_quant_config_initrS   rT   r   r   select_gemm_implshared_expertsuse_mxfp4_w4a16
isinstancefused_expertsr   r   r   forward
activationmoe_sumr   )r$   quant_configprepare_finalizem_fused_moe_fnrf   r   r   r   r'   r   r(   r"   ~   sB   




P?z,FusedMoEWithLoRA._inject_lora_into_fused_moerC   lora_configc                    sP   t  fddtjD _tjjj jjj	f j
jdf_d S )Nc                 3   sJ    | ] }t jjjjs jnt jjjjf j	j
d V  qdS rJ   r   N)torchzerosr   r{   rs   r   r   r   rE   
lora_dtyper   .0r,   r   rC   r$   r'   r(   	<genexpr>E  s    
z:FusedMoEWithLoRA._create_lora_a_weights.<locals>.<genexpr>r   )tupleranger!   rw   r   r   r   r{   r   rM   r   r   r   r$   rC   r   r'   r   r(   _create_lora_a_weights@  s   

z'FusedMoEWithLoRA._create_lora_a_weightsc                    f   t  fddtjD _tjjjjsjj	nt
jj	j jf jjdf_d S )Nc                 3   s6    | ]}t jjjjj jf jjd V  qdS r   r   r   r   r{   rM   r   r   r   r   r   r'   r(   r   b  s    
z:FusedMoEWithLoRA._create_lora_b_weights.<locals>.<genexpr>r   r   r   r!   r   r   r   r   r{   rs   rE   r   r   r   r   r   r   r   r'   r   r(   _create_lora_b_weightsa     


z'FusedMoEWithLoRA._create_lora_b_weightsmodel_configc                 C   s  |j | _ |j| _tjdg|d  tj| jd| _| || | 	|| g | _
g | _t|D ]^}t| jjD ]U}| j
| jd | |  | j
| jd | |  | j| jd | |  | j| jd | |  | jdkr| j
| jd | |  | j| jd | |  q6q.dS )Initializes lora matrices.r   r   r   r   N)rC   fully_sharded_lorasrs   r   tensorintr   r|   r   r   lora_a_stackedlora_b_stackedr   r   r{   appendrw   r   r   r   r!   )r$   rC   r   r   lora_id
experts_idr'   r'   r(   create_lora_weights~  sB   
z$FusedMoEWithLoRA.create_lora_weights
w13_lora_ac                 C   sr   | j dks| js
|S |jd }|| j  dksJ | jd jd }| j| }| jd | }|dd||ddf S D
        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
        r   r   r   N)r   rs   rx   rw   r   )r$   r   current_lora_rank
shard_size	start_idxend_idxr'   r'   r(   _slice_w13_a  s   

zFusedMoEWithLoRA._slice_w13_a
w13_lora_bc                 C   sH   | j dkr|S | jj}| j| }| jd | }|d d ||d d f S Nr   r   r   rM   r   )r$   r   r   r   r   r'   r'   r(   _slice_w13_b  s   

zFusedMoEWithLoRA._slice_w13_b	w2_lora_ac                 C   sH   | j dkr|S | jj}| j| }| jd | }|dddd||f S )r   r   Nr   )r$   r   r   r   r   r'   r'   r(   _slice_w2_a  s   

zFusedMoEWithLoRA._slice_w2_a	w2_lora_bc                 C   sV   | j dks| js
|S | jd jd }| j| }| jd | }|dd||ddf S r   )r   rs   r   rx   r   )r$   r   r   r   r   r'   r'   r(   _slice_w2_b  s   
zFusedMoEWithLoRA._slice_w2_bindexc                 C   sV   t | jD ]}d| j| |< d| j| |< qd| jd |< d| jd |< d| j|< dS )z+Resets the lora weights at index back to 0.r   N)r   r!   rw   r   r   r   r|   )r$   r   posr'   r'   r(   
reset_lora  s   zFusedMoEWithLoRA.reset_loralora_alora_bc                 C   s&  t |tsJ t |tsJ | | d| j|< | jd jd }|\}}}|\}}	}
||jd   krC|jd   krC|jd ksFJ  J | |}| |}| |}| 	|	}| jd |ddd|jd d|jd f j
|dd | jd |ddd|jd d|jd f j
|dd | jdkr| |}| |
}| jd |ddd|jd d|jd f j
|dd | jd |ddd|jd d|jd f j
|dd | jd |ddd|jd d|jd f j
|dd | jd |ddd|jd d|jd f j
|dd dS )!Overwrites lora tensors at index.r   r   Nr   Tnon_blocking)r   listr   r|   rw   rx   r   r   r   r   copy_r   r!   r   r   )r$   r   r   r   num_experts	w1_lora_ar   	w3_lora_a	w1_lora_br   	w3_lora_bslliced_w1_lora_aslliced_w1_lora_bsliced_w2_lora_asliced_w2_lora_bslliced_w3_lora_aslliced_w3_lora_br'   r'   r(   set_lora  sf   







""


""""
zFusedMoEWithLoRA.set_lorac                 O      | j j|i |S N)r   r   r$   r_   r`   r'   r'   r(   r   ,     zFusedMoEWithLoRA.forwardc                 O   r   r   )r   &maybe_all_reduce_tensor_model_parallelr   r'   r'   r(   r   /  r   z7FusedMoEWithLoRA.maybe_all_reduce_tensor_model_parallelc                 C      | j jS r   )r   _shared_expertsr$   r'   r'   r(   r   2     z FusedMoEWithLoRA._shared_expertsc                 C   r   r   )r   rS   r   r'   r'   r(   rS   6  r   zFusedMoEWithLoRA.quant_methodc                 C   r   r   )r   is_internal_routerr   r'   r'   r(   r   :  r   z#FusedMoEWithLoRA.is_internal_routersource_layerpacked_modules_listc                 C      t |to
t|dkS )=Returns True if the layer can be replaced by this LoRA layer.r   r   r
   lenclsr   r   r   r   r'   r'   r(   can_replace_layer>  s   z"FusedMoEWithLoRA.can_replace_layerr   )%__name__
__module____qualname__r
   r   dictstrr   r7   rY   r"   r   r   r   r   r   r   Tensorr   r   r   r   r   r   r   r   r   propertyr   rS   boolr   classmethodnnModuler   __classcell__r'   r'   r%   r(   r   ,   s    *	
5 C
!!
0
<

r   c                       s   e Zd Z fddZdd Z	d dedededB d	dfd
dZde	j
fddZdede	j
ee	j
 B de	j
ee	j
 B fddZedd Zedd Zedd Zedd Ze	d dejdedededB d	ef
ddZ  ZS )!FusedMoE3DWithLoRAc                    s   t  | d| _d S r   )r   r   r!   r#   r%   r'   r(   r   M  s   
zFusedMoE3DWithLoRA.__init__c                    r   )Nc                 3   s:    | ]}t jjjjjd   jf jjdV  qdS )r   r   Nr   r   r   r'   r(   r   R  s    

z<FusedMoE3DWithLoRA._create_lora_b_weights.<locals>.<genexpr>r   r   r   r'   r   r(   r   Q  r   z)FusedMoE3DWithLoRA._create_lora_b_weightsNrC   r   r   r   c                 C   sf   t |tsJ |jd | _|j| _|j| _tjdg|d  tj	| j
d| _| || | || dS )r   r   r   r   N)r   r   architectures_base_modelrC   r   rs   r   r   r   r   r|   r   r   )r$   rC   r   r   r'   r'   r(   r   n  s   z&FusedMoE3DWithLoRA.create_lora_weightsr   c           
      C   sF  | j dkr|S | jj}| j| }| jd | }| jdkr_|d d d d dd d f }|d d dd dd d f }|d d ||d d f }|d d ||d d f }tj||gddddS |jd d }	|d d d |	d d f }|d d |	d d d f }|d d ||d d f }|d d ||d d f }tj	||gddS )Nr   GptOssForCausalLMr   )dim)
r   r   rM   r   r  r   stackflattenrx   cat)
r$   r   r   r   r   r   r   sliced_w1_lora_bsliced_w3_lora_b
slice_sizer'   r'   r(   r     s&   


zFusedMoE3DWithLoRA._slice_w13_br   r   r   c                 C   sp  t |tsJ t |tsJ t|t|  krdks J  J | | d| j|< |\}}|\}}| |}| |}	| |}
| |}| j	d |ddd|j
d d|j
d f j|dd | jd |ddd|
j
d d|
j
d f j|
dd | jd |ddd|	j
d d|	j
d f j|	dd | jd |ddd|j
d d|j
d f j|dd dS )r   r   r   r   NTr   )r   r   r   r   r|   r   r   r   r   rw   rx   r   r   r   r   )r$   r   r   r   r   r   r   r   sliced_w13_lora_asliced_w13_lora_br   r   r'   r'   r(   r     s>   $





""""
zFusedMoE3DWithLoRA.set_lorac                 C   s   | j d jd S 
        Full size
        r   r-   )rw   rx   r   r'   r'   r(   w13_input_size  s   z!FusedMoE3DWithLoRA.w13_input_sizec                 C      | j d jd | j S )r  r   rl   )r   rx   r   r   r'   r'   r(   w13_output_size     z"FusedMoE3DWithLoRA.w13_output_sizec                 C   r  r  )r   rx   r   r   r'   r'   r(   w2_input_size  r  z FusedMoE3DWithLoRA.w2_input_sizec                 C   r   )r  )r   rE   r   r'   r'   r(   w2_output_size  s   z!FusedMoE3DWithLoRA.w2_output_sizer   r   c                 C   r   )r   r   r   r   r'   r'   r(   r     s   
z$FusedMoE3DWithLoRA.can_replace_layerr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r   r   r   r   r   r   r'   r'   r%   r(   r   L  sT    !

&



r   )(rN   r   torch.nnr   transformersr   vllmr   vllm.config.lorar   vllm.distributed.parallel_stater   r   vllm.distributed.utilsr   vllm.lora.layers.baser   vllm.lora.ops.triton_ops.utilsr	   $vllm.model_executor.layers.fused_moer
   +vllm.model_executor.layers.fused_moe.configr   5vllm.model_executor.layers.fused_moe.fused_marlin_moer   .vllm.model_executor.layers.fused_moe.fused_moer   =vllm.model_executor.layers.fused_moe.fused_moe_modular_methodr   ?vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moer   3vllm.model_executor.layers.fused_moe.modular_kernelr   5vllm.model_executor.layers.fused_moe.prepare_finalizer   utilsr   r   r   r   r'   r'   r'   r(   <module>   s2       $