o
    پi;,                     @  s  d dl mZ d dlZd dlmZmZmZmZ d dlZd dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ e Zer[d dlZd dlZd dlZd d
lmZmZmZ ee Z!dd Z"dddZ#dddZ$G dd dZ%G dd dej&j'Z(e(gZ)dS )    )annotationsN)AnyIterableOptionalTuple)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)LogitsProcessorOutput)QuantizationConfig)ForwardBatch)import_model_classes)is_npu)Tensormintmutablec                 C  s\   t d}t| dg }t|tr|g}|std|D ]}||v r&||   S qtd| )Nzsgl_mindspore.modelsarchitecturesz$No model architectures are specifiedzUnsupported arch )r   getattr
isinstancestr
ValueError)configmindspore_modelsr   arch r   O/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/mindspore.py_get_arch_from_config   s   
r   xtorch.Tensorc                 C  s8   | d u s
t | tjs| S tjj| }tjj|}|S N)r   torchr   utilsdlpack	to_dlpackmsfrom_dlpack)r   	pt_dlpack	ms_tensorr   r   r   tensor_torch2ms,   s
   r'   'ms.Tensor'c                 C  sB   | d u s
t | tjs| S tjj| }tjj|}tj	
  |S r   )r   r#   r   r    r!   r"   r   r$   	torch_npunpusynchronize)r   	ms_dlpacktorch_tensorr   r   r   tensor_ms2torch6   s   
r.   c                   @  s,   e Zd ZdZdddZdd ZdddZdS )LowerTriangularMaskz
    Provide Infer model attention mask.
    Args:
        dtype (ms dtype): The compute type of Infer model.
        max_model_len (int): The max model length of Infer model.
         c                 C  s   || _ || _d| _|| _| j tjkrdnd}ttjtj	dtj
ddd| | j d| _tjd	|d| _ttjtj	| j| jftjddd| j d| j | _d S )
Ni    g      ?r0   )   r1   )shapedtype   )kr3   )r4   r4   )r3   max_model_lencached_mask_lendecode_mask_coeffr#   bfloat16r   nptriuonesfloat16prefill_maskr   zeros	hard_maskint8decode_mask)selfr3   r7   r9   prefill_mask_coeffr   r   r   __init__J   s2   

zLowerTriangularMask.__init__c                 C  s   |   }|  }tj||f| jd}|jd }d}t|D ]A}||  }	||	7 }|	dkr2q!||  }
|
|	 }	 | j|||	 ||df< 	 |||	 |||
f }|	|
 | jkd q!|S )z
        when query_lens_np = [3], seq_lens_np = [6], decode_mask_coeff = 1
        init attention mask
        0 0 0 0 0 0
        0 0 0 0 0 0
        0 0 0 0 0 0
        r6   r   r4   N)maxitemsumr   r@   r3   r2   ranger9   masked_fill_tril)rD   query_lens_npseq_lens_npmax_seq_lentotal_q_lenattention_maskreq_numcurrent_rowiq_lenseq_lencontext_lenright_tensorr   r   r   create_maskf   s*   
zLowerTriangularMask.create_mask
is_prefillboolposition_idsr(   rM   
np.ndarrayrN   c                 C  s^   |  }|  }|r| j}|S |dkr*|| jkr"t| jd|}|S | ||}|S | j}|S )Nr4   r   )rG   r?   r8   r   index_selectrC   rY   rA   )rD   rZ   r\   rM   rN   max_query_lenrO   rQ   r   r   r   gen_attention_mask   s   
z&LowerTriangularMask.gen_attention_maskN)r0   )rZ   r[   r\   r(   rM   r]   rN   r]   )__name__
__module____qualname____doc__rF   rY   r`   r   r   r   r   r/   B   s
    
.r/   c                      sj   e Zd Z		d#d$ fddZdd Zedd Zd%ddZd&ddZdd Z	d'dd Z
ed!d" Z  ZS )(MindSporeForCausalLMN r   r   quant_configOptional[QuantizationConfig]prefixr   returnNonec                   s   t    || _tjdd td tdt t	  t dvr'tjdd | 
| j}|||d| _t| jj| jj| _g | _g | _d S )Nz)--disable_pass=gather_pre_rms_norm_fusion)graph_kernel_flagsFz*MindSporeForCausalLM tp size %d tp rank %d)r4            z--disable_pass=MatMulAllReduce)r   rg   )superrF   r   r#   set_contextset_kernel_launch_captureloggerinfor   r   get_archmodelr/   param_dtypemax_position_embeddingscausal_mask	key_cachevalue_cache)rD   r   rg   ri   r   	__class__r   r   rF      s$   



zMindSporeForCausalLM.__init__c                 C  s   t |S r   )r   )rD   r   r   r   r   ru      s   zMindSporeForCausalLM.get_archc                 C  s   | j jd dv S )Nr   DeepseekV3ForCausalLM)r   r   )rD   r   r   r   use_mla   s   zMindSporeForCausalLM.use_mlaweights"Iterable[Tuple[str, torch.Tensor]]c                 C  sB   | j | | j  D ]\}}t|dd }|d ur|| qd S )Nquant_method)rv   load_weightscells_and_namesr   process_weights_after_loading)rD   r   _cellr   r   r   r   r      s   
z!MindSporeForCausalLM.load_weightsforward_batchr   c                   s    fdd}j rjs|jdd tjS jr)jr)tjtjfS |jdd |jdd tjtjfS )Nc                   s\   t jjD ]%}|r j|}n j|}t|}|jdkr&t	|d}| 
| qd S )N   rm   )rJ   r   num_hidden_layerstoken_to_kv_poolget_key_bufferget_value_bufferr'   ndimr   	unsqueezeappend)
cache_listis_key_cacherT   cachecache_msr   rD   r   r   prepare_cache   s   
z7MindSporeForCausalLM.get_kvcache.<locals>.prepare_cacheT)r   F)r   rz   r   r{   )rD   r   r   r   r   r   get_kvcache   s   
z MindSporeForCausalLM.get_kvcachec                 C  sl  | j r	| |}n| |\}}|j }|o|j  dk}|j 	 }|j
d ur4|j
 	 }n
tj|jgtjd}|jj}	t|jj|jd |j f d d d d |	f |	 tj}
i }t|tj|d< tj|tjd|d< t||d< tj|tjd|d< | j||d || |d< t|jtj|d< ||d	< ||d
< | j s||d< |
|d< |S )Nr   r6   	input_idsbatch_valid_lengthr\   
q_seq_lensrQ   out_cache_locrZ   rz   r{   block_tables)r   r   forward_mode	is_extendextend_prefix_lensrI   rH   seq_lenscpunumpyextend_seq_lensr;   r=   
batch_sizeint32r   	page_sizer'   req_to_token_poolreq_to_tokenreq_pool_indicesrG   tor#   r   ry   r`   
contiguousr   )rD   r   	positionsr   rz   r{   rZ   r   r   r   r   model_inputsr   r   r   prepare_inputs   sT   

	

z#MindSporeForCausalLM.prepare_inputsr   r   r   r(   c                 C  s>   |  |||}| j ||}| jdi |}tt|d}|S )N)next_token_logitsr   )r   rv   r	   r.   )rD   r   r   r   r   logitslogits_resultr   r   r   forward   s
   zMindSporeForCausalLM.forwardc                 C  sB   zt |}t|dd }|d u rW d S ||W S  ty    Y d S w )N$get_model_config_for_expert_location)r   r   	Exception)clsr   arch_clsmethodr   r   r   r   1  s   
z9MindSporeForCausalLM.get_model_config_for_expert_location)Nrf   )r   r   rg   rh   ri   r   rj   rk   )r   r   )r   r   )r   r   r   r   r   r   rj   r(   )ra   rb   rc   rF   ru   propertyr   r   r   r   r   classmethodr   __classcell__r   r   r|   r   re      s    



1re   )r   r   )r   r(   )*
__future__r   loggingtypingr   r   r   r   r   sglang.srt.distributedr   r   "sglang.srt.layers.logits_processorr	   *sglang.srt.layers.quantization.base_configr
   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.models.registryr   sglang.srt.utilsr   _is_npu	mindsporer#   r   r;   r)   r   r   r   	getLoggerra   rs   r   r'   r.   r/   nnModulere   
EntryClassr   r   r   r   <module>   s0   



g 
