o
    
۾i%                     @   sJ  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ eeZh dZdedefddZdejdejfddZdejdededeejejf fddZdejdejdejdejdejdejdejfddZ	d<d ejj d!ejd"ejd#ejdB fd$d%Z!d&d' Z"	d<d!ejd"ejd#ejdB dejfd(d)Z#	d<d!ejd"ejd#ejdB dejfd*d+Z$	d<d ejj d!ejd"ejd#ejdB dejf
d,d-Z%ed-e#e$d. d/ed0ed1ej&defd2d3Z'd ejj d4eddfd5d6Z(	d<d ejj d!ejd"ejd#ejdB fd7d8Z)ded9ejf fd:d;Z*dS )=z!Utility methods for model layers.    )CallableN)_custom_ops)envs)rocm_aiter_ops)init_logger)CpuArchEnumcurrent_platform)get_cu_count)direct_register_custom_op>   gaterouterexpert_gaterouter_gateshared_expert_gateprefixreturnc                 C   s   | sdS |  ddd tv S )NF.   )rsplitMOE_LAYER_ROUTER_GATE_SUFFIXES)r    r   T/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/utils.pyis_layer_moe_router_gate   s   r   wc                 C   sV   | j }|d }| dd |d f }| d|d d f }tj||fdd}||}|S )Nr   .   dim)shapetorchstackreshape)r   r   Nfirstsecondstacked
w_shuffledr   r   r   shuffle_weight"   s   
r'   tokens
vocab_sizenum_seqsc                 C   sT   t j||d ft j| jd}|d| t |  |d d d |f }|dk}||fS )Nr   )dtypedevicer   )r   zeroslongr,   scatter_add_	ones_like)r(   r)   r*   
bin_countsmaskr   r   r   get_token_bin_counts_and_mask7   s   r3   logitsprompt_tokens_tensoroutput_tokens_tensorpresence_penaltiesfrequency_penaltiesrepetition_penaltiesc                 C   sp   | j \}}t|||\}}	t|||\}
}ddlm} || |	|| | |jdd|
 8 } | |jdd| 8 } | S )a  
    Applies penalties in place to the logits tensor
    logits : The input logits tensor of shape [num_seqs, vocab_size]
    prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts
        are padded to the maximum prompt length within the batch using
        `vocab_size` as the padding value. The value `vocab_size` is used
        for padding because it does not correspond to any valid token ID
        in the vocabulary.
    output_tokens_tensor: The output tokens tensor.
    presence_penalties: The presence penalties of shape (num_seqs, )
    frequency_penalties: The frequency penalties of shape (num_seqs, )
    repetition_penalties: The repetition penalties of shape (num_seqs, )
    r   )apply_repetition_penaltiesr   r   )r   r3   vllm._custom_opsr:   	unsqueeze)r4   r5   r6   r7   r8   r9   r*   r)   _prompt_maskoutput_bin_countsoutput_maskr:   r   r   r   apply_penaltiesH   s   
rA   layerxweightbiasc                 C      t jj|||S Nr   nn
functionallinearrB   rC   rD   rE   r   r   r   default_unquantized_gemmq      rM   c                 C   s   t  rt s|tjtjfvrdS | dkr|dkrdS |dkr$|dkpC|dko+|dkpC|dko3|dkpC|dko;|dkpC|dkoC|dkS )	NFi      i   i@  i      i  )r   is_triton_gemm_enabledr   is_fp8_fnuzr   float16bfloat16)nmkr+   r   r   r   use_aiter_triton_gemmz   s"   rX   c                 C   s  ddl m}m} |  | d }|jd }|jd }t }t|||| jr2ddl	m
}	 |	| ||S d|d  > }
|d d d |d d d  }t|
d d	}|| }||k}tjo| o| jtjtjfv od
|  koqdkn  o|d dko|dko|d dko|o|  }|r| d| d}t||||}|jg | jd d |jd R  S tjo| o| jtjtjfv o|d dko|  }|durtjj| ||S | d| d}|dkrd|  k rd	krn nt }t||||}|jg | jd d |jd R  S |d	 dkr<|dkr<|dkr<|d u r<t||d	}|jg | jd d |jd R  S tjj| ||S )Nr   )on_gfx9	on_gfx950r   r   )gemm_a16w16@   rO         
   rP      Ti    )vllm.platforms.rocmrY   rZ   numelsizer   r	   rX   r+   aiter.ops.triton.gemm_a16w16r[   
bit_lengthminr   VLLM_ROCM_USE_SKINNY_GEMMr   rS   rT   is_contiguousr!   ops
wvSplitKrcrI   rJ   rK   wvSplitKLLMM1)rC   rD   rE   rY   rZ   rU   rV   rW   cu_countr[   N_p2	rndup_cusGrpsShrBCuNeededfits_wvsplitkrcuse_skinny_reduce_countingx_viewout
use_skinnyr   r   r   rocm_unquantized_gemm_impl   sj   

 

$
$$,$rw   c                 C   s$   |  g | jd d |jd R S )Nr   r   )	new_emptyr   rC   rD   rE   r   r   r   rocm_unquantized_gemm_fake   s   $rz   c                 C   rF   rG   )r   ri   vllmrocm_unquantized_gemmrL   r   r   r   r|      rN   r|   )op_nameop_func	fake_implrU   rW   r+   c                 C   s4   t jj o|t jt jfv o|d dko| d dkS )N    r   r]   )r   _C_cpu_is_amx_tile_supportedrT   int8)rU   rW   r+   r   r   r   check_cpu_sgl_kernel   s   

r   remove_weightc              
      sB  | j jrtjjj| _d S | j  \}}| j j}t	j
rRt|||rRtjj| j t| dd d ur8| jtj nd   fdd| _|rPtjjtddd| _ d S tjrt tjkrz#| j }t| dfdd| _|r}tjjtddd| _ W d S  ty } ztd	|  W Y d }~nd }~ww d
d | _d S )NrE   c                    s$   t jj| |d ur dS d dS )NT)r   ri   r   weight_packed_linearry   )bias_f32packed_weightr   r   <lambda>  s
    z/dispatch_cpu_unquantized_gemm.<locals>.<lambda>r   F)requires_gradr   c                    s   t  | |S rG   )ri   	onednn_mmry   )handlerr   r   r     s    zEFailed to create oneDNN linear, fallback to torch linear. Exception: c                 S   s   t jj| ||S rG   rH   ry   r   r   r   r     s    )rD   is_metar   rI   rJ   rK   
cpu_linearrc   r+   r   VLLM_CPU_SGL_KERNELr   ri   r   convert_weight_packedgetattrrE   tofloat32	Parameterempty_supports_onednnr   get_cpu_architecturer   POWERPCcreate_onednn_mmtRuntimeErrorloggerwarning_once)rB   r   r"   Kr+   origin_weighter   )r   r   r   r   dispatch_cpu_unquantized_gemm   sB   r   c                 C   s   |  |||S rG   )r   rL   r   r   r   cpu_unquantized_gemm!  s   r   .c                   C   s   t  rtS t  rtS tS rG   )r   is_rocmr|   is_cpur   rM   r   r   r   r   dispatch_unquantized_gemm*  s
   r   rG   )+__doc__collections.abcr   r   r{   r   ri   r   vllm._aiter_opsr   vllm.loggerr   vllm.platformsr   r   vllm.utils.platform_utilsr	   vllm.utils.torch_utilsr
   __name__r   r   strboolr   Tensorr'   inttupler3   rA   rI   ModulerM   rX   rw   rz   r|   r+   r   r   r   r   r   r   r   r   <module>   s   	

-
	
E


		
3
	