o
    -i"                     @   s0  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ eeZdejdejfddZdejdededeejejf fddZdejdejdejdejdejdejdejfddZ	d8dejjdejdejdejdB fd d!Zd"d# Z	d8dejdejdejdB dejfd$d%Z	d8dejdejdejdB dejfd&d'Z 	d8dejjdejdejdejdB dejf
d(d)Z!ed)ee d* d+ed,ed-ej"de#fd.d/Z$dejjd0e#ddfd1d2Z%	d8dejjdejdejdejdB fd3d4Z&ded5ejf fd6d7Z'dS )9z!Utility methods for model layers.    )CallableN)_custom_ops)envs)rocm_aiter_ops)init_logger)CpuArchEnumcurrent_platform)get_cu_count)direct_register_custom_opwreturnc                 C   sV   | j }|d }| dd |d f }| d|d d f }tj||fdd}||}|S )N.   dim)shapetorchstackreshape)r   r   Nfirstsecondstacked
w_shuffled r   ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/layers/utils.pyshuffle_weight   s   
r   tokens
vocab_sizenum_seqsc                 C   sT   t j||d ft j| jd}|d| t |  |d d d |f }|dk}||fS )N   )dtypedevicer   )r   zeroslongr"   scatter_add_	ones_like)r   r   r   
bin_countsmaskr   r   r   get_token_bin_counts_and_mask)   s   r)   logitsprompt_tokens_tensoroutput_tokens_tensorpresence_penaltiesfrequency_penaltiesrepetition_penaltiesc                 C   sp   | j \}}t|||\}}	t|||\}
}ddlm} || |	|| | |jdd|
 8 } | |jdd| 8 } | S )a  
    Applies penalties in place to the logits tensor
    logits : The input logits tensor of shape [num_seqs, vocab_size]
    prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts
        are padded to the maximum prompt length within the batch using
        `vocab_size` as the padding value. The value `vocab_size` is used
        for padding because it does not correspond to any valid token ID
        in the vocabulary.
    output_tokens_tensor: The output tokens tensor.
    presence_penalties: The presence penalties of shape (num_seqs, )
    frequency_penalties: The frequency penalties of shape (num_seqs, )
    repetition_penalties: The repetition penalties of shape (num_seqs, )
    r   )apply_repetition_penaltiesr    r   )r   r)   vllm._custom_opsr0   	unsqueeze)r*   r+   r,   r-   r.   r/   r   r   _prompt_maskoutput_bin_countsoutput_maskr0   r   r   r   apply_penalties:   s   
r7   layerxweightbiasc                 C      t jj|||S Nr   nn
functionallinearr8   r9   r:   r;   r   r   r   default_unquantized_gemmc      rC   c                 C   s   t  rt s|tjtjfvrdS | dkr|dkrdS |dkr$|dkpC|dko+|dkpC|dko3|dkpC|dko;|dkpC|dkoC|dkS )	NFi      i   i@  i      i  )r   is_triton_gemm_enabledr   is_fp8_fnuzr   float16bfloat16)nmkr!   r   r   r   use_aiter_triton_gemml   s"   rN   c                 C   s>  ddl m}m} |  | d }|jd }|jd }dd l}tjoN| oN| j	t
jt
jfv oN|dkoN|dkoN|dkoN||d ||d  t k oN|  }	|	rwt }
| d| d}t|||
|}|jg | jd d |jd R  S t|||| j	rddlm} || ||S tjo| o| j	t
jt
jfv o|d	 dko|  }|d
urt
jj| ||S | d| d}|d	krd|  k rdkrn nt }
t|||
|}|jg | jd d |jd R  S |d dkr|dkr|dkr|d u rt||d}|jg | jd d |jd R  S t
jj| ||S )Nr   )on_gfx9	on_gfx950r   r       rF   rE   )gemm_a16w16   T   i    )vllm.platforms.rocmrO   rP   numelsizer   mathr   VLLM_ROCM_USE_SKINNY_GEMMr!   r   rI   rJ   ceilr	   is_contiguousr   ops
wvSplitKrcrN   aiter.ops.triton.gemm_a16w16rR   r?   r@   rA   wvSplitKLLMM1)r9   r:   r;   rO   rP   rK   rL   rM   rX   use_skinny_reduce_countingcu_countx_viewoutrR   
use_skinnyr   r   r   rocm_unquantized_gemm_impl   s^   

 $
 $,$rf   c                 C   s$   |  g | jd d |jd R S )Nr   r   )	new_emptyr   r9   r:   r;   r   r   r   rocm_unquantized_gemm_fake   s   $ri   c                 C   r<   r=   )r   r\   vllmrocm_unquantized_gemmrB   r   r   r   rk      rD   rk   )op_nameop_func	fake_implrK   rM   r!   c                 C   s4   t jj o|t jt jfv o|d dko| d dkS )N    r   rQ   )r   _C_cpu_is_amx_tile_supportedrJ   int8)rK   rM   r!   r   r   r   check_cpu_sgl_kernel   s   

rt   remove_weightc              
      sB  | j jrtjjj| _d S | j  \}}| j j}t	j
rRt|||rRtjj| j t| dd d ur8| jtj nd   fdd| _|rPtjjtddd| _ d S tjrt tjkrz#| j }t| dfdd| _|r}tjjtddd| _ W d S  ty } ztd	|  W Y d }~nd }~ww d
d | _d S )Nr;   c                    s$   t jj| |d ur dS d dS )NT)r   r\   rp   weight_packed_linearrh   )bias_f32packed_weightr   r   <lambda>   s
    z/dispatch_cpu_unquantized_gemm.<locals>.<lambda>r   F)requires_gradro   c                    s   t  | |S r=   )r\   	onednn_mmrh   )handlerr   r   ry      s    zEFailed to create oneDNN linear, fallback to torch linear. Exception: c                 S   s   t jj| ||S r=   r>   rh   r   r   r   ry     s    )r:   is_metar   r?   r@   rA   
cpu_linearrW   r!   r   VLLM_CPU_SGL_KERNELrt   r\   rp   convert_weight_packedgetattrr;   tofloat32	Parameterempty_supports_onednnr   get_cpu_architecturer   POWERPCcreate_onednn_mmtRuntimeErrorloggerwarning_once)r8   ru   r   Kr!   origin_weighter   )rw   r|   rx   r   dispatch_cpu_unquantized_gemm   sB   r   c                 C   s   |  |||S r=   )r~   rB   r   r   r   cpu_unquantized_gemm  s   r   .c                   C   s   t  rtS t  rtS tS r=   )r   is_rocmrk   is_cpur   rC   r   r   r   r   dispatch_unquantized_gemm  s
   r   r=   )(__doc__collections.abcr   r   rj   r   r\   r   vllm._aiter_opsr   vllm.loggerr   vllm.platformsr   r   vllm.utils.platform_utilsr	   vllm.utils.torch_utilsr
   __name__r   Tensorr   inttupler)   r7   r?   ModulerC   rN   rf   ri   rk   r!   boolrt   r   r   r   r   r   r   r   <module>   s   

-
	
:


		
3
	