o
    TiS;                  
   @   s   d dl mZ ddlmZ z
d dlZd dlZW n ey( Z zW Y dZ[ndZ[ww G dd deZG dd dZ	G d	d
 d
Z
G dd deZdS )    )IntEnum   )NPUOpBuilderNc                   @   s    e Zd ZdZdZdZdZdZdS )ActivationFuncTyper   r            N)__name__
__module____qualname__UNKNOWNGELUReLU
GATED_GELU
GATED_SILU r   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/op_builder/npu/inference.pyr      s    r   c                   @   sZ   e Zd ZdZdZdZdZdZdZdZ	dZ
dZdZedddZedd Zed	d
 ZdS )InferenceContextN*   r   r   c                 C   s
   | t _d S Nr   _num_tokens)initial_tokensr   r   r   reset_tokens&   s   
zInferenceContext.reset_tokensc                   C      t jS r   r   r   r   r   r   current_tokens*      zInferenceContext.current_tokensc                   C   r   r   )r   
_workspacer   r   r   r   GetWorkSpace.   r   zInferenceContext.GetWorkSpace)r   )r	   r
   r   r   _seed_curr_offset_stream_free_memory_sizer   #_attention_unfused_workspace_offset_workSpaceSizeworkSpaceSize	kv_cachesstaticmethodr   r   r   r   r   r   r   r      s"    
r   c                   @   s  e Zd Zedd Zedd Zedd Zedd Zed	d
 Zedd Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd  Zed!d" Zed#d$ Zed%d& Zed'd( Zed)d* Zed+d, Zd-S ).NPUInferencec                 C   s    t jjj| | jd g|||dS )N)eps)torchnn
functional
layer_normshape)inputsgammabetaepsilonr   r   r   r.   5   s    zNPUInference.layer_normc
                 C   sR   t jj| | jd f|||}
|	r| n|}t |
|}|r#||7 }||
g}|S )Nr   )r+   r,   r-   r.   r/   tmatmul)r0   weightq_scalebiasr1   r2   r*   add_biasq_int8	transposeinp_normtmpoutputr   r   r   	_qkv_gemm9   s   zNPUInference._qkv_gemmc
           
      C      t | |||||||||	
S r   r(   r?   
r0   r6   r7   r8   r1   r2   r*   r9   r:   r;   r   r   r   qkv_gemm_fp16C      zNPUInference.qkv_gemm_fp16c
           
      C   r@   r   rA   rB   r   r   r   qkv_gemm_bf16G   rD   zNPUInference.qkv_gemm_bf16c
           
      C   r@   r   rA   rB   r   r   r   qkv_gemm_fp32K   rD   zNPUInference.qkv_gemm_fp32c                 C   s  | j \}}}| dd |f |||d}| d|||||   f |||d}| d||||   d f }|dkr|	rtd|d}td|d| }|d}dt|
| }t||}| }| }|	d|d|d j
ddd}|	d|d|d j
ddd}|dd |f |d|d f }}|dd |f |d|d f }}t|||}tj||gdd}t|||}tj||gdd}|||d }||||ddd }||||ddd }|||fS )	N.r)   r   npur         ?r   dim)r/   reshaper+   arangetopowoutersincosviewrepeat_interleave	torch_npunpu_rotary_mulcat
contiguousr;   )valsr8   
hidden_dim
seq_length
seq_offsetheadsnum_kv
rotary_dimrotate_halfrotate_every_two
rope_thetabsz_qkvseq_idinv_freqrP   rQ   q_posq_passk_posk_passr>   k_cachev_cacher   r   r   _bias_add_transform_0213O   s0   (
""
z%NPUInference._bias_add_transform_0213c                 C   s  |   \}}}||d|dkr|n|   }|| }|dk}tjs*dd t|D t_|r8t| d d gtj|< t }t }|rDdn|d }tj| d |||||dkrV|n|||||d\}}}|stj| \}}|d urt	j
||gdd}t	j
||gdd}||gtj|< |jd }t|  dkrtd|nd}|| | }tj||dd||d	 |dd||d	 |d
d d | |dddddd }|||fS )Nr   r   r   c                 S   s   g | ]}d d gqS r   r   ).0rc   r   r   r   
<listcomp>{   s    z1NPUInference._softmax_context.<locals>.<listcomp>)rX   r8   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rI   rH   r)   BSHi   )psepadding_mask
atten_maskscalepre_tockensnext_tockens	keep_probinner_precise)sizer   r&   ranger   r   r   r(   ro   r+   rV   r/   lenmaxrT   npu_fusion_attentionr;   rK   rW   bool)query_key_value	attn_maskr^   r_   r`   r\   r]   norm_factortriangular_maskinglocal_attentionwindow_size
no_maskinglayer_id
num_layersalibira   rb   seq_lenre   rY   is_promtsoft_len	workspacer[   rd   rf   rm   rn   layer_scalealphar>   r   r   r   _softmax_contextq   sd   


zNPUInference._softmax_contextc                 C   (   t | |||||||||	|
|||||S r   r(   r   r   r   r^   r_   r`   r\   r]   r   r   r   r   r   r   r   r   ra   r   r   r   softmax_context_fp16      
z!NPUInference.softmax_context_fp16c                 C   r   r   r   r   r   r   r   softmax_context_bf16   r   z!NPUInference.softmax_context_bf16c                 C   r   r   r   r   r   r   r   softmax_context_fp32   r   z!NPUInference.softmax_context_fp32c                 C   s    |r
t | | S t | |S r   )r+   r5   r4   inputr6   async_opr7   r:   transposed_moder   r   r   _vector_matmul   s   zNPUInference._vector_matmulc                 C      t | |||||S r   r(   r   r   r   r   r   vector_matmul_fp16      zNPUInference.vector_matmul_fp16c                 C   r   r   r   r   r   r   r   vector_matmul_bf16   r   zNPUInference.vector_matmul_bf16c                 C   r   r   r   r   r   r   r   vector_matmul_fp32   r   zNPUInference.vector_matmul_fp32c                 C   s   |
rt jj| | | | jd f|||}nt jj| | jd f|||}|r+| n|}t ||}|tjkrBt jj	|| }n|tj
krQt jj|| }ntd|t || }||fS )Nr)   z!Unsupported ActivationFuncType {})r+   r,   r-   r.   r/   r4   r5   r   r   gelur   relu	Exceptionformat)r   residual
input_biasweight_interm
weight_outr8   r1   r2   r*   pre_layer_normmlp_after_attninterm_scale	out_scaledtypemlp_act_func_typer;   residual_addr=   r>   r   r   r   	_mlp_gemm   s   

zNPUInference._mlp_gemmc                 C   r   r   r(   r   r   r   r   r   r   r8   r1   r2   r*   r   r   r   r   r   r   r;   r   r   r   mlp_gemm_fp16      
zNPUInference.mlp_gemm_fp16c                 C   r   r   r   r   r   r   r   mlp_gemm_bf16   r   zNPUInference.mlp_gemm_bf16c                 C   r   r   r   r   r   r   r   mlp_gemm_fp32   r   zNPUInference.mlp_gemm_fp32c	                 C   s   |r(|r|  |   |   |   | |    }	n'|  |    |   }	n|r0||  7 }|   |   |  |   |  }	| j}
||	|
 d S r   )floatr   set_rM   )hidden_stater   attention_outputattention_bias
final_biasmp_sizer   r9   r   r=   input_dtyper   r   r   _residual_add_bias   s   $zNPUInference._residual_add_biasc	           	      C      t | ||||||||	S r   r(   r   	r   r   r   r   r   r   r   r9   r   r   r   r   residual_add_bias_fp16     z#NPUInference.residual_add_bias_fp16c	           	      C   r   r   r   r   r   r   r   residual_add_bias_bf16  r   z#NPUInference.residual_add_bias_bf16c	           	      C   r   r   r   r   r   r   r   residual_add_bias_fp32  r   z#NPUInference.residual_add_bias_fp32N)r	   r
   r   r'   r.   r?   rC   rE   rF   ro   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r(   3   sZ    

	



!
9













r(   c                       sF   e Zd ZdZdZ fddZdd Zdd Zd	d
 ZdddZ	  Z
S )InferenceBuilderDS_BUILD_TRANSFORMER_INFERENCEtransformer_inferencec                    s   t  j| jd d S )N)name)super__init__NAMEself	__class__r   r   r   &  s   zInferenceBuilder.__init__c                 C   s   d| j  dS )Nz$deepspeed.ops.transformer.inference._op)r   r   r   r   r   absolute_name)  s   zInferenceBuilder.absolute_namec                 C      g S r   r   r   r   r   r   sources,     zInferenceBuilder.sourcesc                 C   r   r   r   r   r   r   r   include_paths/  r   zInferenceBuilder.include_pathsTc                 C   s   t S r   )r(   )r   verboser   r   r   load2  r   zInferenceBuilder.load)T)r	   r
   r   	BUILD_VARr   r   r   r   r   r   __classcell__r   r   r   r   r   "  s    r   )enumr   builderr   r+   rT   ImportErrorer   r   r(   r   r   r   r   r   <module>   s    p