o
    Ti@                     @   s   d dl Z d dlZd dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZ d dlmZmZmZ dZG dd dejZ								dd
dZ	 ejdejdejdejdejdejf
ddZdddZdS )    N)get_accelerator)comm)LinearOpVectorMatMulOpSoftmaxContextOp	QKVGemmOp)softmaxscore_4d_matmulcontext_4d_matmulg     c                       sZ   e Zd ZdZd fdd	ZeejfddZd	d
 Z											dddZ
  ZS )TritonSelfAttentionr   N   Fc                    s  t t|   || _| jj}| jjtjkrtjn| jj}|tjks%J dtj| j_	tjd t_t
  }	|jdks>J d| jjrad | _d | _d | _d | _d | _d | _d | _d | _d | _d | _nR| jj| jj d }
tjtj| jj|
||	ddd| _tjtj|
||	ddd| _| jj| jj }tjtj|| jj||	ddd| _tjtj| jj||	ddd| _| jj| jj | _| jj| jj | _| jj| jj | _|| _ d| _!t
 " rd| _!|| _#|| _$t%t&'|| _(t&)| jj| jj | _*|j+st&)| j*| _*| jj,du r|  j*t&)| jj	d 9  _*| jj-o | jj	d	k}t.|| _/t0|| _1t2|| _3t4|| _5|j| _|j|j | _6| jj7rMd| j* | j* nd
| _8| jj9| _9|rot:d  ;d| jj<| j6| jj| j9| j8 d S d S )Nz!triton supports fp16 data_type_fpr   z-mp_size has to be 1 with triton attention yet   dtypedeviceF)requires_gradTr         ?z4running triton autotune for regular attention kernel   )=superr   __init__configr   torchint8half
num_layerslayer_idr   current_device_namemp_sizeset_empty_paramsattn_qwattn_qbattn_kwattn_kbattn_vwattn_vb	attn_qkvw	attn_qkvbattn_owattn_obhidden_sizenn	Parameteremptyheads!num_attention_heads_per_partitionhidden_size_per_partitionhidden_size_per_attention_headmp_group	use_flashis_triton_supportedq_scalesq_groupsintmathlog2merge_countsqrtnorm_factoruse_mupscale_attn_by_inverse_layer_idxtriton_autotuner   qkv_funcr   score_context_funcr   linear_funcr   vector_matmul_func	head_sizescale_attentionscaletriangular_maskingprint_triton_autotunemax_out_tokens)selfr   r1   r4   r5   r9   qkv_merging	data_typedata_type_fpr   qkv_size_per_partitionout_size_per_partitionr>   	__class__ h/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/transformer/inference/triton/attention.pyr      s   







 
zTritonSelfAttention.__init__c                    s   ddl m}m}m}	  fddtd|  |j d |jD }
|  |
D ]}tjd|d| f|dd}|||||}|	||| q%|	  d S )	Nr   )
Fp16Matmulr	   r
   c                    s   g | ]} | qS rR   rR   ).0i
min_seqlenrR   rS   
<listcomp>w   s    z8TritonSelfAttention._triton_autotune.<locals>.<listcomp>r   r   cudar   )
5deepspeed.ops.transformer.inference.triton.matmul_extrT   r	   r
   range_cache_stride_read_autotune_tabler   randn_update_autotune_table)rX   
max_seqlenrC   r)   rF   rE   r   rT   r	   r
   seqlenNqkvoutputrR   rW   rS   rH   n   s   
z$TritonSelfAttention._triton_autotunec                 C   s   t |tr	|d }|d u }|rtd}| j||jtjkr'd| |jt n|| j	| j
jr3d| j nd|| j
jtj||||d}	|	\}
}}|
||fS )Nr   r   r   )query_key_value	attn_maskr-   r;   
no_maskingr   r   alibi	is_prompt	token_idxposition_ids)
isinstancelistr   r,   r@   r   int64to	minus_infr.   r   rD   r;   r   r   r   )rJ   qkv_out
input_mask
layer_pastri   rj   rk   rl   rh   attn_key_valuecontext_layer	key_layervalue_layerrR   rR   rS   ds_compute_attention   s0   



z(TritonSelfAttention.ds_compute_attentionTc                 K   s  | j js| j|| j| j| jd ud| jtjd}|}n| j|| j| jd ur'| jn|
|	|
d}|d }|rk|d u rkt	||| j
||| j| jd| jd	}|d d d d | jd| j f |d d d d d| j d f }}n(|d|d jd dk}|d	d }|d
d }| j|||||||d\}}}| j|| jd}|d }| j jr| jd urtj| jddkrtj|| jd |||||fS )NF)inputweightbiasadd_biasdo_flash_attn	num_headsr   )rz   r{   r|   gammabetar   )	rd   rs   rE   rt   ri   rC   use_triton_flashuse_cuda_flash
triangularr   first_tokenr   rk   rl   )rr   rs   rt   ri   rj   toke_idxrl   )rz   r{   )group)r   pre_layer_normrA   r%   r&   r.   r   r   r?   _triton_attentionrE   rC   r2   rF   r)   getshapery   rB   r'   mlp_after_attnr1   distget_world_size
all_reduce)rJ   rz   rs   	head_maskrt   get_presentencoder_hidden_statesencoder_attention_masktriangularutput_attentionsnorm_wnorm_bri   use_triton_attentionkwargsrr   rd   rv   rw   rx   rj   rk   rl   re   inp_normrR   rR   rS   forward   sZ   F	$zTritonSelfAttention.forward)NNr   r   F)
NNFNNFNNNT)__name__
__module____qualname__r   r   staticmethodr   float16rH   ry   r   __classcell__rR   rR   rP   rS   r      s$    Tr   Fc
                 C   s~   t | tr	| d } |d u sJ d|r$t| ||||| o|d ud}
|
S t| |||}
|r2t|
}
nt|
|}
t|
| |}
|
S )Nr   z%layer_past not supported in alibi yet)causaladd_mask)rm   rn   _triton_packed_flashr	   r   r
   )rd   rs   rt   ri   rE   rC   r   r   r   use_ds_attentionre   rR   rR   rS   r      s$   



r   ADD_MASK	IS_CAUSALBLOCK_MBLOCK_DMODELBLOCK_Nc           5   	   C   s  t d}t d}|| }|| }|| ||  }|| }|| }|| t d| }t d|}t d|}| | |d d d f |  |d d d f  }| | | |d d d f |  |d d d f  }| d|  | |d d d f |  |d d d f  } ||	 |d d d f  }!||! }"t j|gt jdtd }#t j|gt jd}$t j||gt jd}%|d }&t j||d d d f |k dd}'|'|& t j}'d}(|r||d |  n|| })t	|(|)|D ]}*t j||*|  |*| d d d f |k dd}+t j| |*|  |*| d d d f |k dd},t j||gt jd}-|r.t |"}.|"|7 }"|-|.t j }-|rLt 
||d d d f  |*|d d d f  k|-td	}-|-t j|'t |+t jd
7 }-|-t 
|*| d d d f |k dt7 }-t |#t |-d}/t j|#|/ }0t j|-|/d d d f  }1|$d |0 }2|%|2d d d f 9 }%|%t |1t j|,t j7 }%|$|0 t |1d }$|/}#q|%|$d d d f  }%||
 ||  }3||3 |d d d f | |d d d f   }4t j|4|%t j|d d d f |k d d S )Nr   r   r   )r   infg/ldG?g        )maskotherz-inf)	out_dtype)r   )tl
program_idarangezerosfloat32floatloadrp   r   r\   wheredottransrq   maximummaxr7   exp2sumstore)5QKVr   r   r   sm_scaleOut	stride_qz	stride_qn	stride_qm	stride_mz	stride_oz	stride_onZHN_CTXP_SEQr)   r   r   r   start_moff_hzbatchheadq_offsetk_offsetv_offsetoffs_moffs_noffs_dq_ptrsk_ptrsv_ptrsoff_mask	mask_ptrsm_il_iaccqk_scaleqlohistart_nkvqkmask_valm_i_newalphap	acc_scaleo_offsetout_ptrsrR   rR   rS   _flash_packed_kernel  s^   

,04 ,,
6& ,,r   Tc                 C   s  | j d d | }| j d d }d}|dkrdnd}	tj| j d | j d |f| jtjd}
|d u r8td}d	}t| j d || j d | df}|dkrOd
nd}d
}d}t| | |||||
| d| d| d|rq|dnd|
d|
d| j d || j d ||||	|||d |
S )Nr   r      @       r   r   )r   r   F   r   )r   r   r   	num_warps
num_stages)	r   r   r,   r   r   tritoncdivr   stride)rd   rC   r   r   r   r   r-   r)   r   r   ogridr   r   r   rR   rR   rS   r   e  sH   &
"r   )FFFF)FT)r7   r   torch.nnr*   r   triton.languagelanguager   deepspeed.acceleratorr   	deepspeedr   r   .deepspeed.ops.transformer.inference.op_bindingr   r   r   r   *deepspeed.ops.transformer.inference.tritonr   r	   r
   rq   Moduler   r   jit	constexprr   r   rR   rR   rR   rS   <module>   s>    N
!^