o
    Tig;                     @   sz   d dl Z d dlZd dlmZ d dlmZ d dlmZ ddl	m
Z
mZmZmZmZ dZG dd dejZG d	d
 d
eZdS )    N)comm)get_accelerator   )LinearOpVectorMatMulOpSoftmaxContextOp	QKVGemmOp	SoftmaxOpg     c                       sR   e Zd ZdZg Zd fdd	Zdd Zdd	 Z			
			
			dddZ  Z	S )DeepSpeedSelfAttentionr   Nr   c                    s  t t|   || _| jj}| jjtjkrtjn| jj}tj| j_	tjd t_t
  }| jjrOd | _d | _d | _d | _d | _d | _d | _d | _d | _d | _nm|jdk r^| jj| jj d n| jj| jjd  | jj | jj| jj  }	tjtj| jj|	||ddd| _tjtj|	||ddd| _| jj| jj }
tjtj|
| jj||ddd| _tjtj| jj||ddd| _| jj| jj | _| jj| jj | _| jj| jj | _ | jj| jj | _!|| _"|| _#|| _$t%t&'|| _(t&)| jj| jj | _*|j+st&)| j*| _*| jj,du r|  j*t&)| jj	d 9  _*t-|| _.t/|| _0t1|| _2t3|| _4t5tj6dkrUtj| j d | jj||dtj| j d ||dgt_6d S d S )	Nr   r         )dtypedeviceF)requires_gradT)7superr
   __init__configr   torchint8half
num_layerslayer_idr   current_device_nameset_empty_paramsattn_qwattn_qbattn_kwattn_kbattn_vwattn_vb	attn_qkvw	attn_qkvbattn_owattn_obnum_kvhidden_sizemp_sizeheadsnn	Parameterempty!num_attention_heads_per_partitionnum_kv_partitionhidden_size_per_partitionhidden_size_per_attention_headmp_groupq_scalesq_groupsintmathlog2merge_countsqrtnorm_factoruse_mupscale_attn_by_inverse_layer_idxr   qkv_funcr   score_context_funcr   linear_funcr   vector_matmul_funclen_qkv_buffers)selfr   r/   r0   r1   r5   	data_typedata_type_fpr   qkv_size_per_partitionout_size_per_partition	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/transformer/inference/ds_attention.pyr      s   

*





zDeepSpeedSelfAttention.__init__c                 C   s   t |ts
t |tr|d }|d u p|du }|rtd}| j||jtjkr0d| |jt	 n|| j
| j| jjr>d| j nd|| jjtj||||d}	|	\}
}}|
||fS )Nr   Fr   g      ?)query_key_value	attn_maskr'   r$   r7   
no_maskingr   r   alibi	is_prompt	token_idxposition_ids)
isinstancelisttupler   r*   r;   r   int64to	minus_infr+   r,   r   scale_attentionr7   r   r
   r   )r@   qkv_out
input_mask
layer_pastrL   rM   rN   rO   rK   attn_key_valuecontext_layer	key_layervalue_layerrG   rG   rH   compute_attention\   s2   


z(DeepSpeedSelfAttention.compute_attentionc                 C   s   t jd }| j|d | jd d f< | j|| jd| j d d f< | j|d| j d d d f< | jd urUt jd }| j|d | j< | j|| jd| j < | j|d| j d < t jS )Nr   r   r   )	r
   r?   r   r-   r   r   r   r   r   )r@   qvkwqvkbrG   rG   rH   
_merge_qkvw   s   


z!DeepSpeedSelfAttention._merge_qkvFc              	   K   s  | j d u r|  \| _| _n| j | _| j| _| jjs.| j|| j| j| jd ud| jt	j
d}n| j|| j| j|	|
d}|d|d jd dk}|dd }|dd }| j|||||||d	\}}}| j|| jd
}|d }| jjr| jd urtj| jddkrtj|| jd |||||fS )NF)inputweightbiasadd_biasdo_flash_attn	num_headsr   )rb   rc   rd   gammabetafirst_tokenr   r   rN   rO   )rW   rX   rY   rL   rM   rN   rO   )rb   rc   )group)r    ra   
_attn_qkvw
_attn_qkvbr!   r   pre_layer_normr<   r+   r
   r   r:   getshaper^   r=   r"   mlp_after_attnr/   distget_world_size
all_reduce)r@   rb   rX   	head_maskrY   get_presentencoder_hidden_statesencoder_attention_maskoutput_attentionsnorm_wnorm_brL   kwargsrW   rM   rN   rO   r[   r\   r]   outputinp_normrG   rG   rH   forward   sF   
$zDeepSpeedSelfAttention.forward)NNr   r   )	NNFNNFNNN)
__name__
__module____qualname__r   r?   r   r^   ra   r   __classcell__rG   rG   rE   rH   r
      s     Hr
   c                       s6   e Zd Z fddZdd Zd
ddZdd	 Z  ZS )BloomSelfAttentionc                    s&   t t| j|i | t| j| _d S N)r   r   r   r	   r   softmax_func)r@   argsr}   rE   rG   rH   r      s   zBloomSelfAttention.__init__c                 C   s:   | dddd }| d d | jf }|j|  S )Nr   r   r   r   )permute
contiguoussizer-   view)r@   xnew_x_layer_shaperG   rG   rH   _transpose_for_context   s
   z)BloomSelfAttention._transpose_for_contextTc           	      C   sn   |  d }| | |}}|| dkst| d| || }tj|||d}|r5tdd |D S |S )a  Split a tensor along its last dimension.

        Args:
            tensor: ([`torch.tensor`], *required*):
                input tensor to split
            num_partitions ([`int`], *required*):
                number of partitions to split the tensor
            contiguous_split_chunks ([`bool`], *optional*, default=`False`)::
                If True, make each chunk contiguous in memory.
        r   r   z is not divisible by dimc                 s   s    | ]}|  V  qd S r   )r   ).0chunkrG   rG   rH   	<genexpr>   s    zBBloomSelfAttention._split_tensor_along_last_dim.<locals>.<genexpr>)r   r   
ValueErrorr   splitrR   )	r@   tensornum_partitionscontiguous_split_chunkslast_dim	numeratordenominatorlast_dim_sizetensor_listrG   rG   rH   _split_tensor_along_last_dim   s   z/BloomSelfAttention._split_tensor_along_last_dimc                 C   s  t |ts
t |tr|d }|d u }|rtd}|}	|t  }| j| j	 }
|	
 d d | j	d|
 f }|	j| }	| |	d\}}}|
d|
d|
d|
df}|dd|d |d  |d d}|dd|d |d  |d ddd}|dd|d |d  |d d}|d ur|\}}tj|||fdd}tj|||fdd}||f}t||}||d |d |d d}t rt | j	 nd}| jjtjkrtjn| jj}|jtjkr| }| jjrd| }| j|||t || jjo|jd dkddddd| j | j   |d	
}|j|j }t!||}||
d| j	 | j	|
d|jd }| "|}|d }|d }|||fS )
Nr   r   rk   r   r   r   r   F)
attn_scoresrJ   rL   
triangular	recomputelocal_attentionwindow_sizeasync_oplayer_scalehead_offset)#rP   rQ   rR   r   r*   rT   r   r   r-   r+   r   r   r   	transposereshapecattype_asmatmulrs   is_initializedget_rankr   r   r   float16boollonginvert_maskr   rU   triangular_maskingrq   r7   bmmr   )r@   rW   rX   rY   rL   rM   rN   rO   rK   mixed_x_layerhead_dimnew_tensor_shapequery_layerr\   r]   output_sizepast_key
past_valuepresentsmatmul_resultattention_scoresoffsettarget_dtypeattention_probsattention_probs_reshapedr[   rG   rG   rH   r^      sl   

$& &


z$BloomSelfAttention.compute_attention)T)r   r   r   r   r   r   r^   r   rG   rG   rE   rH   r      s
    
r   )r3   r   torch.nnr(   	deepspeedr   rs   deepspeed.acceleratorr   
op_bindingr   r   r   r   r	   rU   Moduler
   r   rG   rG   rG   rH   <module>   s    )