o
    Ti                     @   s   d dl mZ d dlZd dlmZ d dlmZmZ dej	de
dejdej	fd	d
Zdd Z			ddeej deej deej fddZddej	fddZdS )    )commN)Optional)get_shard_sizeget_shard_size_listattention_mask	num_headsdtypereturnc                 C   s  ddl }| j\}}d||| }tjdd||d     | jtjd}tjdd| | jtj	d}t
||}	||krztjdd|d| d     | jtjd}
t||| }tjddd|  d| jtj	d}tj|	t
|
|gdd}	| jddd |  dddddf }|	d	 | }t rt|t }tt|t dt  }|||d|}|dd||| ddddf }||| d||S ||| d||S )
a  
    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
    `softmax(l+a) = softmax(l)`. Based on
    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.

    Args:
    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
        attention_mask (`torch.Tensor`):
            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
        num_heads (`int`, *required*):
            number of heads
        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
            dtype of the output tensor
    r   N      )devicer      )dim).N)mathshapefloorlog2torchtensorr   float32arangeint32powmincatcumsumdistis_initializedr   get_world_sizesumr   get_rankviewreshapeto)r   r   r   r   
batch_size
seq_lengthclosest_power_of_2basepowersslopes
extra_basenum_remaining_headsextra_powersarange_tensoralibinum_heads_per_rankoffset r2   _/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/auto_tp_model_utils.pybuild_bloom_alibi_tensor   s4   
  &$r4   c                 C   sl   |  ||}| js4t r4t| jt }tt| jt dt	  }|||| d |d |f }|S )Nr   )
get_alibi_mask_origtrainingr   r   r   n_headr   r    r   r!   )selfr   seq_length_with_pastmaskr0   r1   r2   r2   r3   get_alibi_mask@   s    r;   prefix_masksequence_idc           	      C   s   | j |||||d\}}t r<t| jjt }tt| jjt dt	  }|d d ||| d d d d f }||fS )N)r   r<   r=   r   )
_attn_bias_origr   r   r   confign_headsr   r    r   r!   )	r8   r   r   r   r<   r=   	attn_biasr0   r1   r2   r2   r3   build_mpt_atten_bias_tensorJ   s   
"$rB      c                 C   sV   |  ||||}t r)t|t  }t | }|||| ddddf }|S )a  
    Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    N)build_mpt_alibi_tensor_origr   r   intr   r!   )r8   r   sequence_lengthalibi_bias_maxr   r/   r0   r1   r2   r2   r3   build_mpt_alibi_tensor\   s   rH   )NNN)rC   N)	deepspeedr   r   r   typingr    deepspeed.module_inject.tp_shardr   r   TensorrE   r   r4   r;   
ByteTensor
LongTensorrB   rH   r2   r2   r2   r3   <module>   s"    4
