o
    Ti%                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
Zdadd ZG dd deZG dd dejZdS )    N)Function)version)log_dist)get_accelerator)WorkspaceOp)SoftmaxContextOp)LinearOp)PadTransformOpg     c                  C   sR   zdd l } W n ty   tdw t| jtdk r!tdddlma d S )Nr   z9Please install triton 2.0+ or `pip install deepspeed[sd]`z2.0   )triton_flash_attn)tritonImportErrorpkg_versionparse__version__
triton_opsr   )r    r   k/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/transformer/inference/diffusers_attention.pyload_triton_flash_attn   s   r   c                   @   s$   e Zd Zedd Zedd ZdS )#DeepSpeedDiffusersAttentionFunctionc                    sP   
fdd fdd} 	fdd}||||}|S )Nc                    s0   |  dddd} |  d d  f }| j| S )Nr      r
      )permutesizereshape)xnew_x_layer_shape)hidden_size_per_partitionr   r   _transpose_for_context+   s
   
zKDeepSpeedDiffusersAttentionFunction.forward.<locals>._transpose_for_contextc                    sH   | j d   }|  d d  |f }| j| } | dddd} |  S )Nr   r   r
   r   )shaper   r   r   
contiguous)r   attention_head_sizenew_x_shape)!num_attention_heads_per_partitionr   r   _transpose_for_scores1   s
   
zJDeepSpeedDiffusersAttentionFunction.forward.<locals>._transpose_for_scoresc              	      s  j tjtjfv r| j tjkr|  } | jd j }|dk}d d  }|ri|d u ri
| d ur6nd u|jd}|d |d |d || jd d dk} |d d d d d d d |f }nYd}|d urt| }t|}	t|}
nt| }|jddd	\}}	}
|	 }|		 }	|
	 }
||	|
j|\}}	}
t||	
dd| jdd	} t||
}
|	djd}|S )
Nr       r
   Fr   r   r   r   )dim)dtypetorchhalffloat16float32r!   headsmatmulchunkr"   	transposesoftmax)inputcontext
input_mask	head_sizedo_flash_attnscaleqkv_outcontext_layerquerykeyvalueqkvattention_scoresoutput)r   attn_kwattn_obattn_ow	attn_qkvb	attn_qkvwattn_qwattn_vwconfigdo_out_biaslinear_funcnorm_factorpad_transform_func
rope_thetatriton_flash_attn_kernelr   r   selfAttention_fp8   s<   
& zEDeepSpeedDiffusersAttentionFunction.forward.<locals>.selfAttention_fpr   )ctxr3   r4   r5   rH   rE   rF   rA   rG   rD   r%   rK   r   rC   rB   rI   score_context_funcrJ   rL   rN   rM   r&   rO   r@   r   )r   rA   rB   rC   rD   rE   rF   rG   rH   rI   r   rJ   rK   r%   rL   rM   rN   r   forward%   s
   (!z+DeepSpeedDiffusersAttentionFunction.forwardc                 C   s   t d)NzYou are running with DeepSpeed Inference mode.                             Please switch to Training mode for running backward!)RuntimeError)rP   grad_outputgrad_output1grad_output2grad_output3r   r   r   backward]   s   z,DeepSpeedDiffusersAttentionFunction.backwardN)__name__
__module____qualname__staticmethodrR   rX   r   r   r   r   r   #   s
    
7r   c                       s6   e Zd ZdZdZ fddZdd Zd
dd	Z  ZS )DeepSpeedDiffusersAttentiona0  Initialize the DeepSpeed Transformer Layer.
        Arguments:
            layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers,
                layer_id will be 0,1,2...23 when each layer object is instantiated
            config: An object of DeepSpeedInferenceConfig
    r   c                    sh  t t|   || _tj| j_t jd7  _|jrt  nd}| jj| jj	 d }| jj
}| jj
tjkr8tjn| jj
}tjdkrLtd| jj dg tjtj| jj|||ddd| _tjtj| jj| jj||ddd| _tjtj| jj| jj||ddd| _tjtj| jj| jj||ddd| _tjtj|||ddd| _| jj| jj	 }tjtj|| jj||ddd| _tjtj| jj||ddd| _d	| _td u rt  t | _| jj| jj	 | _| jj| jj	 | _ | jj| jj | _!t"#t"#| jj| jj | _$| jj%d	u r|  j$t"#| jjd 9  _$t&| j| _'t(| j| _)t*| j| _+t,| j| _-d S )
Nr
   cpur   zDeepSpeed-Attention config: r   )r)   deviceF)requires_gradT).superr]   __init__rH   layer_idbigscience_bloomr   current_device_namehidden_sizemp_sizer)   r*   int8r+   r   __dict__nn	ParameteremptyrE   rA   rG   rF   rD   rC   rB   rI   r   r   rN   r.   r%   r   hidden_size_per_attention_headmathsqrtrK   scale_attn_by_inverse_layer_idxr   	workspacer   rQ   r   rJ   r	   rL   )selfrH   r_   qkv_size_per_partition	data_typedata_type_fpout_size_per_partition	__class__r   r   rb   l   s   


z$DeepSpeedDiffusersAttention.__init__c                 C   sP   | j jdkr&| j| j j| j j|d |d tj| j jdd| j j| j j	
 d S d S )Nr   r
   F)
rH   rc   rq   allocate_workspacerf   r.   r]   rg   max_out_tokensmin_out_tokens)rr   r   r   r   r   ry      s   z.DeepSpeedDiffusersAttention.allocate_workspaceNc                 C   sf   |  |  t|||| j| j| j| j| j| j	| j
| j| j| j| j| j| j| j| j| j| jj}|S )N)ry   r   r   applyrH   rE   rF   rA   rG   rD   r%   rK   r   rC   rB   rI   rQ   rJ   rL   rN   rM   )rr   r3   r4   r5   r@   r   r   r   rR      s   z#DeepSpeedDiffusersAttention.forward)NN)	rY   rZ   r[   __doc__rc   rb   ry   rR   __classcell__r   r   rw   r   r]   c   s    Er]   )rn   r*   torch.autogradr   torch.nnrj   	packagingr   r   deepspeed.utils.loggingr   deepspeed.acceleratorr   8deepspeed.ops.transformer.inference.op_binding.workspacer   >deepspeed.ops.transformer.inference.op_binding.softmax_contextr   .deepspeed.ops.transformer.inference.op_bindingr   <deepspeed.ops.transformer.inference.op_binding.pad_transformr	   	minus_infr   r   r   Moduler]   r   r   r   r   <module>   s    @