o
    TixP                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 dadaG dd dZG dd	 d	eZG d
d deZG dd dejZdS )    N)nn)Function)get_accelerator)TransformerBuilderStochasticTransformerBuilderc                   @   s   e Zd Zdd ZdS )TransformerConfigc	           	      C   s:   d| _ || _|| _|| _|| _|| _|| _|| _|| _d S )N)	layer_id
batch_sizehidden_sizeintermediate_sizeheadsattn_dropout_ratiohidden_dropout_rationum_hidden_layersinitializer_range)	selfr
   r   r   r   r   r   r   r    r   Y/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/transformer/transformer.py__init__   s   
zTransformerConfig.__init__N)__name__
__module____qualname__r   r   r   r   r   r      s    r   c                       sb   e Zd ZdZ																				d fdd	Zedd	 Zed
d Z  ZS )DeepSpeedTransformerConfiga
  Initialize the DeepSpeed Transformer Config.

        Arguments:
            batch_size: The maximum batch size used for running the kernel on each GPU

            hidden_size: The hidden size of the transformer layer

            intermediate_size: The intermediate size of the feed-forward part of transformer layer

            heads: The number of heads in the self-attention of the transformer layer

            attn_dropout_ratio: The ratio of dropout for the attention's output

            hidden_dropout_ratio: The ratio of dropout for the transformer's output

            num_hidden_layers: The number of transformer layers

            initializer_range: BERT model's initializer range for initializing parameter data

            local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
                to use if the model already set the current device, otherwise need to set it
                so that the transformer kernel can work on the right device

            seed: The random seed for the dropout layers

            fp16: Enable half-precision computation

            pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture

            normalize_invertible: Optional: Enable invertible LayerNorm execution (dropping the input activation),
                default is False

            gelu_checkpoint: Optional: Enable checkpointing of Gelu activation output to save memory,
                default is False

            adjust_init_range: Optional: Set as True (default) if the model adjusts the weight initial values of
                its self-attention output and layer output, False keeps the initializer_range no change.
                See the adjustment below:
                    output_std = self.config.initializer_range / math.sqrt(2.0 * num_layers)

            attn_dropout_checkpoint: Optional: Enable checkpointing of attention dropout to save memory,
                default is False

            stochastic_mode:  Enable for high performance, please note that this flag has some level of
                non-determinism and can produce different results on different runs.  However, we have seen
                that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
                to turn it off in order to be able to reproduce the same result through the regular kernel execution.

            return_tuple: Enable if using the return_tuple interface style for sending out the forward results.

            training: Enable for training rather than inference.
    r   -q=FTc              
      s   t t| |||dkr|nd| ||||| || _|| _|
| _|| _|| _|| _|| _	d| _
|	| _|| _d| _|| _|| _|| _d S )Nr      FT)superr   r   fp16pre_layer_norm
local_rankseednormalize_invertiblegelu_checkpointadjust_init_range	test_gemmlayer_norm_epstrainingis_grad_enabledattn_dropout_checkpointstochastic_modereturn_tuple)r   r
   r   r   r   r   r   r   r   r%   r   r    r   r   r!   r"   r#   r(   r)   r*   r&   	__class__r   r   r   Y   s*   
z#DeepSpeedTransformerConfig.__init__c                 C   s&   t  }| D ]	\}}||j|< q|S N)r   items__dict__)clsjson_objectconfigkeyvaluer   r   r   	from_dict   s   z$DeepSpeedTransformerConfig.from_dictc                 C   sF   t |ddd}| }W d    n1 sw   Y  | t|S )Nrzutf-16)encoding)openreadr5   jsonloads)r0   	json_filereadertextr   r   r   from_json_file   s   
z)DeepSpeedTransformerConfig.from_json_file)r   r   r   r   r   r   r   r   r   r   r   FTFFTFFFT)	r   r   r   __doc__r   classmethodr5   r?   __classcell__r   r   r+   r   r   "   s6    7(
r   c                   @   s$   e Zd Zedd Zedd ZdS )DeepSpeedTransformerFunctionc           (         sz  |j rtnt}|jr|jn|j}| }|d d dkr]t|tj	|d d|d d  |d f|j
|jdfd}t|tj|d |jd |jd d|d d  f|j
|jdd fd}||j|||| |	|
||||||||jos|j|j|j|j|j\}}}}}}}}}}} }!}"}#}$}%}&d ur%dD ]}'||'|f fd	d
	 qdD ]}'||'|f fdd
	 q |ffdd
	 |	|ffdd
	 |
|ffdd
	 ||ffdd
	 ||ffdd
	 ||ffdd
	 ||ffdd
	 ||ffdd
	 ||ffdd
	 ||ffdd
	 |jr|jr|jrG|jrG| ||| |	|
||||||| n| ||||| |	|
||||||| || _|jse|jsh|| _|| _|| _|jsu|| _|| _|js|| _|$| _|&| _|| _ |js|| _!|| _"| | _#|!| _$|"| _%|#| _&|%| _'|d d dkrt(|dd|d }|j)r|fS |S )N      r      devicedtypei   )rF   c                    N    | | d |d  d  |dkrdgS |dkr#dgS dgS )Nr   rD   Q_WK_WV_Wappendsizexir   attn_owgradsr   r   <lambda>       ,
z6DeepSpeedTransformerFunction.forward.<locals>.<lambda>c                    rK   )Nr   rD   Q_BK_BV_BrO   rR   rU   r   r   rX      rY   c                         | dgS )NO_WrP   rS   r   rW   r   r   rX          c                    r]   )NO_Br_   r`   ra   r   r   rX      rb   c                    r]   )NN2_Wr_   r`   ra   r   r   rX      rb   c                    r]   )NN2_Br_   r`   ra   r   r   rX      rb   c                    r]   )Nint_Wr_   r`   ra   r   r   rX      rb   c                    r]   )Nint_Br_   r`   ra   r   r   rX      rb   c                    r]   )Nout_Wr_   r`   ra   r   r   rX      rb   c                    r]   )Nout_Br_   r`   ra   r   r   rX      rb   c                    r]   )Nnorm_Wr_   r`   ra   r   r   rX      rb   c                    r]   )Nnorm_Br_   r`   ra   r   r   rX      rb   )*r)   "stochastic_transformer_cuda_moduletransformer_cuda_moduler   forward_fp16forward_fp32rQ   torchcatrandnrH   rI   onesshaper	   r&   r'   r   r(   r!   r"   register_hooksave_for_backwardr2   inp_normqkv_tfsoft_inpctx_bufB
attn_o_inpadd_resattn_layer_norm_meanlayer_norm_meanff1_inpgelu_inpff2_inpattn_prob_dropout_maskattn_output_dropout_masklayer_output_dropout_maskattn_layer_norm_varlayer_norm_varnarrowr*   )(ctxinput
input_maskr   rW   r	   	attn_qkvw	attn_qkvbrV   attn_obattn_nwattn_nbinter_winter_boutput_woutput_bnorm_wnorm_br2   cuda_moduleforward_funcinp_sizeoutputrw   rx   ry   rz   r{   r|   r   r   r   r   r   r   r   r}   r   r~   rT   r   rU   r   forward   s   $ 

z$DeepSpeedTransformerFunction.forwardc           "      C   s  |j d }| }|d d dkr,t|tj|d|d d  |d f|j|jdfd}| jjs2J | jj	rK| jj
rK| j\}}}}}}	}
}}}}}}n| j\}}}}}}}}	}
}}}}}}| jjrctnt}| jjrl|jn|j}|g | jj|| jj	r| jj
r| jn|| jj	s| jj
s| jn|| j| j| jjr| jn| j| j| jj
r| jn| j| j| jjr| jn| j| j| j| j| j | j!| j"| j#| j$| jj	r| jj
r| jn|||||||	|
||||||R  \}}}}}}}}}}}} }!d | _d | _d | _d | _d | _d | _d | _d | _d | _d | _d | _"d | _$d | _d | _d | _ d | _!d | _#|d d dkrPt%|dd|d }|d d d d ||||||||||| |!d fS )Nr   rD   rE   rF   rG   )&rt   rQ   rp   rq   zerosrH   rI   r2   r&   r   r!   saved_tensorsr)   rl   rm   r   backward_fp16backward_fp32r	   rw   rx   ry   r(   rz   r{   r   r|   r"   r   r   r   r   r   r   r}   r   r~   r   )"r   grad_outputbszgrad_output_shaper   r   r   rV   r   r   r   r   r   r   r   r   r   r   r   r   backward_func
grad_inputgrad_attn_qkvwgrad_attn_qkvbgrad_attn_owgrad_attn_obgrad_attn_nwgrad_attn_nbgrad_inter_wgrad_inter_bgrad_output_wgrad_output_bgrad_norm_wgrad_norm_br   r   r   backward   s   



z%DeepSpeedTransformerFunction.backwardN)r   r   r   staticmethodr   r   r   r   r   r   rC      s
    
YrC   c                       sJ   e Zd ZdZdZd fdd	ZdddZ								dd	d
Z  ZS )DeepSpeedTransformerLayera  Initialize the DeepSpeed Transformer Layer.

        Static variable:
            layer_id: The layer-index counter starting from 0 and incrementing by 1 every time a layer object is instantiated,
            e.g. if a model has 24 transformer layers, layer_id goes from 0 to 23.
        Arguments:
            config: An object of DeepSpeedTransformerConfig

            initial_weights: Optional: Only used for unit test

            initial_biases: Optional: Only used for unit test
    r   Nc           	         s$  t t|   || _tj| j_tjd t_td| jj | jjdkr*t 	| jj |d u r|d u rt
t| jjd | jj| _t
t| jjd | _t
t| jj| jj| _t
t| jj| _t
t| jj| _t
t| jj| _t
t| jj| jj| _t
t| jj| _t
t| jj| jj| _t
t| jj| _t
t| jj| _t
t| jj| _| | jj n`|d j}|d j}|d j}t
t|||f| _t
t| jjd | _| jj   |d | _|d | _|d | _|d | _|d | _|d | _|d | _|d | _|d	 | _|d	 | _t!d u r=| jj"s=t# $ a!t%d u rL| jj"rLt& $ a%| jj"rSt%nt!}| jj'r]|j(n|j)}|| jj| jj*| jj| jj+| jj| jj,| jj-| jj.| jj/| jj0| jj1| jj2| jj3| jj4| jj" d S )
NrD   z DeepSpeed Transformer config is r   rJ   rF   r            )5r   r   r   r2   r	   printr/   r   r   
set_devicer   	Parameterrp   Tensorr   r   r   rV   r   r   r   r   r   r   r   r   r   r   init_transformer_weightsr#   datarq   zero_rm   r)   r   loadrl   r   r   create_transformer_layer_fp16create_transformer_layer_fp32r
   r   r   r   r%   r    r   r$   r(   r!   r"   )	r   r2   initial_weightsinitial_biasesqkvr   create_layer_funcr+   r   r   r   7  sb   
 














z"DeepSpeedTransformerLayer.__init__Fc                 C   s   | j j}| j j}|r| j jdkrtd | j jtd|  }| jjj	d| j jd | j
j  | jjj	d|d | jj  | jjd | jj  | jjj	d| j jd | jj  | jjj	d|d | jj  | jjd | jj  d S )Nr   z0Accounting for accumulation on the residual pathg       @g        )meanstdg      ?)r2   r   r   r   r   mathsqrtr   r   normal_r   r   rV   r   r   fill_r   r   r   r   r   r   r   )r   r#   
num_layers
output_stdr   r   r   r   y  s"   z2DeepSpeedTransformerLayer.init_transformer_weightsc
           
      C   s`   t  | j_| j| j_t||| |	| jj| j| j| j	| j
| j| j| j| j| j| j| j| j| jS r-   )rp   r'   r2   r&   rC   applyr	   r   r   rV   r   r   r   r   r   r   r   r   r   )
r   hidden_statesattention_mask	head_masklayer_head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsrW   r   r   r   r     s   

z!DeepSpeedTransformerLayer.forward)NN)F)NNNNNNFN)	r   r   r   r@   r	   r   r   r   rB   r   r   r+   r   r   (  s    
Br   )r:   r   rp   r   torch.autogradr   deepspeed.acceleratorr   deepspeed.ops.op_builderr   r   rm   rl   r   r   rC   Moduler   r   r   r   r   <module>   s   m 