o
    Ti                     @   sB   d dl Z d dlZd dlmZmZ G dd dZG dd deZdS )    N)ActivationFuncTypeNormTypec                   @   s   e Zd Zdd ZdS )TransformerConfigc                 C   s"   d| _ || _|| _|| _|| _d S )N)layer_idhidden_sizeintermediate_sizeheadsnum_hidden_layers)selfr   r   r	   r
    r   ^/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/transformer/inference/config.py__init__   s
   
zTransformerConfig.__init__N)__name__
__module____qualname__r   r   r   r   r   r      s    r   c                $       s   e Zd ZdZdddddddejdejddddddddddej	dddddddddddddd	df$ fd
d	Z
edd Zedd Z  ZS )DeepSpeedInferenceConfigaW  Initialize the DeepSpeed Transformer Config.
        Arguments:
            hidden_size: The hidden size of the transformer layer
            intermediate_size: The intermediate size of the feed-forward part of transformer layer
            heads: The number of heads in the self-attention of the transformer layer
            num_hidden_layers: The number of transformer layers
            layer_norm_eps: The epsilon value for the layer norm
            local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
                to use if the model already set the current device, otherwise need to set it
                so that the transformer kernel can work on the right device
            mp_size (optional): This argument is mainly used to create the parameters on the kernel side
                using model-parallel architecture. If the client model already takes care of this, there is no
                need to pass this argument.
            pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
            stochastic_mode:  Enable for high performance, please note that this flag has some level of
                non-determinism and can produce different results on different runs.  However, we have seen
                that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
                to turn it off in order to be able to reproduce the same result through the regular kernel execution.

            scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
            return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
            bigscience_bloom: This flag is added temporarily for supporting the BLOOM-176B model architecture.
            use_triton: This flag is to enable triton kernels in inference or not.
            invert_mask: If True, the attention mask is inverted when passed to attention block.
    r   g-q=   TF   i   i'  c%           %         s   t t| ||dkr|nd| || || _|	| _|
| _|| _|| _|| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| | _|!| _|"| _ |#| _!|$| _"d S )Nr      )#superr   r   dtypepre_layer_norm	norm_type
local_rankstochastic_modeepsilonmp_sizescale_attentiontriangular_maskinglocal_attentionwindow_size
rotary_dimrotate_halfrotate_every_tworeturn_tuplemlp_after_attnmlp_act_func_typetraining_mp_sizebigscience_bloommax_out_tokensmin_out_tokensscale_attn_by_inverse_layer_idxenable_qkv_quantizationuse_mupreturn_single_tupleset_empty_paramstransposed_mode
use_tritontriton_autotunenum_kv
rope_thetainvert_mask)%r   r   r   r	   r
   layer_norm_epsr   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r-   r.   r,   r/   r0   r1   r2   r3   r4   r5   r6   	__class__r   r   r   1   sL   %
z!DeepSpeedInferenceConfig.__init__c                 C   s&   t  }| D ]	\}}||j|< q|S )N)r   items__dict__)clsjson_objectconfigkeyvaluer   r   r   	from_dictz   s   z"DeepSpeedInferenceConfig.from_dictc                 C   sF   t |ddd}| }W d    n1 sw   Y  | t|S )Nrzutf-8)encoding)openreadrA   jsonloads)r<   	json_filereadertextr   r   r   from_json_file   s   
z'DeepSpeedInferenceConfig.from_json_file)r   r   r   __doc__torchfloat16r   	LayerNormr   GELUr   classmethodrA   rK   __classcell__r   r   r8   r   r      sV    I
r   )rF   rM   deepspeed.utils.typesr   r   r   r   r   r   r   r   <module>   s
   
