o
    ߥiu                     @   s^   d Z ddlmZ ddlmZ ddlmZ ddlmZ e Z	G dd deZ
G dd	 d	eZd
S )z T5 model configuration    )Mapping)PretrainedConfig)OnnxSeq2SeqConfigWithPast)
get_loggerc                       sZ   e Zd ZdZdZdgZddddZ				
														d fdd	Z  ZS )T5Configa,  
    This is the configuration class to store the configuration of a [`T5Model`] or a [`TFT5Model`]. It is used to
    instantiate a T5 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the T5
    [t5-small](https://huggingface.co/t5-small) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Arguments:
        vocab_size (`int`, *optional*, defaults to 32128):
            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
        d_model (`int`, *optional*, defaults to 512):
            Size of the encoder layers and the pooler layer.
        d_kv (`int`, *optional*, defaults to 64):
            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
            num_heads`.
        d_ff (`int`, *optional*, defaults to 2048):
            Size of the intermediate feed forward layer in each `T5Block`.
        num_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        num_decoder_layers (`int`, *optional*):
            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
        num_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            The number of buckets to use for each attention layer.
        relative_attention_max_distance (`int`, *optional*, defaults to 128):
            The maximum distance of the longer sequences for the bucket separation.
        dropout_rate (`float`, *optional*, defaults to 0.1):
            The ratio for all dropout layers.
        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.
        initializer_factor (`float`, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. T5v1.1 uses the
            `"gated-gelu"` feed forward projection. Original T5 uses `"relu"`.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
    t5past_key_valuesd_model	num_heads
num_layers)hidden_sizenum_attention_headsnum_hidden_layers}     @         N          皙?ư>      ?reluTr      c                    s   || _ || _|| _|| _|| _|d ur|n| j| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _| jd}|d | _|d dk| _t|dkrO|d dksUt|dkr]td| d|d	krdd
| _t jd|||d| d S )N-r   gatedr      z`feed_forward_proj`: z is not a valid activation function of the dense layer.Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'z
gated-gelugelu_new)pad_token_ideos_token_idis_encoder_decoder )
vocab_sizer	   d_kvd_ffr   num_decoder_layersr
   relative_attention_num_bucketsrelative_attention_max_distancedropout_ratelayer_norm_epsiloninitializer_factorfeed_forward_proj	use_cachesplitdense_act_fnis_gated_actlen
ValueErrorsuper__init__)selfr%   r	   r&   r'   r   r(   r
   r)   r*   r+   r,   r-   r.   r#   r/   r!   r"   kwargsact_info	__class__r$   Z/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/T5/configuration.pyr6   N   sD   
$

zT5Config.__init__)r   r   r   r   r   Nr   r   r   r   r   r   r   TTr   r   )	__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr6   __classcell__r$   r$   r:   r<   r      s4    +r   c                   @   s@   e Zd Zedeeeeef f fddZedefddZdS )T5OnnxConfigreturnc                 C   sx   ddddddd}| j r"d|d d< ddi|d	< dd
d|d< nddd|d	< ddd|d< | j r:| j|dd |S )Nbatchencoder_sequence)r   r   )	input_idsattention_maskz past_encoder_sequence + sequencerJ   r   r   decoder_input_idsz past_decoder_sequence + sequencedecoder_attention_maskdecoder_sequenceinputs)	direction)use_pastfill_with_past_key_values_)r7   common_inputsr$   r$   r<   rN      s2   


zT5OnnxConfig.inputsc                 C   s   dS )N   r$   )r7   r$   r$   r<   default_onnx_opset   s   zT5OnnxConfig.default_onnx_opsetN)	r=   r>   r?   propertyr   strintrN   rT   r$   r$   r$   r<   rE      s
     "rE   N)r@   typingr    transformers.configuration_utilsr   transformers.onnxr   modelscope.utils.loggerr   loggerr   rE   r$   r$   r$   r<   <module>   s   n