o
    ei                      @   s@   d Z ddlmZ ddlmZ eeZG dd deZdgZ	dS )zUDOP model configuration   )PreTrainedConfig)loggingc                       s|   e Zd ZdZdZdgZddddZdd	d
ddddddddiddiddigddddddddd	ddddddf fdd 	Z  ZS )!
UdopConfiga  
    This is the configuration class to store the configuration of a [`UdopForConditionalGeneration`]. It is used to
    instantiate a UDOP model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the UDOP
    [microsoft/udop-large](https://huggingface.co/microsoft/udop-large) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Arguments:
        vocab_size (`int`, *optional*, defaults to 33201):
            Vocabulary size of the UDOP model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`UdopForConditionalGeneration`].
        d_model (`int`, *optional*, defaults to 1024):
            Size of the encoder layers and the pooler layer.
        d_kv (`int`, *optional*, defaults to 64):
            Size of the key, query, value projections per attention head. The `inner_dim` of the projection layer will
            be defined as `num_heads * d_kv`.
        d_ff (`int`, *optional*, defaults to 4096):
            Size of the intermediate feed forward layer in each `UdopBlock`.
        num_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder and decoder.
        num_decoder_layers (`int`, *optional*):
            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
        num_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder and decoder.
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            The number of buckets to use for each attention layer.
        relative_attention_max_distance (`int`, *optional*, defaults to 128):
            The maximum distance of the longer sequences for the bucket separation.
        relative_bias_args (`list[dict]`, *optional*, defaults to `[{'type': '1d'}, {'type': 'horizontal'}, {'type': 'vertical'}]`):
            A list of dictionaries containing the arguments for the relative bias layers.
        dropout_rate (`float`, *optional*, defaults to 0.1):
            The ratio for all dropout layers.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. Udopv1.1 uses the
            `"gated-gelu"` feed forward projection. Original Udop uses `"relu"`.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model should behave as an encoder/decoder or not.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        pad_token_id (`int`, *optional*, defaults to 0):
            The id of the padding token in the vocabulary.
        eos_token_id (`int`, *optional*, defaults to 1):
            The id of the end-of-sequence token in the vocabulary.
        max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
            The maximum absolute position embeddings for relative position encoding.
        image_size (`int`, *optional*, defaults to 224):
            The size of the input images.
        patch_size (`int`, *optional*, defaults to 16):
            The patch size used by the vision encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of channels in the input images.
        is_decoder (`bool`, *optional*, defaults to `False`):
            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
            decoder-only or encoder-only architectures.
        add_cross_attention (`bool`, *optional*, defaults to `False`):
            Whether cross-attention layers should be added to the model.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
    udoppast_key_valuesd_model	num_heads
num_layers)hidden_sizenum_attention_headsnum_hidden_layersi  i   @   i      N          type1d
horizontalverticalg?gư>g      ?reluT          r   Fc                    s"  || _ || _|| _|| _|| _|| _|| _|d ur|n| j| _|| _|| _	|	| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _t|
tsRtd|
| _| jd}|d | _|d dk| _t|dkrs|d dksyt|dkrtd| d	d
| _t  j!dd|i| d S )Nz6`relative_bias_args` should be a list of dictionaries.-r   gatedr      z`feed_forward_proj`: z is not a valid activation function of the dense layer.Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'Tis_encoder_decoder )"
is_decoderadd_cross_attention
vocab_sizer   d_kvd_ffr	   num_decoder_layersr   relative_attention_num_bucketsrelative_attention_max_distancedropout_ratelayer_norm_epsiloninitializer_factorfeed_forward_proj	use_cachepad_token_ideos_token_idmax_2d_position_embeddings
image_size
patch_sizenum_channels
isinstancelist	TypeErrorrelative_bias_argssplitdense_act_fnis_gated_actlen
ValueErrortie_word_embeddingssuper__init__)selfr"   r   r#   r$   r	   r%   r   r&   r'   r6   r(   r)   r*   r+   r   r,   r-   r.   r/   r0   r1   r2   r    r!   r<   kwargsact_info	__class__r   i/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/udop/configuration_udop.pyr>   _   sF   

$
zUdopConfig.__init__)	__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr>   __classcell__r   r   rB   rD   r      s>    Cr   N)
rH   configuration_utilsr   utilsr   
get_loggerrE   loggerr   __all__r   r   r   rD   <module>   s   
 
