o
    ߥi@                     @   sH   d Z ddlZddlmZ ddlmZ e ZddiZG dd deZ	dS )	z OFA model configuration    N)PretrainedConfig)loggingz
ofa-mediumz8https://huggingface.co/ofa-base/resolve/main/config.jsonc                       s   e Zd ZdZdZdgZdddZ					
				
																																																		d! fdd 	Z  ZS )"	OFAConfiga*$  
    This is the configuration class to store the configuration of a [`~OFAModel`]. It is used to instantiate an OFA
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the OFA [ofa-base](https://huggingface.co/ofa-base)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50265):
            Vocabulary size of the OFA model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`~OFAModel`] or [`~TFOFAModel`].
        d_model (`int`, *optional*, defaults to 1024):
            Dimension of the layers and the pooler layer.
        encoder_layers (`int`, *optional*, defaults to 12):
            Number of encoder layers.
        decoder_layers (`int`, *optional*, defaults to 12):
            Number of decoder layers.
        encoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for activations inside the fully connected layer.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
        max_position_embeddings (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
            for more details.
        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
            for more details.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether or not the model is used as an encoder/decoder.
        scale_embedding (`bool`, *optional*, defaults to `False`):
            Whether or not to scale the embedding. If True, embedding = Math.qrt(d_model) * embedding
        pad_token_id (`int`, *optional*, defaults to `1`):
            The id of the _padding_ token.
        bos_token_id (`int`, *optional*, defaults to `0`):
            The id of the _beginning-of-stream_ token.
        decoder_start_token_id (`int`, *optional*, defaults to `0`):
            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
        eos_token_id (`int`, *optional*, defaults to `2`):
            The id of the _end-of-stream_ token.
        forced_eos_token_id (`int`, *optional*, defaults to `2`):
            The id of the token to force as the last generated token when `max_length` is reached.
        encoder_normalize_before (`bool`, *optional*, defaults to `True`):
            Whether or not to use layer normalization before the self attention and fc layer within encoder layer
        decoder_normalize_before (`bool`, *optional*, defaults to `True`):
            Whether or not to use layer normalization before the self attention and fc layer within decoder layer
        normformer (`bool`, *optional*, defaults to `True`):
            Whether or not to use layer normalization between the self attention layers and fc layer within
            encoder&decoder layer
        encoder_drop_path_rate (`float`, *optional*, defaults to `0.0`):
            The drop path rate using in the encoder. see more about drop path [drop path](https://arxiv.org/abs/1605.07648)
        decoder_drop_path_rate (`float`, *optional*, defaults to `0.0`):
            The drop path rate using in the decoder. see more about drop path [drop path](https://arxiv.org/abs/1605.07648)
        layernorm_embedding (`bool`, *optional*, defaults to `True`):
            Whether or not to use layer normalization for text input embedding in encoder and decoder.
        patch_layernorm_embedding (`bool`, *optional*, defaults to `True`):
            Whether or not to use layer normalization for image patch input embedding in encoder and decoder.
        entangle_position_embedding (`bool`, *optional*, defaults to `False`):
            Whether or not to entangle position embedding to input embedding.
        resnet_type (`str`, *optional*, defaults to `"resnet101"`):
            The image encoder's type in OFA, only works when use_ofasys=False. `"resnet18"`, `"resnet34"`,
            `"resnet50"`, `"resnet101"` and `"resnet152"` are supported.
        resnet_model_path (`str`, *optional*, defaults to `None`):
            The path where can load resnet model. If None, will use random initialized weights.
        resnet_drop_path_rate
            The drop path rate using in resnet for image encoding, see more about drop path
            [drop path](https://arxiv.org/abs/1605.07648)
        token_bucket_size (`int`, *optional*, defaults to `256`):
            The number of token buckets to use for each attention layer.
        image_bucket_size (`int`, *optional*, defaults to `42`):
            The number of image buckets to use for each attention layer.
        add_type_embedding (`bool`, *optional*, defaults to `True`):
            Whether or not to add type embedding to the input while encoding. So far, type means the type of modality,
            and only Text&Image modalities is supported, `0`=Text, `1`=Image
        share_decoder_input_output_embed (`bool`, *optional*, defaults to `True`):
            Whether or not to share the input embedding table as the weights the output projection in decoder. If False,
            using a new linear projection.
        attn_scale_factor (`float`, *optional*, defaults to `2.0`):
            The position embedding scaling factor. If it works,
            position_embedding = position_embedding * float(d_model / num_attention_heads * attn_scale_factor)**-0.5
        code_layernorm_embedding (`bool`, *optional*, defaults to `True`):
            Whether or not to user layer normalization for code generation
        code_image_size (`int`, *optional*, defaults to `128`):
            Image size of generated images. Also used in calculating the image's position id for attention bias.
        interpolate_position (`bool`, *optional*, defaults to `False`):
            Deprecated now, will be deleted in next version.
        orig_patch_image_size (`int`, *optional*, defaults to `224`):
            Deprecated now, will be deleted in next version.
        share_attn_bias (`bool`, *optional*, defaults to `False`):
            Whether or not to share attn_bias cross transformer layers
        use_image_feature (`bool`, *optional*, defaults to `True`):
            Whether or not the model have image modality.
        disable_entangle (`bool`, *optional*, defaults to `False`):
            Whether or not to disable the entangle relative configs.
        use_ofasys (`bool`, *optional*, defaults to `False`):
            Whether or not the model is come from OFA-Sys. If True, the model structure will be some differences from OFA
        vit_type (`str`, *optional*, defaults to `"vit_base"`):
            The image encoder's type in OFA-Sys, only works when use_ofasys=True. `"vit_base"`, `"vit_large"`,
            `"vit_large_336"` and `"vit_huge"` are supported.
        vit_drop_path_rate
            The drop path rate using the image encoder vit. see more about drop path
            [drop path](https://arxiv.org/abs/1605.07648)
    ofapast_key_valuesencoder_attention_headsd_model)num_attention_headshidden_sizeA                      Tgelu   皙?{Gz?F   r      	resnet101N   *          @      vit_base      ?c:           ;   	      s  || _ || _|| _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|| _|	| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| | _|!| _|"| _|#| _|$| _|%| _|&| _ |'| _!|(| _"|)| _#|*| _$|+| _%|,| _&|-| _'|.| _(|/| _)|0| _*|1| _+|2| _,|3| _-|4| _.|5| _/|6| _0|7| _1|8| _2|9| _3t4 j5d||||||d|: | j6d u r|:7ddr| j8| _6t9:d| j8 d d S d S d S )N)pad_token_idbos_token_ideos_token_idis_encoder_decoderdecoder_start_token_idforced_eos_token_idforce_bos_token_to_be_generatedFz:Please make sure the config includes `forced_bos_token_id=zT` in future versions. The config can simply be saved and uploaded again to be fixed. );
vocab_sizemax_position_embeddingsr   encoder_ffn_dimencoder_layersr   decoder_ffn_dimdecoder_layersdecoder_attention_headsdropoutattention_dropoutactivation_dropoutactivation_functioninit_stdencoder_layerdropdecoder_layerdropclassifier_dropout	use_cachenum_hidden_layersscale_embeddingencoder_normalize_beforedecoder_normalize_before
normformerencoder_drop_path_ratedecoder_drop_path_ratelayernorm_embeddingpatch_layernorm_embeddingresnet_typeresnet_model_pathresnet_drop_path_ratetoken_bucket_sizeimage_bucket_sizeadd_type_embedding share_decoder_input_output_embedattn_scale_factorcode_layernorm_embeddingcode_image_sizeentangle_position_embeddinginterpolate_positionorig_patch_image_sizeshare_attn_biasuse_image_featuredisable_entangle
use_ofasysvit_typevit_drop_path_rateuse_gamma_featuregammaexclude_mlptemperature_init_valueremove_decoder_type_embeddingmlp_dimsuper__init__forced_bos_token_idgetr    warningswarn);selfr'   r(   r*   r)   r   r,   r+   r-   r3   r4   r6   r"   r1   r   r.   r/   r0   r2   r5   r8   r   r    r#   r!   r$   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   kwargs	__class__r&   g/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/ofa/configuration_ofa.pyrZ      s   ;zOFAConfig.__init__)9r   r   r   r   r   r   r   r   r   r   TTr   r   r   r   r   r   r   Fr   r   r   r   r   TTTr   r   TTr   Nr   r   r   TTr   Tr   FFr   FTFFr   r   Fr   TNFr   )	__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_maprZ   __classcell__r&   r&   ra   rc   r      s     r   )
rg   r]   transformersr   transformers.utilsr   
get_loggerlogger!OFA_PRETRAINED_CONFIG_ARCHIVE_MAPr   r&   r&   r&   rc   <module>   s   