o
    	۷iPH                     @   s   d Z ddlmZ ddlmZ eeZG dd deZG dd deZ	G dd	 d	eZ
G d
d deZG dd deZg dZdS )zBlt model configuration   )PretrainedConfig)loggingc                       F   e Zd ZdZdZ									
									d fdd	Z  ZS )BltLocalEncoderConfigzB
    Configuration class for the Blt Local Encoder component.
    blt_local_encoder  F            N   h㈵>         `      Asilu   {Gz?c                       || _ || _|| _|| _|| _|| _|p|| _|| | _|p%td| d | _	|| _
|	| _|
| _|| _|| _|| _|| _|| _|dd  t jdi |ddi d S N   r   tie_word_embeddingsF 
vocab_sizecross_attn_all_layerscross_attn_khidden_size_globalhidden_sizenum_attention_headsnum_key_value_headshead_dimintintermediate_sizenum_hidden_layersrms_norm_epsdropoutmax_position_embeddings
rope_thetarope_scaling
hidden_actinitializer_rangepopsuper__init__selfr   r   r   r   r   r   r    r$   r%   r&   r'   r(   r)   r*   r#   r+   kwargs	__class__r   _/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/blt/configuration_blt.pyr.      &   

zBltLocalEncoderConfig.__init__)r   Fr   r	   r
   r   Nr   r   r   r   r   Nr   r   r   __name__
__module____qualname____doc__
model_typer.   __classcell__r   r   r2   r4   r      (    r   c                       r   )BltLocalDecoderConfigzB
    Configuration class for the Blt Local Decoder component.
    blt_local_decoderr   Tr   r	   r
   r   N	   r   r   r   r   r   r   r   c                    r   r   r   r/   r2   r   r4   r.   Q   r5   zBltLocalDecoderConfig.__init__)r   Tr   r	   r
   r   Nr@   r   r   r   r   Nr   r   r   r6   r   r   r2   r4   r>   J   r=   r>   c                       s>   e Zd ZdZdZ									
				d fdd	Z  ZS )BltGlobalTransformerConfigzG
    Configuration class for the Blt Global Transformer component.
    blt_global_transformerr	   r   N   r   r      r   r      r   c                    s   || _ || _|p	|| _|| | _|ptd| d | _|| _|| _|| _|| _	|| _
|	| _|
| _|| _|dd  t jdi |ddi d S r   )r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   )r0   r   r   r    r$   r%   r&   r'   r(   r)   r*   r#   r+   r1   r2   r   r4   r.      s   

z#BltGlobalTransformerConfig.__init__)r	   r   NrC   r   r   rD   r   Nr   rE   r   r6   r   r   r2   r4   rA   |   s     rA   c                       s>   e Zd ZdZdZ									
				d fdd	Z  ZS )BltPatcherConfiga	  
    Configuration class for the Blt Patcher/Entropy model component.

    Args:
            vocab_size (`int`, *optional*, defaults to 260):
                Vocabulary size of the Blt patcher model. Defines the number of different tokens that can be represented by the
                `inputs_ids` passed when calling the patcher model.
            hidden_size (`int`, *optional*, defaults to 768):
                Dimension of the hidden representations.
            num_hidden_layers (`int`, *optional*, defaults to 14):
                Number of hidden layers in the Transformer decoder.
            num_attention_heads (`int`, *optional*, defaults to 12):
                Number of attention heads for each attention layer in the Transformer decoder.
            num_key_value_heads (`int`, *optional*):
                This is the number of key_value heads that should be used to implement Grouped Query Attention. If
                `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
                `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
                converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
                by meanpooling all the original heads within that group. For more details, check out [this
                paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
                `num_attention_heads`.
            max_position_embeddings (`int`, *optional*, defaults to 8192):
                The maximum sequence length that this model might ever be used with.
            rms_norm_eps (`float`, *optional*, defaults to 1e-05):
                The epsilon used by the rms normalization layers.
            dropout (`float`, *optional*, defaults to 0.0):
                The dropout ratio for the attention probabilities.
            rope_theta (`float`, *optional*, defaults to 10000.0):
                The base period of the RoPE embeddings.
            intermediate_size (`int`, *optional*, defaults to 2048):
                Dimension of the MLP representations.
            rope_scaling (`dict`, *optional*):
                Dictionary containing the RoPE scaling configuration.
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    blt_patcherr            N    r   r        @r	   r   c                    s   || _ || _|| _|| _|| | _|d ur|n|| _|| _|| _|| _|	| _	d| _
|
p3td| j d | _|| _|| _|dd  t jdi |ddi d S )Nr   r   r   r   Fr   )r   r   r$   r   r!   r    r'   r%   r&   r(   r*   r"   r#   r)   r+   r,   r-   r.   )r0   r   r   r$   r   r    r'   r%   r&   r(   r#   r)   r+   r1   r2   r   r4   r.      s    
zBltPatcherConfig.__init__)r   rH   rI   rJ   NrK   r   r   rL   r	   Nr   r6   r   r   r2   r4   rF      s     %rF   c                       sb   e Zd ZdZdZdgZeeee	dZ
							
														d fdd	Z  ZS )	BltConfigas  
    This is the configuration class to store the configuration of a [`BltModel`]. It is used to instantiate a
    Blt model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
            vocab_size (`int`, *optional*, defaults to 260):
                Vocabulary size of the Blt model. Defines the number of different tokens that can be represented by the
                `inputs_ids` passed when calling [`BltModel`].
            max_position_embeddings (`int`, *optional*, defaults to 4096):
                The maximum sequence length that this model might ever be used with.
            patch_in_forward (`bool`, *optional*, defaults to `True`):
                Whether to perform patching during the forward pass.
            patch_size (`int`, *optional*, defaults to 4):
                Size of the patches used in the patching mechanism.
            patching_mode (`str`, *optional*, defaults to `"entropy"`):
                The mode used for patching, such as entropy-based patching.
            patching_threshold (`float`, *optional*, defaults to 1.34):
                Threshold value used for determining when to apply patches.
            patching_batch_size (`int`, *optional*, defaults to 1):
                Batch size used during the patching process.
            max_patch_length (`int`, *optional*):
                Maximum length of patches that can be generated.
            cross_attn_k (`int`, *optional*, defaults to 2):
                Number of cross-attention heads used in the model.
            encoder_hash_byte_group_size (`list`, *optional*):
                List of byte group sizes used in the encoder hash function.
            encoder_hash_byte_group_vocab (`int`, *optional*, defaults to 500002):
                Vocabulary size for the encoder hash byte groups.
            encoder_hash_byte_group_nb_functions (`int`, *optional*, defaults to 1):
                Number of hash functions used in the encoder byte grouping.
            patcher_config (`BltPatcherConfig`, *optional*):
                Configuration for the patcher component of the model.
            encoder_config (`BltLocalEncoderConfig`, *optional*):
                Configuration for the local encoder component of the model.
            decoder_config (`BltLocalDecoderConfig`, *optional*):
                Configuration for the local decoder component of the model.
            global_config (`BltGlobalTransformerConfig`, *optional*):
                Configuration for the global transformer component of the model.
            tie_word_embeddings (`bool`, *optional*, defaults to `False`):
                Whether to tie weight embeddings.
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            rope_theta (`float`, *optional*, defaults to 500000.0):
                The base period of the RoPE embeddings.
            rope_scaling (`dict`, *optional*):
                Dictionary containing the RoPE scaling configuration.

    ```python
    >>> from transformers import BltModel, BltConfig

    >>> # Initializing a Blt configuration
    >>> configuration = BltConfig()

    >>> # Initializing a model from the configuration
    >>> model = BltModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    Checkpoint: [facebook/blt](https://huggingface.co/facebook/blt)
    bltpast_key_values)patcher_configencoder_configdecoder_configglobal_configr   rD   T   entropy   ]?r   Nr   " Fr   r   c                    sF  || _ || _|| _|| _|| _|| _|| _|| _|| _|| _	|| _
|dd| _|dd| _|d| _|dd| _|	| _|
pDg d| _|| _|| _|d u r\t|d	| _td
 nt|trp|d| tdi || _nt|trx|| _|d u rt|d	| _td nt|tr|d| tdi || _nt|tr|| _|d u rt|d	| _td nt|tr|d| tdi || _nt|tr|| _|d u rt|d	| _ td nt|tr|d| tdi || _ nt|tr|| _ | jj!| j }|| j j!kr|nd | j _"|#dd  t$ j%dd|i| d S )Npatching_devicecudarealtime_patchingTpatching_threshold_addmonotonicityF)r   rT            r   )r+   z8patcher_config is None, using default Blt patcher configr+   z8encoder_config is None, using default Blt encoder configz8decoder_config is None, using default Blt decoder configz6global_config is None, using default Blt global configr   r   )&r   r'   r+   r(   r)   patch_in_forward
patch_sizepatching_modepatching_thresholdpatching_batch_sizemax_patch_lengthgetrX   rZ   r[   r\   r   encoder_hash_byte_group_sizeencoder_hash_byte_group_vocab$encoder_hash_byte_group_nb_functionsrF   rP   loggerinfo
isinstancedict
setdefaultr   rQ   r>   rR   rA   rS   r   encoder_cross_output_sizer,   r-   r.   )r0   r   r'   r`   ra   rb   rc   rd   re   r   rg   rh   ri   rP   rQ   rR   rS   r   r+   r(   r)   r1   ro   r2   r   r4   r.   >  sp   







zBltConfig.__init__)r   rD   TrT   rU   rV   r   Nr   NrW   r   NNNNFr   r   N)r7   r8   r9   r:   r;   keys_to_ignore_at_inferencerF   r   r>   rA   sub_configsr.   r<   r   r   r2   r4   rM      s<    B	rM   )rM   rF   r   r>   rA   N)r:   configuration_utilsr   utilsr   
get_loggerr7   rj   r   r>   rA   rF   rM   __all__r   r   r   r4   <module>   s   
22*L 0