o
    ºiPH  ã                   @   s‚   d Z ddlmZ ddlmZ e e¡ZG dd„ deƒZG dd„ deƒZ	G dd	„ d	eƒZ
G d
d„ deƒZG dd„ deƒZg d¢ZdS )zBlt model configurationé   )ÚPretrainedConfig)Úloggingc                       óF   e Zd ZdZdZ									
									d‡ fdd„	Z‡  ZS )ÚBltLocalEncoderConfigzB
    Configuration class for the Blt Local Encoder component.
    Úblt_local_encoderé  Fé   é   é   é   Né   çñhãˆµøä>ç        é `  ç    €„AÚsilué   ç{®Gáz”?c                    ó¨   || _ || _|| _|| _|| _|| _|p|| _|| | _|p%td| d ƒ| _	|| _
|	| _|
| _|| _|| _|| _|| _|| _| dd ¡ tƒ jdi |¤ddi¤Ž d S ©Né   r   Útie_word_embeddingsF© ©Ú
vocab_sizeÚcross_attn_all_layersÚcross_attn_kÚhidden_size_globalÚhidden_sizeÚnum_attention_headsÚnum_key_value_headsÚhead_dimÚintÚintermediate_sizeÚnum_hidden_layersÚrms_norm_epsÚdropoutÚmax_position_embeddingsÚ
rope_thetaÚrope_scalingÚ
hidden_actÚinitializer_rangeÚpopÚsuperÚ__init__©Úselfr   r   r   r   r   r   r    r$   r%   r&   r'   r(   r)   r*   r#   r+   Úkwargs©Ú	__class__r   úf/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/blt/configuration_blt.pyr.      ó&   

zBltLocalEncoderConfig.__init__)r   Fr   r	   r
   r   Nr   r   r   r   r   Nr   r   r   ©Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_typer.   Ú__classcell__r   r   r2   r4   r      ó(    ïr   c                       r   )ÚBltLocalDecoderConfigzB
    Configuration class for the Blt Local Decoder component.
    Úblt_local_decoderr   Tr   r	   r
   r   Né	   r   r   r   r   r   r   r   c                    r   r   r   r/   r2   r   r4   r.   Q   r5   zBltLocalDecoderConfig.__init__)r   Tr   r	   r
   r   Nr@   r   r   r   r   Nr   r   r   r6   r   r   r2   r4   r>   J   r=   r>   c                       s>   e Zd ZdZdZ									
				d‡ fdd„	Z‡  ZS )ÚBltGlobalTransformerConfigzG
    Configuration class for the Blt Global Transformer component.
    Úblt_global_transformerr	   r   Né   r   r   é   r   r   é   r   c                    s   || _ || _|p	|| _|| | _|ptd| d ƒ| _|| _|| _|| _|| _	|| _
|	| _|
| _|| _| dd ¡ tƒ jdi |¤ddi¤Ž d S r   )r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   )r0   r   r   r    r$   r%   r&   r'   r(   r)   r*   r#   r+   r1   r2   r   r4   r.   ƒ   s   

z#BltGlobalTransformerConfig.__init__)r	   r   NrC   r   r   rD   r   Nr   rE   r   r6   r   r   r2   r4   rA   |   s     órA   c                       s>   e Zd ZdZdZ									
				d‡ fdd„	Z‡  ZS )ÚBltPatcherConfiga¿	  
    Configuration class for the Blt Patcher/Entropy model component.

    Args:
            vocab_size (`int`, *optional*, defaults to 260):
                Vocabulary size of the Blt patcher model. Defines the number of different tokens that can be represented by the
                `inputs_ids` passed when calling the patcher model.
            hidden_size (`int`, *optional*, defaults to 768):
                Dimension of the hidden representations.
            num_hidden_layers (`int`, *optional*, defaults to 14):
                Number of hidden layers in the Transformer decoder.
            num_attention_heads (`int`, *optional*, defaults to 12):
                Number of attention heads for each attention layer in the Transformer decoder.
            num_key_value_heads (`int`, *optional*):
                This is the number of key_value heads that should be used to implement Grouped Query Attention. If
                `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
                `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
                converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
                by meanpooling all the original heads within that group. For more details, check out [this
                paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
                `num_attention_heads`.
            max_position_embeddings (`int`, *optional*, defaults to 8192):
                The maximum sequence length that this model might ever be used with.
            rms_norm_eps (`float`, *optional*, defaults to 1e-05):
                The epsilon used by the rms normalization layers.
            dropout (`float`, *optional*, defaults to 0.0):
                The dropout ratio for the attention probabilities.
            rope_theta (`float`, *optional*, defaults to 10000.0):
                The base period of the RoPE embeddings.
            intermediate_size (`int`, *optional*, defaults to 2048):
                Dimension of the MLP representations.
            rope_scaling (`dict`, *optional*):
                Dictionary containing the RoPE scaling configuration.
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    Úblt_patcherr   é   é   é   Né    r   r   ç     ˆÃ@r	   r   c                    s    || _ || _|| _|| _|| | _|d ur|n|| _|| _|| _|| _|	| _	d| _
|
p3td| j d ƒ| _|| _|| _| dd ¡ tƒ jdi |¤ddi¤Ž d S )Nr   r   r   r   Fr   )r   r   r$   r   r!   r    r'   r%   r&   r(   r*   r"   r#   r)   r+   r,   r-   r.   )r0   r   r   r$   r   r    r'   r%   r&   r(   r#   r)   r+   r1   r2   r   r4   r.   Î   s    
zBltPatcherConfig.__init__)r   rH   rI   rJ   NrK   r   r   rL   r	   Nr   r6   r   r   r2   r4   rF   ¦   s     %órF   c                       sb   e Zd ZdZdZdgZeeee	dœZ
							
														d‡ fdd„	Z‡  ZS )Ú	BltConfigas  
    This is the configuration class to store the configuration of a [`BltModel`]. It is used to instantiate a
    Blt model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
            vocab_size (`int`, *optional*, defaults to 260):
                Vocabulary size of the Blt model. Defines the number of different tokens that can be represented by the
                `inputs_ids` passed when calling [`BltModel`].
            max_position_embeddings (`int`, *optional*, defaults to 4096):
                The maximum sequence length that this model might ever be used with.
            patch_in_forward (`bool`, *optional*, defaults to `True`):
                Whether to perform patching during the forward pass.
            patch_size (`int`, *optional*, defaults to 4):
                Size of the patches used in the patching mechanism.
            patching_mode (`str`, *optional*, defaults to `"entropy"`):
                The mode used for patching, such as entropy-based patching.
            patching_threshold (`float`, *optional*, defaults to 1.34):
                Threshold value used for determining when to apply patches.
            patching_batch_size (`int`, *optional*, defaults to 1):
                Batch size used during the patching process.
            max_patch_length (`int`, *optional*):
                Maximum length of patches that can be generated.
            cross_attn_k (`int`, *optional*, defaults to 2):
                Number of cross-attention heads used in the model.
            encoder_hash_byte_group_size (`list`, *optional*):
                List of byte group sizes used in the encoder hash function.
            encoder_hash_byte_group_vocab (`int`, *optional*, defaults to 500002):
                Vocabulary size for the encoder hash byte groups.
            encoder_hash_byte_group_nb_functions (`int`, *optional*, defaults to 1):
                Number of hash functions used in the encoder byte grouping.
            patcher_config (`BltPatcherConfig`, *optional*):
                Configuration for the patcher component of the model.
            encoder_config (`BltLocalEncoderConfig`, *optional*):
                Configuration for the local encoder component of the model.
            decoder_config (`BltLocalDecoderConfig`, *optional*):
                Configuration for the local decoder component of the model.
            global_config (`BltGlobalTransformerConfig`, *optional*):
                Configuration for the global transformer component of the model.
            tie_word_embeddings (`bool`, *optional*, defaults to `False`):
                Whether to tie weight embeddings.
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
            rope_theta (`float`, *optional*, defaults to 500000.0):
                The base period of the RoPE embeddings.
            rope_scaling (`dict`, *optional*):
                Dictionary containing the RoPE scaling configuration.

    ```python
    >>> from transformers import BltModel, BltConfig

    >>> # Initializing a Blt configuration
    >>> configuration = BltConfig()

    >>> # Initializing a model from the configuration
    >>> model = BltModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    Checkpoint: [facebook/blt](https://huggingface.co/facebook/blt)
    ÚbltÚpast_key_values)Úpatcher_configÚencoder_configÚdecoder_configÚglobal_configr   rD   Té   Úentropyç   €ø]õ?r   Nr   é"¡ Fr   r   c                    sF  || _ || _|| _|| _|| _|| _|| _|| _|| _|| _	|| _
| dd¡| _| dd¡| _| d¡| _| dd¡| _|	| _|
pDg d¢| _|| _|| _|d u r\t|d	| _t d
¡ nt|tƒrp| d|¡ tdi |¤Ž| _nt|tƒrx|| _|d u rˆt|d	| _t d¡ nt|tƒrœ| d|¡ tdi |¤Ž| _nt|tƒr¤|| _|d u r´t|d	| _t d¡ nt|tƒrÈ| d|¡ tdi |¤Ž| _nt|tƒrÐ|| _|d u ràt|d	| _ t d¡ nt|tƒrô| d|¡ tdi |¤Ž| _ nt|tƒrü|| _ | jj!| j }|| j j!kr|nd | j _"| #dd ¡ t$ƒ j%dd|i|¤Ž d S )NÚpatching_deviceÚcudaÚrealtime_patchingTÚpatching_threshold_addÚmonotonicityF)r   rT   é   é   é   r   )r+   z8patcher_config is None, using default Blt patcher configr+   z8encoder_config is None, using default Blt encoder configz8decoder_config is None, using default Blt decoder configz6global_config is None, using default Blt global configr   r   )&r   r'   r+   r(   r)   Úpatch_in_forwardÚ
patch_sizeÚpatching_modeÚpatching_thresholdÚpatching_batch_sizeÚmax_patch_lengthÚgetrX   rZ   r[   r\   r   Úencoder_hash_byte_group_sizeÚencoder_hash_byte_group_vocabÚ$encoder_hash_byte_group_nb_functionsrF   rP   ÚloggerÚinfoÚ
isinstanceÚdictÚ
setdefaultr   rQ   r>   rR   rA   rS   r   Úencoder_cross_output_sizer,   r-   r.   )r0   r   r'   r`   ra   rb   rc   rd   re   r   rg   rh   ri   rP   rQ   rR   rS   r   r+   r(   r)   r1   ro   r2   r   r4   r.   >  sp   







ÿzBltConfig.__init__)r   rD   TrT   rU   rV   r   Nr   NrW   r   NNNNFr   r   N)r7   r8   r9   r:   r;   Úkeys_to_ignore_at_inferencerF   r   r>   rA   Úsub_configsr.   r<   r   r   r2   r4   rM   ò   s<    Bü	ërM   )rM   rF   r   r>   rA   N)r:   Úconfiguration_utilsr   Úutilsr   Ú
get_loggerr7   rj   r   r>   rA   rF   rM   Ú__all__r   r   r   r4   Ú<module>   s   
22*L 0