o
    eiM                     @   s   d Z ddlmZ ddlmZ ddlmZ eeZ	G dd deZ
G dd deZG d	d
 d
eZG dd deZG dd deZg dZdS )zBlt model configuration   )PreTrainedConfig)RopeParameters)loggingc                        s   e Zd ZdZdZdZ								
					
			d#ded
B ded
B ded
B ded
B ded
B ded
B ded
B ded
B ded
B ded
B ded
B de	e
ee	f B d
B ded
B ded
B d ed
B f fd!d"Z  ZS )$BltLocalEncoderConfigzB
    Configuration class for the Blt Local Encoder component.
    blt_local_encoder    A  F            N   h㈵>         `  silu   {Gz?
vocab_sizecross_attn_all_layerscross_attn_khidden_size_globalhidden_sizenum_attention_headsnum_key_value_headsnum_hidden_layersrms_norm_epsdropoutmax_position_embeddingsrope_parameters
hidden_actintermediate_sizeinitializer_rangec                    s   || _ || _|| _|| _|| _|| _|p|| _|| | _|p%td| d | _	|| _
|	| _|
| _|| _|| _|| _|| _|dd  t jdi |ddi d S )N   r   tie_word_embeddingsF )r   r   r   r   r   r   r   head_dimintr!   r   r   r   r   r    r"   r   popsuper__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   kwargs	__class__r%   g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/blt/configuration_blt.pyr*       s$   

zBltLocalEncoderConfig.__init__)r   Fr	   r
   r   r   Nr   r   r   r   Nr   r   r   __name__
__module____qualname____doc__
model_typedefault_thetar'   boolfloatr   dictstrr*   __classcell__r%   r%   r-   r/   r      sd    	
r   c                (       s   e Zd ZdZdZdZ								
					
				
	
	
	d(ded
B ded
B ded
B ded
B ded
B ded
B ded
B ded
B ded
B ded
B ded
B de	e
ee	f B d
B ded
B d ed
B d!ed
B d"ed
B d#ed
B d$ed
B d%ed
B f& fd&d'Z  ZS ))BltLocalDecoderConfigzB
    Configuration class for the Blt Local Decoder component.
    blt_local_decoderr   r   Tr	   r
   r   r   N	   r   r   r   r   r   r   Fr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   pad_token_idbos_token_ideos_token_idr$   c                    s   || _ || _|| _|| _|| _|| _|p|| _|| | _|p%td| d | _	|| _
|	| _|
| _|| _|| _|| _|| _|| _|| _d| _|| _t jdi | d S Nr#   r   Fr%   )r   r   r   r   r   r   r   r&   r'   r!   r   r   r   r   r    r"   r?   r@   rA   r$   r   r)   r*   )r+   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r?   r@   rA   r$   r,   r-   r%   r/   r*   Q   s*   

zBltLocalDecoderConfig.__init__)r   Tr	   r
   r   r   Nr>   r   r   r   Nr   r   r   NNNFr0   r%   r%   r-   r/   r<   I   s|    	
r<   c                       s   e Zd ZdZdZdZ								
					ddedB dedB dedB dedB dedB dedB dedB dee	e
ef B dB de
dB dedB dedB dedB f fddZ  ZS )BltGlobalTransformerConfigzG
    Configuration class for the Blt Global Transformer component.
    blt_global_transformerr   r
   r   N   r   r      r      r   Fr   r   r   r   r   r   r   r   r    r!   r"   r$   c                    s|   || _ || _|p	|| _|| | _|
ptd| d | _|| _|| _|| _|| _	|	| _
|| _d| _|| _t jdi | d S rB   )r   r   r   r&   r'   r!   r   r   r   r   r    r"   r$   r   r)   r*   )r+   r   r   r   r   r   r   r   r   r    r!   r"   r$   r,   r-   r%   r/   r*      s   

z#BltGlobalTransformerConfig.__init__)r
   r   NrE   r   r   rF   Nr   rG   r   F)r1   r2   r3   r4   r5   r6   r'   r8   r   r9   r:   r7   r*   r;   r%   r%   r-   r/   rC      sR    	
rC   c                       s   e Zd ZdZdZ									
				ddedB dedB dedB dedB dedB dedB dedB dedB dedB deee	ef B dB dedB de
dB f fddZ  ZS )BltPatcherConfiga	  
    Configuration class for the Blt Patcher/Entropy model component.

    Args:
        vocab_size (`int`, *optional*, defaults to 260):
            Vocabulary size of the Blt patcher model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling the patcher model.
        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the hidden representations.
        num_hidden_layers (`int`, *optional*, defaults to 14):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimension of the MLP representations.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    blt_patcherr            N    r   r   r
   r   Fr   r   r   r   r   r   r   r   r!   r   r"   r$   c                    s   || _ || _|| _|| _|| | _|d ur|n|| _|| _|| _|| _d| _	|	p0t
d| j d | _|| _|
| _d| _t jdi | d S )Nr   r#   r   Fr%   )r   r   r   r   r&   r   r   r   r   r    r'   r!   r"   r   r$   r)   r*   )r+   r   r   r   r   r   r   r   r   r!   r   r"   r$   r,   r-   r%   r/   r*      s   
zBltPatcherConfig.__init__)r   rJ   rK   rL   NrM   r   r   r
   Nr   F)r1   r2   r3   r4   r5   r'   r8   r   r9   r:   r7   r*   r;   r%   r%   r-   r/   rH      sP    %	
rH   c                0       s(  e Zd ZdZdZdgZdZeee	e
dZ						
																	d*dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB d edB d!edB d"edB d#edB d$edB d%edB d&edB d'eeeef B dB f, fd(d)Z  ZS )+	BltConfiga<  
    This is the configuration class to store the configuration of a [`BltModel`]. It is used to instantiate a
    Blt model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 260):
            Vocabulary size of the Blt model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`BltModel`].
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used with.
        patch_in_forward (`bool`, *optional*, defaults to `True`):
            Whether to perform patching during the forward pass.
        patch_size (`int`, *optional*, defaults to 4):
            Size of the patches used in the patching mechanism.
        patching_mode (`str`, *optional*, defaults to `"entropy"`):
            The mode used for patching, such as entropy-based patching.
        patching_threshold (`float`, *optional*, defaults to 1.34):
            Threshold value used for determining when to apply patches.
        patching_batch_size (`int`, *optional*, defaults to 1):
            Batch size used during the patching process.
        max_patch_length (`int`, *optional*):
            Maximum length of patches that can be generated.
        cross_attn_k (`int`, *optional*, defaults to 2):
            Number of cross-attention heads used in the model.
        encoder_hash_byte_group_size (`list`, *optional*):
            List of byte group sizes used in the encoder hash function.
        encoder_hash_byte_group_vocab (`int`, *optional*, defaults to 500002):
            Vocabulary size for the encoder hash byte groups.
        encoder_hash_byte_group_nb_functions (`int`, *optional*, defaults to 1):
            Number of hash functions used in the encoder byte grouping.
        patcher_config (`BltPatcherConfig`, *optional*):
            Configuration for the patcher component of the model.
        encoder_config (`BltLocalEncoderConfig`, *optional*):
            Configuration for the local encoder component of the model.
        decoder_config (`BltLocalDecoderConfig`, *optional*):
            Configuration for the local decoder component of the model.
        global_config (`BltGlobalTransformerConfig`, *optional*):
            Configuration for the global transformer component of the model.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.

    ```python
    >>> from transformers import BltModel, BltConfig

    >>> # Initializing a Blt configuration
    >>> configuration = BltConfig()

    >>> # Initializing a model from the configuration
    >>> model = BltModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    Checkpoint: [facebook/blt](https://huggingface.co/facebook/blt)
    bltpast_key_valuesr   )patcher_configencoder_configdecoder_configglobal_configr   rF   T   entropy   ]?r   Nr	   " Fr   r   r   patch_in_forward
patch_sizepatching_modepatching_thresholdpatching_batch_sizemax_patch_lengthr   encoder_hash_byte_group_sizeencoder_hash_byte_group_vocab$encoder_hash_byte_group_nb_functionsrQ   rR   rS   rT   r$   r?   r@   rA   r"   r   c                    sH  || _ || _|| _|| _|| _|| _|| _|| _|| _|	dd| _
|	dd| _|	d| _|	dd| _|	| _|
p>g d| _|| _|| _|d u rVt|d	| _td
 nt|trj|d| tdi || _nt|trr|| _|d u rt|d	| _td nt|tr|d| tdi || _nt|tr|| _|d u rt|d	| _td nt|tr|d| tdi || _nt|tr|| _|d u rt|d	| _td nt|tr|d| tdi || _nt|tr|| _| jj| j }|| jjkr|nd | j_ || _!|| _"|| _#|| _$|| _%t& j'di | d S )Npatching_devicecudarealtime_patchingTpatching_threshold_addmonotonicityF)r   rU            r#   )r"   z8patcher_config is None, using default Blt patcher configr"   z8encoder_config is None, using default Blt encoder configz8decoder_config is None, using default Blt decoder configz6global_config is None, using default Blt global configr%   )(r   r   r"   rY   rZ   r[   r\   r]   r^   getrb   rd   re   rf   r   r_   r`   ra   rH   rQ   loggerinfo
isinstancer9   
setdefaultr   rR   r<   rS   rC   rT   r   encoder_cross_output_sizer?   r@   rA   r$   r   r)   r*   )r+   r   r   rY   rZ   r[   r\   r]   r^   r   r_   r`   ra   rQ   rR   rS   rT   r$   r?   r@   rA   r"   r   r,   ro   r-   r%   r/   r*   @  st   







zBltConfig.__init__)r   rF   TrU   rV   rW   r   Nr	   NrX   r   NNNNFNNNr   N)r1   r2   r3   r4   r5   keys_to_ignore_at_inferencer6   rH   r   r<   rC   sub_configsr'   r7   r:   r8   r9   r   r*   r;   r%   r%   r-   r/   rN      s    B		
rN   )rN   rH   r   r<   rC   N)r4   configuration_utilsr   modeling_rope_utilsr   utilsr   
get_loggerr1   rk   r   r<   rC   rH   rN   __all__r%   r%   r%   r/   <module>   s   
17)J 5