o
    eiV                     @   sn   d dl mZmZ d dlmZ d dlmZ eeZ	G dd deZ
G dd deZG dd	 d	eZg d
ZdS )   )PreTrainedConfiglayer_type_validation)RopeParameters)loggingc                *       s
  e Zd ZdZddddddddZdZdZ				
																d.dedB dedB dedB dedB dedB dedB dedB d edB d!edB d"e	dB d#edB d$e	dB d%e	dB d&edB d'edB d(e
dB d)e	dB d*e	dB d+eeeef B dB f& fd,d-Z  ZS )/Llama4VisionConfigaB  
    This is the configuration class to store the configuration of a [`Llama4VisionModel`]. It is used to instantiate a
    Llama4 vision model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Llama4 109B.

    e.g. [meta-llama/Llama-4-Scout-17B-16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        num_hidden_layers (`int`, *optional*, defaults to 34):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input image.
        intermediate_size (`int`, *optional*, defaults to 5632):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        vision_output_dim (`int`, *optional*, defaults to 7680):
            Dimensionality of the vision model output. Includes output of transformer
            encoder with intermediate layers and global transformer encoder.
        image_size (`int`, *optional*, defaults to 448):
            The size (resolution) of each image *tile*.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
            Controls which vision tokens are kept from the backbone. `"default"` drops the CLS token and `"full"` keeps all tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        pixel_shuffle_ratio (`float`, *optional*, defaults to 0.5):
            Pixel-shuffle ratio for downsampling patch tokens. Smaller values produce fewer tokens (more downsampling).
        projector_input_dim (`int`, *optional*, defaults to 4096):
            Width of the vision adapter MLP before pixel shuffle. Larger value increases capacity and compute.
        projector_output_dim (`int`, *optional*, defaults to 4096):
            Output width of the vision adapter. Larger value yields higher-dimensional image features.
        multi_modal_projector_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the multi-modal projector layers.
        projector_dropout (`float`, *optional*, defaults to 0.0):
            Dropout rate inside the vision adapter MLP. Higher value adds more regularization.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            Dropout rate on vision attention probabilities. Higher value adds more regularization.
        rope_parameters (`RopeParameters`, *optional*):
            RoPE Parameters
    colwiserowwisecolwise_gather_output)zmodel.layers.*.self_attn.q_projzmodel.layers.*.self_attn.k_projzmodel.layers.*.self_attn.v_projzmodel.layers.*.self_attn.o_projzvision_adapter.mlp.fc1zvision_adapter.mlp.fc2zpatch_embedding.linearllama4_vision_modelvision_config   gelu"      r              h㈵>default{Gz?      ?   F        Nhidden_size
hidden_actnum_hidden_layersnum_attention_headsnum_channelsintermediate_sizevision_output_dim
image_size
patch_sizenorm_epsvision_feature_select_strategyinitializer_rangepixel_shuffle_ratioprojector_input_dimprojector_output_dimmulti_modal_projector_biasprojector_dropoutattention_dropoutrope_parametersc                    s   || _ || _|| _|| _|| _|| _|| _|	| _|
| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _t jdi | d S )N )r   r   r   r   r   r!   r    r"   r#   r   r%   r&   r'   r(   r)   r*   r+   r$   r,   super__init__)selfr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   kwargs	__class__r-   m/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/llama4/configuration_llama4.pyr/   [   s(   zLlama4VisionConfig.__init__)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Fr   r   N)__name__
__module____qualname____doc__base_model_tp_plan
model_typebase_config_keyintstrfloatboolr   dictr/   __classcell__r-   r-   r2   r4   r      s    6		
r   c                       s   e Zd ZdZdZdgZdZdddddddddddddZddddd	d	dddd
d
Z																													 					!d%d"e	e
ee	f B dB f fd#d$Z  ZS )&Llama4TextConfiga)  
    This is the configuration class to store the configuration of a [`Llama4TextModel`]. It is used to instantiate a
    Llama4 text model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Llama4 109B.

    e.g. [meta-llama/Llama-4-Scout-17B-16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 202048):
            Vocabulary size of the Llama4 text model. Defines the maximum number of different tokens that can be represented
            by the `inputs_ids` passed when calling [`Llama4TextModel`].
        hidden_size (`int`, *optional*, defaults to 5120):
            Dimensionality of the embeddings and hidden states.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        intermediate_size_mlp (`int`, *optional*, defaults to 16384):
            Intermediate size of dense MLP layers. Larger value increases FFN capacity and compute.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 40):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If not
            specified, will default to `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 128):
            Per-head attention dimension. Larger value increases head width and compute.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions.
        pad_token_id (`int`, *optional*, defaults to 128004):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the beginning of sentence token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the end of sentence token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        attention_dropout (`float`, *optional*, defaults to 0.0):
            Dropout rate on vision attention probabilities. Higher value adds more regularization.
        num_experts_per_tok (`int`, *optional*, defaults to 1):
            Top-k experts routed per token. Higher value uses more experts per token and more compute.
        num_local_experts (`int`, *optional*, defaults to 16):
            Number of experts in each MoE layer. Higher value increases capacity and routing choices.
        moe_layers (`list[int]`, *optional*):
            List of layer indices that use MoE. Overrides `interleave_moe_layer_step` when set.
        interleave_moe_layer_step (`int`, *optional*, defaults to 1):
            Spacing between MoE layers when `moe_layers` is `None`. Larger value means fewer MoE layers.
        use_qk_norm (`bool`, *optional*, defaults to `True`):
            Whether to L2-normalize queries/keys on RoPE layers. Can stabilize attention when enabled.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether to return router logits (and auxiliary loss) in outputs.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            Weight for the router auxiliary loss. Higher value makes routing loss contribute more to total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.0):
            Amount of noise added to router logits during training. Higher value increases exploration.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        no_rope_layers (`list[int]`, *optional*):
            List with at least the same length as the number of layers in the model.
            A `1` at an index position indicates that the corresponding layer will use RoPE,
            while a `0` indicates that it's a NoPE layer.
        no_rope_layer_interval (`int`, *optional*, defaults to 4):
            If `no_rope_layers` is `None`, it will be created using a NoPE layer every
            `no_rope_layer_interval` layers.
        attention_chunk_size (`int`, *optional*, defaults to 8192):
            Chunk size for the attention computation. Smaller value enforces more local attention and lowers memory.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        attn_temperature_tuning (`bool`, *optional*, defaults to `True`):
            Whether to dynamically scale the attention temperature for each query token based on sequence length.
            Recommended for long sequences (e.g., >32k tokens) to maintain stable output results.
        floor_scale (`int`, *optional*, defaults to 8192):
            Base scale (in tokens) for attention temperature tuning. Larger value delays scaling to longer positions.
        attn_scale (`float`, *optional*, defaults to 0.1):
            Strength of attention temperature tuning. Larger value increases scaling at long positions.

    Example:
    llama4_textpast_key_valuesg    Ar   r   packed_rowwise)layers.*.self_attn.q_projlayers.*.self_attn.k_projlayers.*.self_attn.v_projlayers.*.self_attn.o_projz-layers.*.feed_forward.shared_expert.gate_projz+layers.*.feed_forward.shared_expert.up_projz-layers.*.feed_forward.shared_expert.down_proj*layers.*.feed_forward.experts.gate_up_proj'layers.*.feed_forward.experts.down_projlayers.*.feed_forward.gate_projlayers.*.feed_forward.up_projlayers.*.feed_forward.down_projgrouped_gemm	ep_router)
rF   rG   rH   rI   rJ   rK   rL   rM   rN   zlayers.*.feed_forward.router@         @  0   (         silu   r   r   TN      Fr   r   MbP?   皙?r,   c#           %         sr  || _ || _|| _|| _| | _|"| _|!| _|| _|
| _|| _	|| _
|| _|| _|| _d| _|d u r3|}|| _|	| _|| _|| _|| _|| _|d urK|n| j	| j | _|| _|| _|| _|| _|| _|| _|g krjd } fddt| jD }$|rz|n|$| _|| _|d ur|n	tt|d ||| _ || _!|| _"|d u rdd | jD | _"t#| j"| j || _$t% j&di |# d S )NFc                    s    g | ]}t |d    dkqS )r[       )r<   ).0	layer_idxno_rope_layer_intervalr-   r4   
<listcomp>P  s    z-Llama4TextConfig.__init__.<locals>.<listcomp>r[   c                 S   s   g | ]}|rd ndqS )chunked_attentionfull_attentionr-   )ra   no_roper-   r-   r4   re   f  s    r-   )'tie_word_embeddingspad_token_idbos_token_ideos_token_idattn_temperature_tuning
attn_scalefloor_scale
vocab_sizemax_position_embeddingsr   r   intermediate_size_mlpr   r   attention_biasnum_key_value_headsr   r%   rms_norm_eps	use_cacher+   head_dimuse_qk_normnum_experts_per_toknum_local_expertsoutput_router_logitsrouter_aux_loss_coefrouter_jitter_noiserangeno_rope_layersinterleave_moe_layer_steplist
moe_layersattention_chunk_sizelayer_typesr   r,   r.   r/   )%r0   rp   r   r   rr   r   r   rt   rw   r   rq   r%   ru   rv   rj   rk   rl   ri   r+   ry   rz   r   r   rx   r{   r|   r}   r,   r   rd   r   r   rm   ro   rn   r1   default_no_rope_layersr2   rc   r4   r/     sp   &
zLlama4TextConfig.__init__)"rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r   r   TNr[   r\   Fr   r[   r   Nr[   TFr]   r   NNr^   rS   NTrS   r_   )r5   r6   r7   r8   r:   keys_to_ignore_at_inferencedefault_thetar9   base_model_ep_planr   r@   r=   r/   rA   r-   r-   r2   r4   rB      s    [rB   c                       sP   e Zd ZdZdZddddZeedZdd	iZ		
	
				d fdd	Z
  ZS )Llama4Configa  
    This is the configuration class to store the configuration of a [`Llama4Model`]. It is used to instantiate an
    Llama4 model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Llama4 109B.

    e.g. [meta-llama/Llama-4-Scout-17B-16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vision_config (`Llama4VisionConfig`, *optional*):
            The Llama4 Vision config.
        text_config (`Llama4TextConfig`, *optional*):
            The Llama4 Text config.
        boi_token_index (`int`, *optional*, defaults to 200080):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 200081):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 200092):
            The image token index to encode the image prompt.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.

    ```python
    >>> from transformers import Llama4Model, Llama4Config

    >>> # Initializing a Llama4 7B style configuration
    >>> configuration = Llama4Config()

    >>> # Initializing a model from the Llama4 7B style configuration
    >>> model = Llama4Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```llama4image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configr   zmulti_modal_projector.linear_1colwise_repN   Fc                    s   |d u rt  | _td nt|trt di || _nt|t r$|| _|| _|| _|| _|d u r;t	 | _
td nt|trIt	di || _
nt|t	rQ|| _
|| _t jdi | d S )Nz9vision_config is None, using default llama4 vision configz5text_config is None, using default llama4 text configr-   )r   r   loggerinfo
isinstancer@   r   r   r   rB   r   ri   r.   r/   )r0   r   r   r   r   r   ri   r1   r2   r-   r4   r/     s&   




zLlama4Config.__init__)NNr   r   r   F)r5   r6   r7   r8   r:   attribute_maprB   r   sub_configsr9   r/   rA   r-   r-   r2   r4   r   o  s"    &
r   )r   rB   r   N)configuration_utilsr   r   modeling_rope_utilsr   utilsr   
get_loggerr5   r   r   rB   r   __all__r-   r-   r-   r4   <module>   s   
q fS