o
    i                     @   sx  d dl Z d dlmZ d dlmZ ddlmZ ddlmZm	Z	 e	
eZe r*d dlZdd Z			d1d	ee d
ed dee dedef fddZ			d1d	ee d
ed dee dedef fddZ			d1d	ee d
ed dee dedef fddZ	d2d	ed
ddee dedef fddZ	d2d	ed
ddee dedef fddZ	d2d	ed
ddee dedef fddZeeeeeedZ		d3dedededee d ee f
d!d"Zd2d	ed ee fd#d$Zd2d	ed ee fd%d&Zd2d	ed ee fd'd(Zd2d	ed ee fd)d*Zd2d	ed ee fd+d,Z d2d	ed ee fd-d.Z!eeeee e!dZ"d2d	ed ee fd/d0Z#dS )4    Nwraps)Optional   )PretrainedConfig)is_torch_availableloggingc                    s,   dd dd  t  fdd}|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    c                 S   s   t |d }t| jdr| jj}n| jj}||kr8t| ds-| j| j||d d\| _}| jd| jdd dS | j	
|| _	| jd| j	dd dS )	zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r    original_max_position_embeddingslong_inv_freqseq_leninv_freqF
persistentN)torchmaxhasattrconfigr	   max_position_embeddingsrope_init_fnr
   register_bufferoriginal_inv_freqto)selfposition_idsdevicer   r	   _ r   Y/home/ubuntu/LTX-2/.venv/lib/python3.10/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_update+   s   

z6dynamic_rope_update.<locals>.longrope_frequency_updatec                 S   s   t |d }|| jkr#| j| j||d\}| _| jd|dd || _|| jk rD| j| jkrF| j	|| _| jd| jdd | j| _dS dS dS )a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   r   r   Fr   N)
r   r   max_seq_len_cachedr   r   attention_scalingr   original_max_seq_lenr   r   )r   r   r   r   r   r   r   r   dynamic_frequency_update>   s   
z5dynamic_rope_update.<locals>.dynamic_frequency_updatec                    sB   d| j v r | ||jd n| j dkr| ||jd | ||S )Ndynamic)r   longrope)	rope_typer   )r   xr   r#   r   rope_forwardr   r   wrapperQ   s
   

z$dynamic_rope_update.<locals>.wrapperr   )r)   r*   r   r(   r   dynamic_rope_update   s
   r+   r   r   ztorch.devicer   returnztorch.Tensorc           	      C   sn   | j }t| dd}t| ddp| j| j }t|| }d}d|tjd|dtjdj|tj	d|   }||fS )	a  
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    partial_rotary_factor      ?head_dimNr      dtyper   r2   )

rope_thetagetattrhidden_sizenum_attention_headsintr   arangeint64r   float)	r   r   r   baser-   r/   dimattention_factorr   r   r   r    _compute_default_rope_parameters\   s   ,r?   c                 C   s*   | j d }t| ||\}}|| }||fS )a  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    factor)rope_scalingr?   )r   r   r   r@   r   r>   r   r   r   '_compute_linear_scaling_rope_parameters   s   
rB   c                 C   s   | j }t| dd}t| d| j| j }t|| }| j}| jd }d}	|du r*|}nt|tj	r?t
|tj||j|jd}nt||}||| | |d  ||d    }d|tjd	|dtjd
j|tjd|   }
|
|	fS )a	  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
                inference time
            *   rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
                will be accessed. The value of `factor` is used to determine the new base frequency, along with the
                current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
                computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
                factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
                context window using an exponent derived from `dim`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
            max_position_embeddings, this value will be overridden by max_position_embeddings.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    r-   r.   r/   r@   Nr2   r   r   r0   r   r1   r3   )r4   r5   r6   r7   r8   r   rA   
isinstancer   Tensormaximumtensorr2   r   r   r9   r:   r   r;   )r   r   r   r<   r-   r/   r=   r   r@   r>   r   r   r   r   _compute_dynamic_ntk_parameters   s$   *

$,rH   c                    s  | j }t| dd}t| d| j| j }t|| }| jd }| jd}| jd}	| jd}
| jdp8| j}dd
d}|du rW|	rS|
rSt|||	|||
 }n||}| jdp^d}| jdpfd	}dd   fdd}dd }|t	
d|dj|t	jd|  }d| }d||  }| jdd}|||||||\}}d	||||d j|t	jd }|d	|  ||  }||fS )ak  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                    If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as avaialble.
                *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                    (only) in the linear ramp function.
                *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                    (only) in the linear ramp function.
                *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                    extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                    value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                    `mscale_all_dim`, if provided.
                *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                    numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                    calculated based on `factor` only.
                *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                    the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                    will be calculated based on `factor` only.
                *   `original_max_position_embeddings` (`int`, *optional*): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.
                *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r-   r.   r/   r@   r>   mscalemscale_all_dimr	   r   c                 S   s"   | dkrdS d| t |  d S )Nr   r.   g?)mathlog)scalerI   r   r   r   
get_mscale:  s   z,_compute_yarn_parameters.<locals>.get_mscaleN	beta_fast    	beta_slowc                 S   s*   |t || d t j   dt |  S )zPInverse dimension formula to find the dimension based on the number of rotationsr0   )rK   rL   pi)num_rotationsr=   r<   r   r   r   r   find_correction_dimL  s   *z5_compute_yarn_parameters.<locals>.find_correction_dimc                    sL    | |||} ||||}|rt |}t |}t|dt||d fS )z.Find dimension range bounds based on rotationsr   r   )rK   floorceilr   min)low_rothigh_rotr=   r<   r   truncatelowhighrT   r   r   find_correction_rangeP  s   

z7_compute_yarn_parameters.<locals>.find_correction_rangec                 S   s>   | |kr|d7 }t j|t jd|  ||   }t |dd}|S )NgMbP?r1   r   r   )r   r9   float32clamp)rW   r   r=   linear_func	ramp_funcr   r   r   linear_ramp_factorY  s
   z4_compute_yarn_parameters.<locals>.linear_ramp_factorr   r0   r3   rZ   T)r   )r4   r5   r6   r7   r8   rA   getr   r;   r   r9   r   )r   r   r   r<   r-   r/   r=   r@   r>   rI   rJ   r	   rN   rO   rQ   r^   rc   	pos_freqsinv_freq_extrapolationinv_freq_interpolationrZ   r[   r\   inv_freq_extrapolation_factorr   r   r]   r   _compute_yarn_parameters   s>   8

	"
 
ri   c                 C   s  | j }t| dd}t| d| j| j }t|| }| jd }| jd }| jd}	| jd}
t| dd	 }r=| j| }	n| j}|
d	u rZ|	dkrKd}
nt	d
t
|	t
|  }
|rj||krjtj|tj|d}n	tj|tj|d}tjd|dtj|d | }d|||   }||
fS )a  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
                pretraining. If not provided, defaults to `max_position_embeddings`.
            *   rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
                will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                    the value of `factor`.
                *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                    `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                    overridden s the ratio between those values.
                *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
                *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r-   r.   r/   long_factorshort_factorr@   r>   r	   Nr   rC   r   r0   )r4   r5   r6   r7   r8   rA   rd   r   rK   sqrtrL   r   rG   r_   r9   r:   r;   )r   r   r   r<   r-   r/   r=   rj   rk   r@   r>   r	   ext_factorsinv_freq_shaper   r   r   r   _compute_longrope_parameterss  s*   /

ro   c                 C   s   t | ||\}}| jd }| jd }| jd }| jd }|| }	|| }
dtj | }t||	k|| |}|| | ||  }d| | | ||  }||
k  ||	k  }t|||}||fS )ap
  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                    wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                    during smoothing.
                *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                    the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
                *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                    the shift applied to the numerator and denominator of the smoothing factor.
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r@   low_freq_factorhigh_freq_factorr	   r0   r   )r?   rA   rK   rR   r   where)r   r   r   r   r>   r@   rp   rq   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqr   r   r   _compute_llama3_parameters  s   *



r{   )defaultlinearr$   yarnr%   llama3r&   received_keysrequired_keysoptional_keysignore_keysc                 C   s   d|v r|dh8 }| d |dur||8 }|| }|r&td|  d| |dur1|| | }n|| }|rDtd|  d|  dS dS )zYCompare the received keys in `config.rope_scaling` against the expected and optional keystyper&   Nz9Missing required keys in `rope_scaling` for 'rope_type'='z': z5Unrecognized keys in `rope_scaling` for 'rope_type'=')addKeyErrorloggerwarning)r&   r   r   r   r   missing_keysunused_keysr   r   r   _check_received_keys  s   	

r   c                 C   s@   | j }|d|dd }dh}t| }t||||d d S )Nr&   r   r   )rA   rd   setkeysr   )r   r   rA   r&   r   r   r   r   r   !_validate_default_rope_parameters0  s
   r   c                 C   sx   | j }|d|dd }ddh}t| }t||||d |d }|d u s0t|tr0|dk r:td|  d S d S )Nr&   r   r@   r   r.   8`rope_scaling`'s factor field must be a float >= 1, got 	rA   rd   r   r   r   rD   r;   r   r   )r   r   rA   r&   r   r   r@   r   r   r   (_validate_linear_scaling_rope_parameters8  s   r   c                 C   s   | j }|d|dd }ddh}dh}t| }t|||||d |d }|d u s4t|tr4|dk r>td|  d S d S )Nr&   r   r@   r	   r   r.   r   r   )r   r   rA   r&   r   r   r   r@   r   r   r   )_validate_dynamic_scaling_rope_parametersD  s   r   c              	   C   s  | j }|d|dd }ddh}h d}t| }t|||||d |d }|d u s5t|tr5|dk r=td|  |d}|d urWt|trO|d	k rWtd
|  |d}	|	d urmt|	tsmtd|	  |d}
|
d urt|
tstd|
  |	pd|
pdk rtd|	 d|
 d | j d}|d ur| j	| }||krt
d| d| d| d d S d S t
d d S )Nr&   r   r@   >   rI   rZ   rO   rQ   rJ   r>   r	   r   r.   r   r>   r   L`rope_scaling`'s attention_factor field must be a float greater than 0, got rO   z6`rope_scaling`'s beta_fast field must be a float, got rQ   z6`rope_scaling`'s beta_slow field must be a float, got rP   r   zO`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r	   zHThe explicitly set RoPE scaling factor (config.rope_scaling['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_scaling['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config.a~  config.rope_scaling['original_max_position_embeddings'], the pre-yarn context length, is unset. We will **assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * factor) -- we recommend updating both fields for optimal downstream model usage.)rA   rd   r   r   r   rD   r;   r   r   r   warning_once)r   r   rA   r&   r   r   r   r@   r>   rO   rQ   r	   implicit_factorr   r   r   _validate_yarn_parametersR  sR   	



r   c                 C   s  | j }|d|dd }h d}h d}t| }t|||||d t| dd}t| d| j| j }t|| }	|d	}
t	|
t
sUtd
d |
D rUtd|
  t|
|	d krltd|	d  dt|
  |d}t	|t
stdd |D rtd|  t||	d krtd|	d  dt|  t| drtd d S |d}|d u rtd nt	|tr|dk rtd|  |d}|d urt	|tr|dk rtd|  d S d S d S )Nr&   r   >   r&   rj   rk   >   r@   r>   r	   r   r-   r.   r/   rk   c                 s       | ]
}t |ttfV  qd S NrD   r8   r;   .0r'   r   r   r   	<genexpr>      z0_validate_longrope_parameters.<locals>.<genexpr>zC`rope_scaling`'s short_factor field must be a list of numbers, got r0   z5`rope_scaling`'s short_factor field must have length z, got rj   c                 s   r   r   r   r   r   r   r   r     r   zB`rope_scaling`'s long_factor field must be a list of numbers, got z4`rope_scaling`'s long_factor field must have length r	   aY  This model has set a `original_max_position_embeddings` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.r@   z1Missing required keys in `rope_scaling`: 'factor'r   r>   g        r   )rA   rd   r   r   r   r5   r6   r7   r8   rD   listallr   r   lenr   r   r;   )r   r   rA   r&   r   r   r   r-   r/   r=   rk   rj   r@   r>   r   r   r   _validate_longrope_parameters  sH   




r   c           
      C   s6  | j }|d|dd }h d}t| }t||||d |d }|d u s0t|tr0|dk r8td|  |d }|d	 }|d u sIt|tsQtd
|  |d u sZt|tsbtd|  ||krqtd| d|  |d }	|	d u s~t|	t	std|	  |	| j
krtd|	 d| j
  d S d S )Nr&   r   >   r@   r&   rp   rq   r	   r   r@   r.   r   rp   rq   z<`rope_scaling`'s low_freq_factor field must be a float, got z=`rope_scaling`'s high_freq_factor field must be a float, got zc`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r	   zP`rope_scaling`'s original_max_position_embeddings field must be an integer, got zg`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)rA   rd   r   r   r   rD   r;   r   r   r8   r   )
r   r   rA   r&   r   r   r@   rp   rq   r	   r   r   r   _validate_llama3_parameters  sL   
r   c                 C   sd   t | dd}|du rdS |d|dd}t|}|dur'|| |d dS td| d dS )	zO
    Validate the RoPE config arguments, given a `PretrainedConfig` object
    rA   Nr&   r   r|   r   zTMissing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='')r5   rd   ROPE_VALIDATION_FUNCTIONSr   r   )r   r   rA   r&   validation_fnr   r   r   rope_config_validation  s   

r   )NNNr   )NN)$rK   	functoolsr   typingr   configuration_utilsr   utilsr   r   
get_logger__name__r   r   r+   r8   tupler;   r?   rB   rH   ri   ro   r{   ROPE_INIT_FUNCTIONSstrr   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s   
?

,

,

E

~

S

E
B2&
