o
    i                     @   s  d Z ddlZddlmZmZmZ ddlZddlm  m	Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1 e! rddl2m3Z3 G dd deZ4G dd de*Z5G dd de+Z6			d@dej7dej8dej8dej8d eej8d!f d"ee9 d#ee9 d$eej8 d%e:ej8ej8f fd&d'Z;e Z<e;e<d(< G d)d* d*ej7Z=G d+d, d,e(Z>G d-d. d.ej7Z?G d/d0 d0eZ@G d1d2 d2e)ZAG d3d4 d4e1ZB				dAd5eej8e:ej8 df d6eeC d7eeC d8eCd eej8 d%eej8eCf fd9d:ZDG d;d< d<e0ZEG d=d> d>e'ZFg d?ZGdS )BzPyTorch Doge model.    N)CallableOptionalUnion)nn   )ACT2FN)Cache)PretrainedConfig)compile_friendly_flex_attention)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)rope_config_validation)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsis_torch_flex_attn_available)deprecate_kwarg)OutputRecorder   )LlamaForSequenceClassificationLlamaMLPLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward	repeat_kv)MixtralForCausalLMMixtralModel)	BlockMaskc                       s   e Zd ZdZdZdgZi dddddddd	d
d	dddddddddddddddd	ddddddZdgdgfddgdgfdgdgfdZ			 	!	"	#	$	%	&	'	 	(	)	*	)	'	"	'	)	 	'	+	,	'	'	-d0 fd.d/	Z  Z	S )1
DogeConfiga   
    This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
    model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for each sequence transformation and state transformation module.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings.
            NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
            Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
                    In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'.
                    The original max position embeddings used during pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation.
                    If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention.
            If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
            When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
            For more details checkout [this paper](https://huggingface.co/papers/2305.13245).
            If it is not specified, will default to `num_attention_heads`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `None`.
        keep_window_size (`int`, *optional*, defaults to 2048):
            The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
        is_moe (`bool`, *optional*, defaults to `False`):
            Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.
        num_experts (`int`, *optional*, defaults to 16384):
            Number of routed experts in the model. This is only used when `is_moe=True`.
        num_experts_per_tok (`int`, *optional*, defaults to 64):
            Number of selected experts to route per-token.
        norm_topk_prob (`bool`, *optional*, defaults to `False`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.

    ```python
    >>> from transformers import DogeConfig, DogeModel

    >>> # Initializing a Doge-320M style configuration
    >>> configuration = DogeConfig()

    >>> # Initializing a model from the Doge-320M style configuration
    >>> model = DogeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```dogepast_key_valueszlayers.*.self_attn.q_projcolwisezlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.dt_projrowwisezlayers.*.self_attn.o_projzlayers.*.input_layernorm.weightsequence_parallelzlayers.*.input_residual.weightz(layers.*.post_attention_layernorm.weightz'layers.*.post_attention_residual.weightznorm.weightzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projzlayers.*.mlp.router_gatecolwise_repzlayers.*.mlp.down_embedrowwise_repzlayers.*.mlp.up_embed	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnorm                     silu{Gz?ư>TF     @N    @  @   MbP?c                    s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| jd ur]d| jv r]| jd | jd< t|  |d u rh|| _t jdd|
i| d S )Ntype	rope_typetie_word_embeddings )
vocab_sizehidden_sizeintermediate_sizenum_hidden_layershidden_dropout
hidden_actinitializer_rangerms_norm_eps	use_cachemax_position_embeddings
rope_thetarope_scalingnum_attention_headsnum_key_value_headsattention_biasattention_dropoutmlp_biassliding_windowkeep_window_sizeis_moenum_expertsnum_experts_per_toknorm_topk_proboutput_router_logitsrouter_aux_loss_coefr   super__init__)selfrB   rC   rD   rE   rF   rG   rH   rI   rJ   r@   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   kwargs	__class__rA   b/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/doge/modular_doge.pyr\      sF   

zDogeConfig.__init__)r1   r2   r3   r4   r5   r6   r7   r8   TFr3   r9   Nr:   NFr5   FNr3   Fr;   r<   FFr=   )
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr\   __classcell__rA   rA   r_   ra   r"   7   s    p	


r"   c                   @      e Zd ZdS )DogeRMSNormNrb   rc   rd   rA   rA   rA   ra   rl         rl   c                   @   rk   )DogeRotaryEmbeddingNrm   rA   rA   rA   ra   ro     rn   ro   modulequerykeyvaluer-   r!   scalingsoftcap	head_maskreturnc              
      s   d }	d  t |tr|}	n|  d ur% d d d d d d d |jd f   fdd}
t||||
|	d|dd\}}||j}|dd }||fS )Nc                    s^   d urt |   }  d ur|  | | | |  } d ur-| | | d d  } | S )Nr   )torchtanh)score	batch_idxhead_idxq_idxkv_idxcausal_maskrv   ru   rA   ra   	score_mod*  s   z)flex_attention_forward.<locals>.score_modT)r   
block_mask
enable_gqascale
return_lse   r   )
isinstancer!   shaper
   todtype	transpose
contiguous)rp   rq   rr   rs   r-   rt   ru   rv   r^   r   r   attn_outputattention_weightsrA   r   ra   flex_attention_forward  s*   
&	
r   doge_flex_attentionc                       s   e Zd Zddedee f fddZedddd				dd
ej	de
ej	ej	f deej	 dee deej de
ej	eej	 ee
ej	  f fddZ		dd
ej	dej	dedeej	 fddZ  ZS )DogeAttentionNconfig	layer_idxc                    s(  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _|j| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tt|j| _tj|j| j |j|jd| _tj|j| j |j|jd| _t| j|jd| _t| j|jd| _d S )Nhead_dimg      ࿩biaseps)r[   r\   r   r   getattrrC   rN   r   rO   num_key_value_groupsrt   rQ   rT   r   LinearrP   q_projk_projv_proj	Parameterry   zerosAdt_projo_projrl   rI   q_normk_normr]   r   r   r_   rA   ra   r\   K  s4   
zDogeAttention.__init__past_key_valuer$   4.58new_nameversionr,   position_embeddingsr-   cache_positionrw   c                 K   s  |j d d }g |d| jR }| | ||dd}	| | ||dd}
| ||dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}| |dd|j d |j d d}t| jt| dd}| j||| j|d}t|| j}t}| jjdkrt| jj }|| |	|
|f|| jsd	n| j| jd
|\}}|jg |dR   }| |}||fS )Nr   r   )sincosr   r   rx   )r,   	dt_statesrT   r-   eagerr5   )r-   dropoutrt   ) r   r   r   r   viewr   r   r   r   r   updater   r   reshapery   expr   Fsoftplusprepare_dynamic_maskrT   r   r   r   r   _attn_implementationALL_ATTENTION_FUNCTIONStrainingrQ   rt   r   r   )r]   r,   r   r-   r$   r   r^   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsr   	attn_maskattention_interfacer   attn_weightsrA   rA   ra   forwardi  sN   
 

zDogeAttention.forwardr3   r   rT   c           
   	   C   s  t |jj}|j}|dddddddf dd|jd d}|durZt|tsZ|jt jkrA|j}t 	|t j
d|j|d|}||ddddddd|jd f dk|}|jd |krt j|||jd}t j||ddd	d
j}	|d|	d}||dk|}|S )a8  
        The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

        Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

        Args:
            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
            dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
            keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
        Nr   r   r5   )devicer   r   r   r   TF)dimlargestsorted      ?)ry   finfor   minexpandr   r   r!   boolwheretensorr   masked_fill
zeros_liketopkindicesscatter)
r]   r,   r   rT   r-   	min_dtyper   r   active_masktopk_indicesrA   rA   ra   r     s$   2z"DogeAttention.prepare_dynamic_maskNNNN)r3   N)rb   rc   rd   r"   r   intr\   r   ry   Tensortupler   
LongTensorr   r   rj   rA   rA   r_   ra   r   J  s<    <r   c                   @   rk   )DogeMLPNrm   rA   rA   rA   ra   r     rn   r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )	DogeCDMoEr   c                    s   t    |j| _|j| _t|j | _|j| _t	t
| j| _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| jd dd| _t| j| j| _t| j| j| _d S )Nr   r   F)r[   r\   rC   rD   r   rG   act_fnrV   mathfloorsqrtnum_keysrW   top_krX   r   r   rR   	gate_projup_proj	down_projrouter_gate	Embedding
down_embedup_embedr]   r   r_   rA   ra   r\     s   
zDogeCDMoE.__init__r,   rw   c                 K   s  |j \}}}| |d|| d}|j| jdd\\}}\}	}
|d|d }|	d| j |
d }|jg |j d d dR  }|jg |j d d dR  }|j| jdd\}}|d|}tj	|dd}| j
rx||jddd }| |}| |}t|||| dd|| d}| || }t||| dd|||d}| | | || | }|| }||fS )Nr   r   r   rx   T)r   keepdimr   )r   r   r   r   r   	unsqueezer   gatherr   softmaxrX   sumr   r   ry   matmulr   r   r   r   )r]   r,   r^   bszseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr   routing_weightsr   r   experts_weightsexperts_statesrA   rA   ra   r     s(   

&$ zDogeCDMoE.forward)	rb   rc   rd   r"   r\   ry   r   r   rj   rA   rA   r_   ra   r     s    r   c                       s   e Zd Zddedee f fddZedddd					
	ddej	de
ej	ej	f deej	 deej dee dee deej dee de
ejee
ejejf  f fddZ  ZS )DogeDecoderLayerNr   r   c                    s   t    |j| _t|j|jd| _t||d| _t	
t|j| _t|j|jd| _|js3t|nt|| _t	
t|j| _d S )Nr   )r   r   )r[   r\   rF   rl   rC   rI   input_layernormr   	self_attnr   r   ry   onesinput_residualpost_attention_layernormrU   r   r   mlppost_attention_residualr   r_   rA   ra   r\     s   
zDogeDecoderLayer.__init__r   r$   r   r   Fr,   r   r-   position_idsrJ   r   r^   rw   c              
   K   s   |}	|  |}| jd|||||||d|\}}
tj|| j| jd}| j|	 | }|}	| |}| |}tj|| j| jd}| j	|	 | }|S )N)r,   r   r-   r  r$   rJ   r   )pr   rA   )
r
  r  r   r   rF   r   r  r  r  r  )r]   r,   r   r-   r  r$   rJ   r   r^   residualself_attn_weightsrA   rA   ra   r     s*   




zDogeDecoderLayer.forwardr   )NNNFN)rb   rc   rd   r"   r   r   r\   r   ry   r   r   r   r   r   r   r   FloatTensorr   rj   rA   rA   r_   ra   r	    s6    	
r	  c                   @   s0   e Zd ZdZdZeeddeedZ	dd Z
dS )DogePreTrainedModelFr   )index)r   r,   
attentionsc                 C   sz   t | | t|trt|dr|jj  dS dS t|tr9t|dr+|j	j
d t|dr;|jj
d dS dS dS )zInitialize the weightsr   r  r   r  N)r   _init_weightsr   r   hasattrr   datazero_r	  r  fill_r  )r]   rp   rA   rA   ra   r  A  s   




z!DogePreTrainedModel._init_weightsN)rb   rc   rd   _supports_flash_attn_can_compile_fullgraphr   r   r	  r   _can_record_outputsr  rA   rA   rA   ra   r  8  s    
r  c                   @   rk   )	DogeModelNrm   rA   rA   rA   ra   r!  N  rn   r!  gate_logitsrV   r   r   c                 C   sv  | du s	t | tsdS | d j}| d j}g }g }| D ]h}	|	|}	|	j|dd\\}
}\}}|
d|d }|d| |d }|jg |jdd dR  }|jg |jdd dR  }|j|dd\}}|	d|}t
j|dd}|| || qtj|dd}tj|dd}|du r|d}tj|||d}tj|||d}|d|||jd  }tj|dd}nq|j\}}t| }|ddddddf ||||fd|}|d|  }tj|||d}tj|||d}|d||t| }|ddddddf ||||fd||}tj|| ddtj|dd }t|| }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [2, batch_size * sequence_length, num_keys].
        num_experts:
            Number of experts
        num_keys:
            Number of keys
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   r   r   rx   r   )r   r   r   r   r   r   r   r   r   r   r   r   appendry   catr   	ones_likescatter_add_meanlenr   r   r   r   )r"  rV   r   r   r-   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr   r   r   r  r  r  r   r  expert_indicesr  tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthrE   expert_attention_mask router_per_expert_attention_maskoverall_lossrA   rA   ra   load_balancing_loss_funcR  sb    





r7  c                       s   e Zd Z fddZ										ddeej deej deej dee d	eej	 d
eej dee
 deej deeejf dee
 dee defddZ  ZS )DogeForCausalLMc                    s"   t  | t|| _|j| _d S r   )r[   r\   r!  modelrV   r   r_   rA   ra   r\     s   
zDogeForCausalLM.__init__Nr   r*   r-   r  r$   r+   labelsrJ   r   logits_to_keeprY   r^   rw   c              
   K   s   |
dur|
n| j j}
| jd|||||||d|}|j}t|	tr(t|	 dn|	}| |dd|ddf }d}|durJ| j||| j	fi |}d}|
rot
|j| jtt| j| j|}|duro|| j||j 7 }t||||j|j|j|jdS )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DogeForCausalLM

        >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
        >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r*   r-   r  r$   r+   rJ   r   )lossaux_losslogitsr$   r,   r  r   rA   )r   rY   r9  last_hidden_stater   r   slicelm_headloss_functionrB   r7  r   rV   r   r   r   rW   rZ   r   r   r   r$   r,   r  )r]   r*   r-   r  r$   r+   r:  rJ   r   r;  rY   r^   outputsr,   slice_indicesr>  r<  r=  rA   rA   ra   r     sN   %zDogeForCausalLM.forward)
NNNNNNNNr   N)rb   rc   rd   r\   r   ry   r   r   r   r  r   r   r   r   r   r   r   rj   rA   rA   r_   ra   r8    sJ    	
r8  c                   @   rk   )DogeForSequenceClassificationNrm   rA   rA   rA   ra   rE    rn   rE  )r"   r8  r!  r  rE  r   )NNr   N)Hre   r   typingr   r   r   ry   torch.nn.functionalr   
functionalr   activationsr   cache_utilsr   configuration_utilsr	   integrations.flex_attentionr
   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.deprecationr   utils.genericr   llama.modeling_llamar   r   r   r   r   r   r   r   mixtral.modeling_mixtralr   r    !torch.nn.attention.flex_attentionr!   r"   rl   ro   Moduler   floatr   r   r   r   r   r   r	  r  r!  r   r7  r8  rE  __all__rA   rA   rA   ra   <module>   s   (
 W


1~93
jZ