o
    ei                     @   s  d Z ddlZddlmZ ddlmZ ddlZddlm  m	Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 e%4e5Z6e$ rddl7m8Z8 G dd deZ9G dd de,Z:G dd de-Z;		d@dej<dej=dej=d ej=d!eej=d"f d#e>dB d$e>dB d%e?ej=ej=f fd&d'Z@e ZAe@eAd(< G d)d* d*ej<ZBG d+d, d,e*ZCG d-d. d.ej<ZDG d/d0 d0eZEG d1d2 d2e+ZFG d3d4 d4e3ZG				dAd5ej=e?ej= B dB d6eHdB d7eHdB d8eHd!ej=dB d%ej=eHB fd9d:ZIG d;d< d<e2ZJG d=d> d>e)ZKg d?ZLdS )BzPyTorch Doge model.    N)Callable)Union)nn   )initialization)ACT2FN)Cache)PreTrainedConfig)compile_friendly_flex_attention)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)RopeParameters)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsis_torch_flex_attn_availablelogging)OutputRecorder   )LlamaForSequenceClassificationLlamaMLPLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward	repeat_kv)MixtralForCausalLMMixtralModel)	BlockMaskc                :       s  e Zd ZdZdZdgZddddddddddddZd	gd
gfddgdgfdgdgfdZ																												d;dedB dedB dedB d edB d!e	dB d"e
dB d#e	dB d$edB d%edB d&edB d'edB d(eee
ef B dB d)edB d*edB d+edB d,e	dB d-edB d.edB d/edB d0edB d1edB d2edB d3edB d4edB d5e	dB d6edB d7edB d8edB f8 fd9d:Z  ZS )<
DogeConfiga  
    This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
    model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for each sequence transformation and state transformation module.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention.
            If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
            When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
            For more details checkout [this paper](https://huggingface.co/papers/2305.13245).
            If it is not specified, will default to `num_attention_heads`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `None`.
        keep_window_size (`int`, *optional*, defaults to 2048):
            The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
        is_moe (`bool`, *optional*, defaults to `False`):
            Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.
        num_experts (`int`, *optional*, defaults to 16384):
            Number of routed experts in the model. This is only used when `is_moe=True`.
        num_experts_per_tok (`int`, *optional*, defaults to 64):
            Number of selected experts to route per-token.
        norm_topk_prob (`bool`, *optional*, defaults to `False`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*):
            End of stream token id.

    ```python
    >>> from transformers import DogeConfig, DogeModel

    >>> # Initializing a Doge-320M style configuration
    >>> configuration = DogeConfig()

    >>> # Initializing a model from the Doge-320M style configuration
    >>> model = DogeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```dogepast_key_valuescolwiserowwisecolwise_gather_outputrowwise_split_input)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.dt_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projzlayers.*.mlp.router_gatezlayers.*.mlp.down_embedzlayers.*.mlp.up_embed	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnorm                     silu{Gz?ư>TFN    @  @   MbP?
vocab_sizehidden_sizeintermediate_sizenum_hidden_layershidden_dropout
hidden_actinitializer_rangerms_norm_eps	use_cachetie_word_embeddingsmax_position_embeddingsrope_parametersnum_attention_headsnum_key_value_headsattention_biasattention_dropoutmlp_biassliding_windowkeep_window_sizeis_moenum_expertsnum_experts_per_toknorm_topk_proboutput_router_logitsrouter_aux_loss_coefpad_token_idbos_token_ideos_token_idc                    s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|
| _|| _|| _|| _|| _|d u r[|| _t jdi | d S )N )r<   r=   r>   r?   r@   rA   rB   rC   rD   rF   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rE   rU   rV   rW   rG   super__init__)selfr<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   kwargs	__class__rX   c/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/doge/modular_doge.pyrZ      s>    zDogeConfig.__init__)r0   r1   r2   r3   r4   r5   r6   r7   TFr2   Nr8   NFr4   FNr2   Fr9   r:   FFr;   NNN)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planintfloatstrboolr   dictrZ   __classcell__rX   rX   r]   r_   r"   9   s    W

	
r"   c                   @      e Zd ZdS )DogeRMSNormNr`   ra   rb   rX   rX   rX   r_   ro          ro   c                   @   rn   )DogeRotaryEmbeddingNrp   rX   rX   rX   r_   rr      rq   rr   modulequerykeyvaluer,   r!   scalingsoftcapreturnc              
      s   d }d  t |tr|}n|  d ur% d d d d d d d |jd f   fdd}	t||||	|d|dd\}
}||j}|
dd }
|
|fS )Nc                    s>   d urt |   }  d ur|  | | | |  } | S N)torchtanh)score	batch_idxhead_idxq_idxkv_idxcausal_maskrx   rX   r_   	score_mod	  s
   z)flex_attention_forward.<locals>.score_modT)r   
block_mask
enable_gqascale
return_lse   r   )
isinstancer!   shaper
   todtype	transpose
contiguous)rs   rt   ru   rv   r,   rw   rx   r\   r   r   attn_outputattention_weightsrX   r   r_   flex_attention_forward   s*   

&
r   doge_flex_attentionc                       s   e Zd ZddededB f fddZ			ddejdeejejf dejdB d	e	dB d
ej
dB deejejdB eej dB f fddZ		ddejdejdedejdB fddZ  ZS )DogeAttentionNconfig	layer_idxc                    s(  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _|j| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tt|j| _tj|j| j |j|jd| _tj|j| j |j|jd| _t| j|jd| _t| j|jd| _d S )Nhead_dimg      ࿩biaseps)rY   rZ   r   r   getattrr=   rH   r   rI   num_key_value_groupsrw   rK   rN   r   LinearrJ   q_projk_projv_proj	Parameterr|   zerosAdt_projo_projro   rC   q_normk_normr[   r   r   r]   rX   r_   rZ   (  s4   
zDogeAttention.__init__r+   position_embeddingsr,   r$   cache_positionry   c                 K   s  |j d d }g |d| jR }| | ||dd}	| | ||dd}
| ||dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}| |dd|j d |j d d}t| jt| dd}| j||| j|d}t|| j}t| jjt}|| |	|
|f|| jsdn| j| jd	|\}}|jg |dR   }|  |}||fS )
Nr   r   )sincosr   r   rz   )r+   	dt_statesrN   r,   r4   )r,   dropoutrw   )!r   r   r   r   viewr   r   r   r   r   updater   r   reshaper|   expr   Fsoftplusprepare_dynamic_maskrN   r   r   ALL_ATTENTION_FUNCTIONSget_interfacer   _attn_implementationr   trainingrK   rw   r   r   )r[   r+   r   r,   r$   r   r\   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsr   	attn_maskattention_interfacer   attn_weightsrX   rX   r_   forwardF  sN   	 

zDogeAttention.forwardr2   r   rN   c           
   	   C   s  t |jj}|j}|dddddddf dd|jd d}|durZt|tsZ|jt jkrA|j}t 	|t j
d|j|d|}||ddddddd|jd f dk|}|jd |krt j|||jd}t j||ddd	d
j}	|d|	d}||dk|}|S )a8  
        The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

        Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

        Args:
            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
            dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
            keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
        Nr   r   r4   )devicer   r   r   r   TF)dimlargestsortedg      ?)r|   finfor   minexpandr   r   r!   rk   wheretensorr   masked_fill
zeros_liketopkindicesscatter)
r[   r+   r   rN   r,   	min_dtyper   r   active_masktopk_indicesrX   rX   r_   r   ~  s$   2z"DogeAttention.prepare_dynamic_maskr{   )NNN)r2   N)r`   ra   rb   r"   rh   rZ   r|   Tensortupler   
LongTensorr   r   rm   rX   rX   r]   r_   r   '  s:    "
<r   c                   @   rn   )DogeMLPNrp   rX   rX   rX   r_   r     rq   r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )	DogeCDMoEr   c                    s   t    |j| _|j| _t|j | _|j| _t	t
| j| _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| jd dd| _t| j| j| _t| j| j| _d S )Nr   r   F)rY   rZ   r=   r>   r   rA   act_fnrP   mathfloorsqrtnum_keysrQ   top_krR   r   r   rL   	gate_projup_proj	down_projrouter_gate	Embedding
down_embedup_embedr[   r   r]   rX   r_   rZ     s   
zDogeCDMoE.__init__r+   ry   c                 K   s  |j \}}}| |d|| d}|j| jdd\\}}\}	}
|d|d }|	d| j |
d }|jg |j d d dR  }|jg |j d d dR  }|j| jdd\}}|d|}tj	|dd}| j
rx||jddd }| |}| |}t|||| dd|| d}| || }t||| dd|||d}| | | || | }|| }||fS )Nr   r   r   rz   T)r   keepdimr   )r   r   r   r   r   	unsqueezer   gatherr   softmaxrR   sumr   r   r|   matmulr   r   r   r   )r[   r+   r\   bszseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr   routing_weightsr   r   experts_weightsexperts_statesrX   rX   r_   r     s(   

&$ zDogeCDMoE.forward)	r`   ra   rb   r"   rZ   r|   r   r   rm   rX   rX   r]   r_   r     s    r   c                       s   e Zd ZddededB f fddZ						ddejdeejejf dB d	ejdB d
ej	dB de
dB dedB dej	dB dee deejeejejf dB f fddZ  ZS )DogeDecoderLayerNr   r   c                    s   t    |j| _t|j|jd| _t||d| _t	
t|j| _t|j|jd| _|js3t|nt|| _t	
t|j| _d S )Nr   )r   r   )rY   rZ   r@   ro   r=   rC   input_layernormr   	self_attnr   r   r|   onesinput_residualpost_attention_layernormrO   r   r   mlppost_attention_residualr   r]   rX   r_   rZ     s   
zDogeDecoderLayer.__init__Fr+   r   r,   position_idsr$   rD   r   r\   ry   c              
   K   s   |}	|  |}| jd|||||||d|\}}
tj|| j| jd}| j|	 | }|}	| |}| |}tj|| j| jd}| j	|	 | }|S )N)r+   r   r,   r
  r$   rD   r   )pr   rX   )
r  r  r   r   r@   r   r  r  r  r	  )r[   r+   r   r,   r
  r$   rD   r   r\   residualself_attn_weightsrX   rX   r_   r     s*   




zDogeDecoderLayer.forwardr{   )NNNNFN)r`   ra   rb   r"   rh   rZ   r|   r   r   r   r   rk   r   r   FloatTensorr   rm   rX   rX   r]   r_   r    s6    	
r  c                   @   s8   e Zd ZdZdZeeddeedZ	e
 dd ZdS )DogePreTrainedModelFr   )index)r   r+   
attentionsc                 C   sv   t | | t|trt|drt|j dS dS t|tr7t|dr*t	|j
 t|dr9t	|j dS dS dS )zInitialize the weightsr   r  r	  N)r   _init_weightsr   r   hasattrinitzeros_r   r  ones_r  r	  )r[   rs   rX   rX   r_   r    s   




z!DogePreTrainedModel._init_weightsN)r`   ra   rb   _supports_flash_attn_can_compile_fullgraphr   r   r  r   _can_record_outputsr|   no_gradr  rX   rX   rX   r_   r    s    
r  c                   @   rn   )	DogeModelNrp   rX   rX   rX   r_   r  *  rq   r  gate_logitsrP   r   r   c                 C   sv  | du s	t | tsdS | d j}| d j}g }g }| D ]h}	|	|}	|	j|dd\\}
}\}}|
d|d }|d| |d }|jg |jdd dR  }|jg |jdd dR  }|j|dd\}}|	d|}t
j|dd}|| || qtj|dd}tj|dd}|du r|d}tj|||d}tj|||d}|d|||jd  }tj|dd}nq|j\}}t| }|ddddddf ||||fd|}|d|  }tj|||d}tj|||d}|d||t| }|ddddddf ||||fd||}tj|| ddtj|dd }t|| }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [2, batch_size * sequence_length, num_keys].
        num_experts:
            Number of experts
        num_keys:
            Number of keys
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   r   r   rz   r   )r   r   r   r   r   r   r   r   r   r   r   r   appendr|   catr   	ones_likescatter_add_meanlenr   r   rk   r   )r  rP   r   r   r,   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr   r   r   r   r   r   r   r   expert_indicesr   tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthr?   expert_attention_mask router_per_expert_attention_maskoverall_lossrX   rX   r_   load_balancing_loss_func.  sb    





r1  c                       s   e Zd Z fddZ										ddejdB dejdB dejdB dedB d	ejdB d
ejdB de	dB dejdB de
ejB de	dB dee defddZ  ZS )DogeForCausalLMc                    s"   t  | t|| _|j| _d S r{   )rY   rZ   r  modelrP   r   r]   rX   r_   rZ     s   
zDogeForCausalLM.__init__Nr   r)   r,   r
  r$   r*   labelsrD   r   logits_to_keeprS   r\   ry   c              
   K   s   |
dur|
n| j j}
| jd|||||||d|}|j}t|	tr(t|	 dn|	}| |dd|ddf }d}|durJ| j||| j	fi |}d}|
rot
|j| jtt| j| j|}|duro|| j||j 7 }t||||j|j|j|jdS )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DogeForCausalLM

        >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
        >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r)   r,   r
  r$   r*   rD   r   )lossaux_losslogitsr$   r+   r  r   rX   )r   rS   r3  last_hidden_stater   rh   slicelm_headloss_functionr<   r1  r   rP   r   r   r   rQ   rT   r   r   r   r$   r+   r  )r[   r)   r,   r
  r$   r*   r4  rD   r   r5  rS   r\   outputsr+   slice_indicesr8  r6  r7  rX   rX   r_   r     sN   %zDogeForCausalLM.forward)
NNNNNNNNr   N)r`   ra   rb   rZ   r|   r   r   r   r  rk   rh   r   r   r   r   rm   rX   rX   r]   r_   r2    sJ    	
r2  c                   @   rn   )DogeForSequenceClassificationNrp   rX   rX   rX   r_   r?    rq   r?  )r"   r2  r  r  r?  )NN)NNr   N)Mrc   r   collections.abcr   typingr   r|   torch.nn.functionalr   
functionalr    r   r  activationsr   cache_utilsr   configuration_utilsr	   integrations.flex_attentionr
   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.output_capturingr   llama.modeling_llamar   r   r   r   r   r   r   r   mixtral.modeling_mixtralr   r    
get_loggerr`   logger!torch.nn.attention.flex_attentionr!   r"   ro   rr   Moduler   ri   r   r   r   r   r   r   r  r  r  rh   r1  r2  r?  __all__rX   rX   rX   r_   <module>   s   (

 5
	
.}92
jZ