o
    	۷i                     @   s0  d dl Z d dlmZmZmZ d dlZd dlm  mZ	 d dlmZ ddl
mZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 e* rd dl2m3Z3 edG dd dej4Z5G dd dej4Z6dd Z7dMddZ8dej9d e:d!ej9fd"d#Z;	$dNd%ej4d&ej9d'ej9d(ej9d)eej9 d*e<d+e<d,e%e' fd-d.Z=			dOd%ej4d&ej9d'ej9d(ej9d)eej9d/f d*ee< d0ee< d1eej9 d!e>ej9ej9f fd2d3Z?e" Z@e?e@d4< G d5d6 d6ej4ZAG d7d8 d8ej4ZBG d9d: d:ej4ZCG d;d< d<eZDe(G d=d> d>e#ZEe(G d?d@ d@eEZF			A	dPdBeej9e>ej9 df dCee: dDee: dEe:d)eej9 d!eej9e:f fdFdGZGe(G dHdI dIeEeZHG dJdK dKeeEZIg dLZJdS )Q    N)CallableOptionalUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)compile_friendly_flex_attention)create_causal_mask!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_available)deprecate_kwarg)OutputRecordercheck_model_inputs   )
DogeConfig)	BlockMaskRMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	DogeRMSNormư>c                    s&   t    tt|| _|| _dS )z:
        DogeRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ \/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/doge/modeling_doge.pyr&   6   s   

zDogeRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor(   float32powmeanrsqrtr+   r*   )r,   hidden_statesinput_dtypevariancer1   r1   r2   forward>   s
   zDogeRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler*   shaper+   )r,   r1   r1   r2   
extra_reprE   s   zDogeRMSNorm.extra_repr)r$   )__name__
__module____qualname__r&   r?   rB   __classcell__r1   r1   r/   r2   r#   4   s    r#   c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	DogeRotaryEmbeddinginv_freqNconfigc                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultrH   F)
persistent)r%   r&   hasattr
isinstancerJ   dictgetrK   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrI   r   rope_init_fnattention_scalingregister_bufferrH   original_inv_freq)r,   rI   devicerH   r/   r1   r2   r&   L   s   
zDogeRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r4   r   mpscpuF)device_typeenabledr3   dim)r6   )rH   floatexpandrA   r7   rZ   rP   rL   strr(   autocast	transposecatcosrW   sinr6   )
r,   xposition_idsinv_freq_expandedposition_ids_expandedr]   freqsembrg   rh   r1   r1   r2   r?   ]   s   0&zDogeRotaryEmbedding.forwardN)rC   rD   rE   r(   Tensor__annotations__r    r&   no_gradr   r?   rF   r1   r1   r/   r2   rG   I   s   
 
rG   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr4   r3   r_   )rA   r(   rf   )ri   x1x2r1   r1   r2   rotate_halfm   s   ru   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezeru   )qkrg   rh   rj   unsqueeze_dimq_embedk_embedr1   r1   r2   apply_rotary_pos_embt   s
   

r|   r<   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rA   rb   reshape)r<   r}   batchnum_key_value_headsslenhead_dimr1   r1   r2   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr3   r   r4   )r`   r6   ptrainingr   )r   num_key_value_groupsr(   matmulre   rA   r   
functionalsoftmaxr8   r7   r6   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputr1   r1   r2   eager_attention_forward   s   
&r   r!   softcap	head_maskc              
      s   d }	d  t |tr|}	n|  d ur% d d d d d d d |jd f   fdd}
t||||
|	d|dd\}}||j}|dd }||fS )Nr   c                    s^   d urt |   }  d ur|  | | | |  } d ur-| | | d d  } | S )Nr   )r(   tanh)score	batch_idxhead_idxq_idxkv_idxr   r   r   r1   r2   	score_mod   s   z)flex_attention_forward.<locals>.score_modT)r   
block_mask
enable_gqascale
return_lser   r3   )rP   r!   rA   r   r7   r6   re   r   )r   r   r   r   r   r   r   r   r   r   r   r   attention_weightsr1   r   r2   flex_attention_forward   s*   
&	
r   doge_flex_attentionc                       s   e Zd Zddedee f fddZedddd				dd
ej	de
ej	ej	f deej	 dee deej de
ej	eej	 ee
ej	  f fddZ		dd
ej	dej	dedeej	 fddZ  ZS )DogeAttentionNrI   	layer_idxc                    s(  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _|j| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tt|j| _tj|j| j |j|jd| _tj|j| j |j|jd| _t| j|jd| _t| j|jd| _d S )Nr   g      ࿩biasr.   )r%   r&   rI   r   getattrr-   num_attention_headsr   r   r   r   attention_dropoutkeep_window_sizer   Linearattention_biasq_projk_projv_projr'   r(   zerosAdt_projo_projr#   rms_norm_epsq_normk_normr,   rI   r   r/   r1   r2   r&      s4   
zDogeAttention.__init__past_key_valuepast_key_values4.58new_nameversionr<   position_embeddingsr   cache_positionr~   c                 K   s  |j d d }g |d| jR }| | ||dd}	| | ||dd}
| ||dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}| |dd|j d |j d d}t| jt| dd}| j||| j|d}t|| j}t}| jjdkrt| jj }|| |	|
|f|| jsd	n| j| jd
|\}}|jg |dR   }| |}||fS )Nr4   r   r3   )rh   rg   r   r   r   )r<   	dt_statesr   r   eagerr   )r   r   r   ) rA   r   r   r   viewre   r   r   r   r|   updater   r   r   r(   expr   Fsoftplusprepare_dynamic_maskr   r   r   r   rI   _attn_implementationALL_ATTENTION_FUNCTIONSr   r   r   r   r   )r,   r<   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rg   rh   cache_kwargsr   	attn_maskattention_interfacer   r   r1   r1   r2   r?   	  sN   
 

zDogeAttention.forward   r   r   c           
   	   C   s  t |jj}|j}|dddddddf dd|jd d}|durZt|tsZ|jt jkrA|j}t 	|t j
d|j|d|}||ddddddd|jd f dk|}|jd |krt j|||jd}t j||ddd	d
j}	|d|	d}||dk|}|S )a8  
        The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

        Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

        Args:
            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
            dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
            keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
        Nr4   r   r   )rZ   r6   r   r6   rZ   TF)r`   largestsorted      ?)r(   finfor6   minrb   rA   rP   r!   boolwheretensorrZ   masked_fill
zeros_liketopkindicesscatter)
r,   r<   r   r   r   	min_dtyper6   r   active_masktopk_indicesr1   r1   r2   r   B  s$   2z"DogeAttention.prepare_dynamic_maskro   NNN)r   N)rC   rD   rE   r    r   intr&   r   r(   rp   r@   r   
LongTensorr?   r   rF   r1   r1   r/   r2   r      s<    <r   c                       s$   e Zd Z fddZdd Z  ZS )DogeMLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )Nr   )r%   r&   rI   r-   intermediate_sizer   r   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnr,   rI   r/   r1   r2   r&   i  s   
zDogeMLP.__init__c                 C   s$   |  | | || | }|S ro   )r   r   r   r   )r,   ri   r   r1   r1   r2   r?   s  s    zDogeMLP.forward)rC   rD   rE   r&   r?   rF   r1   r1   r/   r2   r   h  s    
r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )	DogeCDMoErI   c                    s   t    |j| _|j| _t|j | _|j| _t	t
| j| _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| jd dd| _t| j| j| _t| j| j| _d S )Nr   r3   F)r%   r&   r-   r   r   r   r   num_expertsmathfloorsqrtnum_keysnum_experts_per_toktop_knorm_topk_probr   r   r   r   r   r   router_gate	Embedding
down_embedup_embedr   r/   r1   r2   r&   y  s   
zDogeCDMoE.__init__r<   r~   c                 K   s  |j \}}}| |d|| d}|j| jdd\\}}\}	}
|d|d }|	d| j |
d }|jg |j d d dR  }|jg |j d d dR  }|j| jdd\}}|d|}tj	|dd}| j
rx||jddd }| |}| |}t|||| dd|| d}| || }t||| dd|||d}| | | || | }|| }||fS )Nr3   r4   r_   r   T)r`   r5   r   )rA   r  r   r   r   rv   r  gatherr   r   r  sumr  r  r(   r   r   r   r   r   )r,   r<   r   bszseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr   routing_weightsr  r  experts_weightsexperts_statesr1   r1   r2   r?     s(   

&$ zDogeCDMoE.forward)	rC   rD   rE   r    r&   r(   rp   r?   rF   r1   r1   r/   r2   r   x  s    r   c                       s   e Zd Zddedee f fddZedddd					
	ddej	de
ej	ej	f deej	 deej dee dee deej dee de
ejee
ejejf  f fddZ  ZS )DogeDecoderLayerNrI   r   c                    s   t    |j| _t|j|jd| _t||d| _t	
t|j| _t|j|jd| _|js3t|nt|| _t	
t|j| _d S )Nr   )rI   r   )r%   r&   hidden_dropoutr#   r-   r   input_layernormr   	self_attnr   r'   r(   r)   input_residualpost_attention_layernormis_moer   r   mlppost_attention_residualr   r/   r1   r2   r&     s   
zDogeDecoderLayer.__init__r   r   r   r   Fr<   r   r   rj   	use_cacher   r   r~   c              
   K   s   |}	|  |}| jd|||||||d|\}}
tj|| j| jd}| j|	 | }|}	| |}| |}tj|| j| jd}| j	|	 | }|S )N)r<   r   r   rj   r   r!  r   r   r1   )
r  r  r   r   r  r   r  r  r  r   )r,   r<   r   r   rj   r   r!  r   r   residualself_attn_weightsr1   r1   r2   r?     s*   




zDogeDecoderLayer.forwardro   )NNNFN)rC   rD   rE   r    r   r   r&   r   r(   rp   r@   r   r   r   r   r   FloatTensorr?   rF   r1   r1   r/   r2   r    s6    	
r  c                       sb   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeeddeed	Z fd
dZ  ZS )DogePreTrainedModelrI   modelTr  r   Fr   )index)r  r<   
attentionsc                    sz   t  | t|trt|dr|jj  dS dS t|tr9t|dr+|j	j
d t|dr;|jj
d dS dS dS )zInitialize the weightsr   r  r   r   N)r%   _init_weightsrP   r   rO   r   datazero_r  r  fill_r   )r,   r   r/   r1   r2   r)    s   




z!DogePreTrainedModel._init_weights)rC   rD   rE   r    rq   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   r  r   _can_record_outputsr)  rF   r1   r1   r/   r2   r%    s    
 
r%  c                       s   e Zd Zdef fddZe e							ddeej	 deej
 deej	 dee d	eej d
ee deej	 dee defddZ  ZS )	DogeModelrI   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r1   )r  ).0r   rI   r1   r2   
<listcomp>  s    z&DogeModel.__init__.<locals>.<listcomp>r   r9  F)r%   r&   pad_token_idpadding_idx
vocab_sizer   r  r-   embed_tokens
ModuleListrangenum_hidden_layerslayersr#   r   normrG   
rotary_embgradient_checkpointing	post_initr   r/   r9  r2   r&     s   zDogeModel.__init__N	input_idsr   rj   r   inputs_embedsr!  r   r   r~   c              
   K   s  |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}| jj
d u rNtnt}
|
| j|||||d}|}| ||}| jd | jj D ]}||f||||||d|}ql| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr9  r   r   )rZ   )rI   input_embedsr   r   r   rj   )r   r   rj   r   r!  r   )last_hidden_stater   )
ValueErrorr	   rI   r>  get_seq_lengthr(   arangerA   rZ   rv   sliding_windowr   r   rD  rB  rA  rC  r   )r,   rG  r   rj   r   rH  r!  r   r   past_seen_tokensmask_functionr   r<   r   decoder_layerr1   r1   r2   r?     sT   

	
zDogeModel.forward)NNNNNNN)rC   rD   rE   r    r&   r   r   r   r(   r   rp   r   r$  r   r   r   r   r?   rF   r1   r1   r/   r2   r7    s<    	
r7  r3   gate_logitsr   r   r  c                 C   sv  | du s	t | tsdS | d j}| d j}g }g }| D ]h}	|	|}	|	j|dd\\}
}\}}|
d|d }|d| |d }|jg |jdd dR  }|jg |jdd dR  }|j|dd\}}|	d|}t
j|dd}|| || qtj|dd}tj|dd}|du r|d}tj|||d}tj|||d}|d|||jd  }tj|dd}nq|j\}}t| }|ddddddf ||||fd|}|d|  }tj|||d}tj|||d}|d||t| }|ddddddf ||||fd||}tj|| ddtj|dd }t|| }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [2, batch_size * sequence_length, num_keys].
        num_experts:
            Number of experts
        num_keys:
            Number of keys
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   r4   r_   r   r   )rP   r@   r6   rZ   r7   r   rv   r   rA   r  r   r   appendr(   rf   r   	ones_likescatter_add_r:   lenrb   r   r   r  )rR  r   r   r  r   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr  r  r  r  r  r  r  r  expert_indicesr  tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthrA  expert_attention_mask router_per_expert_attention_maskoverall_lossr1   r1   r2   load_balancing_loss_funcV  sb    





re  c                       s   e Zd ZdgZddiZddgdgfiZ fddZee											dd
e	e
j de	e
j de	e
j de	e de	e
j de	e
j de	e de	e
j deee
jf de	e dee defddZ  ZS )DogeForCausalLMzlm_head.weightlm_headcolwise_repr<   logitsc                    sX   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _
|j| _|   d S )NFr   )r%   r&   r7  r&  r=  r   r   r-   rg  router_aux_loss_coefr   r   rF  r   r/   r1   r2   r&     s   
zDogeForCausalLM.__init__Nr   rG  r   rj   r   rH  labelsr!  r   logits_to_keepoutput_router_logitsr   r~   c              
   K   s   |
dur|
n| j j}
| jd|||||||d|}|j}t|	tr(t|	 dn|	}| |dd|ddf }d}|durJ| j||| j	fi |}d}|
rot
|j| jtt| j| j|}|duro|| j||j 7 }t||||j|j|j|jdS )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DogeForCausalLM

        >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
        >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)rG  r   rj   r   rH  r!  r   )lossaux_lossri  r   r<   r(  r  r1   )rI   rm  r&  rJ  rP   r   slicerg  loss_functionr=  re  r  r   r   r   r   r   rj  r7   rZ   r   r   r<   r(  )r,   rG  r   rj   r   rH  rk  r!  r   rl  rm  r   outputsr<   slice_indicesri  rn  ro  r1   r1   r2   r?     sN   'zDogeForCausalLM.forward)
NNNNNNNNr   N)rC   rD   rE   _tied_weights_keys_tp_plan_pp_planr&   r   r   r   r(   r   rp   r   r$  r   r   r   r   r   r   r?   rF   r1   r1   r/   r2   rf    sT    	
rf  c                   @   s   e Zd ZdS )DogeForSequenceClassificationN)rC   rD   rE   r1   r1   r1   r2   rw  (  s    rw  )rf  r7  r%  rw  )Nr   )r   r   )NNr3   N)Kr   typingr   r   r   r(   torch.nn.functionalr   r   r   activationsr   cache_utilsr   r	   
generationr
   integrationsr   integrations.flex_attentionr   masking_utilsr   r   modeling_layersr   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r   configuration_doger    !torch.nn.attention.flex_attentionr!   Moduler#   rG   ru   r|   rp   r   r   ra   r   r@   r   r   r   r   r   r  r%  r7  re  rf  rw  __all__r1   r1   r1   r2   <module>   s   $

 

1~93T
jg