o
    wijx                     @   sD  d dl mZmZmZ d dlZd dlm  mZ d dlmZ ddl	m
Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* e(+e,Z-edG dd dej.Z/G dd dej.Z0dd Z1d>ddZ2dej3de4dej3fdd Z5	!d?d"ej.d#ej3d$ej3d%ej3d&eej3 d'e6d(e6fd)d*Z7G d+d, d,ej.Z8G d-d. d.ej.Z9G d/d0 d0ej.Z:G d1d2 d2ej.Z;G d3d4 d4eZ<e&G d5d6 d6e!Z=e&G d7d8 d8e=Z>G d9d: d:ee%Z?e&G d;d< d<e=eZ@g d=ZAdS )@    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tuplelogging   )Dots1ConfigRMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	Dots1RMSNormư>c                    s&   t    tt|| _|| _dS )z;
        Dots1RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/dots1/modeling_dots1.pyr!   /   s   

zDots1RMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor#   float32powmeanrsqrtr&   r%   )r'   hidden_statesinput_dtypevariancer,   r,   r-   forward7   s
   zDots1RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler%   shaper&   r'   r,   r,   r-   
extra_repr>   s   zDots1RMSNorm.extra_repr)r   )__name__
__module____qualname__r!   r:   r>   __classcell__r,   r,   r*   r-   r   -   s    r   c                       s8   e Zd Zddef fddZe edd Z  Z	S )Dots1RotaryEmbeddingNconfigc                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r    r!   hasattrrE   getrF   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrD   r   rope_init_fnattention_scalingregister_bufferrI   original_inv_freq)r'   rD   devicerI   r*   r,   r-   r!   C   s   
zDots1RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r/   r   mpscpuF)device_typeenabledr.   dimr1   )rI   floatexpandr<   r2   rT   
isinstancerG   strr#   autocast	transposecatcosrQ   sinr1   )
r'   xposition_idsinv_freq_expandedposition_ids_expandedrW   freqsembrc   rd   r,   r,   r-   r:   T   s   0&zDots1RotaryEmbedding.forwardN)
r?   r@   rA   r   r!   r#   no_gradr   r:   rB   r,   r,   r*   r-   rC   B   s
    rC   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr/   r.   rY   )r<   r#   rb   )re   x1x2r,   r,   r-   rotate_halfd   s   ro   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezero   )qkrc   rd   rf   unsqueeze_dimq_embedk_embedr,   r,   r-   apply_rotary_pos_embk   s
   

rv   r7   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r<   r]   reshape)r7   rw   batchnum_key_value_headsslenhead_dimr,   r,   r-   	repeat_kv   s
   0r~           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr.   r   r/   )rZ   r1   )ptrainingr   )r~   num_key_value_groupsr#   matmulra   r<   r   
functionalsoftmaxr3   r2   r1   r   r   
contiguous)r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr,   r,   r-   eager_attention_forward   s   
&r   c                       s   e Zd ZdZdedef fddZ		ddejde	ejejf d	e
ej d
e
e de
ej dee de	eje
ej e
e	ej  f fddZ  ZS )Dots1Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrD   	layer_idxc                    s  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _t| j|jd| _t| j|jd| _|j| dkr|j| _d S d | _d S )Nr}   g      Tbiasr)   sliding_attention)r    r!   rD   r   getattrr(   num_attention_headsr}   r{   r   r   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_projr   rms_norm_epsq_normk_normlayer_typessliding_windowr'   rD   r   r*   r,   r-   r!      s.   
$zDots1Attention.__init__Nr7   position_embeddingsr   past_key_valuecache_positionr   rx   c                 K   s4  |j d d }g |d| jR }| | ||dd}	| | ||dd}
| ||dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}t}| jjdkrkt| jj }|| |	|
||f| jswdn| j| j| jd|\}}|jg |dR   }| |}||fS )Nr/   r   r.   )rd   rc   r   eagerr   )r   r   r   )r<   r}   r   r   viewra   r   r   r   rv   updater   r   rD   _attn_implementationr   r   r   r   r   ry   r   r   )r'   r7   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rc   rd   cache_kwargsattention_interfacer   r   r,   r,   r-   r:      s:   		

zDots1Attention.forwardNN)r?   r@   rA   __doc__r   intr!   r#   Tensorr;   r   r   
LongTensorr   r   r:   rB   r,   r,   r*   r-   r      s(    r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )Dots1MLPNc                    s   t    || _|d u r|jn|| _|d u r|jn|| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFr   )r    r!   rD   r(   intermediate_sizer   r   	gate_projup_proj	down_projr   
hidden_actact_fn)r'   rD   r(   r   r*   r,   r-   r!      s   
zDots1MLP.__init__c                 C   s$   |  | | || | }|S rk   )r   r   r   r   )r'   re   r   r,   r,   r-   r:     s    zDots1MLP.forwardr   )r?   r@   rA   r!   r:   rB   r,   r,   r*   r-   r      s    r   c                       sD   e Zd ZdZ fddZdejdejdejfddZd	d
 Z  Z	S )Dots1MoEz:
    A mixed expert module containing shared experts.
    c                    sT   t     | _t fddt jD | _t | _	t
  j j d| _d S )Nc                    s   g | ]	}t   jd qS ))r   )r   moe_intermediate_size).0_rD   r,   r-   
<listcomp>  s    z%Dots1MoE.__init__.<locals>.<listcomp>)rD   r   )r    r!   rD   r   
ModuleListrangen_routed_expertsexpertsDots1TopkRoutergater   r   n_shared_expertsshared_expertsr'   rD   r*   r   r-   r!     s   

zDots1MoE.__init__r7   topk_indicestopk_weightsc                 C   s   t j||jd}t jjj|t| jd}|ddd}t	t| jD ]4}| j| }|| }t 
|\}	}
|	 dkrV||	|
f }||	 }||}||d }|d|	| q"||jS )z
        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
        to not have to do a loop here (deepseek has 256 experts soooo yeah).
        r[   )num_classesr.   r   r   r/   )r#   
zeros_liker1   r   r   one_hotlenr   permuter   wherenumelrp   
index_add_rG   )r'   r7   r   r   final_hidden_statesexpert_mask
expert_idxexpertmasktoken_indicesweight_indicesexpert_weightsexpert_inputexpert_outputweighted_outputr,   r,   r-   moe  s   
zDots1MoE.moec                 C   sP   |}|j }| |\}}|d|j d }| |||j| }|| | }|S )Nr/   )r<   r   r   r   r   )r'   r7   	residuals
orig_shaper   r   r,   r,   r-   r:   1  s   zDots1MoE.forward)
r?   r@   rA   r   r!   r#   r   r   r:   rB   r,   r,   r*   r-   r     s
    r   c                       s4   e Zd Z fddZe dd Zdd Z  ZS )r   c                    sr   t    || _|j| _|j| _|j| _|j| _|j| _|j	| _	t
t| j|jf| _| dt| j d S )Ne_score_correction_bias)r    r!   rD   num_experts_per_toktop_kr   routed_scaling_factorn_group
topk_groupnorm_topk_probr   r"   r#   emptyr(   r%   rR   zerosr   r*   r,   r-   r!   <  s   
zDots1TopkRouter.__init__c                 C   s   | d| j| jd }| d| j| j| j jdddd jdd}tj|| jdddd }t	|}|
d|d |dd| j| j| j d| j}||  d}tj|| jdddd }|S )	Nr/   r   r.   rY   F)rr   rZ   sortedr   r   )r   r   r   rp   r   topksumr#   r   r   scatter_r]   ry   masked_fillboolr   )r'   scoresscores_for_choicegroup_scores	group_idx
group_mask
score_maskr   r,   r,   r-   get_topk_indicesI  s&   

z Dots1TopkRouter.get_topk_indicesc                 C   s~   | d| jj}t|tj| jtj}|	 }| 
|}|d|}| jr6|jdddd }|| }|| j }||fS )Nr/   r   T)rZ   r0   g#B;)r   rD   r(   FlinearrG   r#   r3   r%   sigmoidr  gatherr   r   r   )r'   r7   router_logitsr   r   r   denominatorr,   r,   r-   r:   ]  s   

zDots1TopkRouter.forward)	r?   r@   rA   r!   r#   rl   r  r:   rB   r,   r,   r*   r-   r   ;  s
    
r   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
 dee dee deej	 deeejejf  dee deejeeejejf  f fddZ  ZS )Dots1DecoderLayerrD   r   c                    st   t    |j| _t||d| _||jkrt|| _nt|| _t	|j|j
d| _t	|j|j
d| _|j| | _d S )N)rD   r   r   )r    r!   r(   r   	self_attnfirst_k_dense_replacer   mlpr   r   r   input_layernormpost_attention_layernormr   attention_typer   r*   r,   r-   r!   k  s   


zDots1DecoderLayer.__init__NFr7   r   rf   r   output_attentions	use_cacher   r   r   rx   c	                 K   st   |}
|  |}| jd||||||||d|	\}}|
| }|}
| |}| |}|
| }|f}|r8||f7 }|S )N)r7   r   rf   r   r  r  r   r   r,   )r  r	  r  r  )r'   r7   r   rf   r   r  r  r   r   r   residualself_attn_weightsoutputsr,   r,   r-   r:   z  s.   
	



zDots1DecoderLayer.forward)NNNFFNN)r?   r@   rA   r   r   r!   r#   r   r   r   r   r   r;   r   r   FloatTensorr:   rB   r,   r,   r*   r-   r  j  s<    	
r  c                   @   sL   e Zd ZeZdZdZdgZdgZdZ	dZ
dZdZdZdZdZdZdd ZdS )Dots1PreTrainedModelmodelTr  past_key_valuesc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|trQ|jjd d S t|tra|jjjd|d d S d S )Nr   )r5   stdg      ?)rD   initializer_ranger^   r   r   r%   datanormal_r   zero_	Embeddingpadding_idxr   fill_r   )r'   r   r  r,   r,   r-   _init_weights  s    



z"Dots1PreTrainedModel._init_weightsN)r?   r@   rA   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_3_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendr   r,   r,   r,   r-   r    s    r  c                       s   e Zd Zdef fddZdd Zdd Zee									dd	e	e
j d
e	e
j de	e
j de	e de	e
j de	e de	e de	e de	e
j dee defddZ  ZS )
Dots1ModelrD   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _d| jjv | _|   d S )Nc                    s   g | ]}t  |qS r,   )r  )r   r   r   r,   r-   r     s    z'Dots1Model.__init__.<locals>.<listcomp>r   r   Fr   )r    r!   pad_token_idr  
vocab_sizer   r  r(   embed_tokensr   r   num_hidden_layerslayersr   r   normrC   
rotary_embgradient_checkpointingrD   r   has_sliding_layers	post_initr   r*   r   r-   r!     s   zDots1Model.__init__c                 C      | j S rk   r1  r=   r,   r,   r-   get_input_embeddings     zDots1Model.get_input_embeddingsc                 C   
   || _ d S rk   r:  r'   r   r,   r,   r-   set_input_embeddings     
zDots1Model.set_input_embeddingsN	input_idsr   rf   r  inputs_embedsr  r  output_hidden_statesr   flash_attn_kwargsrx   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}t	|t
d tfsFtd|d u rO| |}|rX|d u rXt }|	d u rt|d urd| nd}tj|||jd  |jd}	|d u r}|	d}t	| }ts| j |||	||d}d	tdi |i}| jrtdi ||d
< |}| ||}|rdnd }|rdnd }| jd | j j D ])}|r||f7 }||f||j |||||	|d|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r   )rT   )rD   input_embedsr   r   r  rf   full_attentionr   r,   )r   rf   r   r  r  r   r   )last_hidden_stater  r7   
attentions)rD   r  rC  r  
ValueErrorr6  r   loggerwarning_oncer^   rG   r   r1  r	   get_seq_lengthr#   aranger<   rT   rp   dictr   r7  r   r5  r3  r2  r  r4  r   )r'   rA  r   rf   r  rB  r  r  rC  r   rD  past_seen_tokenscausal_mask_mappingmask_kwargsr7   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr,   r,   r-   r:     s   



	


zDots1Model.forward)	NNNNNNNNN)r?   r@   rA   r   r!   r;  r?  r   r   r   r#   r   r   r   r  r   r   r   r   r:   rB   r,   r,   r*   r-   r.    sL    	
r.  c                   @   s   e Zd ZdS )KwargsForCausalLMN)r?   r@   rA   r,   r,   r,   r-   rV  M  s    rV  c                       s
  e Zd ZdgZddiZddgdgfiZ fddZdd	 Zd
d Zdd Z	dd Z
dd Zdd Zee											d%deej deej deej dee deej deej dee dee dee deej d eeejf d!ee d"efd#d$Z  ZS )&Dots1ForCausalLMzlm_head.weightlm_headcolwise_repr7   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r   )
r    r!   r.  r  r0  r   r   r(   rX  r8  r   r*   r,   r-   r!   V  s
   
zDots1ForCausalLM.__init__c                 C   s   | j jS rk   r  r1  r=   r,   r,   r-   r;  _  s   z%Dots1ForCausalLM.get_input_embeddingsc                 C   s   || j _d S rk   r[  r>  r,   r,   r-   r?  b  s   z%Dots1ForCausalLM.set_input_embeddingsc                 C   r9  rk   rX  r=   r,   r,   r-   get_output_embeddingse  r<  z&Dots1ForCausalLM.get_output_embeddingsc                 C   r=  rk   r\  )r'   new_embeddingsr,   r,   r-   set_output_embeddingsh  r@  z&Dots1ForCausalLM.set_output_embeddingsc                 C   r=  rk   r  )r'   decoderr,   r,   r-   set_decoderk  r@  zDots1ForCausalLM.set_decoderc                 C   r9  rk   r`  r=   r,   r,   r-   get_decodern  r<  zDots1ForCausalLM.get_decoderNr   rA  r   rf   r  rB  labelsr  r  rC  r   logits_to_keepr   rx   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }d}|durX| j	d||| j j
d|}t|||j|j|jdS )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Dots1ForCausalLM

        >>> model = Dots1ForCausalLM.from_pretrained("rednote-hilab/dots1.llm1.inst")
        >>> tokenizer = AutoTokenizer.from_pretrained("rednote-hilab/dots1.llm1.inst")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	rA  r   rf   r  rB  r  r  rC  r   )rZ  rd  r0  )lossrZ  r  r7   rH  r,   )rD   r  rC  r  rG  r^   r   slicerX  loss_functionr0  r   r  r7   rH  )r'   rA  r   rf   r  rB  rd  r  r  rC  r   re  r   r  r7   slice_indicesrZ  rf  r,   r,   r-   r:   q  s:   '
zDots1ForCausalLM.forward)NNNNNNNNNNr   )r?   r@   rA   _tied_weights_keys_tp_plan_pp_planr!   r;  r?  r]  r_  rb  rc  r   r   r   r#   r   r   r   r  r   r   r   r   rV  r   r:   rB   r,   r,   r*   r-   rW  P  sf    		
rW  )r  r.  rW  )Nr   )r   )Btypingr   r   r   r#   torch.nn.functionalr   r   r  activationsr   cache_utilsr   r	   
generationr
   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   configuration_dots1r   
get_loggerr?   rJ  Moduler   rC   ro   rv   r   r   r~   r\   r   r   r   r   r   r  r  r.  rV  rW  __all__r,   r,   r,   r-   <module>   sl   
"

J4/: 	l