o
    eiW|                     @   s  d dl mZ d dlmZ d dlZd dlm  mZ d dlmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 edG dd dej4Z5G dd dej4Z6G dd dej4Z7dd Z8eddHd d!Z9d"ej:d#e;d$ej:fd%d&Z<	'dId(ej4d)ej:d*ej:d+ej:d,ej:dB d-e=d.e=d/e&e( fd0d1Z>ee9G d2d3 d3ej4Z?eG d4d5 d5ej4Z@G d6d7 d7ej4ZAG d8d9 d9ej4ZBG d:d; d;eZCe)G d<d= d=e$ZDe)G d>d? d?eDZE		@	dJdAej:eFej: B dB dBe;dB d,ej:dB d$ej:e;B fdCdDZGe)G dEdF dFeDeZHg dGZIdS )K    )Callable)OptionalN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_experts_implementationuse_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_grouped_mm_available)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )OlmoeConfigRMSNormc                       s>   e Zd Zdd fddZdejdejfddZd	d
 Z  ZS )OlmoeRMSNormh㈵>returnNc                    s&   t    tt|| _|| _dS )z;
        OlmoeRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/olmoe/modeling_olmoe.pyr'   2   s   

zOlmoeRMSNorm.__init__hidden_statesc                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor)   float32powmeanrsqrtr,   r+   )r-   r4   input_dtypevariancer2   r2   r3   forward:   s
   zOlmoeRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler+   shaper,   )r-   r2   r2   r3   
extra_reprA   s   zOlmoeRMSNorm.extra_repr)r$   )r%   N)	__name__
__module____qualname__r'   r)   Tensorr@   rC   __classcell__r2   r2   r0   r3   r#   0   s    r#   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )OlmoeRotaryEmbeddinginv_freqNconfigc                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultrJ   F)
persistentoriginal_inv_freq)r&   r'   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrK   rope_parametersrL   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r-   rK   devicerope_init_fnrJ   r0   r2   r3   r'   H   s   


zOlmoeRotaryEmbedding.__init__rX   ztorch.deviceseq_lenr%   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r5   r8   )rX   r8   )	rS   getattrr.   num_attention_headsr)   arangeint64r9   float)rK   rX   rZ   basedimattention_factorrJ   r2   r2   r3   rT   X   s   
&z4OlmoeRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r6   r    mpscpuF)device_typeenabledr5   rd   r]   )rJ   rb   expandrB   r9   rX   
isinstancetypestrr   	transposer)   catcosrU   sinr8   )
r-   xposition_idsinv_freq_expandedposition_ids_expandedrh   freqsembrq   rr   r2   r2   r3   r@   v   s   0&zOlmoeRotaryEmbedding.forwardN)NNN)rD   rE   rF   r)   rG   __annotations__r!   r'   staticmethodr   intrA   rb   rT   no_gradr   r@   rH   r2   r2   r0   r3   rI   E   s&   
 

rI   c                       $   e Zd Z fddZdd Z  ZS )OlmoeMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFbias)r&   r'   rK   r.   intermediate_sizer   Linear	gate_projup_proj	down_projr   
hidden_actact_fnr-   rK   r0   r2   r3   r'      s   
zOlmoeMLP.__init__c                 C   s$   |  | | || | }|S ry   )r   r   r   r   )r-   rs   r   r2   r2   r3   r@      s    zOlmoeMLP.forwardrD   rE   rF   r'   r@   rH   r2   r2   r0   r3   r      s    
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr6   r5   rj   )rB   r)   rp   )rs   x1x2r2   r2   r3   rotate_half   s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkrq   rr   unsqueeze_dimq_embedk_embedr2   r2   r3   apply_rotary_pos_emb   s
   

r   r4   n_repr%   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rB   rk   reshape)r4   r   batchnum_key_value_headsslenr\   r2   r2   r3   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr5   r   r6   )rd   r8   )ptrainingr    )r   num_key_value_groupsr)   matmulro   r   
functionalsoftmaxr:   r9   r8   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr2   r2   r3   eager_attention_forward   s   
r   c                       s   e Zd ZdZddededB f fddZ		ddejde	ejejf d	ejdB d
e
dB dejdB dee de	ejejdB e	ej dB f fddZ  ZS )OlmoeAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrK   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _t|j|jd| _t|j|j |j |jd| _d S )Nr\   g      Tr   r/   )r&   r'   rK   r   r^   r.   r_   r\   r   r   r   attention_dropout	is_causalr   r   attention_biasq_projk_projv_projo_projr#   rms_norm_epsq_normk_normr-   rK   r   r0   r2   r3   r'      s0   
zOlmoeAttention.__init__r4   position_embeddingsr   past_key_valuescache_positionr   r%   c                 K   s  |j d d }g |d| jR }| | |}	| | |}
| |}| jjd urP|	j	| jj | jjd |
j	| jj | jjd |j	| jj | jjd |	j
| dd}	|
j
| dd}
|j
| dd}|\}}t|	|
||\}	}
|d ur|||d}||
|| j|\}
}t| jjt}|| |	|
||f| jsdn| j| jt| jdd d|\}}|jg |dR   }| |}||fS )	Nr6   )minmaxr    r5   )rr   rq   r   r   sliding_window)r   r   r   )rB   r\   r   r   r   r   r   rK   clip_qkvclamp_viewro   r   updater   r   get_interface_attn_implementationr   r   r   r   r^   r   r   r   )r-   r4   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rq   rr   cache_kwargsattention_interfacer   r   r2   r2   r3   r@      sH   	
	

zOlmoeAttention.forwardry   )NN)rD   rE   rF   __doc__r!   r|   r'   r)   rG   rA   r   
LongTensorr   r   r@   rH   r2   r2   r0   r3   r      s(     r   c                       sH   e Zd ZdZdef fddZdejdejdejdejfd	d
Z  Z	S )OlmoeExpertsz2Collection of expert weights stored as 3D tensors.rK   c                    sn   t    |j| _|j| _|j| _t	t
| jd| j | j| _t	t
| j| j| j| _t|j | _d S )Nr5   )r&   r'   num_local_expertsnum_expertsr.   
hidden_dimr   intermediate_dimr   r(   r)   emptygate_up_projr   r   r   r   r   r0   r2   r3   r'   4  s   
 zOlmoeExperts.__init__r4   top_k_indextop_k_weightsr%   c                 C   s  t |}t  % t jjj|| jd}|ddd}t |j	ddd
 }W d    n1 s1w   Y  |D ]O}|d }|| jkrDq8t || \}}	||	 }
tj|
| j| jddd\}}| || }tj|| j| }|||	|d f  }|d|	||j q8|S )N)num_classesr5   r    r   )r6   rj   r6   )r)   
zeros_liker}   r   r   one_hotr   permutegreatersumnonzerowherelinearr   chunkr   r   
index_add_r9   r8   )r-   r4   r   r   final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategateupcurrent_hidden_statesr2   r2   r3   r@   =  s$   


"zOlmoeExperts.forward)
rD   rE   rF   r   r!   r'   r)   rG   r@   rH   r2   r2   r0   r3   r   0  s    	r   c                       r~   )OlmoeTopKRouterc                    sF   t    |j| _|j| _|j| _|j| _t	t
| j| j| _d S ry   )r&   r'   num_experts_per_toktop_kr   norm_topk_probr.   r   r   r(   r)   zerosr+   r   r0   r2   r3   r'   Y  s   
zOlmoeTopKRouter.__init__c                 C   sz   | d| j}t|| j}tjjj|tj	dd}tj
|| jdd\}}| jr0||jddd }||j}|}|||fS )Nr6   )r8   rd   rj   T)rd   r7   )r   r   Fr   r+   r)   r   r   r   rb   topkr   r   r   r9   r8   )r-   r4   router_logitsrouter_top_valuerouter_indicesrouter_scoresr2   r2   r3   r@   a  s   
zOlmoeTopKRouter.forwardr   r2   r2   r0   r3   r   X  s    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )OlmoeSparseMoeBlockc                    s"   t    t|| _t|| _d S ry   )r&   r'   r   r   r   expertsr   r0   r2   r3   r'   n  s   

zOlmoeSparseMoeBlock.__init__r4   r%   c           	      C   sD   |j \}}}|d|}| |\}}}| ||||||}|S )Nr6   )rB   r   r   r   r   )	r-   r4   
batch_sizesequence_lengthr   _r   r   r   r2   r2   r3   r@   s  s   zOlmoeSparseMoeBlock.forward)rD   rE   rF   r'   r)   rG   r@   rH   r2   r2   r0   r3   r   m  s    r   c                       s   e Zd Zdedef fddZ						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee dejfddZ  ZS )OlmoeDecoderLayerrK   r   c                    sR   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
d S )N)rK   r   r   )r&   r'   r.   r   	self_attnr   mlpr#   r   input_layernormpost_attention_layernormr   r0   r2   r3   r'   ~  s   

zOlmoeDecoderLayer.__init__NFr4   r   rt   r   	use_cacher   r   r   r%   c              
   K   s^   |}	|  |}| jd|||||||d|\}}
|	| }|}	| |}| |}|	| }|S )N)r4   r   rt   r   r  r   r   r2   )r  r  r  r  )r-   r4   r   rt   r   r  r   r   r   residualr  r2   r2   r3   r@     s&   




zOlmoeDecoderLayer.forward)NNNFNN)rD   rE   rF   r!   r|   r'   r)   rG   r   r   boolrA   r   r   r@   rH   r2   r2   r0   r3   r  }  s6    	
r  c                   @   s`   e Zd ZU eed< dZdZdgZdgZdZ	dZ
eeddeedZe ZdZe d	d
 ZdS )OlmoePreTrainedModelrK   modelTr  r   r   )index)r   r4   
attentionsc                 C   sn   t | | t|tr#tj|jd| jjd tj|j	d| jjd d S t|t
r5tj|jd| jjd d S d S )Nr   )r<   std)r   _init_weightsrl   r   initnormal_r   rK   initializer_ranger   r   r+   )r-   r   r2   r2   r3   r    s   

z"OlmoePreTrainedModel._init_weightsN)rD   rE   rF   r!   rz   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpar   r   r  r   _can_record_outputsr   _can_compile_fullgraph_supports_attention_backendr)   r}   r  r2   r2   r2   r3   r
    s"   
 
r
  c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee defddZ  ZS )
OlmoeModelrK   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r2   )r  ).0r   rK   r2   r3   
<listcomp>      z'OlmoeModel.__init__.<locals>.<listcomp>r   r  F)r&   r'   pad_token_idpadding_idx
vocab_sizer   	Embeddingr.   embed_tokens
ModuleListrangenum_hidden_layerslayersr#   r   normrI   
rotary_embgradient_checkpointing	post_initr   r0   r  r3   r'     s   zOlmoeModel.__init__N	input_idsr   rt   r   inputs_embedsr  r   r   r%   c              
   K   s   |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| j|||||d}
|}| ||}| jd | jj D ]}||f||
||||d|}qb| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r    )rX   )rK   r/  r   r   r   rt   )r   r   rt   r   r  r   )last_hidden_stater   )
ValueErrorr	   rK   r%  get_seq_lengthr)   r`   rB   rX   r   r   r+  r)  r(  r*  r   )r-   r.  r   rt   r   r/  r  r   r   past_seen_tokenscausal_maskr4   r   decoder_layerr2   r2   r3   r@     sR   

	
zOlmoeModel.forward)NNNNNNN)rD   rE   rF   r!   r'   r   r   r   r)   r   rG   r   FloatTensorr	  r   r   r   r@   rH   r2   r2   r0   r3   r    s>    	
r  r5   gate_logitsr   c                    s  | du s	t | tsdS t | tr#| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rStj
| dd}	tj
|dd}
ng|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||fd| }tj|| ddtj|dd }
t|	|
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS r2   )r9   )r  
layer_gatecompute_devicer2   r3   r  9  r   z,load_balancing_loss_func.<locals>.<listcomp>rj   r6   )rl   rA   rX   r)   rp   r   r   r   r   r   r<   rb   rB   rk   r   r9   r   r   )r7  r   r   r   concatenated_gate_logitsrouting_weightsr  selected_expertsr   tokens_per_expertrouter_prob_per_expertr   r   r(  expert_attention_mask router_per_expert_attention_maskoverall_lossr2   r9  r3   load_balancing_loss_func  s>   



rC  c                       s   e Zd ZddiZddiZddgdgfiZ fddZee																			
dde	j
d	B de	jd	B de	j
d	B ded	B de	jd	B de	j
d	B ded	B ded	B de	j
d	B dee	jB dee defddZ  ZS )OlmoeForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr4   logitsc                    sX   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _
|j| _|   d S r   )r&   r'   r  r  r#  r   r   r.   rE  router_aux_loss_coefr   r   r-  r   r0   r2   r3   r'   o  s   
zOlmoeForCausalLM.__init__Nr   r.  r   rt   r   r/  labelsr  output_router_logitsr   logits_to_keepr   r%   c                 K   s   |dur|n| j j}| jd||||||||	d|}|j}t|
tr)t|
 dn|
}| |dd|ddf }d}|durK| j||| j	fi |}d}|rht
|j| j| j|}|durh|| j||j 7 }t||||j|j|j|jdS )u  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OlmoeForCausalLM

        >>> model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924")
        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'Hey, are you conscious? Can you talk to me?\nI’m not sure if you’re conscious of this, but I’m'
        ```
        N)r.  r   rt   r   r/  r  rJ  r   )lossaux_lossrG  r   r4   r  r   r2   )rK   rJ  r  r0  rl   r|   slicerE  loss_functionr#  rC  r   r   r   rH  r9   rX   r   r   r4   r  )r-   r.  r   rt   r   r/  rI  r  rJ  r   rK  r   outputsr4   slice_indicesrG  rL  rM  r2   r2   r3   r@   {  sN   )	zOlmoeForCausalLM.forward)
NNNNNNNNNr   )rD   rE   rF   _tied_weights_keys_tp_plan_pp_planr'   r   r   r)   r   rG   r   r6  r	  r|   r   r   r   r@   rH   r2   r2   r0   r3   rD  i  sT    	
rD  )rD  r  r
  )r    )r   )Nr5   N)Jcollections.abcr   typingr   r)   torch.nn.functionalr   r   r    r   r  activationsr   cache_utilsr   r	   
generationr
   integrationsr   r   r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   r   configuration_olmoer!   Moduler#   rI   r   r   r   rG   r|   r   rb   r   r   r   r   r   r  r
  r  rA   rC  rD  __all__r2   r2   r2   r3   <module>   s   A
S'+S
Ri