o
    ei~                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ ddl	m
Z ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 edG dd dej1Z2G dd dej1Z3G dd dej1Z4G dd dej1Z5G d d! d!ej1Z6d"d# Z7ed$dGd%d&Z8d'ej9d(e:d)ej9fd*d+Z;	,dHd-ej1d.ej9d/ej9d0ej9d1ej9dB d2e<d3e<d4e%e' fd5d6Z=ee8G d7d8 d8ej1Z>G d9d: d:eZ?e(G d;d< d<e#Z@e(G d=d> d>e@ZA		?	dId@ej9eBej9 B dB dAe:dB d1ej9dB d)ej9e:B fdBdCZCe(G dDdE dEe@eZDg dFZEdS )J    )Callable)OptionalN)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring)can_return_tuplemaybe_autocastmerge_with_config_defaults)capture_outputs   )GraniteMoeConfigRMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )GraniteMoeRMSNormư>epsreturnNc                    s&   t    tt|| _|| _dS )z@
        GraniteMoeRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer#   	__class__ p/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/granitemoe/modeling_granitemoe.pyr&   0   s   

zGraniteMoeRMSNorm.__init__hidden_statesc                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor(   float32powmeanrsqrtr+   r*   )r,   r2   input_dtypevariancer0   r0   r1   forward8   s
   zGraniteMoeRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler*   shaper+   )r,   r0   r0   r1   
extra_repr?   s   zGraniteMoeRMSNorm.extra_repr)r"   )
__name__
__module____qualname__floatr&   r(   Tensorr>   rA   __classcell__r0   r0   r.   r1   r!   .   s    r!   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )GraniteMoeRotaryEmbeddinginv_freqNconfigc                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultrI   F)
persistentoriginal_inv_freq)r%   r&   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrJ   rope_parametersrK   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r,   rJ   devicerope_init_fnrI   r.   r0   r1   r&   F   s   


z"GraniteMoeRotaryEmbedding.__init__rW   ztorch.deviceseq_lenr$   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r3   r6   )rW   r6   )	rR   getattrr-   num_attention_headsr(   arangeint64r7   rE   )rJ   rW   rY   basedimattention_factorrI   r0   r0   r1   rS   V   s   
&z9GraniteMoeRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r4   r   mpscpuF)device_typeenabledr3   rb   r\   )rI   rE   expandr@   r7   rW   
isinstancetypestrr   	transposer(   catcosrT   sinr6   )
r,   xposition_idsinv_freq_expandedposition_ids_expandedrf   freqsembro   rp   r0   r0   r1   r>   t   s   0&z!GraniteMoeRotaryEmbedding.forward)N)NNN)rB   rC   rD   r(   rF   __annotations__r   r&   staticmethodr   intr?   rE   rS   no_gradr   r>   rG   r0   r0   r.   r1   rH   C   s&   
 

rH   c                       s6   e Zd Zdedededdf fddZdd	 Z  ZS )
GraniteMoeParallelExpertsnum_experts
input_sizeoutput_sizer$   Nc                    s6   t    tt|||| _|| _|| _|| _	dS )a  
        Initialize the GraniteMoeParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
r%   r&   r   r'   r(   emptyr*   r|   r}   r~   )r,   r|   r}   r~   r.   r0   r1   r&      s
   

z"GraniteMoeParallelExperts.__init__c                 C   sP   |j |dd}g }t| jD ]}|t|| | j|  qtj|dd}|S )a  
        Forward pass of the GraniteMoeParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   rh   )	splitranger|   appendFlinearr*   r(   rn   )r,   inputsexpert_size
input_listoutput_listiresultsr0   r0   r1   r>      s   z!GraniteMoeParallelExperts.forwardrB   rC   rD   ry   r&   r>   rG   r0   r0   r.   r1   r{      s    r{   c                       s2   e Zd Zdededef fddZdd Z  ZS )GraniteMoeTopKGatingr}   r|   top_kc                    s2   t    || _|| _|| _tj||dd| _dS )a  
        Initialize the top-k gating mechanism.

        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        FbiasN)r%   r&   r|   r}   r   r   Linearlayer)r,   r}   r|   r   r.   r0   r1   r&      s
   
zGraniteMoeTopKGating.__init__c                 C   s   |  | }|j| jdd\}}tj|dd|}tj|d| j	g|j
|jd}|d|d}| d}| }| }	|	d\}
}|j| jdd}| }|| }|||||fS )Nr   rh   r   r6   rW   trunc)rounding_mode)r   rE   topkr   r(   softmaxtype_aszerossizer|   r6   rW   scatterlongsumtolistflattensortdiv)r,   r2   logitstop_k_logitstop_k_indicestop_k_gatesr   gatesr   top_k_experts_index_sorted_expertsbatch_indexbatch_gatesr0   r0   r1   r>      s   zGraniteMoeTopKGating.forwardr   r0   r0   r.   r1   r      s    r   c                       s.   e Zd ZdZdef fddZdd Z  ZS )GraniteMoeMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    rJ   c                    sl   t    |j| _|j| _t|j | _t|j	| j| jd | _
t|j	| j| j| _t| j|j	|jd| _d S )Nr3   )r}   r|   r   )r%   r&   r-   r}   intermediate_sizer   
hidden_act
activationr{   num_local_expertsinput_linearoutput_linearr   num_experts_per_tokrouterr,   rJ   r.   r0   r1   r&      s   
zGraniteMoeMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}|| }	| |	|}
|
jddd}| |d |d  }
| |
|}||d d d f  }tj|| | j	f|j
|jd}|d||}|||| j	}|S )Nr4   r3   rh   r   r   r   )r   reshaper   r   chunkr   r   r(   r   r}   r6   rW   	index_addview)r,   layer_inputbszlengthemb_sizer   r   r   r   expert_inputsr2   chunked_hidden_statesexpert_outputsr   layer_outputr0   r0   r1   r>      s   zGraniteMoeMoE.forward)rB   rC   rD   __doc__r   r&   r>   rG   r0   r0   r.   r1   r      s    r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr4   r3   rh   )r@   r(   rn   )rq   x1x2r0   r0   r1   rotate_half  s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkro   rp   unsqueeze_dimq_embedk_embedr0   r0   r1   apply_rotary_pos_emb  s
   

r   r2   n_repr$   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r@   ri   r   )r2   r   batchnum_key_value_headsslenr[   r0   r0   r1   	repeat_kv.  s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr3   r   r4   )rb   r6   )ptrainingr   )r   num_key_value_groupsr(   matmulrm   r   r   r   r8   r7   r6   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr0   r0   r1   eager_attention_forward:  s   
r   c                       s   e Zd ZdZdedef fddZ				ddejde	ejejf dB d	ejdB d
e
dB dejdB dee de	ejejf fddZ  ZS )GraniteMoeAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrJ   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
| _|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )Nr[   Tr   )r%   r&   rJ   r   r]   r-   r^   r[   r   r   attention_multiplierr   attention_dropout	is_causalr   r   attention_biasq_projk_projv_projo_projr,   rJ   r   r.   r0   r1   r&   W  s(   
zGraniteMoeAttention.__init__Nr2   position_embeddingsr   past_key_valuescache_positionr   r$   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jskdn| j| jd|\}}|jg |dR   }| |}||fS )Nr4   r   r3   )rp   ro   r   r   )r   r   )r@   r[   r   r   rm   r   r   r   updater   r   get_interfacerJ   _attn_implementationr   r   r   r   r   r   r   )r,   r2   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   ro   rp   cache_kwargsattention_interfacer   r   r0   r0   r1   r>   n  s8   	

zGraniteMoeAttention.forwardNNNN)rB   rC   rD   r   r   ry   r&   r(   rF   r?   r	   
LongTensorr   r   r>   rG   r0   r0   r.   r1   r   S  s,    r   c                       sv   e Zd Zdedef fddZ				ddejdejdB dedB d	ej	dB d
e
ejejf dB dejfddZ  ZS )GraniteMoeDecoderLayerrJ   r   c                    sZ   t    |j| _t||d| _t|j|jd| _t|j|jd| _t	|| _
|j| _d S )N)rJ   r   r#   )r%   r&   r-   r   	self_attnr!   rms_norm_epsinput_layernormpost_attention_layernormr   block_sparse_moeresidual_multiplierr   r.   r0   r1   r&     s   

zGraniteMoeDecoderLayer.__init__Nr2   r   r   r   r   r$   c           	      K   sf   |}|  |}| jd|||||d|\}}||| j  }|}| |}| |}||| j  }|S )N)r2   r   r   r   r   r0   )r  r   r  r  r  )	r,   r2   r   r   r   r   r   residualr   r0   r0   r1   r>     s"   	



zGraniteMoeDecoderLayer.forwardr   )rB   rC   rD   r   ry   r&   r(   rF   r	   r   r?   r>   rG   r0   r0   r.   r1   r     s&    r   c                       s`   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZe  fdd	Z  ZS )
GraniteMoePreTrainedModelrJ   modelTr   r   F)r2   
attentionsc                    s4   t  | t|trtj|jd| jjd d S d S )Nr   )r:   std)	r%   _init_weightsrj   r{   initnormal_r*   rJ   initializer_range)r,   r   r.   r0   r1   r
    s   
z'GraniteMoePreTrainedModel._init_weights)rB   rC   rD   r   rw   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr(   rz   r
  rG   r0   r0   r.   r1   r    s    
 r  c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee defddZ  ZS )GraniteMoeModelrJ   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _ j| _|   d S )Nc                    s   g | ]}t  |qS r0   )r   ).0r   rJ   r0   r1   
<listcomp>      z,GraniteMoeModel.__init__.<locals>.<listcomp>r   r  F)r%   r&   pad_token_idpadding_idx
vocab_sizer   	Embeddingr-   embed_tokens
ModuleListr   num_hidden_layerslayersr!   r   normrH   
rotary_embgradient_checkpointingembedding_multiplier	post_initr   r.   r  r1   r&     s   zGraniteMoeModel.__init__N	input_idsr   rr   r   inputs_embeds	use_cacher   r   r$   c              
   K   s  |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| j|||||d}
|| j }|}| ||}| jd | jj D ]}||f||
||||d|}qg| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r   )rW   )rJ   r+  r   r   r   rr   )r   r   rr   r   r,  r   )last_hidden_stater   )
ValueErrorr
   rJ   r!  get_seq_lengthr(   r_   r@   rW   r   r   r(  r&  r$  r#  r%  r   )r,   r*  r   rr   r   r+  r,  r   r   past_seen_tokenscausal_maskr2   r   decoder_layerr0   r0   r1   r>     sT   



zGraniteMoeModel.forward)NNNNNNN)rB   rC   rD   r   r&   r   r   r   r(   r   rF   r	   FloatTensorboolr   r   r   r>   rG   r0   r0   r.   r1   r    s>    	
r  r3   gate_logitsr|   c                    s  | du s	t | tsdS t | tr#| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rStj
| dd}	tj
|dd}
ng|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||fd| }tj|| ddtj|dd }
t|	|
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS r0   )r7   )r  
layer_gatecompute_devicer0   r1   r  M  r  z,load_balancing_loss_func.<locals>.<listcomp>rh   r4   )rj   r?   rW   r(   rn   r   r   r   r   one_hotr:   rE   r@   ri   r   r7   r   r   )r5  r|   r   r   concatenated_gate_logitsrouting_weightsr   selected_expertsexpert_masktokens_per_expertrouter_prob_per_expert
batch_sizesequence_lengthr#  expert_attention_mask router_per_expert_attention_maskoverall_lossr0   r7  r1   load_balancing_loss_func+  s>   



rE  c                       s   e Zd ZddiZddiZddgdgfiZdef fdd	Zee		
	
	
	
	
	
	
	
	dde
jd
B de
jd
B de
jd
B ded
B de
jd
B de
jd
B ded
B de
jd
B dee
jB deeB fddZ  ZS )GraniteMoeForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr2   r   rJ   c                    s`   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _|j| _|j| _|   d S )NFr   )r%   r&   r  r  r  r   r   r-   rG  router_aux_loss_coefr   r|   r   logits_scalingr)  r   r.   r0   r1   r&     s   
zGraniteMoeForCausalLM.__init__Nr   r*  r   rr   r   r+  labelsoutput_router_logitsr   logits_to_keepr$   c
              	   K   s   |dur|n| j j}| jd||||||d|
}|j}t|	tr't|	 dn|	}| |dd|ddf }|| j j }d}|durQ| j	||fd| j j
i|
}d}|rnt|j| j| j|}|durn|| j||j 7 }t||||j|j|j|jdS )al  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeForCausalLM

        >>> model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r*  r   rr   r   r+  r   r  )lossaux_lossr   r   r2   r  router_logitsr0   )rJ   rL  r  r-  rj   ry   slicerG  rJ  loss_functionr  rE  rP  r|   r   rI  r7   rW   r   r   r2   r  )r,   r*  r   rr   r   r+  rK  rL  r   rM  r   outputsr2   slice_indicesr   rN  rO  r0   r0   r1   r>     sZ   &zGraniteMoeForCausalLM.forward)	NNNNNNNNr   )rB   rC   rD   _tied_weights_keys_tp_plan_pp_planr   r&   r   r   r(   r   rF   r	   r3  r4  ry   r?   r   r>   rG   r0   r0   r.   r1   rF  }  sJ    	
rF  )rF  r  r  )r   )r   )Nr3   N)Fcollections.abcr   typingr   r(   r   torch.nnr   r    r   r  activationsr   cache_utilsr	   r
   
generationr   integrationsr   r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.genericr   r   r   utils.output_capturingr   configuration_granitemoer   Moduler!   rH   r{   r   r   r   r   rF   ry   r   rE   r   r   r   r  r  r?   rE  rF  __all__r0   r0   r0   r1   <module>   s   A-1+
F%U
Rj