o
    i;                     @   sp  d dl mZmZmZ d dlZd dlm  mZ d dlmZ ddl	m
Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% e  r|d dl&m'Z' ddl(m)Z) e!*e+Z,			d@deej-e.ej- df dee/ deej- deej-e/f fddZ0G dd dej1Z2G dd dej1Z3dd Z4dAd d!Z5G d"d# d#ej1Z6G d$d% d%ej1Z7G d&d' d'ej1Z8d(ej-d)e/dej-fd*d+Z9G d,d- d-ej1Z:	.dBd/ej1d0ej-d1ej-d2ej-deej- d3e;d4e;fd5d6Z<G d7d8 d8eZ=eG d9d: d:eZ>eG d;d< d<e>Z?G d=d> d>e>eZ@g d?ZAdS )C    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastMoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)auto_docstringis_torch_flex_attn_availablelogging)deprecate_kwarg   )GraniteMoeConfig)	BlockMask)make_flex_block_causal_mask   gate_logitsnum_expertsattention_maskreturnc                    s  | du s	t | tsdS t | tr#| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rStj
| dd}	tj
|dd}
nm|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||jd fd|jd  }tj|| ddtj|dd }
|jjdur|jjnd}|jd t| }t|	dd|||jd  f |
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS  )to).0
layer_gatecompute_devicer!   o/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/granitemoe/modeling_granitemoe.py
<listcomp>O       z,load_balancing_loss_func.<locals>.<listcomp>dimr   )
isinstancetupledevicetorchcatr   
functionalsoftmaxtopkone_hotmeanfloatshapeexpandreshaper"   sumindexint	unsqueeze)r   r   top_kr   concatenated_gate_logitsrouting_weights_selected_expertsexpert_masktokens_per_expertrouter_prob_per_expert
batch_sizesequence_lengthnum_hidden_layersexpert_attention_mask router_per_expert_attention_maskdevice_indexrankoverall_lossr!   r%   r'   load_balancing_loss_func-   sF   



&rO   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	GraniteMoeRMSNormư>c                    s&   t    tt|| _|| _dS )z@
        GraniteMoeRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parameterr0   onesweightvariance_epsilon)selfhidden_sizeeps	__class__r!   r'   rS      s   

zGraniteMoeRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   r,   T)keepdim)	dtyper"   r0   float32powr6   rsqrtrW   rV   )rX   hidden_statesinput_dtypevariancer!   r!   r'   forward   s
   zGraniteMoeRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r.   rV   r8   rW   )rX   r!   r!   r'   
extra_repr   s   zGraniteMoeRMSNorm.extra_repr)rQ   )__name__
__module____qualname__rS   re   rf   __classcell__r!   r!   r[   r'   rP      s    rP   c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	GraniteMoeRotaryEmbeddinginv_freqNconfigc                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultrl   F)
persistent)rR   rS   hasattrr-   rn   dictgetro   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrm   r   rope_init_fnattention_scalingregister_bufferrl   original_inv_freq)rX   rm   r/   rl   r[   r!   r'   rS      s   
z"GraniteMoeRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r,   r   mpscpuF)device_typeenabledr   r*   )r^   )rl   r7   r9   r8   r"   r/   r-   rp   strr0   autocast	transposer1   cosrz   sinr^   )
rX   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   r!   r!   r'   re      s   0&z!GraniteMoeRotaryEmbedding.forwardN)rg   rh   ri   r0   Tensor__annotations__r   rS   no_gradr   re   rj   r!   r!   r[   r'   rk      s   
 
rk   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr,   r   r*   )r8   r0   r1   )r   x1x2r!   r!   r'   rotate_half   s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r>   r   )qkr   r   r   unsqueeze_dimq_embedk_embedr!   r!   r'   apply_rotary_pos_emb   s
   

r   c                       s6   e Zd Zdedededdf fddZdd	 Z  ZS )
GraniteMoeParallelExpertsr   
input_sizeoutput_sizer    Nc                    s6   t    tt|||| _|| _|| _|| _	dS )a  
        Initialize the GraniteMoeParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
rR   rS   r   rT   r0   emptyrV   r   r   r   )rX   r   r   r   r[   r!   r'   rS      s
   

z"GraniteMoeParallelExperts.__init__c                 C   sP   |j |dd}g }t| jD ]}|t|| | j|  qtj|dd}|S )a  
        Forward pass of the GraniteMoeParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   r*   )	splitranger   appendFlinearrV   r0   r1   )rX   inputsexpert_size
input_listoutput_listiresultsr!   r!   r'   re      s   z!GraniteMoeParallelExperts.forwardrg   rh   ri   r=   rS   re   rj   r!   r!   r[   r'   r      s    r   c                       s2   e Zd Zdededef fddZdd Z  ZS )GraniteMoeTopKGatingr   r   r?   c                    s2   t    || _|| _|| _tj||dd| _dS )a  
        Initialize the top-k gating mechanism.
        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        FbiasN)rR   rS   r   r   r?   r   Linearlayer)rX   r   r   r?   r[   r!   r'   rS     s
   
zGraniteMoeTopKGating.__init__c                 C   s   |  | }|j| jdd\}}tj|dd|}tj|d| j	g|j
|jd}|d|d}| d}| }| }	|	d\}
}|j| jdd}| }|| }|||||fS )Nr   r*   r   r^   r/   trunc)rounding_mode)r   r7   r4   r?   r0   r3   type_aszerossizer   r^   r/   scatterlongr;   tolistflattensortdiv)rX   rb   logitstop_k_logitstop_k_indicestop_k_gatesr   gatesr   top_k_expertsrB   index_sorted_expertsbatch_indexbatch_gatesr!   r!   r'   re   $  s   zGraniteMoeTopKGating.forwardr   r!   r!   r[   r'   r     s    r   c                       s.   e Zd ZdZdef fddZdd Z  ZS )GraniteMoeMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    rm   c                    sl   t    |j| _|j| _t|j | _t|j	| j| jd | _
t|j	| j| j| _t| j|j	|jd| _d S )Nr   )r   r   r?   )rR   rS   rY   r   intermediate_sizer   
hidden_act
activationr   num_local_expertsinput_linearoutput_linearr   num_experts_per_tokrouterrX   rm   r[   r!   r'   rS   I  s   
zGraniteMoeMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}	|| }
| |
|}|jddd}| |d |d  }| ||}||dddf  }tj|| | j	f|j
|jd}|d||}|||| j	}||	fS )a  
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
            Tensor:
                Router logits.
        r,   r   r*   r   r   Nr   )r   r:   r   r   chunkr   r   r0   r   r   r^   r/   	index_addview)rX   layer_inputbszlengthemb_sizerB   r   r   r   router_logitsexpert_inputsrb   chunked_hidden_statesexpert_outputsr   layer_outputr!   r!   r'   re   X  s   zGraniteMoeMoE.forward)rg   rh   ri   __doc__r   rS   re   rj   r!   r!   r[   r'   r   @  s    r   rb   n_repc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r8   r9   r:   )rb   r   batchnum_key_value_headsslenhead_dimr!   r!   r'   	repeat_kvy  s
   0r   c                       s   e Zd ZdZddedee f fddZeddd	d
						dde	j
dee	j
 dee	j dee dedee	j deee	j
e	j
f  dee	j
ee	j
 eee	j
  f fddZ  ZS )GraniteMoeAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrm   	layer_idxc                    s   t    || _|| _|d u rtd| jj d |j| _|j	| _	|j
| _| j	| j | _|j| _| j| j | _d| _|j| _| j| j | j	krUtd| j	 d| j dtj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j	|jd| _d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r   )rR   rS   rm   r   loggerwarning_oncer\   rg   attention_dropoutrY   num_attention_heads	num_headsr   r   num_key_value_groups	is_causalattention_multiplierscaling
ValueErrorr   r   attention_biasq_projk_projv_projo_projrX   rm   r   r[   r!   r'   rS     s2   

zGraniteMoeAttention.__init__past_key_valuepast_key_values4.58new_nameversionFrb   r   r   	use_cachecache_positionposition_embeddingsr    c                 K   sD  |  \}	}
}| |}| |}| |}||	|
| j| jdd}||	|
| j| jdd}||	|
| j| jdd}|d urF|nd\}}|d urWt	||||\}}|d url|||d}|
||| j|\}}t}| jjdkrzt| jj }|| ||||f| jsdn| j| jd|\}}||	|
d}| |}||fS )	Nr   r   )NN)r   r   r   eager        )dropoutr   r,   )r   r   r   r   r   r   r   r   r   r   updater   eager_attention_forwardrm   _attn_implementationr   trainingr   r   r   )rX   rb   r   r   r   r   r   r   kwargsr   q_lenrB   query_states
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightsr!   r!   r'   re     s>   




zGraniteMoeAttention.forwardr   )NNNFNN)rg   rh   ri   r   r   r   r=   rS   r   r0   r   
LongTensorr   boolr.   re   rj   r!   r!   r[   r'   r     s6     
r   r   modulequerykeyvaluer   r   c                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r   r,   )r+   r^   )pr  r   )r   r   r0   matmulr   r8   r   r2   r3   r_   r"   r^   r   r  
contiguous)r  r  r  r  r   r   r   r  r  r  r
  causal_maskr	  r!   r!   r'   r     s   
&r   c                       s   e Zd Zdedef fddZedddd							
	
			
		ddejde	ej de	ej
 de	e de	e de	e de	ej
 de	e de	eejejf  deeje	eejejf  f fddZ  ZS )GraniteMoeDecoderLayerrm   r   c                    sd   t    |j| _t||d| _|jdkrt|| _t|j|j	d| _
t|j|j	d| _|j| _d S )N)rm   r   r   rZ   )rR   rS   rY   r   	self_attnr   r   block_sparse_moerP   rms_norm_epsinput_layernormpost_attention_layernormresidual_multiplierr   r[   r!   r'   rS     s   


zGraniteMoeDecoderLayer.__init__r   r   r   r   NFrb   r   r   output_attentionsr   r   output_router_logitsr   r    c
                 K   s   |}|  |}| jd||||||||	d|
\}}||| j  }|}| |}| |\}}||| j  }|f}|r@||f7 }|rG||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )rb   r   r   r   r  r   r   r   Nr!   )r  r  r  r  r  )rX   rb   r   r   r   r  r   r   r  r   r  residualself_attn_weightsr   outputsr!   r!   r'   re     s2   '
	



zGraniteMoeDecoderLayer.forward)NNNFFNFN)rg   rh   ri   r   r=   rS   r   r0   r   r   r  r   r  r.   FloatTensorre   rj   r!   r!   r[   r'   r    s@    	
r  c                       sF   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZ fddZ  ZS )	GraniteMoePreTrainedModelrm   modelTr  r   Fc                    s4   t  | t|tr|jjjd| jjd d S d S )Nr   )r6   std)	rR   _init_weightsr-   r   rV   datanormal_rm   initializer_range)rX   r  r[   r!   r'   r'  ^  s   
z'GraniteMoePreTrainedModel._init_weights)rg   rh   ri   r   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr'  rj   r!   r!   r[   r'   r$  R  s   
 r$  c                       s   e Zd Zdef fddZe											ddeej deej	 deej dee
eeej f  d	eej d
ee dee dee dee dee deej de
eef fddZ	dde
ej	df dej	dej	dedef
ddZedej	dededejdej	defddZ  ZS ) GraniteMoeModelrm   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _d| _ j| _ j| _ j| _| j| j | _ j| _ j| _ j| _| jdkr]t nd | _|   d S )Nc                    s   g | ]}t  |qS r!   )r  )r#   r   rm   r!   r'   r(   m  r)   z,GraniteMoeModel.__init__.<locals>.<listcomp>r  Frope)rR   rS   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrY   embed_tokens
ModuleListr   rI   layersrP   r  normgradient_checkpointingembedding_multiplierr   r   r   rv   
rope_thetaposition_embedding_typerk   
rotary_emb	post_initr   r[   r3  r'   rS   f  s$   zGraniteMoeModel.__init__N	input_idsr   r   r   inputs_embedsr   r  output_hidden_statesr  return_dictr   r    c                 K   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|
d ur$|
n| j j}
|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|| j }|r]|d u r]t| j d}|d u ry|d uri| nd}tj|||jd  |jd}|d u r|d}| |||||}|}d }| jd ur| ||}|rdnd }|rdnd }|	rdnd }| jD ],}|r||f7 }|||||||||	|d		}|d }|r||d f7 }|	r||d
 f7 }q| |}|r||f7 }|
stdd ||||fD S t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr3  r   r   r/   r!   )r   r   r   r  r   r   r  r   r,   c                 s   s    | ]	}|d ur|V  qd S r   r!   )r#   vr!   r!   r'   	<genexpr>  s    z*GraniteMoeModel.forward.<locals>.<genexpr>)last_hidden_stater   rb   
attentionsr   )rm   r  rE  r   use_return_dictr   r=  r  r   r   r9  r>  r	   get_seq_lengthr0   aranger8   r/   r>   _update_causal_maskrA  r;  r<  r.   r   )rX   rC  r   r   r   rD  r   r  rE  r  rF  r   r  past_seen_tokensr  rb   r   all_hidden_statesall_self_attnsall_router_logitsdecoder_layerlayer_outputsr!   r!   r'   re     s   









zGraniteMoeModel.forwardFr   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2r   flex_attentionr   Fsdpa)rD  past_key_values_lengthis_trainingr   r,   )rH   target_lengthr^   r   rG   )cudaxpunpu)rm   r   anyr-   r0   r   r   rM  is_compileabler   _ignore_causal_mask_sdpar  r^   r8   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr/   rp   finfomin_unmask_unattended)rX   r   rV  r   r   r  rP  using_compilable_cacher^   rH   r\  r  	min_dtyper!   r!   r'   rO    sT   




z#GraniteMoeModel._update_causal_maskrH   r\  r^   rG   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer^   r/   r   )diagonalrG  r,   r   )r+   r0   re  rf  fullr/   triurN  r:   r9   cloner8   r"   masked_fill)r   rH   r\  r^   r   rG   r  r  ri  mask_lengthpadding_maskr!   r!   r'   rd  /  s,    $
6  zEGraniteMoeModel._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNN)F)rg   rh   ri   r   rS   r   r   r0   r  r   r   r   listr#  r  r.   r   re   rO  staticmethodr=   r^   rd  rj   r!   r!   r[   r'   r2  d  s    	

q
Dr2  c                        s   e Zd ZdgZdef fddZe													ddeej	 deej
 d	eej	 d
eeeeej f  deej deej	 dee dee dee dee dee deej	 deeej
f deeef fddZ  ZS )GraniteMoeForCausalLMzlm_head.weightrm   c                    sX   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _|j| _|   d S )NFr   )rR   rS   r2  r%  r7  r   r   rY   lm_headrouter_aux_loss_coefr   r   r   rB  r   r[   r!   r'   rS   k  s   
zGraniteMoeForCausalLM.__init__Nr   rC  r   r   r   rD  labelsr   r  rE  r  rF  r   logits_to_keepr    c                 K   s  |dur|n| j j}|
dur|
n| j j}
|	dur|	n| j j}	|dur$|n| j j}| jd||||||||	|
||d|}|d }t|trKt| dn|}| 	|dd|ddf }|| j j
 }d}|dury| }| j||fd| j ji|}d}|
rt|r|jn|d | j| j|}|dur|| j||j 7 }|s|f|dd  }|
r|f| }|dur|f| S |S t||||j|j|j|jdS )	al  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeForCausalLM

        >>> model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)rC  r   r   r   rD  r   r  rE  r  rF  r   r   r7  r,   r   )lossaux_lossr   r   rb   rK  r   r!   )rm   r  r  rE  rL  r%  r-   r=   slicerv  logits_scalingr7   loss_functionr7  rO   r   r   r   rw  r"   r/   r   r   rb   rK  )rX   rC  r   r   r   rD  rx  r   r  rE  r  rF  r   ry  r  r"  rb   slice_indicesr   rz  r{  outputr!   r!   r'   re   x  sx   (
zGraniteMoeForCausalLM.forward)NNNNNNNNNNNNr   )rg   rh   ri   _tied_weights_keysr   rS   r   r   r0   r  r   r   r   rs  r#  r  r=   r.   r   re   rj   r!   r!   r[   r'   ru  h  s\    	

ru  )ru  r2  r$  )Nr   N)Nr   )r   )Btypingr   r   r   r0   torch.nn.functionalr   r2   r   activationsr   cache_utilsr   r	   
generationr
   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   utilsr   r   r   utils.deprecationr   configuration_granitemoer   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerrg   r   r   r.   r=   rO   ModulerP   rk   r   r   r   r   r   r   r   r7   r   r  r$  r2  ru  __all__r!   r!   r!   r'   <module>   s   

W%
.09^
Y  