o
    i                     @   s  d dl mZmZmZmZ d dlZd dlm  mZ	 d dlmZ ddl
mZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( e# rd dl)m*Z* ddl+m,Z, e$-e.Z/G dd deddZ0G dd dej1Z2G dd dej1Z3G dd dej1Z4G dd dej1Z5G d d! d!ej1Z6d"d# Z7dGd$d%Z8d&ej9d'e:d(ej9fd)d*Z;	+dHd,ej1d-ej9d.ej9d/ej9d0eej9 d1e<d2e<fd3d4Z=G d5d6 d6ej1Z>G d7d8 d8eZ?e"G d9d: d:eZ@G d;d< d<ej1ZAe"G d=d> d>e@ZB		?	dId@eej9eCej9 df dAee: d0eej9 d(eej9e:f fdBdCZDG dDdE dEe@eZEg dFZFdS )J    )CallableOptional	TypedDictUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastMoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torch_flex_attn_availablelogging)deprecate_kwarg   )GraniteMoeSharedConfig)	BlockMask)make_flex_block_causal_maskc                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )GraniteFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor r-   r-   r/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/granitemoeshared/modeling_granitemoeshared.pyr   3   s   
 

r   F)totalc                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    sZ   t    |j| _|j| _t|j | _tj	| j| jd dd| _
tj	| j| jdd| _d S )N   Fbias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr1   	__class__r-   r.   r6   U   s   
zGraniteMoeSharedMLP.__init__hidden_statesreturnc                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr2   dimr   r   )r=   chunkr;   r>   )r@   rC   chunked_hidden_statesr-   r-   r.   forward^   s
   

zGraniteMoeSharedMLP.forward)
r$   r%   r&   r'   r   r6   r(   TensorrJ   __classcell__r-   r-   rA   r.   r0   L   s    	r0   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	GraniteMoeSharedRMSNormư>c                    s&   t    tt|| _|| _dS )zF
        GraniteMoeSharedRMSNorm is equivalent to T5LayerNorm
        N)r5   r6   r   	Parameterr(   onesweightvariance_epsilon)r@   r7   epsrA   r-   r.   r6   g   s   

z GraniteMoeSharedRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr2   rE   T)keepdim)	dtypetor(   float32powmeanrsqrtrR   rQ   )r@   rC   input_dtypevariancer-   r-   r.   rJ   o   s
   zGraniteMoeSharedRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerQ   shaperR   )r@   r-   r-   r.   
extra_reprv   s   z"GraniteMoeSharedRMSNorm.extra_repr)rN   )r$   r%   r&   r6   rJ   r_   rL   r-   r-   rA   r.   rM   f   s    rM   c                       s6   e Zd Zdedededdf fddZdd	 Z  ZS )
GraniteMoeSharedParallelExpertsnum_expertsr8   output_sizerD   Nc                    s6   t    tt|||| _|| _|| _|| _	dS )a  
        Initialize the GraniteMoeSharedParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
r5   r6   r   rO   r(   emptyrQ   ra   r8   rb   )r@   ra   r8   rb   rA   r-   r.   r6   {   s
   

z(GraniteMoeSharedParallelExperts.__init__c                 C   sP   |j |dd}g }t| jD ]}|t|| | j|  qtj|dd}|S )a  
        Forward pass of the GraniteMoeSharedParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   rF   )	splitrangera   appendFlinearrQ   r(   cat)r@   inputsexpert_size
input_listoutput_listiresultsr-   r-   r.   rJ      s   z'GraniteMoeSharedParallelExperts.forwardr$   r%   r&   r+   r6   rJ   rL   r-   r-   rA   r.   r`   z   s    r`   c                       s2   e Zd Zdededef fddZdd Z  ZS )GraniteMoeSharedTopKGatingr8   ra   top_kc                    s2   t    || _|| _|| _tj||dd| _dS )a  
        Initialize the top-k gating mechanism.
        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        Fr3   N)r5   r6   ra   r8   rr   r   r<   layer)r@   r8   ra   rr   rA   r-   r.   r6      s
   
z#GraniteMoeSharedTopKGating.__init__c                 C   s   |  | }|j| jdd\}}tj|dd|}tj|d| j	g|j
|jd}|d|d}| d}| }| }	|	d\}
}|j| jdd}| }|| }|||||fS )Nr   rF   r   rU   devicetrunc)rounding_mode)rs   floattopkrr   r(   softmaxtype_aszerossizera   rU   ru   scatterlongsumtolistflattensortdiv)r@   rC   logitstop_k_logitstop_k_indicestop_k_gatesr|   gatesrk   top_k_experts_index_sorted_expertsbatch_indexbatch_gatesr-   r-   r.   rJ      s   z"GraniteMoeSharedTopKGating.forwardrp   r-   r-   rA   r.   rq      s    rq   c                       s.   e Zd ZdZdef fddZdd Z  ZS )GraniteMoeSharedMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    r1   c                    sl   t    |j| _|j| _t|j | _t|j	| j| jd | _
t|j	| j| j| _t| j|j	|jd| _d S )Nr2   )r8   ra   rr   )r5   r6   r7   r8   intermediate_sizer   r:   r;   r`   num_local_expertsr=   r>   rq   num_experts_per_tokrouterr?   rA   r-   r.   r6      s   
zGraniteMoeSharedMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}	|| }
| |
|}|jddd}| |d |d  }| ||}||dddf  }tj|| | j	f|j
|jd}|d||}|||| j	}||	fS )a  
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
            Tensor:
                Router logits.
        rE   r2   rF   r   r   Nrt   )r}   reshaper   r=   rH   r;   r>   r(   r|   r8   rU   ru   	index_addview)r@   layer_inputbszlengthemb_sizer   r   r   rk   router_logitsexpert_inputsrC   rI   expert_outputsr|   layer_outputr-   r-   r.   rJ      s   zGraniteMoeSharedMoE.forward)r$   r%   r&   r'   r   r6   rJ   rL   r-   r-   rA   r.   r      s    r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrE   r2   rF   )r^   r(   ri   )xx1x2r-   r-   r.   rotate_half  s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkcossinposition_idsunsqueeze_dimq_embedk_embedr-   r-   r.   apply_rotary_pos_emb  s
   

r   rC   n_reprD   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r^   expandr   )rC   r   batchnum_key_value_headsslenhead_dimr-   r-   r.   	repeat_kv5  s
   0r           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr2   r   rE   )rG   rU   )ptrainingr   )r   num_key_value_groupsr(   matmul	transposer^   r   
functionalrz   rW   rV   rU   r   r   
contiguous)r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr-   r-   r.   eager_attention_forwardA  s   
&r   c                       s   e Zd ZdZddedee f fddZeddd	d
						dde	j
dee	j
 dee	j dee dedee	j deee	j
e	j
f  dee	j
ee	j
 eee	j
  f fddZ  ZS )GraniteMoeSharedAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr1   	layer_idxc                    s   t    || _|| _|d u rtd| jj d |j| _|j	| _	|j
| _| j	| j | _|j| _| j| j | _d| _|j| _| j| j | j	krUtd| j	 d| j dtj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j	|jd| _d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r3   )r5   r6   r1   r   loggerwarning_oncerB   r$   attention_dropoutr7   num_attention_heads	num_headsr   r   r   	is_causalattention_multiplierr   
ValueErrorr   r<   attention_biasq_projk_projv_projo_projr@   r1   r   rA   r-   r.   r6   a  s2   

z"GraniteMoeSharedAttention.__init__past_key_valuepast_key_values4.58new_nameversionFrC   r   r   	use_cachecache_positionposition_embeddingsrD   c                 K   sD  |  \}	}
}| |}| |}| |}||	|
| j| jdd}||	|
| j| jdd}||	|
| j| jdd}|d urF|nd\}}|d urWt	||||\}}|d url|||d}|
||| j|\}}t}| jjdkrzt| jj }|| ||||f| jsdn| j| jd|\}}||	|
d}| |}||fS )	Nr   r2   )NN)r   r   r   eagerr   )r   r   rE   )r}   r   r   r   r   r   r   r   r   r   updater   r   r1   _attn_implementationr   r   r   r   r   )r@   rC   r   r   r   r   r   r   r   r   q_lenr   query_statesr   r   r   r   cache_kwargsattention_interfacer   r   r-   r-   r.   rJ     s>   




z!GraniteMoeSharedAttention.forwardN)NNNFNN)r$   r%   r&   r'   r   r   r+   r6   r   r(   rK   r)   r	   boolr]   rJ   rL   r-   r-   rA   r.   r   ^  s6     
r   c                       s   e Zd Zdedef fddZedddd							
	
			
		ddejde	ej de	ej
 de	e de	e de	e de	ej
 de	e de	eejejf  dee deeje	eejejf  f fddZ  ZS )GraniteMoeSharedDecoderLayerr1   r   c                    s   t    |j| _t||d| _|jdkrt|| _t|j|j	d| _
t|j|j	d| _|j| _|jdkr:d | _d S t|| _d S )N)r1   r   r   rS   )r5   r6   r7   r   	self_attnr   r   block_sparse_moerM   rms_norm_epsinput_layernormpost_attention_layernormresidual_multiplierr9   r0   
shared_mlpr   rA   r-   r.   r6     s   


"z%GraniteMoeSharedDecoderLayer.__init__r   r   r   r   NFrC   r   r   output_attentionsr   r   output_router_logitsr   r   rD   c
                 K   s   |}|  |}| jd||||||||	d|
\}}||| j  }|}| |}| |\}}| jdu r7|}n|| | }~||| j  }|f}|rP||f7 }|rW||f7 }|S )a1  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        )rC   r   r   r   r   r   r   r   Nr-   )r   r   r   r   r   r   )r@   rC   r   r   r   r   r   r   r   r   r   residualself_attn_weightsmoe_hidden_statesr   outputsr-   r-   r.   rJ     s:   '
	




z$GraniteMoeSharedDecoderLayer.forward)NNNFFNFN)r$   r%   r&   r   r+   r6   r   r(   rK   r   r)   r	   r   r]   r   r   FloatTensorrJ   rL   r-   r-   rA   r.   r     sD    	
r   c                       sF   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZ fddZ  ZS )	GraniteMoeSharedPreTrainedModelr1   modelTr   r   Fc                    s4   t  | t|tr|jjjd| jjd d S d S )Nr   )rY   std)	r5   _init_weights
isinstancer`   rQ   datanormal_r1   initializer_range)r@   r   rA   r-   r.   r   "  s   
z-GraniteMoeSharedPreTrainedModel._init_weights)r$   r%   r&   r   r*   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr   rL   r-   r-   rA   r.   r     s   
 r   c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	GraniteMoeSharedRotaryEmbeddinginv_freqNr1   c                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr  F)
persistent)r5   r6   hasattrr  r  dictgetr  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr1   r   rope_init_fnattention_scalingregister_bufferr  original_inv_freq)r@   r1   ru   r  rA   r-   r.   r6   +  s   
z(GraniteMoeSharedRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   rE   r   mpscpuF)device_typeenabledr2   rF   )rU   )r  rx   r   r^   rV   ru   r  r  strr(   autocastr   ri   r   r  r   rU   )
r@   r   r   inv_freq_expandedposition_ids_expandedr  freqsembr   r   r-   r-   r.   rJ   <  s   0&z'GraniteMoeSharedRotaryEmbedding.forwardr   )r$   r%   r&   r(   rK   r*   r   r6   no_gradr   rJ   rL   r-   r-   rA   r.   r  (  s   
 
r  c                       s   e Zd Zdef fddZe											ddeej deej	 deej dee
eeej f  d	eej d
ee dee dee dee dee deej de
eef fddZ	dde
ej	df dej	dej	dedef
ddZedej	dededejdej	defddZ  ZS ) GraniteMoeSharedModelr1   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _d| _ j| _ j| _ j| _| j| j | _ j| _ j| _ j| _| jdkr]t nd | _|   d S )Nc                    s   g | ]}t  |qS r-   )r   ).0r   r1   r-   r.   
<listcomp>U      z2GraniteMoeSharedModel.__init__.<locals>.<listcomp>r   Frope)r5   r6   pad_token_idpadding_idx
vocab_sizer   	Embeddingr7   embed_tokens
ModuleListre   num_hidden_layerslayersrM   r   normgradient_checkpointingembedding_multiplierr   r   r   r  
rope_thetaposition_embedding_typer  
rotary_emb	post_initr?   rA   r*  r.   r6   N  s$   zGraniteMoeSharedModel.__init__N	input_idsr   r   r   inputs_embedsr   r   output_hidden_statesr   return_dictr   rD   c                 K   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|
d ur$|
n| j j}
|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|| j }|r]|d u r]t| j d}|d u ry|d uri| nd}tj|||jd  |jd}|d u r|d}| |||||}|}d }| jd ur| ||}|rdnd }|rdnd }|	rdnd }| jD ],}|r||f7 }|||||||||	|d		}|d }|r||d f7 }|	r||d
 f7 }q| |}|r||f7 }|
stdd ||||fD S t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr*  r   r   ru   r-   )r   r   r   r   r   r   r   r   rE   c                 s   s    | ]	}|d ur|V  qd S r   r-   )r)  vr-   r-   r.   	<genexpr>  s    z0GraniteMoeSharedModel.forward.<locals>.<genexpr>)last_hidden_stater   rC   
attentionsr   )r1   r   r?  r   use_return_dictr   r7  r   r   r   r2  r8  r
   get_seq_lengthr(   aranger^   ru   r   _update_causal_maskr;  r5  r6  r]   r   )r@   r=  r   r   r   r>  r   r   r?  r   r@  r   r   past_seen_tokensr   rC   r   all_hidden_statesall_self_attnsall_router_logitsdecoder_layerlayer_outputsr-   r-   r.   rJ   g  s   









zGraniteMoeSharedModel.forwardFr   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2r   flex_attentionr   Fsdpa)r>  past_key_values_lengthis_trainingr   rE   )sequence_lengthtarget_lengthrU   r   
batch_size)cudaxpunpu)r1   r   anyr  r(   rK   r   rG  is_compileabler   _ignore_causal_mask_sdpar   rU   r^   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionru   r  finfomin_unmask_unattended)r@   r   rP  r   r   r   rJ  using_compilable_cacherU   rV  rW  r   	min_dtyper-   r-   r.   rI    sT   




z)GraniteMoeSharedModel._update_causal_maskrV  rW  rU   rX  c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuerU   ru   r   )diagonalrA  rE   r   )rG   r(   ra  rb  fullru   triurH  r   r   cloner^   rV   masked_fill)r   rV  rW  rU   r   rX  r   r   re  mask_lengthpadding_maskr-   r-   r.   r`    s,    $
6  zKGraniteMoeSharedModel._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNN)F)r$   r%   r&   r   r6   r   r   r(   r)   rK   r   r	   listr   r   r]   r   rJ   rI  staticmethodr+   rU   r`  rL   r-   r-   rA   r.   r(  L  s    	

p
Dr(  r2   gate_logitsra   c                    s  | du s	t | tsdS t | tr#| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rStj
| dd}	tj
|dd}
nm|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||jd fd|jd  }tj|| ddtj|dd }
|jjdur|jjnd}|jd t| }t|	dd|||jd  f |
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS r-   )rV   )r)  
layer_gatecompute_devicer-   r.   r+  p  r,  z,load_balancing_loss_func.<locals>.<listcomp>rF   rE   r   )r  r]   ru   r(   ri   r   r   rz   ry   one_hotrY   rx   r^   r   r   rV   r   indexr+   r   )rq  ra   rr   r   concatenated_gate_logitsrouting_weightsr   selected_expertsexpert_masktokens_per_expertrouter_prob_per_expertrX  rV  r4  expert_attention_mask router_per_expert_attention_maskdevice_indexrankoverall_lossr-   rs  r.   load_balancing_loss_funcN  sF   



&r  c                        s   e Zd ZdgZdef fddZe													ddeej	 deej
 d	eej	 d
eeeeej f  deej deej	 dee dee dee dee dee deej	 deeej
f deeef fddZ  ZS )GraniteMoeSharedForCausalLMzlm_head.weightr1   c                    sX   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _|j| _|   d S )NFr3   )r5   r6   r(  r   r0  r   r<   r7   lm_headrouter_aux_loss_coefr   ra   r   r<  r?   rA   r-   r.   r6     s   
z$GraniteMoeSharedForCausalLM.__init__Nr   r=  r   r   r   r>  labelsr   r   r?  r   r@  r   logits_to_keeprD   c                 K   s  |dur|n| j j}|
dur|
n| j j}
|	dur|	n| j j}	|dur$|n| j j}| jd||||||||	|
||d|}|d }t|trKt| dn|}| 	|dd|ddf }|| j j
 }d}|dury| }| j||fd| j ji|}d}|
rt|r|jn|d | j| j|}|dur|| j||j 7 }|s|f|dd  }|
r|f| }|dur|f| S |S t||||j|j|j|jdS )	ax  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeSharedForCausalLM

        >>> model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r=  r   r   r   r>  r   r   r?  r   r@  r   r   r0  rE   r   )lossaux_lossr   r   rC   rE  r   r-   )r1   r   r   r?  rF  r   r  r+   slicer  logits_scalingrx   loss_functionr0  r  r   ra   r   r  rV   ru   r   r   rC   rE  )r@   r=  r   r   r   r>  r  r   r   r?  r   r@  r   r  r   r   rC   slice_indicesr   r  r  outputr-   r-   r.   rJ     sx   (
z#GraniteMoeSharedForCausalLM.forward)NNNNNNNNNNNNr   )r$   r%   r&   _tied_weights_keysr   r6   r   r   r(   r)   rK   r   r	   ro  r   r   r+   r]   r   rJ   rL   r-   r-   rA   r.   r    s\    	

r  )r  r(  r   )Nr   )r   )Nr2   N)Gtypingr   r   r   r   r(   torch.nn.functionalr   r   rg   activationsr   cache_utilsr	   r
   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   configuration_granitemoesharedr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerr$   r   r   Moduler0   rM   r`   rq   r   r   r   rK   r+   r   rx   r   r   r   r   r  r(  r]   r  r  __all__r-   r-   r-   r.   <module>   s   
-0<

Wa$  
V