o
    ei`                     @   s  d dl mZ d dlmZmZ d dlZd dlmZ d dlmZ	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 G dd deddZ2G dd dej3Z4edG dd dej3Z5G d d! d!ej3Z6G d"d# d#ej3Z7G d$d% d%ej3Z8d&d' Z9ed(dMd)d*Z:d+ej;d,e<d-ej;fd.d/Z=	0dNd1ej3d2ej;d3ej;d4ej;d5ej;dB d6e>d7e>d8e&e( fd9d:Z?ee:G d;d< d<ej3Z@G d=d> d>eZAe)G d?d@ d@e$ZBG dAdB dBej3ZCe)G dCdD dDeBZD		E	dOdFej;eEej; B dB dGe<dB d5ej;dB d-ej;e<B fdHdIZFe)G dJdK dKeBeZGg dLZHdS )P    )Callable)Optional	TypedDictN)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring)can_return_tuplemaybe_autocastmerge_with_config_defaults)capture_outputs   )GraniteMoeSharedConfigc                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )GraniteFlashAttentionKwargsaT  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    cu_seq_lens_q (`torch.LongTensor`):
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`):
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor):
        Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor r0   r0   |/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/granitemoeshared/modeling_granitemoeshared.pyr!   -   s   
 

r!   F)totalc                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    sZ   t    |j| _|j| _t|j | _tj	| j| jd dd| _
tj	| j| jdd| _d S )N   Fbias)super__init__hidden_size
input_sizeshared_intermediate_sizer	   
hidden_act
activationr   Linearinput_linearoutput_linearselfr4   	__class__r0   r1   r9   N   s   
zGraniteMoeSharedMLP.__init__hidden_statesreturnc                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr5   dimr   r   )r@   chunkr>   rA   )rC   rF   chunked_hidden_statesr0   r0   r1   forwardW   s
   

zGraniteMoeSharedMLP.forward)
r'   r(   r)   r*   r    r9   r+   TensorrM   __classcell__r0   r0   rD   r1   r3   E   s    	r3   RMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )GraniteMoeSharedRMSNormư>epsrG   Nc                    s&   t    tt|| _|| _dS )zF
        GraniteMoeSharedRMSNorm is equivalent to T5LayerNorm
        N)r8   r9   r   	Parameterr+   onesweightvariance_epsilon)rC   r:   rS   rD   r0   r1   r9   a   s   

z GraniteMoeSharedRMSNorm.__init__rF   c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr5   rH   T)keepdim)	dtypetor+   float32powmeanrsqrtrW   rV   )rC   rF   input_dtypevariancer0   r0   r1   rM   i   s
   zGraniteMoeSharedRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerV   shaperW   )rC   r0   r0   r1   
extra_reprp   s   z"GraniteMoeSharedRMSNorm.extra_repr)rR   )
r'   r(   r)   floatr9   r+   rN   rM   rc   rO   r0   r0   rD   r1   rQ   _   s    rQ   c                       s6   e Zd Zdedededdf fddZdd	 Z  ZS )
GraniteMoeSharedParallelExpertsnum_expertsr;   output_sizerG   Nc                    s6   t    tt|||| _|| _|| _|| _	dS )a  
        Initialize the GraniteMoeSharedParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
r8   r9   r   rT   r+   emptyrV   rf   r;   rg   )rC   rf   r;   rg   rD   r0   r1   r9   u   s
   

z(GraniteMoeSharedParallelExperts.__init__c                 C   sP   |j |dd}g }t| jD ]}|t|| | j|  qtj|dd}|S )a  
        Forward pass of the GraniteMoeSharedParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   rI   )	splitrangerf   appendFlinearrV   r+   cat)rC   inputsexpert_size
input_listoutput_listiresultsr0   r0   r1   rM      s   z'GraniteMoeSharedParallelExperts.forwardr'   r(   r)   r.   r9   rM   rO   r0   r0   rD   r1   re   t   s    re   c                       s2   e Zd Zdededef fddZdd Z  ZS )GraniteMoeSharedTopKGatingr;   rf   top_kc                    s2   t    || _|| _|| _tj||dd| _dS )a  
        Initialize the top-k gating mechanism.

        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        Fr6   N)r8   r9   rf   r;   rw   r   r?   layer)rC   r;   rf   rw   rD   r0   r1   r9      s
   
z#GraniteMoeSharedTopKGating.__init__c                 C   s   |  | }|j| jdd\}}tj|dd|}tj|d| j	g|j
|jd}|d|d}| d}| }| }	|	d\}
}|j| jdd}| }|| }|||||fS )Nr   rI   r   rY   devicetrunc)rounding_mode)rx   rd   topkrw   r+   softmaxtype_aszerossizerf   rY   rz   scatterlongsumtolistflattensortdiv)rC   rF   logitstop_k_logitstop_k_indicestop_k_gatesr   gatesrp   top_k_experts_index_sorted_expertsbatch_indexbatch_gatesr0   r0   r1   rM      s   z"GraniteMoeSharedTopKGating.forwardru   r0   r0   rD   r1   rv      s    rv   c                       s.   e Zd ZdZdef fddZdd Z  ZS )GraniteMoeSharedMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    r4   c                    sl   t    |j| _|j| _t|j | _t|j	| j| jd | _
t|j	| j| j| _t| j|j	|jd| _d S )Nr5   )r;   rf   rw   )r8   r9   r:   r;   intermediate_sizer	   r=   r>   re   num_local_expertsr@   rA   rv   num_experts_per_tokrouterrB   rD   r0   r1   r9      s   
zGraniteMoeSharedMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}|| }	| |	|}
|
jddd}| |d |d  }
| |
|}||d d d f  }tj|| | j	f|j
|jd}|d||}|||| j	}|S )NrH   r5   rI   r   r   ry   )r   reshaper   r@   rK   r>   rA   r+   r   r;   rY   rz   	index_addview)rC   layer_inputbszlengthemb_sizer   r   r   rp   expert_inputsrF   rL   expert_outputsr   layer_outputr0   r0   r1   rM      s   zGraniteMoeSharedMoE.forward)r'   r(   r)   r*   r    r9   rM   rO   r0   r0   rD   r1   r      s    r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrH   r5   rI   )rb   r+   rn   )xx1x2r0   r0   r1   rotate_half  s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkcossinunsqueeze_dimq_embedk_embedr0   r0   r1   apply_rotary_pos_emb  s
   

r   rF   n_reprG   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rb   expandr   )rF   r   batchnum_key_value_headsslenhead_dimr0   r0   r1   	repeat_kv"  s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr5   r   rH   )rJ   rY   )ptrainingr   )r   num_key_value_groupsr+   matmul	transposer   r   r~   r[   rZ   rY   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr0   r0   r1   eager_attention_forward.  s   
r   c                       s   e Zd ZdZdedef fddZ				ddejde	ejejf dB d	ejdB d
e
dB dejdB dee de	ejejf fddZ  ZS )GraniteMoeSharedAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr4   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
| _|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )Nr   Tr6   )r8   r9   r4   r   getattrr:   num_attention_headsr   r   r   attention_multiplierr   attention_dropout	is_causalr   r?   attention_biasq_projk_projv_projo_projrC   r4   r   rD   r0   r1   r9   K  s(   
z"GraniteMoeSharedAttention.__init__NrF   position_embeddingsr   past_key_valuescache_positionr   rG   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jskdn| j| jd|\}}|jg |dR   }| |}||fS )NrH   r   r5   )r   r   r   r   )r   r   )rb   r   r   r   r   r   r   r   updater   r   get_interfacer4   _attn_implementationr   r   r   r   r   r   r   )rC   rF   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   r0   r0   r1   rM   b  s8   	

z!GraniteMoeSharedAttention.forward)NNNN)r'   r(   r)   r*   r    r.   r9   r+   rN   ra   r
   r,   r   r   rM   rO   r0   r0   rD   r1   r   G  s,    r   c                       s   e Zd Zdedef fddZ							ddejdejdB d	ejdB d
e	dB de
dB de
dB dejdB deejejf dB dee deejeejejf dB f fddZ  ZS )GraniteMoeSharedDecoderLayerr4   r   c                    sx   t    |j| _t||d| _t|j|jd| _t|j|jd| _t	|| _
|j| _|jdkr5d | _d S t|| _d S )N)r4   r   rS   r   )r8   r9   r:   r   	self_attnrQ   rms_norm_epsinput_layernormpost_attention_layernormr   block_sparse_moeresidual_multiplierr<   r3   
shared_mlpr   rD   r0   r1   r9     s   

"z%GraniteMoeSharedDecoderLayer.__init__NFrF   r   position_idsr   output_attentions	use_cacher   r   r   rG   c	                 K   s   |}
|  |}| jd||||||||d|	\}}|
|| j  }|}
| |}| |}| jd u r5|}n|| | }|
|| j  }|S )N)rF   r   r   r   r   r   r   r   r0   )r   r   r   r   r   r   )rC   rF   r   r   r   r   r   r   r   r   residualr   moe_hidden_statesr0   r0   r1   rM     s.   
	



z$GraniteMoeSharedDecoderLayer.forward)NNNFFNN)r'   r(   r)   r    r.   r9   r+   rN   r,   r
   boolra   r   r!   FloatTensorrM   rO   r0   r0   rD   r1   r     s<    	
r   c                       s`   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZe  fdd	Z  ZS )
GraniteMoeSharedPreTrainedModelr4   modelTr   r   F)rF   
attentionsc                    s4   t  | t|trtj|jd| jjd d S d S )Nr   )r]   std)	r8   _init_weights
isinstancere   initnormal_rV   r4   initializer_range)rC   r   rD   r0   r1   r     s   
z-GraniteMoeSharedPreTrainedModel._init_weights)r'   r(   r)   r    r-   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr+   no_gradr   rO   r0   r0   rD   r1   r     s    
 r   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )GraniteMoeSharedRotaryEmbeddinginv_freqNr4   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr  F)
persistentoriginal_inv_freq)r8   r9   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr4   rope_parametersr	  compute_default_rope_parametersr   attention_scalingregister_bufferclone)rC   r4   rz   rope_init_fnr  rD   r0   r1   r9     s   


z(GraniteMoeSharedRotaryEmbedding.__init__rz   ztorch.deviceseq_lenrG   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetar   Ng      ?r   r5   rY   )rz   rY   )	r  r   r:   r   r+   arangeint64rZ   rd   )r4   rz   r  baserJ   attention_factorr  r0   r0   r1   r    s   
&z?GraniteMoeSharedRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   rH   r   mpscpuF)device_typeenabledr5   rI   r  )r  rd   r   rb   rZ   rz   r   typestrr   r   r+   rn   r   r  r   rY   )
rC   r   r   inv_freq_expandedposition_ids_expandedr  freqsembr   r   r0   r0   r1   rM     s   0&z'GraniteMoeSharedRotaryEmbedding.forward)N)NNN)r'   r(   r)   r+   rN   r-   r    r9   staticmethodr   r.   ra   rd   r  r  r   rM   rO   r0   r0   rD   r1   r    s&   
 

r  c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee defddZ  ZS )GraniteMoeSharedModelr4   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _ j| _|   d S )Nc                    s   g | ]}t  |qS r0   )r   ).0r   r4   r0   r1   
<listcomp>%      z2GraniteMoeSharedModel.__init__.<locals>.<listcomp>r   r*  F)r8   r9   pad_token_idpadding_idx
vocab_sizer   	Embeddingr:   embed_tokens
ModuleListrj   num_hidden_layerslayersrQ   r   normr  
rotary_embgradient_checkpointingembedding_multiplier	post_initrB   rD   r*  r1   r9     s   zGraniteMoeSharedModel.__init__N	input_idsr   r   r   inputs_embedsr   r   r   rG   c              
   K   s  |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| j|||||d}
|| j }|}| ||}| jd | jj D ]}||f||
||||d|}qg| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr*  r   r   )rz   )r4   r;  r   r   r   r   )r   r   r   r   r   r   )last_hidden_stater   )
ValueErrorr   r4   r1  get_seq_lengthr+   r  rb   rz   r   r   r8  r6  r4  r3  r5  r   )rC   r:  r   r   r   r;  r   r   r   past_seen_tokenscausal_maskrF   r   decoder_layerr0   r0   r1   rM   /  sT   



zGraniteMoeSharedModel.forward)NNNNNNN)r'   r(   r)   r    r9   r   r   r   r+   r,   rN   r
   r   r   r   r   r   rM   rO   r0   r0   rD   r1   r(    s>    	
r(  r5   gate_logitsrf   c                    s  | du s	t | tsdS t | tr#| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rStj
| dd}	tj
|dd}
ng|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||fd| }tj|| ddtj|dd }
t|	|
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS r0   )rZ   )r)  
layer_gatecompute_devicer0   r1   r+    r,  z,load_balancing_loss_func.<locals>.<listcomp>rI   rH   )r   ra   rz   r+   rn   r   r   r~   r}   one_hotr]   rd   rb   r   r   rZ   r   r   )rB  rf   rw   r   concatenated_gate_logitsrouting_weightsr   selected_expertsexpert_masktokens_per_expertrouter_prob_per_expert
batch_sizesequence_lengthr3  expert_attention_mask router_per_expert_attention_maskoverall_lossr0   rD  r1   load_balancing_loss_funcp  s>   



rR  c                       s   e Zd ZddiZddiZddgdgfiZdef fdd	Zee		
	
	
	
	
	
	
	
	dde
jd
B de
jd
B de
jd
B ded
B de
jd
B de
jd
B ded
B de
jd
B dee
jB deeB fddZ  ZS )GraniteMoeSharedForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrF   r   r4   c                    s`   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _|j| _|j| _|   d S )NFr6   )r8   r9   r(  r   r/  r   r?   r:   rT  router_aux_loss_coefr   rf   r   logits_scalingr9  rB   rD   r0   r1   r9     s   
z$GraniteMoeSharedForCausalLM.__init__Nr   r:  r   r   r   r;  labelsoutput_router_logitsr   logits_to_keeprG   c
              	   K   s   |dur|n| j j}| jd||||||d|
}|j}t|	tr't|	 dn|	}| |dd|ddf }|| j j }d}|durQ| j	||fd| j j
i|
}d}|rnt|j| j| j|}|durn|| j||j 7 }t||||j|j|j|jdS )ax  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeSharedForCausalLM

        >>> model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r:  r   r   r   r;  r   r/  )lossaux_lossr   r   rF   r   router_logitsr0   )r4   rY  r   r<  r   r.   slicerT  rW  loss_functionr/  rR  r]  rf   r   rV  rZ   rz   r   r   rF   r   )rC   r:  r   r   r   r;  rX  rY  r   rZ  r   outputsrF   slice_indicesr   r[  r\  r0   r0   r1   rM     sZ   &z#GraniteMoeSharedForCausalLM.forward)	NNNNNNNNr   )r'   r(   r)   _tied_weights_keys_tp_plan_pp_planr    r9   r   r   r+   r,   rN   r
   r   r   r.   ra   r   rM   rO   r0   r0   rD   r1   rS    sJ    	
rS  )rS  r(  r   )r   )r   )Nr5   N)Icollections.abcr   typingr   r   r+   r   torch.nnr   rl    r   r   activationsr	   cache_utilsr
   r   
generationr   integrationsr   r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.genericr   r   r   utils.output_capturingr   configuration_granitemoesharedr    r!   Moduler3   rQ   re   rv   r   r   r   rN   r.   r   rd   r   r   r   r   r  r(  ra   rR  rS  __all__r0   r0   r0   r1   <module>   s   -1/
F5AU
Rj