o
    i                     @   s   d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ d
dlmZmZmZmZ ddlmZ eeZG dd deddZG dd dejZG dd deZG dd deZG dd deZG dd deZg dZ dS )    )Optional	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging)deprecate_kwarg   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )GraniteFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor r!   r!   q/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   &   s   
 

r   F)totalc                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    sZ   t    |j| _|j| _t|j | _tj	| j| jd dd| _
tj	| j| jdd| _d S )Nr   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr%   	__class__r!   r"   r(   H   s   
zGraniteMoeSharedMLP.__init__hidden_statesreturnc                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr   )dimr   r   )r/   chunkr-   r0   )r2   r5   chunked_hidden_statesr!   r!   r"   forwardQ   s
   

zGraniteMoeSharedMLP.forward)
r   r   r   r   r   r(   r   Tensorr;   __classcell__r!   r!   r3   r"   r$   ?   s    	r$   c                       s   e Zd Zdedef fddZedddd							
	
			
		ddejde	ej de	ej
 de	e de	e de	e de	ej
 de	e de	eejejf  dee deeje	eejejf  f fddZ  ZS )GraniteMoeSharedDecoderLayerr%   	layer_idxc                    s0   t  || |jdkrd | _d S t|| _d S )Nr   )r'   r(   r+   r$   
shared_mlp)r2   r%   r?   r3   r!   r"   r(   Z   s   "z%GraniteMoeSharedDecoderLayer.__init__past_key_valuepast_key_valuesz4.58)new_nameversionNFr5   attention_maskposition_idsoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsr6   c
                 K   s   |}|  |}| jd||||||||	d|
\}}||| j  }|}| |}| |\}}| jdu r7|}n|| | }~||| j  }|f}|rP||f7 }|rW||f7 }|S )a1  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        )r5   rE   rF   rB   rG   rH   rI   rK   Nr!   )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moer@   )r2   r5   rE   rF   rB   rG   rH   rI   rJ   rK   rL   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputsr!   r!   r"   r;   ^   s:   '
	




z$GraniteMoeSharedDecoderLayer.forward)NNNFFNFN)r   r   r   r   r   r(   r
   r   r<   r   r   r   booltupler   r   FloatTensorr;   r=   r!   r!   r3   r"   r>   Y   sD    	
r>   c                   @   s   e Zd ZU eed< dgZdS )GraniteMoeSharedPreTrainedModelr%   r>   N)r   r   r   r   r   _no_split_modulesr!   r!   r!   r"   rZ      s   
 
rZ   c                       s"   e Zd Zdef fddZ  ZS )GraniteMoeSharedModelr%   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r!   )r>   ).0r?   r%   r!   r"   
<listcomp>   s    z2GraniteMoeSharedModel.__init__.<locals>.<listcomp>)r'   r(   r   
ModuleListrangenum_hidden_layerslayersr1   r3   r^   r"   r(      s   
zGraniteMoeSharedModel.__init__)r   r   r   r   r(   r=   r!   r!   r3   r"   r\      s    r\   c                       s(   e Zd ZdgZdef fddZ  ZS )GraniteMoeSharedForCausalLMzlm_head.weightr%   c                    s"   t  | t|| _|   d S )N)r'   r(   r\   model	post_initr1   r3   r!   r"   r(      s   
z$GraniteMoeSharedForCausalLM.__init__)r   r   r   _tied_weights_keysr   r(   r=   r!   r!   r3   r"   rd      s    rd   )rd   r\   rZ   )!typingr   r   r   r   activationsr   cache_utilsr   processing_utilsr   utilsr	   utils.deprecationr
   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr   loggerr   Moduler$   r>   rZ   r\   rd   __all__r!   r!   r!   r"   <module>   s$   
X
