o
    eiK                     @   s   d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ d	d
lmZmZmZmZ ddlmZ eeZG dd deddZG dd dejZG dd deZG dd deZG dd deZG dd deZg dZdS )    )	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )GraniteFlashAttentionKwargsaT  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    cu_seq_lens_q (`torch.LongTensor`):
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`):
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor):
        Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor r   r   {/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   $   s   
 

r   F)totalc                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    sZ   t    |j| _|j| _t|j | _tj	| j| jd dd| _
tj	| j| jdd| _d S )Nr	   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr#   	__class__r   r    r&   E   s   
zGraniteMoeSharedMLP.__init__hidden_statesreturnc                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr	   )dimr   r   )r-   chunkr+   r.   )r0   r3   chunked_hidden_statesr   r   r    forwardN   s
   

zGraniteMoeSharedMLP.forward)
r   r   r   r   r   r&   r   Tensorr9   __classcell__r   r   r1   r    r"   <   s    	r"   c                       s   e Zd Zdedef fddZ							ddejdejdB d	ejdB d
e	dB de
dB de
dB dejdB deejejf dB dee deejeejejf dB f fddZ  ZS )GraniteMoeSharedDecoderLayerr#   	layer_idxc                    s0   t  || |jdkrd | _d S t|| _d S )Nr   )r%   r&   r)   r"   
shared_mlp)r0   r#   r=   r1   r   r    r&   W   s   "z%GraniteMoeSharedDecoderLayer.__init__NFr3   attention_maskposition_idspast_key_valuesoutput_attentions	use_cachecache_positionposition_embeddingskwargsr4   c	                 K   s   |}
|  |}| jd||||||||d|	\}}|
|| j  }|}
| |}| |}| jd u r5|}n|| | }|
|| j  }|S )N)r3   r?   r@   rA   rB   rC   rD   rE   r   )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moer>   )r0   r3   r?   r@   rA   rB   rC   rD   rE   rF   residual_moe_hidden_statesr   r   r    r9   [   s.   
	



z$GraniteMoeSharedDecoderLayer.forward)NNNFFNN)r   r   r   r   r   r&   r   r:   r   r   booltupler   r   FloatTensorr9   r;   r   r   r1   r    r<   V   s<    	
r<   c                   @   s   e Zd ZU eed< dgZdS )GraniteMoeSharedPreTrainedModelr#   r<   N)r   r   r   r   r   _no_split_modulesr   r   r   r    rR      s   
 
rR   c                       s"   e Zd Zdef fddZ  ZS )GraniteMoeSharedModelr#   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r   )r<   ).0r=   r#   r   r    
<listcomp>   s    z2GraniteMoeSharedModel.__init__.<locals>.<listcomp>)r%   r&   r   
ModuleListrangenum_hidden_layerslayersr/   r1   rV   r    r&      s   
zGraniteMoeSharedModel.__init__)r   r   r   r   r&   r;   r   r   r1   r    rT      s    rT   c                       s*   e Zd ZddiZdef fddZ  ZS )GraniteMoeSharedForCausalLMzlm_head.weightzmodel.embed_tokens.weightr#   c                    s"   t  | t|| _|   d S )N)r%   r&   rT   model	post_initr/   r1   r   r    r&      s   
z$GraniteMoeSharedForCausalLM.__init__)r   r   r   _tied_weights_keysr   r&   r;   r   r   r1   r    r\      s    r\   )r\   rT   rR   )typingr   r   r   activationsr   cache_utilsr   processing_utilsr   utilsr   granitemoe.modeling_granitemoer
   r   r   r   configuration_granitemoesharedr   
get_loggerr   loggerr   Moduler"   r<   rR   rT   r\   __all__r   r   r   r    <module>   s"   
/
