o
    wi                     @   s   d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z	 dd	l
mZmZmZmZ d
dlmZ e	eZG dd dejZG dd deZG dd deZG dd deZG dd deZg dZdS )    )OptionalN)nn   )ACT2FN)Cache)logging   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    sZ   t    |j| _|j| _t|j | _tj	| j| jd dd| _
tj	| j| jdd| _d S )Nr   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr   	__class__ z/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   -   s   
zGraniteMoeSharedMLP.__init__hidden_statesreturnc                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr   )dimr   r   )r   chunkr   r   )r   r"   chunked_hidden_statesr    r    r!   forward6   s
   

zGraniteMoeSharedMLP.forward)
__name__
__module____qualname____doc__r   r   torchTensorr(   __classcell__r    r    r   r!   r   $   s    	r   c                       s   e Zd Zdedef fddZ								ddejdeej d	eej	 d
ee
 dee dee deej	 dee deeejejf  deejeeejejf  f fddZ  ZS )GraniteMoeSharedDecoderLayerr   	layer_idxc                    s0   t  || |jdkrd | _d S t|| _d S )Nr   )r   r   r   r   
shared_mlp)r   r   r1   r   r    r!   r   ?   s   "z%GraniteMoeSharedDecoderLayer.__init__NFr"   attention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingsr#   c
                 K   s   |}|  |}| jd||||||||	d|
\}}}||| j  }|}| |}| |\}}| jdu r8|}n|| | }~||| j  }|f}|rQ||f7 }|rX||f7 }|r_||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r"   r3   r4   r5   r6   r7   r8   r:   Nr    )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moer2   )r   r"   r3   r4   r5   r6   r7   r8   r9   r:   kwargsresidualself_attn_weightspresent_key_valuemoe_hidden_statesrouter_logitsoutputsr    r    r!   r(   C   s>   &
	




z$GraniteMoeSharedDecoderLayer.forward)NNNFFNFN)r)   r*   r+   r   intr   r-   r.   r   
LongTensorr   booltupleFloatTensorr(   r/   r    r    r   r!   r0   >   s>    	
r0   c                   @   s   e Zd ZeZdgZdS )GraniteMoeSharedPreTrainedModelr0   N)r)   r*   r+   r   config_class_no_split_modulesr    r    r    r!   rL      s    
rL   c                       s"   e Zd Zdef fddZ  ZS )GraniteMoeSharedModelr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r    )r0   ).0r1   r   r    r!   
<listcomp>   s    z2GraniteMoeSharedModel.__init__.<locals>.<listcomp>)r   r   r   
ModuleListrangenum_hidden_layerslayersr   r   rQ   r!   r      s   
zGraniteMoeSharedModel.__init__)r)   r*   r+   r   r   r/   r    r    r   r!   rO      s    rO   c                       s(   e Zd ZdgZdef fddZ  ZS )GraniteMoeSharedForCausalLMzlm_head.weightr   c                    s"   t  | t|| _|   d S )N)r   r   rO   model	post_initr   r   r    r!   r      s   
z$GraniteMoeSharedForCausalLM.__init__)r)   r*   r+   _tied_weights_keysr   r   r/   r    r    r   r!   rW      s    rW   )rW   rO   rL   )typingr   r-   r   activationsr   cache_utilsr   utilsr   granitemoe.modeling_granitemoer	   r
   r   r   configuration_granitemoesharedr   
get_loggerr)   loggerModuler   r0   rL   rO   rW   __all__r    r    r    r!   <module>   s   
Z
