o
    wiC                     @   s6  d dl mZmZ d dlZd dlmZ ddlmZ ddlmZm	Z	 ddl
mZmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZmZmZmZ ddlmZ eeZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G dd deZ'g dZ(dS )    )OptionalUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)auto_docstringcan_return_tuplelogging   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                       &   e Zd Zdedef fddZ  ZS )GraniteMoeHybridAttentionconfig	layer_idxc                       t  || d S Nsuper__init__selfr   r   	__class__ z/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr!   )      z"GraniteMoeHybridAttention.__init____name__
__module____qualname__r   intr!   __classcell__r&   r&   r$   r'   r   (       r   c                       r   )GraniteMoeHybridMambaLayerr   r   c                    s   t  t|| d S r   )r    r!   r   r"   r$   r&   r'   r!   .   s   z#GraniteMoeHybridMambaLayer.__init__r)   r&   r&   r$   r'   r0   -   r/   r0   c                       s   e Zd Zd fdd	Z  ZS )GraniteMoeHybridRMSNormGatedư>c                    r   r   r   )r#   hidden_sizeepsr$   r&   r'   r!   3   r(   z%GraniteMoeHybridRMSNormGated.__init__)r2   )r*   r+   r,   r!   r.   r&   r&   r$   r'   r1   2   s    r1   c                       s"   e Zd Zdef fddZ  ZS )GraniteMoeHybridMLPr   c                    s   t  | d S r   r   r#   r   r$   r&   r'   r!   8   s   zGraniteMoeHybridMLP.__init__)r*   r+   r,   r   r!   r.   r&   r&   r$   r'   r5   7   s    r5   c                       s   e Zd Zdedef fddZ							ddejdeej d	ee	 d
ee
 dee
 deej dee
 deeejejf  deejeeejejf  f fddZ  ZS )GraniteMoeHybridDecoderLayerr   r   c                    sn   t  || t|| _d | _d | _|j| dkr t||| _nt||| _|j| | _	t
|dddk| _d S )Nmambanum_local_expertsr   )r    r!   r5   
shared_mlp	self_attnr8   layers_block_typer0   r   
layer_typegetattrhas_expertsr"   r$   r&   r'   r!   =   s   
z%GraniteMoeHybridDecoderLayer.__init__NFhidden_statesattention_maskpast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingsreturnc	              
   K   s   |}
|  |}| jdur| j||||d}d}n| jd|||||||d|	\}}}|
|| j  }|}
| |}| jrK| |\}}|| | }n| |}d}|
|| j  }|f}|rc||f7 }|rj||f7 }|rq||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        N)r@   rE   cache_paramsrA   )r@   rA   rB   rC   rD   rE   rG   r&   )input_layernormr8   r;   residual_multiplierpost_attention_layernormr?   block_sparse_moer:   )r#   r@   rA   rB   rC   rD   rE   rF   rG   kwargsresidualself_attn_weights_moe_hidden_statesrouter_logitsoutputsr&   r&   r'   forwardM   sL   %






z$GraniteMoeHybridDecoderLayer.forward)NNFFNFN)r*   r+   r,   r   r-   r!   torchTensorr   r   bool
LongTensortupleFloatTensorrU   r.   r&   r&   r$   r'   r7   <   s8    	r7   c                       s*   e Zd ZeZdgZdZ fddZ  ZS )GraniteMoeHybridPreTrainedModelr7   Tc                    s   t    t|tjr%|jjjd| jj	d |j
d ur#|j
j  d S d S t|trH|jjd ttd|jd |j_|jjd d S t|trV|jjd d S d S )Ng        )meanstdg      ?r   )r    _init_weights
isinstancer   Conv1dweightdatanormal_r   initializer_rangebiaszero_r0   dt_biasfill_rV   logarange	num_headsA_logDr1   )r#   moduler$   r&   r'   r_      s   



z-GraniteMoeHybridPreTrainedModel._init_weights)	r*   r+   r,   r   config_class_no_split_modules_is_statefulr_   r.   r&   r&   r$   r'   r\      s
    r\   c                       s   e Zd Zdef fddZee											ddejde	ej
 de	ej de	eeeej f  d	e	ej d
e	e de	e de	e de	e de	e de	ej deeef fddZdd Z  ZS )GraniteMoeHybridModelr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r&   )r7   ).0r   r   r&   r'   
<listcomp>   s    z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>)r    r!   r   
ModuleListrangenum_hidden_layerslayersr6   r$   ru   r'   r!      s   
zGraniteMoeHybridModel.__init__N	input_idsrA   position_idspast_key_valuesinputs_embedsrD   rC   output_hidden_statesrF   return_dictrE   rH   c                 C   sT  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|
d ur$|
n| j j}
|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|| j }|r\|d u r\t	d |d u rx|d urh| nd}tj|||jd  |jd}|d u r|d}| |||||}| ||}|}d }| jd ur| ||}|rdnd }|rdnd }|	rdnd }d }| jD ]L}|jd	kr|n|}|r||f7 }||||||||	|d
}|d }|r||rdnd }|r|d d ur||d f7 }|	r|d d ur||d f7 }q| |}|r||f7 }|r|jsd|_|r|nd }t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicer&   r8   )rA   rB   rC   rD   rE   rF   rG   r   T)last_hidden_stater}   r@   
attentionsrS   )r   rC   r   rD   use_return_dict
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensembedding_multiplierget_seq_lengthrV   rk   shaper   	unsqueeze_update_causal_mask_update_mamba_mask
rotary_embrz   r=   normhas_previous_stater   )r#   r{   rA   r|   r}   r~   rD   rC   r   rF   r   rE   past_seen_tokenscausal_mask
mamba_maskr@   rG   all_hidden_statesall_self_attnsall_router_logitsnext_decoder_cachedecoder_layer
layer_masklayer_outputs
next_cacher&   r&   r'   rU      s   








zGraniteMoeHybridModel.forwardc                 C   s.   |}|d dks|durt |dkrd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )rV   all)r#   rA   rE   r   r&   r&   r'   r   =  s   "z(GraniteMoeHybridModel._update_mamba_mask)NNNNNNNNNNN)r*   r+   r,   r   r!   r
   r	   rV   rY   r   rW   r   r   listr[   rX   rZ   r   rU   r   r.   r&   r&   r$   r'   rs      sR    	

yrs   c                       sL   e Zd ZdgZdef fddZ						dddZd	efd
dZ  Z	S )GraniteMoeHybridForCausalLMzlm_head.weightr   c                    s"   t  | t|| _|   d S r   )r    r!   rs   model	post_initr6   r$   r&   r'   r!   L  s   
z$GraniteMoeHybridForCausalLM.__init__NTc                 K   s  |d u }	|	s5|d us|d |j d kr"|d d |j d  d f }n!|j d |j d kr4|d d |f }nt| j|j d | j| jd}|d url|d u rl| dd }||dkd |	sl|d d |j d  d f }|d urw|	rwd|i}
nd| i}
|
	|||||d |
S )Nr   r   r   r   r~   r{   )r|   r}   rD   rA   rE   )
r   r   r   dtyper   longcumsummasked_fill_
contiguousupdate)r#   r{   r}   rA   r~   rE   r|   rD   rN   empty_past_kvmodel_inputsr&   r&   r'   prepare_inputs_for_generationR  s8   
	z9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationrH   c                 C   s   dS )aG  
        Function overwritten as this class uses its own `HybridMambaAttentionDynamicCache`
        and do not need to initialize the Cache in advance in order to save memory
        (because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
        for `HybridMambaAttentionDynamicCache`).
        Fr&   )r#   r&   r&   r'   _supports_default_dynamic_cache  s   z;GraniteMoeHybridForCausalLM._supports_default_dynamic_cache)NNNNNT)
r*   r+   r,   _tied_weights_keysr   r!   r   rX   r   r.   r&   r&   r$   r'   r   I  s    	
9r   )r   rs   r\   ))typingr   r   rV   r   cache_utilsr   modeling_outputsr   r   utilsr	   r
   r   bamba.configuration_bambar   bamba.modeling_bambar   r   r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   configuration_granitemoehybridr   
get_loggerr*   r   r   r0   r1   r5   r7   r\   rs   r   __all__r&   r&   r&   r'   <module>   s*    
k L