o
    i-B                     @   sR  d dl mZmZ d dlZd dlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ d
dlmZ d
dlmZmZmZ d
dlmZmZmZmZmZmZmZ ddl m!Z! e"e#Z$G dd deZ%G dd deZ&G dd deZ'G dd deZ(G dd deZ)G dd deZ*G dd deZ+G dd deZ,g d Z-dS )!    )OptionalUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)Unpack)auto_docstringcan_return_tuplelogging)deprecate_kwarg   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                       &   e Zd Zdedef fddZ  ZS )GraniteMoeHybridAttentionconfig	layer_idxc                       t  || d S Nsuper__init__selfr   r   	__class__ z/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr$   ,      z"GraniteMoeHybridAttention.__init____name__
__module____qualname__r   intr$   __classcell__r)   r)   r'   r*   r   +       r   c                       r   )GraniteMoeHybridMambaLayerr   r   c                    s   t  t|| d S r!   )r#   r$   r   r%   r'   r)   r*   r$   1   s   z#GraniteMoeHybridMambaLayer.__init__r,   r)   r)   r'   r*   r3   0   r2   r3   c                       s   e Zd Zd fdd	Z  ZS )GraniteMoeHybridRMSNormGatedư>c                    r    r!   r"   )r&   hidden_sizeepsr'   r)   r*   r$   6   r+   z%GraniteMoeHybridRMSNormGated.__init__)r5   )r-   r.   r/   r$   r1   r)   r)   r'   r*   r4   5   s    r4   c                       s"   e Zd Zdef fddZ  ZS )GraniteMoeHybridMLPr   c                    s   t  | d S r!   r"   r&   r   r'   r)   r*   r$   ;   s   zGraniteMoeHybridMLP.__init__)r-   r.   r/   r   r$   r1   r)   r)   r'   r*   r8   :   s    r8   c                       s   e Zd Zdedef fddZedddd					
	
			
		ddejde	ej de	e
 de	e de	e de	ej de	e de	eejejf  dee deeje	eejejf  f fddZ  ZS )GraniteMoeHybridDecoderLayerr   r   c                    sn   t  || t|| _d | _d | _|j| dkr t||| _nt||| _|j| | _	t
|dddk| _d S )Nmambanum_local_expertsr   )r#   r$   r8   
shared_mlp	self_attnr;   layers_block_typer3   r   
layer_typegetattrhas_expertsr%   r'   r)   r*   r$   @   s   
z%GraniteMoeHybridDecoderLayer.__init__past_key_valuepast_key_valuesz4.58)new_nameversionNFhidden_statesattention_maskoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsreturnc	              
   K   s   |}
|  |}| jdur| jd||||d|	}d}n| jd|||||||d|	\}}|
|| j  }|}
| |}| jrN| |\}}|| | }n| |}d}|
|| j  }|f}|rf||f7 }|rm||f7 }|S )a0  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        N)rG   rK   cache_paramsrH   )rG   rH   rD   rI   rJ   rK   rM   r)   )input_layernormr;   r>   residual_multiplierpost_attention_layernormrB   block_sparse_moer=   )r&   rG   rH   rD   rI   rJ   rK   rL   rM   rN   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputsr)   r)   r*   forwardP   sL   &






z$GraniteMoeHybridDecoderLayer.forward)NNFFNFN)r-   r.   r/   r   r0   r$   r   torchTensorr   r   bool
LongTensortupler	   r   FloatTensorrZ   r1   r)   r)   r'   r*   r:   ?   s>    	
r:   c                       s0   e Zd ZU eed< dgZdZ fddZ  ZS )GraniteMoeHybridPreTrainedModelr   r:   Tc                    sr   t  | t|tr)|jjd tt	d|j
d |j_|jjd d S t|tr7|jjd d S d S )Ng      ?r   )r#   _init_weights
isinstancer3   dt_biasdatafill_r[   logarange	num_headsA_logDr4   weight)r&   moduler'   r)   r*   rb      s   

z-GraniteMoeHybridPreTrainedModel._init_weights)	r-   r.   r/   r   __annotations___no_split_modules_is_statefulrb   r1   r)   r)   r'   r*   ra      s
   
 ra   c                       s   e Zd Zdef fddZee											ddeej	 deej
 deej	 deeeeej f  d	eej d
ee dee dee dee dee deej	 dee deeef fddZdd Z  ZS )GraniteMoeHybridModelr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r)   )r:   ).0r   r   r)   r*   
<listcomp>   s    z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>)r#   r$   r   
ModuleListrangenum_hidden_layerslayersr9   r'   rs   r*   r$      s   
zGraniteMoeHybridModel.__init__N	input_idsrH   position_idsrD   inputs_embedsrJ   rI   output_hidden_statesrL   return_dictrK   rN   rO   c                 K   s2  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|
d ur$|
n| j j}
|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|| j }|r\|d u r\t	d |d u rx|d urh| nd}tj|||jd  |jd}|d u r|d}| |||||}| ||}|}d }| jd ur| ||}|rdnd }|rdnd }|	rdnd }| jD ]D}|jd	kr|n|}|r||f7 }||f||||||	|d
|}|d }|r|d d ur||d f7 }|	r|d d ur||d f7 }q| |}|r||f7 }|r|jsd|_t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicer)   r;   )rH   rD   rI   rJ   rK   rL   rM   T)last_hidden_staterD   rG   
attentionsrX   )r   rI   r|   rJ   use_return_dict
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensembedding_multiplierget_seq_lengthr[   rh   shaper   	unsqueeze_update_causal_mask_update_mamba_mask
rotary_embrx   r@   normhas_previous_stater   )r&   ry   rH   rz   rD   r{   rJ   rI   r|   rL   r}   rK   rN   past_seen_tokenscausal_mask
mamba_maskrG   rM   all_hidden_statesall_self_attnsall_router_logitsdecoder_layer
layer_masklayer_outputsr)   r)   r*   rZ      s   






	

zGraniteMoeHybridModel.forwardc                 C   s.   |}|d dks|durt |dkrd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r[   all)r&   rH   rK   r   r)   r)   r*   r   6  s   "z(GraniteMoeHybridModel._update_mamba_mask)NNNNNNNNNNN)r-   r.   r/   r   r$   r   r
   r   r[   r^   r\   r   r   listr`   r]   r	   r   r_   r   rZ   r   r1   r)   r)   r'   r*   rq      sV    	

urq   c                       s>   e Zd ZdgZdef fddZ						d	ddZ  ZS )
GraniteMoeHybridForCausalLMzlm_head.weightr   c                    s"   t  | t|| _|   d S r!   )r#   r$   rq   model	post_initr9   r'   r)   r*   r$   E  s   
z$GraniteMoeHybridForCausalLM.__init__NTc                 K   s:  |d u }	|	s5|d us|d |j d kr"|d d |j d  d f }n#|j d |j d kr4|d d |f }n|rEt| j|j d | j| jd}|d urn|d u rn| dd }||dkd |	sn|d d |j d  d f }|d ury|	ryd|i}
nd| i}
|
	|||||d |
 D ]\}}||
vr||
|< q|
S )Nr   r   r   r~   r{   ry   )rz   rD   rJ   rH   rK   )r   r   r   dtyper   longcumsummasked_fill_
contiguousupdateitems)r&   ry   rD   rH   r{   rK   rz   rJ   rN   empty_past_kvmodel_inputskeyvaluer)   r)   r*   prepare_inputs_for_generationK  sB   
z9GraniteMoeHybridForCausalLM.prepare_inputs_for_generation)NNNNNT)r-   r.   r/   _tied_weights_keysr   r$   r   r1   r)   r)   r'   r*   r   B  s    	r   )r   rq   ra   ).typingr   r   r[   r   cache_utilsr   modeling_outputsr   r   processing_utilsr	   utilsr
   r   r   utils.deprecationr   bamba.configuration_bambar   bamba.modeling_bambar   r   r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   configuration_granitemoehybridr   
get_loggerr-   r   r   r3   r4   r8   r:   ra   rq   r   __all__r)   r)   r)   r*   <module>   s.   $	
j I