o
    eiJ7                     @   s  d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. e/e0Z1G dd de$Z2G dd deZ3G dd deZ4G dd de'Z5G dd de!Z6G dd  d e)Z7G d!d" d"e%Z8G d#d$ d$e*Z9G d%d& d&e(Z:G d'd( d(e&Z;g d)Z<dS )*    )CallableN)nn   )initialization)Cache)create_causal_mask)BaseModelOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)capture_outputs   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)Gemma2RotaryEmbedding)
GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedMoEGraniteMoeSharedPreTrainedModelapply_rotary_pos_embeager_attention_forward   )GraniteMoeHybridConfigc                       s   e Zd Zdedef fddZ			ddejdejdB dedB d	ej	dB d
e
ejejf dB dee de
ejejf fddZ  ZS )GraniteMoeHybridAttentionconfig	layer_idxc                       t  || d S Nsuper__init__selfr$   r%   	__class__ {/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr*   3      z"GraniteMoeHybridAttention.__init__Nhidden_statesattention_maskpast_key_valuescache_positionposition_embeddingskwargsreturnc                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|d urF|\}}t|	|
||\}	}
|d urYd|i}||
|| j	|\}
}t
| jjt}|| |	|
||f| jsmdn| j| jd|\}}|jg |dR   }| |}||fS )Nr!   r   r5   g        )dropoutscaling)shapehead_dimq_projview	transposek_projv_projr   updater%   r
   get_interfacer$   _attn_implementationr    trainingattention_dropoutr;   reshape
contiguouso_proj)r,   r2   r3   r4   r5   r6   r7   input_shapehidden_shapequery_states
key_statesvalue_statescossincache_kwargsattention_interfaceattn_outputattn_weightsr/   r/   r0   forward6   s:   	

z!GraniteMoeHybridAttention.forward)NNN)__name__
__module____qualname__r"   intr*   torchTensorr   
LongTensortupler   r   rV   __classcell__r/   r/   r-   r0   r#   2   s(    r#   c                       s&   e Zd Zdedef fddZ  ZS )GraniteMoeHybridMambaLayerr$   r%   c                    s   t  t|| d S r'   )r)   r*   r   r+   r-   r/   r0   r*   c   s   z#GraniteMoeHybridMambaLayer.__init__)rW   rX   rY   r"   rZ   r*   r_   r/   r/   r-   r0   r`   b   s    r`   c                       s   e Zd Zd fdd	Z  ZS )GraniteMoeHybridRMSNormGatedư>c                    r&   r'   r(   )r,   hidden_sizeepsr-   r/   r0   r*   h   r1   z%GraniteMoeHybridRMSNormGated.__init__)rb   )rW   rX   rY   r*   r_   r/   r/   r-   r0   ra   g   s    ra   c                       s"   e Zd Zdef fddZ  ZS )GraniteMoeHybridMLPr$   c                    s   t  | d S r'   r(   r,   r$   r-   r/   r0   r*   m   s   zGraniteMoeHybridMLP.__init__)rW   rX   rY   r"   r*   r_   r/   r/   r-   r0   re   l   s    re   c                   @      e Zd ZdS )GraniteMoeHybridRotaryEmbeddingNrW   rX   rY   r/   r/   r/   r0   rh   q       rh   c                   @   rg   )GraniteMoeHybridMoENri   r/   r/   r/   r0   rk   u   rj   rk   c                       s   e Zd Zdedef fddZe					ddejdejdB d	e	dB d
e
dB dejdB deejejf dB dee deejeejejf dB f fddZ  ZS )GraniteMoeHybridDecoderLayerr$   r%   c                    s   t  || t|| _d | _d | _|j| dkr t||| _nt||| _|j| | _	|j
dkr5t|nd | _t|dddk| _d S )Nmambar   num_local_experts)r)   r*   re   
shared_mlp	self_attnrm   layers_block_typer`   r#   
layer_typern   rk   block_sparse_moegetattrhas_expertsr+   r-   r/   r0   r*   z   s   
z%GraniteMoeHybridDecoderLayer.__init__NFr2   r3   r4   	use_cacher5   r6   r7   r8   c              	   K   s   |}|  |}| jd ur| jd||||d|}n| jd||||||d|\}}	||| j  }|}| |}| jrI| |}
|
| | }n| |}||| j  }|S )N)r2   r5   cache_paramsr3   )r2   r3   r4   rv   r5   r6   r/   )input_layernormrm   rp   residual_multiplierpost_attention_layernormru   rs   ro   )r,   r2   r3   r4   rv   r5   r6   r7   residual_moe_hidden_statesr/   r/   r0   rV      s<   






z$GraniteMoeHybridDecoderLayer.forward)NNFNN)rW   rX   rY   r"   rZ   r*   r   r[   r\   r   boolr]   r^   r   r   FloatTensorrV   r_   r/   r/   r-   r0   rl   y   s2    	rl   c                       s8   e Zd ZU eed< dgZdZe  fddZ	  Z
S )GraniteMoeHybridPreTrainedModelr$   rl   Tc              
      sr   t  | t|tr*t|j t|jt	
t	d|jd  t|j d S t|tr7t|j d S d S )Nr!   )r)   _init_weights
isinstancer`   initones_dt_biascopy_A_logr[   logarange	num_headsDra   weight)r,   moduler-   r/   r0   r      s   
"
z-GraniteMoeHybridPreTrainedModel._init_weights)rW   rX   rY   r"   __annotations___no_split_modules_is_statefulr[   no_gradr   r_   r/   r/   r-   r0   r      s   
 r   c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee deeB fddZdd Z  ZS )GraniteMoeHybridModelr$   c                    sV   t    t fddt jD | _ j| _ jdkr&t	 | _
d S d | _
d S )Nc                    s   g | ]}t  |qS r/   )rl   ).0r%   r$   r/   r0   
<listcomp>   s    z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>rope)r)   r*   r   
ModuleListrangenum_hidden_layerslayersembedding_multiplierposition_embedding_typerh   
rotary_embrf   r-   r   r0   r*      s   "zGraniteMoeHybridModel.__init__N	input_idsr3   position_idsr4   inputs_embedsrv   r5   r7   r8   c              	   K   s  |d u |d uA rt d|d u r| |}|| j }|d u r6|d ur&| nd}	tj|	|	|jd  |jd}|d u r?|d}t	| j
||||}
| ||}|}d }| jd ur]| ||}| jD ]}|jdkri|n|
}||f|||||d|}q`| |}|r|jsd|_t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r!   devicerm   )r3   r4   rv   r5   r6   T)last_hidden_stater4   )
ValueErrorembed_tokensr   get_seq_lengthr[   r   r<   r   	unsqueezer   r$   _update_mamba_maskr   r   rr   normhas_previous_stater	   )r,   r   r3   r   r4   r   rv   r5   r7   past_seen_tokenscausal_mask
mamba_maskr2   r6   decoder_layer
layer_maskr/   r/   r0   rV      sX   





	
zGraniteMoeHybridModel.forwardc                 C   s.   |}|d dks|durt |dkrd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr!   )r[   all)r,   r3   r5   r   r/   r/   r0   r     s   "z(GraniteMoeHybridModel._update_mamba_mask)NNNNNNN)rW   rX   rY   r"   r*   r   r   r   r[   r]   r\   r   r   r~   r   r   r^   r   rV   r   r_   r/   r/   r-   r0   r      s@    	
Br   c                       sR   e Zd ZddiZdef fddZ fddZ								
d fdd	Z  ZS )GraniteMoeHybridForCausalLMzlm_head.weightzmodel.embed_tokens.weightr$   c                    s"   t  | t|| _|   d S r'   )r)   r*   r   model	post_initrf   r-   r/   r0   r*   )  s   
z$GraniteMoeHybridForCausalLM.__init__c                    s   t  jdi |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

        >>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm-granite/granite-4.0-h-tiny")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-4.0-h-tiny")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```Nr/   )r)   rV   )r,   super_kwargsr-   r/   r0   rV   /  s   z#GraniteMoeHybridForCausalLM.forwardNTFc	              
      sP   |d u r|rt | j|jd | j| jd}t j|f|||||||d|	}
|
S )Nr   r   )r4   r3   r   r5   r   rv   is_first_iteration)r   r$   r<   dtyper   r)   prepare_inputs_for_generation)r,   r   r4   r3   r   r5   r   rv   r   r7   model_inputsr-   r/   r0   r   H  s$   	z9GraniteMoeHybridForCausalLM.prepare_inputs_for_generation)NNNNNTF)	rW   rX   rY   _tied_weights_keysr"   r*   rV   r   r_   r/   r/   r-   r0   r   &  s    r   )r   r   r   )=collections.abcr   r[   r    r   r   cache_utilsr   masking_utilsr   modeling_outputsr   r	   modeling_utilsr
   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   bamba.configuration_bambar   bamba.modeling_bambar   r   r   gemma2.modeling_gemma2r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   r   r   r    configuration_granitemoehybridr"   
get_loggerrW   loggerr#   r`   ra   re   rh   rk   rl   r   r   r   __all__r/   r/   r/   r0   <module>   s:   0
0CZD