o
    ei                    @   sH  d dl mZ d dlmZmZmZ d dlZd dlmZ d dlm	Z
 d dlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 e/7e8Z9dd Z:edd^ddZ;dej<de=dej<fd d!Z>	"d_d#ej?d$ej<d%ej<d&ej<d'ej<dB d(e@d)e@d*e)e+ fd+d,ZAee;G d-d. d.ej?ZBG d/d0 d0ZCd1ej<d2e=fd3d4ZDd5d6 ZEd7d8 ZFd9d: ZGG d;d< d<ej?ZHG d=d> d>ejj?ZIG d?d@ d@ej?ZJG dAdB dBej?ZKG dCdD dDej?ZLG dEdF dFej?ZMG dGdH dHej?ZNG dIdJ dJedKdLZOedMG dNdO dOej?ZPG dPdQ dQeZQe,G dRdS dSe'ZRe,G dTdU dUeRZS		V	d`dWej<eTej< B dB dXe=dB d'ej<dB dej<e=B fdYdZZUe,G d[d\ d\eReZVg d]ZWdS )a    )Callable)AnyOptional	TypedDictN)nn)
functional)ACT2FN   )initialization)Cache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)lazy_load_kernel)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastMoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)maybe_autocastmerge_with_config_defaults)capture_outputs   )GraniteMoeHybridConfigc                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1x2 r/   |/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/granitemoehybrid/modeling_granitemoehybrid.pyrotate_half2   s   r1   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer1   )qkcossinunsqueeze_dimq_embedk_embedr/   r/   r0   apply_rotary_pos_emb9   s
   

r;   hidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r#   N)r)   expandreshape)r<   r=   batchnum_key_value_headsslenhead_dimr/   r/   r0   	repeat_kvS   s
   0rE           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr&   r	   r%   )r(   dtype)ptrainingr#   )rE   num_key_value_groupsr*   matmul	transposer   r   softmaxfloat32torO   rM   rQ   
contiguous)rG   rH   rI   rJ   rK   rL   rM   rN   
key_statesvalue_statesattn_weightsattn_outputr/   r/   r0   eager_attention_forward_   s   
r]   c                       s   e Zd ZdZdedef fddZ			ddejdejdB d	e	dB d
ej
dB deejejf dB dee deejejf fddZ  ZS )GraniteMoeHybridAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
| _|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )NrD   Tbias)super__init__r_   r`   getattrhidden_sizenum_attention_headsrD   rB   rR   attention_multiplierrL   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_projselfr_   r`   	__class__r/   r0   rd   |   s(   
z"GraniteMoeHybridAttention.__init__Nr<   rK   past_key_valuescache_positionposition_embeddingsrN   r>   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|d urF|\}}t|	|
||\}	}
|d urYd|i}||
|| j	|\}
}t
| jjt}|| |	|
||f| jsmdn| j| jd|\}}|jg |dR   }| |}||fS )Nr%   r#   r&   rv   rF   )rM   rL   )r)   rD   rm   viewrT   rn   ro   r;   updater`   r   get_interfacer_   _attn_implementationr]   rQ   ri   rL   r@   rX   rp   )rr   r<   rK   ru   rv   rw   rN   input_shapehidden_shapequery_statesrY   rZ   r6   r7   cache_kwargsattention_interfacer\   r[   r/   r/   r0   forward   s:   	

z!GraniteMoeHybridAttention.forwardNNN)__name__
__module____qualname____doc__r$   intrd   r*   Tensorr   
LongTensortupler   r   r   __classcell__r/   r/   rs   r0   r^   x   s*    r^   c                   @   s   e Zd ZdZdZejdfdefddZdd Z	d	d
 Z
	ddejdejdedeeef dB deejejf f
ddZdejfddZdejdedeeef fddZddedB defddZdS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNr_   c                    s0  |j | _ d| _|j}|j}g | _g | _g | _t|jD ]^}| j | dkrS|  jt	j
 |j|j d|j |  ||dg7  _|  jt	j
 |j|j||dg7  _q|  jt	jg g  dg7  _|  jt	jg g  dg7  _| j| q fddt|jD | _ fddt|jD | _d S )	NFmambar&   devicerO   r   c                        g | ]}t jg g  d qS r   r*   tensor.0_
batch_sizer   r/   r0   
<listcomp>        z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>c                    r   r   r   r   r   r/   r0   r      r   )layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr*   zerosmamba_expandrf   mamba_n_groupsmamba_n_headsmamba_d_headr   append	key_cachevalue_cache)rr   r_   r   rO   r   conv_kernel_sizessm_state_sizeir/   r   r0   rd      sB   	
   z)HybridMambaAttentionDynamicCache.__init__c                 C   s
   t | jS N)lenr   rr   r/   r/   r0   __len__   s   
z(HybridMambaAttentionDynamicCache.__len__c                 C   s   | j | | j| fS r   )r   r   rr   r`   r/   r/   r0   __getitem__   s   z,HybridMambaAttentionDynamicCache.__getitem__rY   rZ   r`   r   r>   c                 C   sz   | j | jd dkr|| j |< || j|< ntj| j | |gdd| j |< tj| j| |gdd| j|< | j | | j| fS )Nr%   r   r&   r'   )r   r)   r   r*   r+   )rr   rY   rZ   r`   r   r/   r/   r0   ry      s   
z'HybridMambaAttentionDynamicCache.updatebeam_idxc                 C   s   |   dkrdtt| jD ]X}| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< | j	| j}| j	| d||| j	|< qdS dS )zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthr   r   r   r   index_selectrW   r   r   r   )rr   r   r`   r   r/   r/   r0   reorder_cache  s    z.HybridMambaAttentionDynamicCache.reorder_cacherv   c                 C   s$   d}|j d }| || }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )r)   r   )rr   rv   r`   	kv_offsetquery_length	kv_lengthr/   r/   r0   get_mask_sizes  s   
z/HybridMambaAttentionDynamicCache.get_mask_sizesr   c                 C   sN   || j vr
| j d n|}t| j|ks| j| jd dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r%   )r   r   r   r)   r   r/   r/   r0   r   !  s   "z/HybridMambaAttentionDynamicCache.get_seq_lengthr   )r   )r   r   r   r   is_compileabler*   float16r$   rd   r   r   r   r   dictstrr   r   ry   r   r   r   r   r/   r/   r/   r0   r      s*    &
 r   input_tensorpad_sizec                 C   sH   t | jdkrddddd|ddfnddd|ddf}tjjj| |dddS )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moderJ   )r   r)   r*   r   r   pad)r   r   	pad_shaper/   r/   r0   pad_tensor_by_size-  s   2r   c                 C   sX   t | |} t| jdkr| | jd d|| jd S | | jd d|| jd | jd S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r	   r   r%   r&   )r   r   r)   r@   )r   r   
chunk_sizer/   r/   r0   reshape_into_chunks8  s   
r   c                 C   s   |  d}| d jg |   |R  } tjtj||| jtjddd}| | d} tj| dd}tjtj||| jtjddd}|| tj	 }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r%   .Nr   )diagonalr   r   r'   )
sizer?   r*   trilonesr   boolmasked_fillcumsuminf)r   r   masktensor_segsumr/   r/   r0   segment_sumL  s   
  r   c                 C   sN   |dur%|j d dkr%|j d dkr%| j}| |dddddf  |} | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr#   r   )r)   rO   rW   )r<   rK   rO   r/   r/   r0   apply_mask_to_padding_states`  s   $ r   c                       s   e Zd ZdZdedef fddZ				ddejde	dB d	ej
dB d
ejdB dejdB f
ddZ			dde	dB d	ej
dB d
ejdB fddZ				dde	dB d	ej
dB d
ejdB dejdB fddZ  ZS )GraniteMoeHybridMambaLayeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r_   r`   c                    s  t    |j| _|j| _|j| _|j| _t	|j
| j | _|| _|j| _|j| _t|j | _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _| jd| j | j  | _ t!j"| j | j |j| j| j | jd d| _#| j| j  | j }t!j$| j|| jd| _%t!&t'(| j| _)t'*d| jd }t!&t'+|| _,t-| j| jd| _.t!&t'(| j| _/t!j$| j| j| jd| _0t1d}t2|dd a3t2|dd a4t1d	}t2|d
d a5t2|dd a6t2|dd a7t8t5t4t3fa9t9st:;d d S t:;d d S )Nr&   r#   )in_channelsout_channelsrb   kernel_sizegroupspaddingra   epszcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmselective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzOThe fast path for GraniteMoeHybrid will be used when running the model on a GPU)<rc   rd   r   	num_headsrf   r   r   r   r   r   r   intermediate_sizer`   mamba_conv_biasuse_conv_bias
hidden_act
activationr   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonr   n_groupsr   rD   mamba_chunk_sizer   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1drk   in_proj	Parameterr*   r   dt_biasarangelogA_logGraniteMoeHybridRMSNormGatednormDout_projr   re   r   r   r   r   r   allis_fast_path_availableloggerwarning_once)rr   r_   r`   projection_sizeAcausal_conv1d	mamba_ssmrs   r/   r0   rd   {  sh   

	z#GraniteMoeHybridMambaLayer.__init__Nr<   cache_paramsrv   rK   seq_idxc                 C   s  t ||}| |}|j\}}}	| j| j }
|d uoD|joD|dkoD|j| j jd |j| j jd   ko8|kn  oD|d uoD|d dk}|r)|	dj
| j| j| jgdd\}}}t||j| j | jj	d| jj| j}tj
|| j|
|
gdd\}}}t| j  }|d d d df d d d d d f d| j| jjtjd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|jd | j }||| j|jd | j }||| j| j}t|j| j ||||||d |dd
}||| j| j }| ||}|  |d d d df }|S t| j  }| j!d	td
fkr>i nd| j!i}| j"r||d u r|t#|| jj	d| jj| j|f| j| j$|| j| jj| jj%| j j| j j| j| jddd|}|S |j
| j| j| jgdd\}}}|d ur|&dd}t'j()|| j*|jd  df}|j| j +| | jdvr| ,| |&dddd |f &dd}nt-|&dd| jj	d| jj| j|d&dd}t ||}tj
|| j|
|
gdd\}}}t.|||d| j|||||| jd|||| jdf| j$| jd |d| jdd|\}}|d ur:|d ur:|j| j +| |||d}| ||}|  |}|S )Nr#   r   r%   r'   .rO   T)zr   dt_softplusrF   r   dt_limitF)r  r   r  r   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr&   )siluswish)r,   weightrb   r   r  )r   r  r  r  r  r   r  )/r   r   r)   r   r   r   r   r`   r   squeezesplitr   r   r   r   r   r  rb   r   r*   expr   floatr?   rD   rW   rV   r   r  rx   r   r  r  r   rQ   r   r   variance_epsilonrT   r   r   r   r   copy_r   r   r   )rr   r<   r  rv   rK   r  projected_statesr   seq_lenr   groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr	  r   r  hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedr   scan_output	ssm_stater/   r/   r0   cuda_kernels_forward  s  
	




<"
^"V
$




z/GraniteMoeHybridMambaLayer.cuda_kernels_forwardc           3   
      s  |j \}}}|j}t||}|}	|	jjjjgdd\}
}}|d uoQ|joQ|dkoQ|j	j
 j d |jj
 j d   koE|kn  oQ|d uoQ|d dk}|r|j	j
 jddd|j	j
< |d d dd d f |j	j
 j|j	j
 d d d d df< |j	j
 jjjjd}tj|jjd dd}jr|jj }|}n8|d ur|dd}tj|j|j d  df}|j	j
 | |dddd |f dd}t||}tj|jjj jj gdd\}}}tj !  }|r[|jj
 j}|d d dd d f d d d df }|dd"||j d j#}j$d	 "j$j d j#}tjj%|||j }t&|j'd j'd }|d
 "jj#jjtj(d}t|d	 | j|d}|)|jddd d d f }|"|jjj |j d * }|)|d|j d }|d	 |dd d d f  }|)|dj#}||d	  j|d}|jj
 |jj
 | |  |)|jddd d d f }|"|jjj |j d * }|)|d|j d }|jj
 j|j|jd}|+|j j#j}|+|j jd}t,||}|+|jj#}j-d	 "j-j d j#}|||  |j}|)|dd d d df }ntj%|j$ }t&|j'd j'd }|)||dj#! }|)||dj! }|)||dj! }|j.jj djd}|j.jj djd}j/|j/  j/  j-d	 t0|  }||d	  }||j| } fdd||||fD \}}}}|1dddd}tj2|dd}tt3|} |d d d d d d d d d d d f |d d d d d d d d d d d f  }!|!jdd}"|"d	 | 1dddddd	  }#|#jdd}$|$d	 |d d d d d f  jdd}%t|d d d d d d dd f | }&||&1ddddd	  }'|'dd d d f |d	  jdd}(|r|jj
 d d d df j|(jd})nt4|(d d d df })tj5|)|(gdd}(tt3tj|d d d d d d df d}*|*dd}*|*d
 |(d d d d d df  jdd}+|+d d d df |+d d df }(},t|}-|dd d d f |(d d d d d df  }.|-1dddd}/|.d|/d	  }0|%|0 }|)|djj#}|| } dkrB|d d d |d d d d f }|)||d}|,d ur\|d ur\|jj
 |, 6||
}17|1|}2|2S )Nr%   r'   r#   r   )shiftsdimsr   r&   .r   ).NNr  r   )r(   output_sizec                    s   g | ]	}t | jqS r/   )r   r   )r   tr   rr   r/   r0   r     s    z<GraniteMoeHybridMambaLayer.torch_forward.<locals>.<listcomp>r	   r   r   )r#   r   )8r)   rO   r   r   r  r   r   r   r   r   r`   r   rollrW   r   r   r  r*   sumr  r   rb   r   rT   r   r   r   r   r"  r   r   r  r   r   r?   rD   r   softplusclampr   rV   r@   rX   rx   bmmr  repeat_interleaver   r   permuter   r   
zeros_liker+   r  r  )3rr   input_statesr  rv   rK   r   r$  r   rO   r#  r'  r(  r)  r&  r   r/  r<   r*  r+  r	  cache_devicer   dAdBdBxr   ssm_states_reshaped
C_reshapedyr  
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr1  state_decay_outC_times_statesstate_decay_out_permutedY_offr0  contextualized_statesr/   r7  r0   torch_forwards  s   


@,
$"$$$P&*"&0(&
*
 z(GraniteMoeHybridMambaLayer.torch_forwardc                 K   s   t rd| jjjjv rt s| |||||S |d urtd|j}|d urC|j	d dkrC|j	d dkrC||d d d d d f  
|}| ||||S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r#   r   )r  r   r  r   typer   r2  NotImplementedErrorrO   r)   rW   r[  )rr   r<   r  rv   rK   r  rN   rO   r/   r/   r0   r   B  s   	$ z"GraniteMoeHybridMambaLayer.forward)NNNNr   )r   r   r   r   r$   r   rd   r*   r   r   r   	IntTensorr2  r[  r   r   r/   r/   rs   r0   r   m  sV    Q
 .
 Sr   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	r   ư>c                    s&   t    tt|| _|| _d S r   rc   rd   r   r   r*   r   r  r!  rr   rf   r   rs   r/   r0   rd   Z  s   

z%GraniteMoeHybridRMSNormGated.__init__Nc                 C   sj   |j }|tj}|d ur|tj|tj }|djddd}|t	|| j
  }| j|| S Nr&   r%   T)keepdim)rO   rW   r*   rV   r   r   r  powmeanrsqrtr!  r  )rr   r<   r'  input_dtypevariancer/   r/   r0   r   _  s   z$GraniteMoeHybridRMSNormGated.forwardr`  r   )r   r   r   rd   r   r   r/   r/   rs   r0   r   Y  s    r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	GraniteMoeHybridMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    r_   c                    sZ   t    |j| _|j| _t|j | _tj	| j| jd dd| _
tj	| j| jdd| _d S )Nr&   Fra   )rc   rd   rf   
input_sizeshared_intermediate_sizer   r   r   r   rk   input_linearoutput_linearrr   r_   rs   r/   r0   rd   t  s   
zGraniteMoeHybridMLP.__init__r<   r>   c                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr&   r%   r'   r   r#   )rn  chunkr   ro  )rr   r<   chunked_hidden_statesr/   r/   r0   r   }  s
   

zGraniteMoeHybridMLP.forward)
r   r   r   r   r$   rd   r*   r   r   r   r/   r/   rs   r0   rk  k  s    	rk  c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )GraniteMoeHybridRotaryEmbeddinginv_freqNr_   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultrt  F)
persistentoriginal_inv_freq)rc   rd   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr_   rope_parametersru  compute_default_rope_parametersr   attention_scalingregister_bufferclone)rr   r_   r   rope_init_fnrt  rs   r/   r0   rd     s   


z(GraniteMoeHybridRotaryEmbedding.__init__r   ztorch.devicer$  r>   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetarD   Ng      ?r   r&   r  r   )	r|  re   rf   rg   r*   r   int64rW   r   )r_   r   r$  baser(   attention_factorrt  r/   r/   r0   r}    s   
&z?GraniteMoeHybridRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r%   r#   mpscpuF)device_typeenabledr&   r'   r  )rt  r   r?   r)   rW   r   
isinstancer]  r   r    rT   r*   r+   r6   r~  r7   rO   )
rr   r,   position_idsinv_freq_expandedposition_ids_expandedr  freqsembr6   r7   r/   r/   r0   r     s   0&z'GraniteMoeHybridRotaryEmbedding.forwardr   r   )r   r   r   r*   r   __annotations__r$   rd   staticmethodr   r   r   r   r}  no_gradr   r   r   r/   r/   rs   r0   rs    s&   
 

rs  c                       s6   e Zd Zdedededdf fddZdd	 Z  ZS )
GraniteMoeHybridParallelExpertsnum_expertsrl  r5  r>   Nc                    s6   t    tt|||| _|| _|| _|| _	dS )a  
        Initialize the GraniteMoeHybridParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
rc   rd   r   r   r*   emptyr  r  rl  r5  )rr   r  rl  r5  rs   r/   r0   rd     s
   

z(GraniteMoeHybridParallelExperts.__init__c                 C   sP   |j |dd}g }t| jD ]}|t|| | j|  qtj|dd}|S )a  
        Forward pass of the GraniteMoeHybridParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   r'   )	r  r   r  r   Flinearr  r*   r+   )rr   inputsexpert_size
input_listoutput_listr   resultsr/   r/   r0   r     s   z'GraniteMoeHybridParallelExperts.forwardr   r   r   r   rd   r   r   r/   r/   rs   r0   r    s    r  c                       s2   e Zd Zdededef fddZdd Z  ZS )GraniteMoeHybridTopKGatingrl  r  top_kc                    s2   t    || _|| _|| _tj||dd| _dS )a  
        Initialize the top-k gating mechanism.

        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        Fra   N)rc   rd   r  rl  r  r   rk   layer)rr   rl  r  r  rs   r/   r0   rd     s
   
z#GraniteMoeHybridTopKGating.__init__c                 C   s   |  | }|j| jdd\}}tj|dd|}tj|d| j	g|j
|jd}|d|d}| d}| }| }	|	d\}
}|j| jdd}| }|| }|||||fS )Nr#   r'   r   rO   r   trunc)rounding_mode)r  r   topkr  r*   rU   type_asr   r   r  rO   r   scatterlongr9  tolistflattensortdiv)rr   r<   logitstop_k_logitstop_k_indicestop_k_gatesr   gatesr  top_k_expertsr   index_sorted_expertsbatch_indexbatch_gatesr/   r/   r0   r     s   z"GraniteMoeHybridTopKGating.forwardr  r/   r/   rs   r0   r    s    r  c                       s.   e Zd ZdZdef fddZdd Z  ZS )GraniteMoeHybridMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    r_   c                    sl   t    |j| _|j| _t|j | _t|j	| j| jd | _
t|j	| j| j| _t| j|j	|jd| _d S )Nr&   )rl  r  r  )rc   rd   rf   rl  r   r   r   r   r  num_local_expertsrn  ro  r  num_experts_per_tokrouterrp  rs   r/   r0   rd   -  s   
zGraniteMoeHybridMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}|| }	| |	|}
|
jddd}| |d |d  }
| |
|}||d d d f  }tj|| | j	f|j
|jd}|d||}|||| j	}|S )Nr%   r&   r'   r   r#   r  )r   r@   r  rn  rq  r   ro  r*   r   rl  rO   r   	index_addrx   )rr   layer_inputbszlengthemb_sizer   r  r  r  expert_inputsr<   rr  expert_outputsr   layer_outputr/   r/   r0   r   @  s   zGraniteMoeHybridMoE.forward)r   r   r   r   r$   rd   r   r   r/   r/   rs   r0   r  $  s    r  c                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )GraniteFlashAttentionKwargsaT  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    cu_seq_lens_q (`torch.LongTensor`):
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`):
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor):
        Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kr  N)	r   r   r   r   r*   r   r  r   r_  r/   r/   r/   r0   r  S  s   
 

r  F)totalRMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )GraniteMoeHybridRMSNormr`  r   r>   Nc                    s&   t    tt|| _|| _dS )zF
        GraniteMoeHybridRMSNorm is equivalent to T5LayerNorm
        Nra  rb  rs   r/   r0   rd   m  s   

z GraniteMoeHybridRMSNorm.__init__r<   c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S rc  )	rO   rW   r*   rV   re  rf  rg  r!  r  )rr   r<   rh  ri  r/   r/   r0   r   u  s
   zGraniteMoeHybridRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r   r  r)   r!  r   r/   r/   r0   
extra_repr|  s   z"GraniteMoeHybridRMSNorm.extra_reprrj  )
r   r   r   r   rd   r*   r   r   r  r   r/   r/   rs   r0   r  k  s    r  c                       s   e Zd Zdedef fddZe					ddejdejdB d	e	dB d
e
dB dejdB deejejf dB dee deejeejejf dB f fddZ  ZS )GraniteMoeHybridDecoderLayerr_   r`   c                    s   t    |j| _d | _t|j|jd| _t|j|jd| _|jdkr't	|nd | _
|j| _t|| _d | _|j| dkrDt||| _nt||| _|j| | _t|dddk| _d S )Nr   r   r   r  )rc   rd   rf   	self_attnr  r   input_layernormpost_attention_layernormr  r  block_sparse_moeresidual_multiplierrk  
shared_mlpr   r   r   r^   
layer_typere   has_expertsrq   rs   r/   r0   rd     s   

z%GraniteMoeHybridDecoderLayer.__init__NFr<   rK   ru   	use_cacherv   rw   rN   r>   c              	   K   s   |}|  |}| jd ur| jd||||d|}n| jd||||||d|\}}	||| j  }|}| |}| jrI| |}
|
| | }n| |}||| j  }|S )N)r<   rv   r  rK   )r<   rK   ru   r  rv   rw   r/   )r  r   r  r  r  r  r  r  )rr   r<   rK   ru   r  rv   rw   rN   residualr   moe_hidden_statesr/   r/   r0   r     s<   






z$GraniteMoeHybridDecoderLayer.forward)NNFNN)r   r   r   r$   r   rd   r   r*   r   r   r   r   r   r   r  FloatTensorr   r   r/   r/   rs   r0   r    s2    	r  c                       sd   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdZe  fdd	Z  ZS )
GraniteMoeHybridPreTrainedModelr_   modelTr  ru   F)r<   
attentionsc              
      s   t  | t|trtj|jd| jjd t|t	r:t
|j t|jttd|jd  t
|j d S t|trGt
|j d S d S )NrF   )rf  stdr#   )rc   _init_weightsr  r  initnormal_r  r_   initializer_ranger   ones_r   r"  r   r*   r   r   r   r  r   )rr   rG   rs   r/   r0   r    s   

"
z-GraniteMoeHybridPreTrainedModel._init_weights)r   r   r   r$   r  base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r^   _can_record_outputs_is_statefulr*   r  r  r   r/   r/   rs   r0   r    s"   
 r  c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee deeB fddZdd Z  ZS )GraniteMoeHybridModelr_   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _ jdkr;t nd | _d| _ j| _|   d S )Nc                    s   g | ]}t  |qS r/   )r  )r   r`   r_   r/   r0   r         z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>r   ropeF)rc   rd   pad_token_idpadding_idx
vocab_sizer   	Embeddingrf   embed_tokens
ModuleListr   r   layersr  r   r  position_embedding_typers  
rotary_embgradient_checkpointingembedding_multiplier	post_initrp  rs   r  r0   rd     s   zGraniteMoeHybridModel.__init__N	input_idsrK   r  ru   inputs_embedsr  rv   rN   r>   c              	   K   s  |d u |d uA rt d|d u r| |}|| j }|d u r6|d ur&| nd}	tj|	|	|jd  |jd}|d u r?|d}t	| j
||||}
| ||}|}d }| jd ur]| ||}| jD ]}|jdkri|n|
}||f|||||d|}q`| |}|r|jsd|_t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r#   r   r   )rK   ru   r  rv   rw   T)last_hidden_stateru   )
ValueErrorr  r  r   r*   r   r)   r   r3   r   r_   _update_mamba_maskr  r  r  r  r   r   )rr   r  rK   r  ru   r  r  rv   rN   past_seen_tokenscausal_mask
mamba_maskr<   rw   decoder_layer
layer_maskr/   r/   r0   r     sX   





	
zGraniteMoeHybridModel.forwardc                 C   s.   |}|d dks|durt |dkrd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr#   )r*   r  )rr   rK   rv   r  r/   r/   r0   r  >  s   "z(GraniteMoeHybridModel._update_mamba_mask)NNNNNNN)r   r   r   r$   rd   r   r!   r"   r*   r   r   r   r  r   r   r  r   r   r   r  r   r/   r/   rs   r0   r    s@    	
Br  r&   gate_logitsr  c                    s  | du s	t | tsdS t | tr#| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rStj
| dd}	tj
|dd}
ng|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||fd| }tj|| ddtj|dd }
t|	|
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS r/   )rW   )r   
layer_gatecompute_devicer/   r0   r   l  r  z,load_balancing_loss_func.<locals>.<listcomp>r'   r%   )r  r   r   r*   r+   r   r   rU   r  one_hotrf  r   r)   r?   r@   rW   r9  r3   )r  r  r  rK   concatenated_gate_logitsrouting_weightsr   selected_expertsexpert_masktokens_per_expertrouter_prob_per_expertr   sequence_lengthr   expert_attention_mask router_per_expert_attention_maskoverall_lossr/   r  r0   load_balancing_loss_funcJ  s>   



r  c                       s   e Zd ZddiZddiZddgdgfiZdef fdd	Zee		
	
	
	
	
	
	
	
	dde
jd
B de
jd
B de
jd
B ded
B de
jd
B de
jd
B ded
B de
jd
B dee
jB deeB fddZ	
	
	
	
	
		d fdd	Z  ZS )GraniteMoeHybridForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr<   r  r_   c                    s`   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _|j| _|j| _|   d S )NFra   )rc   rd   r  r  r  r   rk   rf   r  router_aux_loss_coefr  r  r  logits_scalingr  rp  rs   r/   r0   rd     s   
z$GraniteMoeHybridForCausalLM.__init__Nr   r  rK   r  ru   r  labelsoutput_router_logitsrv   logits_to_keepr>   c
              	   K   s   |dur|n| j j}| jd||||||d|
}|j}t|	tr't|	 dn|	}| |dd|ddf }|| j j }d}|durQ| j	||fd| j j
i|
}d}|rnt|j| j| j|}|durn|| j||j 7 }t||||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

        >>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm-granite/granite-4.0-h-tiny")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-4.0-h-tiny")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r  rK   r  ru   r  rv   r  )lossaux_lossr  ru   r<   r  router_logitsr/   )r_   r  r  r  r  r   slicer  r  loss_functionr  r  r  r  r  r  rW   r   r   ru   r<   r  )rr   r  rK   r  ru   r  r  r  rv   r  rN   outputsr<   slice_indicesr  r  r  r/   r/   r0   r     sZ   &z#GraniteMoeHybridForCausalLM.forwardTFc	              
      sP   |d u r|rt | j|jd | j| jd}t j|f|||||||d|	}
|
S )Nr   r   )ru   rK   r  rv   r  r  is_first_iteration)r   r_   r)   rO   r   rc   prepare_inputs_for_generation)rr   r  ru   rK   r  rv   r  r  r$  rN   model_inputsrs   r/   r0   r%    s$   	z9GraniteMoeHybridForCausalLM.prepare_inputs_for_generation)	NNNNNNNNr   )NNNNNTF)r   r   r   _tied_weights_keys_tp_plan_pp_planr$   rd   r   r   r*   r   r   r   r  r   r   r   r   r   r%  r   r/   r/   rs   r0   r    sZ    	
Xr  )r  r  r  )r#   )rF   )Nr&   N)Xcollections.abcr   typingr   r   r   r*   r   torch.nnr   r  transformers.activationsr    r
   r  cache_utilsr   
generationr   integrationsr   r   r   integrations.hub_kernelsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr    r!   utils.output_capturingr"   configuration_granitemoehybridr$   
get_loggerr   r  r1   r;   r   r   rE   Moduler   r]   r^   r   r   r   r   r   r   r   rk  rs  r  r  r  r  r  r  r  r  r   r  r  __all__r/   r/   r/   r0   <module>   s   

Fn   oA-1/Ge
R 