o
    ei                     @   s  d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ eeZdejdefddZdd Z dd Z!dd Z"G dd dZ#G dd dejj$Z%G dd dej$Z&G dd  d ej$Z'G d!d" d"eZ(eG d#d$ d$eZ)eed%d&G d'd( d(eZ*eed)d&G d*d+ d+eZ+eG d,d- d-e)Z,ed.d&G d/d0 d0e)eZ-g d1Z.dS )2zPyTorch MAMBA2 model.    N)	dataclass)nn   )initialization)ACT2FN)GenerationMixin)lazy_load_kernel)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringis_torchdynamo_compilinglogging   )Mamba2Configinput_tensorpad_sizec                 C   sH   t | jdkrddddd|ddfnddd|ddf}tjjj| |dddS )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)modevalue)lenshapetorchr   
functionalpad)r   r   	pad_shape r   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mamba2/modeling_mamba2.pypad_tensor_by_size&   s   2r   c                 C   sX   t | |} t| jdkr| | jd d|| jd S | | jd d|| jd | jd S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r   r      )r   r   r   reshape)r   r   
chunk_sizer   r   r   reshape_into_chunks1   s   
r$   c                 C   s   |  d}| d jg |   |R  } tjtj||| jtjddd}| | d} tj| dd}tjtj||| jtjddd}|| tj	 }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r    .Ndevicedtype)diagonalr   dim)
sizeexpandr   trilonesr'   boolmasked_fillcumsuminf)r   r#   masktensor_segsumr   r   r   segment_sumE   s   
  r7   c                 C   sN   |dur%|j d dkr%|j d dkr%| j}| |dddddf  |} | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )r   r(   to)hidden_statesattention_maskr(   r   r   r   apply_mask_to_padding_statesY   s   $ r;   c                
   @   sv   e Zd ZdZejdfdededejde	dB fddZ
		dd
edejdedejfddZd
edejfddZdd ZdS )Mamba2Cachea  
    Arguments:
        config: Mamba2Config
        batch_size: int
        dtype: torch.dtype
        device: torch.device

    Attributes:
        dtype: (`torch.dtype`):
            The default `dtype` used to initializing the cache.
        conv_kernel_size: (`int`):
            Model's convolution kernel size taken from config.
        n_groups: (`int`):
            Model's number of groups taken from the config - similar to tensor parallel in Transformer.
        state_size: (`int`):
            Model's SSM state size taken from config.
        num_heads: (`int`):
            The number of heads used in the linear attention / SSM.
        head_dim: (`int`):
            The respective dimension of the heads used in the linear attention / SSM.
        intermediate_size: (`int`):
            Model's intermediate_size based on (expand * hidden_dim) from config.
        conv_states: (`torch.Tensor`):
            A tensor of shape `[num_layers, batch_size, conv_kernel_size, intermediate_size + 2 * n_groups * state_size]` that holds convolutional states.
        ssm_states: (`torch.Tensor`):
            A tensor of shape `[num_layers, batch_size, num_heads, head_dim, state_size]` that holds ssm states.
    Nconfig
batch_sizer(   r'   c              	   C   s   || _ |j| _|j| _|j| _|j| _|j| _t|j|j	 | _
tj|j|| j
d| j | j  | j||d| _tj|j|| j| j| j||d| _d S )Nr!   r&   )r(   conv_kernelconv_kernel_sizen_groups
state_size	num_headshead_dimintr.   hidden_sizeintermediate_sizer   zerosnum_hidden_layersconv_states
ssm_states)selfr=   r>   r(   r'   r   r   r   __init__   s0   zMamba2Cache.__init__F	layer_idxnew_conv_state
cache_initreturnc                 C   sv   |r| | jj| j|< n)| j| jddd| j|< |d d dd d f  | jj| j| d d d d df< | j| S )Nr    )shiftsdimsr   )r8   rJ   r'   roll)rL   rN   rO   rP   r   r   r   update_conv_state   s
   8
zMamba2Cache.update_conv_statenew_ssm_statec                 C   s   | | jj| j|< | j| S N)r8   rK   r'   )rL   rN   rV   r   r   r   update_ssm_state   s   
zMamba2Cache.update_ssm_statec                 C   s   | j   | j  d S rW   )rJ   zero_rK   rL   r   r   r   reset   s   
zMamba2Cache.reset)F)__name__
__module____qualname____doc__r   float16r   rE   r(   strrM   Tensorr1   rU   rX   r[   r   r   r   r   r<   e   s0    


r<   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	MambaRMSNormGatedư>c                    s&   t    tt|| _|| _d S rW   superrM   r   	Parameterr   r0   weightvariance_epsilonrL   rF   eps	__class__r   r   rM      s   

zMambaRMSNormGated.__init__Nc                 C   sj   |j }|tj}|d ur|tj|tj }|djddd}|t	|| j
  }| j|| S Nr!   r    T)keepdim)r(   r8   r   float32r   r   silupowmeanrsqrtri   rh   )rL   r9   gateinput_dtypevariancer   r   r   forward   s   zMambaRMSNormGated.forwardrd   rW   r\   r]   r^   rM   rx   __classcell__r   r   rl   r   rc      s    rc   c                
       s   e Zd ZdZdedef fddZ			ddejde	dB d	ej
dB d
ejdB fddZ			ddejde	dB d	ej
dB d
ejdB fddZ			dde	dB d	ej
dB d
ejdB fddZ  ZS )Mamba2Mixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r=   rN   c                    s  t    |j| _|j| _|j| _|j| _t|j	| j | _
t|j| _|| _|j| _|j| _t|j | _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _| j
d| j | j  | _tj| j| j|j|j| j|jd d| _| j
| j | j }tj| j||jd| _ t!t"#| j| _$t"%d| jd }t!t"&|| _'t(| j
| jd| _)t!t"#| j| _*tj| j
| j|jd| _+|j| _t,d}t-|dd a.t-|dd a/t,d	}t-|d
d a0t-|dd a1t-|dd a2t3t0t1t2t/t.fa4t4st56d d S d S )Nr!   r   )in_channelsout_channelsbiaskernel_sizegroupspaddingr   rk   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmselective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)7rf   rM   rC   rF   rB   ssm_state_sizer?   r@   rE   r.   rG   time_step_rankrN   use_conv_bias
hidden_act
activationr   actlayer_norm_epsilonrms_normrA   rD   r#   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearuse_biasin_projrg   r   r0   dt_biasarangelogA_logrc   normDout_projr   getattrr   r   r   r   r   allis_fast_path_availableloggerwarning_once)rL   r=   rN   projection_sizeAcausal_conv1d	mamba_ssmrl   r   r   rM      sz   

	
zMamba2Mixer.__init__Nr9   cache_paramscache_positionr:   c                 C   s  t ||}| |}|j\}}}| j| j }	|jd d| j  d| j | j  | j d }
|d ur#|d ur#|d dkr#|dj|
|
| j| j	| jgdd\}}}}}t
||j| j | jjd| jj| j}tj|| j|	|	gdd\}}}t| j  }|d d d df d d d d d f d| j| jjtjd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|jd | j }||| j|jd | j }||| j| j}t|j| j ||||||d |dd	
}||| j| j }| ||}| |d d d df }|S t| j  }| j d
tdfkr8i nd| j i}| j!rv|d u rvt"|| jjd| jj| j|f| j| j#d | j| jj| jj$| jj| jj| j| jddd|}|S |j|
|
| j| j	| jgdd\}}}}}|d ur|%dd}t&j'(||j)|jd  df}|j*| j|dd | jdvr| +| |%dddd |f %dd}nt,|%dd| jjd| jj| jd%dd}t ||}tj|| j|	|	gdd\}}}t-|||d| j|||||| jd|||| jdf| j#| jd d d| jdd|\}}|d ur6|d ur6|j.| j|d |||d}| ||}| |}|S )Nr    r!   r   r   r+   .r(   T)zr   dt_softplusg        r4   dt_limitF)r   r#   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesrN   rO   rP   )rq   swish)xrh   r   r   )r#   r   r   r   r   r   r   rN   rV   )/r;   r   r   rA   r   rG   rC   squeezesplitr   r   rJ   rN   r   rh   r   r   r   expr   floatr.   rD   r8   rp   r   r   viewr   rK   r   r   r   trainingr   r#   ri   	transposer   r   r   r@   rU   r   r   r   rX   )rL   r9   r   r   r:   projected_statesr>   seq_len_groups_time_state_sized_mlpru   hidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrJ   scan_output	ssm_stater   r   r   cuda_kernels_forward"  s  

"


<"
]"T
$




z Mamba2Mixer.cuda_kernels_forwardc           2   
      s  |j \}}}|j}t||}|}	|	j d dj  dj j  j d }
|	j|
|
jj	jgdd\}}}}}|d ur|d ur|d dkr|j
j|dd |jj jjjjd}tj|jjd dd}jry|jj }|}n8|d ur|dd}tj||j|j d  df}|j
j|d	d |ddd
d |f dd}t||}tj|jjj jj gdd\}}}tj  }|d ur7|d ur7|d dkr7|jj}|d d dd d f d d d d
f }|dd ||j d j!}j"d  j"j d j!}tjj#|||j }t$|j%d j%d }|d  jj!jjtj&d}t|d | j|d}|'|jdd
d d d f }| |jjj |j d ( }|'|d|j d }|d |d
d d d f  }|'|dj!}||d  j|d}|j)j|jj | | d |'|jdd
d d d f }| |jjj |j d ( }|'|d|j d }|jj j|j|jd}|*|j j!j}|*|j jd}t+||}|*|jj!}j,d  j,j d j!}|||  |j}|'|dd d d d
f }ntj#|j" }t$|j%d j%d }|'||dj! }|'||dj }|'||dj }|j-jj djd}|j-jj djd}j.|j.  j.  j,d t/|  }||d  }||j| } fdd||||fD \}}}}|0dddd}tj1|dd}tt2|}|d d d d d d d d d d d f |d d d d d d d d d d d f  } | jdd}!|!d |0dddddd  }"|"jdd}#|#d |d d d d d f  jdd}$t|d d d d d d dd f | }%||%0ddddd  }&|&d
d d d f |d  jdd}'|d ur|d ur|d dkr|jj d d d d
f j|'jd}(nt3|'d d d df }(tj4|(|'gdd}'tt2tj|d d d d d d df d})|)dd})|)d |'d d d d d d
f  jdd}*|*d d d df |*d d df }'}+t|},|d
d d d f |'d d d d d d
f  }-|,0dddd}.|-d|.d  }/|$|/ }|'|djj!}|| } dkr,|d d d |d d d d f }|'||d}|+d urE|d urE|j)j|+d 5||}06|0|}1|1S )Nr    r!   r+   r   Fr   r'   r   T.r%   ).NNr   r   r&   )r,   output_sizec                    s   g | ]	}t | jqS r   )r$   r#   ).0tr   rL   r   r   
<listcomp>D  s    z-Mamba2Mixer.torch_forward.<locals>.<listcomp>r   r   r*   )r   r   )7r   r(   r;   r   rG   rA   r   rC   r   r   rU   rN   rJ   r8   r   rh   r'   r   sumr   r   r   r   r   r   r   r   r@   r   r   r   rK   r.   rD   r   softplusclampr   rp   r"   
contiguousrX   r   bmmr   repeat_interleaver#   r   permuter3   r7   
zeros_likecatr   r   )2rL   r9   r   r   r:   r>   r   r   r(   r   r   ru   r   r   rJ   r   r   r   r   cache_devicer   dAdBdBxrK   ssm_states_reshaped
C_reshapedyr   
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statesr   r   r   torch_forward  s   

.,
"$"$$$P&*""&0(&
*
 zMamba2Mixer.torch_forwardc                 C   s:   t rd| jjjjv rt s| ||||S | ||||S )Ncuda)r   r   rh   r'   typer   r   r   )rL   r9   r   r   r:   r   r   r   rx     s   zMamba2Mixer.forwardNNN)r\   r]   r^   r_   r   rE   rM   r   rb   r<   
LongTensorr   r   rx   r{   r   r   rl   r   r|      sN    Y
 '
 Ir|   c                       s&   e Zd Zd fdd	Zdd Z  ZS )Mamba2RMSNormrd   c                    s&   t    tt|| _|| _dS )zM
        Mamba2RMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        Nre   rj   rl   r   r   rM     s   

zMamba2RMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S rn   )	r(   r8   r   rp   rr   rs   rt   ri   rh   )rL   r9   rv   rw   r   r   r   rx     s
   zMamba2RMSNorm.forwardry   rz   r   r   rl   r   r    s    r  c                       sJ   e Zd Z fddZ			d	dedB dejdB dejdB fddZ  Z	S )
Mamba2Blockc                    sB   t    || _|| _|j| _t|j|jd| _t	||d| _
d S )Nr   rN   )rf   rM   r=   rN   residual_in_fp32r  rF   r   r   r|   mixer)rL   r=   rN   rl   r   r   rM     s   
zMamba2Block.__init__Nr   r   r:   c                 C   sL   |}|  |j| j jjd}| jr|tj}| j||||d}|| }|S )Nr   r   r   r:   )r   r8   rh   r(   r  r   rp   r  )rL   r9   r   r   r:   residualr   r   r   rx     s   zMamba2Block.forwardr  )
r\   r]   r^   rM   r<   r   r  rb   rx   r{   r   r   rl   r   r    s    r  c                   @   s8   e Zd ZU eed< dZdgZdZdZe	
 dd ZdS )Mamba2PreTrainedModelr=   backboner  Tc                 C   s  | j j}t|trtd| j jd }t|j	t
| t|j tt| j jt
| j jt
| j j  t
| j j j| j jd}|t
t|   }t|j| tj|jjtdd |jjdurtt|jj tj|jjtdd | j jr|jj}|t| j j }t|tj rtj!|j|d |jdurt|j dS dS t|t"t#frt|j dS t|tj$rtj!|j|d dS dS )zInitialize the weights.r   )min   )aN)std)%r=   initializer_range
isinstancer|   r   r   rC   initcopy_r   r   ones_r   r   randmathr   r   r   time_step_floorexpm1r   kaiming_uniform_r   rh   sqrtr   zeros_r   rescale_prenorm_residualrI   r   r   normal_r  rc   	Embedding)rL   moduler  r   r   inv_dtpr   r   r   _init_weights  sB   

z#Mamba2PreTrainedModel._init_weightsN)r\   r]   r^   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr   no_gradr"  r   r   r   r   r
    s   
 r
  z-
    Class for the MAMBA2 model outputs.
    )custom_introc                   @   sJ   e Zd ZU dZdZejdB ed< dZe	dB ed< dZ
eej dB ed< dS )Mamba2Outputa:  
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   r9   )r\   r]   r^   r_   r+  r   FloatTensorr#  r   r<   r9   tupler   r   r   r   r*    s
   
 r*  zK
    Base class for causal language model (or autoregressive) outputs.
    c                   @   s\   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dS )Mamba2CausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   r9   )r\   r]   r^   r_   r/  r   r,  r#  r0  r   r<   r9   r-  r   r   r   r   r.    s   
 r.  c                       s   e Zd Z fddZdd Zdd Zdd Ze																dd
ej	d	B dej	d	B de
d	B ded	B ded	B ded	B dej	d	B dejd	B deeB fddZ  ZS )Mamba2Modelc                    sn   t    t j j| _t fddt j	D | _
d| _t j jd| _| | j |   d S )Nc                    s   g | ]}t  |d qS )r  )r  )r   idxr=   r   r   r   2  s    z(Mamba2Model.__init__.<locals>.<listcomp>Fr   )rf   rM   r   r  
vocab_sizerF   
embeddings
ModuleListrangerI   layersgradient_checkpointingr  r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_initrL   r=   rl   r3  r   rM   .  s    zMamba2Model.__init__c                 G   s2   |D ]}d|v r| |||dd<  d S qd S )Nz
embedding.zembeddings.)popreplace)rL   
state_dictprefixargskr   r   r   r<  :  s   zMamba2Model.load_hookc                 C   s   | j S rW   r5  rZ   r   r   r   get_input_embeddings@  s   z Mamba2Model.get_input_embeddingsc                 C   s
   || _ d S rW   rE  rL   new_embeddingsr   r   r   set_input_embeddingsC     
z Mamba2Model.set_input_embeddingsN	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr   r:   rQ   c	                 K   sd  |dur|n| j j}|dur|n| js| j jnd}|dur|n| j j}|du |duA r/td|du r8| |}| jrB| jrB|rBd}|rk|du rbt| j |	d|j
|jd}tjd| j j|j
d}n|du rjtdnd}|}
|rsdnd}| jD ]}||
|||d	}
|r||
f }qx| |
}
|r||
f }|std
d |
||fD S t|
|r||dS d|dS )a  
        cache_params (`Mamba2Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
            If `cache_params` is passed, `cache_position` should also be passed.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r&   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyr   r  c                 s   s    | ]	}|d ur|V  qd S rW   r   )r   vr   r   r   	<genexpr>  s    z&Mamba2Model.forward.<locals>.<genexpr>)r+  r   r9   )r=   rN  r   rM  use_return_dict
ValueErrorr5  r9  r<   r-   r'   r(   r   r   r?   r8  r:  r-  r*  )rL   rK  rL  r   rM  rN  rO  r   r:   kwargsr9   all_hidden_statesmixer_blockr   r   r   rx   F  s^   





zMamba2Model.forward)NNNNNNNN)r\   r]   r^   rM   r<  rF  rI  r   r   r  r<   r1   rb   r-  r*  rx   r{   r   r   rl   r   r1  ,  sB    	r1  z
    The MAMBA2 Model transformer with a language modeling head on top (linear layer with weights not tied to the input
    embeddings).
    c                       s   e Zd ZddiZ fddZdd Zdd Z											
dded	B dej	d	B dej
d	B ded	B f fddZe																			ddej	d	B dejd	B ded	B dej	d	B ded	B ded	B ded	B dej
d	B dej
d	B deej
B deeB fddZ  ZS )Mamba2ForCausalLMzlm_head.weightzbackbone.embeddings.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
rf   rM   r1  r  r   r   rF   r4  lm_headr=  r>  rl   r   r   rM     s   
zMamba2ForCausalLM.__init__c                 C   s
   | j  S rW   )r  rF  rZ   r   r   r   rF    rJ  z&Mamba2ForCausalLM.get_input_embeddingsc                 C   s   | j |S rW   )r  rI  rG  r   r   r   rI    s   z&Mamba2ForCausalLM.set_input_embeddingsNFr   r   r:   is_first_iterationc              	      s   t  j|f||||||d|}	|rD|d u rDtjd| jjj|jd|	d< |d ur/|d}
n|d}
t	| jj|
| j| j
d|	d< |	S |rP|d dkrPd |	d< |	S )N)rL  rM  r   r   r:   rY  r   r   r   r&   r   r:   )rf   prepare_inputs_for_generationr   r   r  r=   r?   r'   r-   r<   r(   )rL   rK  rL  rM  r   r   r:   rY  rT  model_inputsmax_batch_sizerl   r   r   rZ    s0   

z/Mamba2ForCausalLM.prepare_inputs_for_generationr   rK  rL  labelsrN  rO  rM  logits_to_keeprQ   c              
   K   s   |dur|n| j j}| j||||||||	d}|d }t|
tr&t|
 dn|
}| |dd|ddf | jjj	
 }d}|durR| jd||| j jd|}|sh|f|dd  }|durf|f| S |S t|||j|jdS )ao  
        cache_params (`Mamba2Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
            If `cache_params` is passed, `cache_position` should also be passed.
        N)r   rL  rN  rO  rM  r   r:   r   )r0  r]  r4  r   )r/  r0  r   r9   r   )r=   rR  r  r  rE   slicerX  r8   rh   r(   r   loss_functionr4  r.  r   r9   )rL   rK  rL  r   r]  rN  rO  rM  r   r:   r^  rT  mamba2_outputsr9   slice_indicesr0  r/  outputr   r   r   rx     s4   ,zMamba2ForCausalLM.forward)NNNNNF)
NNNNNNNNNr   )r\   r]   r^   _tied_weights_keysrM   rF  rI  r<   r   r  rb   r1   rZ  r   r,  rE   r-  r.  rx   r{   r   r   rl   r   rW    sl    *	
rW  )rW  r1  r
  )/r_   r  dataclassesr   r   r    r   r  activationsr   
generationr   integrationsr   modeling_layersr	   modeling_utilsr
   utilsr   r   r   r   configuration_mamba2r   
get_loggerr\   r   rb   rE   r   r$   r7   r;   r<   Modulerc   r|   r  r  r
  r*  r.  r1  rW  __all__r   r   r   r   <module>   sZ   
M   U9my