o
    id                     @   s  d Z ddlZddlmZ ddlmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ eeZe reddlmZ ddlm Z m!Z! nd\Z Z!Ze rvddl"m#Z#m$Z$ nd\Z$Z#e%ee e!e#e$fZ&dej'de(fddZ)dd Z*dd Z+dd Z,G dd dZ-G d d! d!ejj.Z/G d"d# d#ej.Z0G d$d% d%ej.Z1G d&d' d'eZ2eG d(d) d)eZ3eed*d+G d,d- d-eZ4eed.d+G d/d0 d0eZ5eG d1d2 d2e3Z6ed3d+G d4d5 d5e3eZ7g d6Z8dS )7zPyTorch MAMBA2 model.    N)	dataclass)OptionalUnion)nn   )ACT2FN)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_mamba_2_ssm_available   )Mamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_update)NNinput_tensorpad_sizec                 C   sH   t | jdkrddddd|ddfnddd|ddf}tjjj| |dddS )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)modevalue)lenshapetorchr   
functionalpad)r   r   	pad_shape r$   g/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/mamba2/modeling_mamba2.pypad_tensor_by_sizeA   s   2r&   c                 C   sX   t | |} t| jdkr| | jd d|| jd S | | jd d|| jd | jd S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r   r      )r&   r   r   reshape)r   r   
chunk_sizer$   r$   r%   reshape_into_chunksL   s   
r+   c                 C   s   |  d}| d jg |   |R  } tjtj||| jtjddd}| | d} tj| dd}tjtj||| jtjddd}|| tj	 }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r'   .Ndevicedtype)diagonalr   dim)
sizeexpandr    trilonesr.   boolmasked_fillcumsuminf)r   r*   masktensor_segsumr$   r$   r%   segment_sum`   s   
  r>   c                 C   sN   |dur%|j d dkr%|j d dkr%| j}| |dddddf  |} | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )r   r/   to)hidden_statesattention_maskr/   r$   r$   r%   apply_mask_to_padding_statest   s   $ rB   c                
   @   sv   e Zd ZdZejdfdededejde	e
 fddZ		dd
edejdedejfddZd
edejfddZdd ZdS )Mamba2Cachea  
    Arguments:
        config: Mamba2Config
        batch_size: int
        dtype: torch.dtype
        device: torch.device

    Attributes:
        dtype: (`torch.dtype`):
            The default `dtype` used to initializing the cache.
        conv_kernel_size: (`int`):
            Model's convolution kernel size taken from config.
        n_groups: (`int`):
            Model's number of groups taken from the config - similar to tensor parallel in Transformer.
        state_size: (`int`):
            Model's SSM state size taken from config.
        num_heads: (`int`):
            The number of heads used in the linear attention / SSM.
        head_dim: (`int`):
            The respective dimension of the heads used in the linear attention / SSM.
        intermediate_size: (`int`):
            Model's intermediate_size based on (expand * hidden_dim) from config.
        conv_states: (`torch.Tensor`):
            A tensor of shape `[num_layers, batch_size, conv_kernel_size, intermediate_size + 2 * n_groups * state_size]` that holds convolutional states.
        ssm_states: (`torch.Tensor`):
            A tensor of shape `[num_layers, batch_size, num_heads, head_dim, state_size]` that holds ssm states.
    Nconfig
batch_sizer/   r.   c              	   C   s   || _ |j| _|j| _|j| _|j| _|j| _t|j|j	 | _
tj|j|| j
d| j | j  | j||d| _tj|j|| j| j| j||d| _d S )Nr(   r-   )r/   conv_kernelconv_kernel_sizen_groups
state_size	num_headshead_dimintr5   hidden_sizeintermediate_sizer    zerosnum_hidden_layersconv_states
ssm_states)selfrD   rE   r/   r.   r$   r$   r%   __init__   s0   zMamba2Cache.__init__F	layer_idxnew_conv_state
cache_initreturnc                 C   sv   |r| | jj| j|< n)| j| jddd| j|< |d d dd d f  | jj| j| d d d d df< | j| S )Nr'   )shiftsdimsr   )r?   rQ   r.   roll)rS   rU   rV   rW   r$   r$   r%   update_conv_state   s
   8
zMamba2Cache.update_conv_statenew_ssm_statec                 C   s   | | jj| j|< | j| S N)r?   rR   r.   )rS   rU   r]   r$   r$   r%   update_ssm_state   s   
zMamba2Cache.update_ssm_statec                 C   s   | j   | j  d S r^   )rQ   zero_rR   rS   r$   r$   r%   reset   s   
zMamba2Cache.reset)F)__name__
__module____qualname____doc__r    float16r   rL   r/   r   strrT   Tensorr8   r\   r_   rb   r$   r$   r$   r%   rC      s0    


rC   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	MambaRMSNormGatedư>c                    s&   t    tt|| _|| _d S r^   superrT   r   	Parameterr    r7   weightvariance_epsilonrS   rM   eps	__class__r$   r%   rT      s   

zMambaRMSNormGated.__init__Nc                 C   sj   |j }|tj}|d ur|tj|tj }|djddd}|t	|| j
  }| j|| S Nr(   r'   T)keepdim)r/   r?   r    float32r   r!   silupowmeanrsqrtrp   ro   )rS   r@   gateinput_dtypevariancer$   r$   r%   forward   s   zMambaRMSNormGated.forwardrk   r^   rc   rd   re   rT   r   __classcell__r$   r$   rs   r%   rj      s    rj   c                
       s   e Zd ZdZdedef fddZ			ddejde	e
 d	e	ej d
e	ej fddZ			ddejde	e
 d	e	ej d
e	ej fddZ			dde	e
 d	e	ej d
e	ej fddZ  ZS )Mamba2Mixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    rD   rU   c                    s  t    |j| _|j| _|j| _|j| _t|j	| j | _
t|j| _|| _|j| _|j| _t|j | _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _| j
d| j | j  | _tj| j| j|j|j| j|jd d| _| j
| j | j }tj| j||jd| _ t!t"#| j| _$t"%d| jd }t!t"&|| _'t(| j
| jd| _)t!t"#| j| _*tj| j
| j|jd| _+|j| _t,st-.d d S d S )Nr(   r   )in_channelsout_channelsbiaskernel_sizegroupspaddingr   rr   a  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)/rm   rT   rJ   rM   rI   ssm_state_sizerF   rG   rL   r5   rN   time_step_rankrU   use_conv_bias
hidden_act
activationr   actlayer_norm_epsilonrms_normrH   rK   r*   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearuse_biasin_projrn   r    r7   dt_biasarangelogA_logrj   normDout_projis_fast_path_availableloggerwarning_once)rS   rD   rU   projection_sizeArs   r$   r%   rT      s\   

	zMamba2Mixer.__init__Nr@   cache_paramscache_positionrA   c                 C   s  t ||}| |}|j\}}}| j| j }	|jd d| j  d| j | j  | j d }
|d ur#|d ur#|d dkr#|dj|
|
| j| j	| jgdd\}}}}}t
||j| j | jjd| jj| j}tj|| j|	|	gdd\}}}t| j  }|d d d df d d d d d f d| j| jjtjd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|jd | j }||| j|jd | j }||| j| j}t|j| j ||||||d |dd	
}||| j| j }| ||}| |d d d df }|S t| j  }| j d
tdfkr8i nd| j i}| j!rv|d u rvt"|| jjd| jj| j|f| j| j#d | j| jj| jj$| jj| jj| j| jddd|}|S |j|
|
| j| j	| jgdd\}}}}}|d ur|%dd}t&j'(||j)|jd  df}|j*| j|dd | jdvr| +| |%dddd |f %dd}nt,|%dd| jjd| jj| jd%dd}t ||}tj|| j|	|	gdd\}}}t-|||d| j|||||| jd|||| jdf| j#| jd d d| jdd|\}}|d ur6|d ur6|j.| j|d |||d}| ||}| |}|S )Nr'   r(   r   r   r2   .r/   T)zr   dt_softplusg        r;   dt_limitF)r   r*   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesrU   rV   rW   )rx   swish)xro   r   r   )r*   r   r   r   r   r   r   rU   r]   )/rB   r   r   rH   r   rN   rJ   squeezesplitr   r   rQ   rU   r   ro   r   r   r    expr   floatr5   rK   r?   rw   r   r   viewr   rR   r   r   r   trainingr   r*   rp   	transposer   r!   r"   rG   r\   r   r   r   r_   )rS   r@   r   r   rA   projected_statesrE   seq_len_groups_time_state_sized_mlpr|   hidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrQ   scan_output	ssm_stater$   r$   r%   cuda_kernels_forward&  s  

"


<"
]"T
$




z Mamba2Mixer.cuda_kernels_forwardc           2   
      s  |j \}}}|j}t||}|}	|	j d dj  dj j  j d }
|	j|
|
jj	jgdd\}}}}}|d ur|d ur|d dkr|j
j|dd |jj jjjjd}tj|jjd dd}jry|jj }|}n8|d ur|dd}tj||j|j d  df}|j
j|d	d |ddd
d |f dd}t||}tj|jjj jj gdd\}}}tj  }|d ur7|d ur7|d dkr7|jj}|d d dd d f d d d d
f }|dd ||j d j!}j"d  j"j d j!}tjj#|||j }t$|j%d j%d }|d  jj!jjtj&d}t|d | j|d}|'|jdd
d d d f }| |jjj |j d ( }|'|d|j d }|d |d
d d d f  }|'|dj!}||d  j|d}|j)j|jj | | d |'|jdd
d d d f }| |jjj |j d ( }|'|d|j d }|jj j|j|jd}|*|j j!j}|*|j jd}t+||}|*|jj!}j,d  j,j d j!}|||  |j}|'|dd d d d
f }ntj#|j" }t$|j%d j%d }|'||dj! }|'||dj }|'||dj }|j-jj djd}|j-jj djd}j.|j.  j.  j,d t/|  }||d  }||j| } fdd||||fD \}}}}|0dddd}tj1|dd}tt2|}|d d d d d d d d d d d f |d d d d d d d d d d d f  } | jdd}!|!d |0dddddd  }"|"jdd}#|#d |d d d d d f  jdd}$t|d d d d d d dd f | }%||%0ddddd  }&|&d
d d d f |d  jdd}'|d ur|d ur|d dkr|jj d d d d
f j|'jd}(nt3|'d d d df }(tj4|(|'gdd}'tt2tj|d d d d d d df d})|)dd})|)d |'d d d d d d
f  jdd}*|*d d d df |*d d df }'}+t|},|d
d d d f |'d d d d d d
f  }-|,0dddd}.|-d|.d  }/|$|/ }|'|djj!}|| } dkr,|d d d |d d d d f }|'||d}|+d urE|d urE|j)j|+d 5||}06|0|}1|1S )Nr'   r(   r2   r   Fr   r.   r   T.r,   ).NNr   r   r-   )r3   output_sizec                    s   g | ]	}t | jqS r$   )r+   r*   ).0tr   rS   r$   r%   
<listcomp>H  s    z-Mamba2Mixer.torch_forward.<locals>.<listcomp>r   r   r1   )r   r   )7r   r/   rB   r   rN   rH   r   rJ   r   r   r\   rU   rQ   r?   r   ro   r.   r    sumr   r   r   r   r   r   r!   r"   rG   r   r   r   rR   r5   rK   r   softplusclampr   rw   r)   
contiguousr_   r   bmmr   repeat_interleaver*   r&   permuter:   r>   
zeros_likecatr   r   )2rS   r@   r   r   rA   rE   r   r   r/   r   r   r|   r   r   rQ   r   r   r   r   cache_devicer   dAdBdBxrR   ssm_states_reshaped
C_reshapedyr   
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statesr$   r   r%   torch_forward  s   

.,
"$"$$$P&*""&0(&
*
 zMamba2Mixer.torch_forwardc                 C   s4   t rd| jjjjv r| ||||S | ||||S )Ncuda)r   r   ro   r.   typer   r   )rS   r@   r   r   rA   r$   r$   r%   r     s   zMamba2Mixer.forwardr   )rc   rd   re   rf   r   rL   rT   r    ri   r   rC   
LongTensorr   r   r   r   r$   r$   rs   r%   r      sN    C
 '
 Ir   c                       s&   e Zd Zd fdd	Zdd Z  ZS )Mamba2RMSNormrk   c                    s&   t    tt|| _|| _dS )zM
        Mamba2RMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        Nrl   rq   rs   r$   r%   rT     s   

zMamba2RMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S ru   )	r/   r?   r    rw   ry   rz   r{   rp   ro   )rS   r@   r}   r~   r$   r$   r%   r     s
   zMamba2RMSNorm.forwardr   r   r$   r$   rs   r%   r     s    r   c                       sJ   e Zd Z fddZ			d	dee deej deej fddZ	  Z
S )
Mamba2Blockc                    sB   t    || _|| _|j| _t|j|jd| _t	||d| _
d S )Nr   rU   )rm   rT   rD   rU   residual_in_fp32r   rM   r   r   r   mixer)rS   rD   rU   rs   r$   r%   rT     s   
zMamba2Block.__init__Nr   r   rA   c                 C   sL   |}|  |j| j jjd}| jr|tj}| j||||d}|| }|S )Nr   r   r   rA   )r   r?   ro   r/   r  r    rw   r  )rS   r@   r   r   rA   residualr$   r$   r%   r     s   zMamba2Block.forwardr   )rc   rd   re   rT   r   rC   r    r   ri   r   r   r$   r$   rs   r%   r    s    r  c                   @   s0   e Zd ZU eed< dZdgZdZdZdd Z	dS )Mamba2PreTrainedModelrD   backboner  Tc                 C   s  | j j}t|trtd| j jd }|jt	| |j
jd tt| j jt	| j jt	| j j  t	| j j j| j jd}|t	t|   }|j| d|j_tjj|jjtdd |jjdurt|jjdd	stj|jj tjj|j jtdd | j j!r|j j}|t| j j" }t|tj#rt|jdd	stjj$|j|d
 |jdurt|jdd	stj|j dS dS dS t|t%t&fr|jjd dS t|tj'rtjj$|j|d
 dS dS )zInitialize the weights.r   g      ?)minT   )aN
_no_reinitF)std)(rD   initializer_range
isinstancer   r    r   rJ   r   copy_r   r   datafill_r   randmathr   r   r   time_step_floorexpm1r   r  r   initkaiming_uniform_r   ro   sqrtr   getattrzeros_r   rescale_prenorm_residualrP   r   normal_r   rj   	Embedding)rS   moduler  r   r   inv_dtpr$   r$   r%   _init_weights  sL   

z#Mamba2PreTrainedModel._init_weightsN)
rc   rd   re   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr"  r$   r$   r$   r%   r    s   
 r  z-
    Class for the MAMBA2 model outputs.
    )custom_introc                   @   sJ   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeeej  ed< dS )Mamba2Outputa:  
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   r@   )rc   rd   re   rf   r*  r   r    FloatTensorr#  r   rC   r@   tupler$   r$   r$   r%   r)    s
   
 r)  zK
    Base class for causal language model (or autoregressive) outputs.
    c                   @   s\   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dS )Mamba2CausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   r@   )rc   rd   re   rf   r.  r   r    r+  r#  r/  r   rC   r@   r,  r$   r$   r$   r%   r-    s   
 r-  c                       s   e Zd Z fddZdd Zdd Zdd Ze																dd
ee	j
 dee	j
 dee dee dee dee dee	j
 dee	j deeef fddZ  ZS )Mamba2Modelc                    sn   t    t j j| _t fddt j	D | _
d| _t j jd| _| | j |   d S )Nc                    s   g | ]}t  |d qS )r  )r  )r   idxrD   r$   r%   r   9  s    z(Mamba2Model.__init__.<locals>.<listcomp>Fr   )rm   rT   r   r  
vocab_sizerM   
embeddings
ModuleListrangerP   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_initrS   rD   rs   r2  r%   rT   5  s    zMamba2Model.__init__c                 G   s2   |D ]}d|v r| |||dd<  d S qd S )Nz
embedding.zembeddings.)popreplace)rS   
state_dictprefixargskr$   r$   r%   r;  A  s   zMamba2Model.load_hookc                 C   s   | j S r^   r4  ra   r$   r$   r%   get_input_embeddingsG  s   z Mamba2Model.get_input_embeddingsc                 C   s
   || _ d S r^   rD  rS   new_embeddingsr$   r$   r%   set_input_embeddingsJ     
z Mamba2Model.set_input_embeddingsN	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr   rA   rX   c	                 K   sd  |dur|n| j j}|dur|n| js| j jnd}|dur|n| j j}|du |duA r/td|du r8| |}| jrB| jrB|rBd}|rk|du rbt| j |	d|j
|jd}tjd| j j|j
d}n|du rjtdnd}|}
|rsdnd}| jD ]}||
|||d	}
|r||
f }qx| |
}
|r||
f }|std
d |
||fD S t|
|r||dS d|dS )a  
        cache_params (`Mamba2Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
            If `cache_params` is passed, `cache_position` should also be passed.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r-   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyr$   r  c                 s   s    | ]	}|d ur|V  qd S r^   r$   )r   vr$   r$   r%   	<genexpr>  s    z&Mamba2Model.forward.<locals>.<genexpr>)r*  r   r@   )rD   rM  r   rL  use_return_dict
ValueErrorr4  r8  rC   r4   r.   r/   r    r   rF   r7  r9  r,  r)  )rS   rJ  rK  r   rL  rM  rN  r   rA   kwargsr@   all_hidden_statesmixer_blockr$   r$   r%   r   M  s^   





zMamba2Model.forward)NNNNNNNN)rc   rd   re   rT   r;  rE  rH  r   r   r    r   rC   r8   ri   r   r,  r)  r   r   r$   r$   rs   r%   r0  3  sB    	
r0  z
    The MAMBA2 Model transformer with a language modeling head on top (linear layer with weights not tied to the input
    embeddings).
    c                       s   e Zd Zg Z fddZdd Zdd Z					ddee d	ee	j
 d
ee	j fddZe									ddee	j
 dee	j dee dee	j
 dee dee dee d	ee	j d
ee	j deeef fddZ  ZS )Mamba2ForCausalLMc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
rm   rT   r0  r  r   r   rM   r3  lm_headr<  r=  rs   r$   r%   rT     s   
zMamba2ForCausalLM.__init__c                 C   s
   | j  S r^   )r  rE  ra   r$   r$   r%   rE    rI  z&Mamba2ForCausalLM.get_input_embeddingsc                 C   s   | j |S r^   )r  rH  rF  r$   r$   r%   rH    s   z&Mamba2ForCausalLM.set_input_embeddingsNr   r   rA   c                 K   s   d|  i}|r7|d u r7tjd| jjj|jd}|d ur&d|i}|d}	n|d}	t| jj|	| j| j	d}|rP|d dkrP|d d df 
d  |d< d }|sZ|d urZd|i}|||||d | D ]\}
}|
|vrt|||
< qh|S )NrJ  r   r   rK  r-   r'   )r   rL  r   rA   )r   r    r   r  rD   rF   r.   r4   rC   r/   	unsqueezeupdateitems)rS   rJ  rK  rL  r   r   rA   rS  model_inputsmax_batch_sizekeyr   r$   r$   r%   prepare_inputs_for_generation  s2   

z/Mamba2ForCausalLM.prepare_inputs_for_generationrJ  rK  labelsrM  rN  rL  rX   c
              
   K   s   |dur|n| j j}| j||||||||	d}|d }| || jjj }d}|dur<| jd||| j j	d|
}|sR|f|dd  }|durP|f| S |S t
|||j|jdS )ao  
        cache_params (`Mamba2Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
            If `cache_params` is passed, `cache_position` should also be passed.
        N)r   rK  rM  rN  rL  r   rA   r   )r/  r_  r3  r   )r.  r/  r   r@   r$   )rD   rQ  r  rW  r?   ro   r/   r   loss_functionr3  r-  r   r@   )rS   rJ  rK  r   r_  rM  rN  rL  r   rA   rS  mamba2_outputsr@   r/  r.  outputr$   r$   r%   r     s2   
zMamba2ForCausalLM.forward)NNNNN)	NNNNNNNNN)rc   rd   re   _tied_weights_keysrT   rE  rH  r   rC   r    r   ri   r^  r   r+  r8   r   r,  r-  r   r   r$   r$   rs   r%   rV    s`    
0	

rV  )rV  r0  r  )9rf   r  dataclassesr   typingr   r   r    r   activationsr   
generationr   modeling_layersr	   modeling_utilsr
   utilsr   r   r   utils.import_utilsr   r   configuration_mamba2r   
get_loggerrc   r   +mamba_ssm.ops.triton.selective_state_updater   !mamba_ssm.ops.triton.ssd_combinedr   r   causal_conv1dr   r   allr   ri   rL   r&   r+   r>   rB   rC   Modulerj   r   r   r  r  r)  r-  r0  rV  __all__r$   r$   r$   r%   <module>   sx   

M   ?<m|