o
    	۷iÜ                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZmZmZmZ ddlm Z  e!e"Z#e roddl$m%Z% ndZ%e rddl&m'Z'm(Z( ddl)m*Z* nd\Z*Z(Z'da+dd Z,G dd dZ-G dd de	j.Z/G dd de	j.Z0G dd deZ1eG dd  d eZ2eed!d"G d#d$ d$eZ3eed%d"G d&d' d'eZ4eG d(d) d)e2Z5ed*d"G d+d, d,e2eZ6g d-Z7dS ).zPyTorch MAMBA model.    N)	dataclass)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)PretrainedConfig)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_kernels_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNNc                  C   s`   t d urt S t rddlm}  | d}|j|jfa t S t r,ddlm}m} ||fa t S da t S )Nr   )
get_kernelzkernels-community/causal-conv1d)causal_conv1d_fncausal_conv1d_update)NN)_causal_conv1d_cacher   kernelsr   r   r   r   causal_conv1d)r   _causal_conv1d_kernelr   r    r#   ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/mamba/modeling_mamba.py_lazy_load_causal_conv1d<   s   r%   c                   @   s   e Zd ZdZdZejdfdededej	de
ejedf fdd	Zd
edejdejdejfddZd
edejfddZdd ZdS )
MambaCachea.  
    Cache for mamba model which does not have attention mechanism and key value states.

    Arguments:
        config (`PretrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if
            a smaller batch size is used.
        dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
            The default `dtype` to use when initializing the layer.
        device (`torch.device` or `str`, *optional*):
            The device on which the cache should be initialized. Should be the same as the layer.

    Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache

        >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

        >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> cache_params = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
        >>> cache_position = torch.arange(len(inputs["input_ids"][0]), device=model.device)  # sequence length
        >>> outputs = model(**inputs, cache_params=cache_params, cache_position=cache_position, use_cache=True)
        >>> outputs.cache_params
        ```
    TNconfigmax_batch_sizedtypedevicec                 C   s   || _ || _|j| _|j| _|j| _g | _g | _|d ur!t	
|nd }t|jD ]6}t	j| j | j| j|| jd}t	j| j | j| j|| jd}t	j| t	j| | j| | j| q(d S )Nr*   r)   )r(   _dtypeintermediate_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeconv_states
ssm_statestorchr*   rangenum_hidden_layerszeros_dynamomark_static_addressappend)selfr'   r(   r)   r*   _
conv_state	ssm_stater#   r#   r$   __init__t   s8   zMambaCache.__init__	layer_idxnew_conv_statecache_positionreturnc                 C   s   | j | j|jkr| j | |j| j |< | j | }|d| jd }|jddd}|j|j|jd|d d d d |f< | j |   | j |  |7  < | j | S )Nr   r   )shiftsdimsr+   )r2   r*   toclampr1   rollr)   zero_)r;   r@   rA   rB   r=   r#   r#   r$   update_conv_state   s   
$
zMambaCache.update_conv_statenew_ssm_statec                 C   s8   | j |   | j |  || j | j7  < | j | S N)r3   rJ   rG   r*   )r;   r@   rL   r#   r#   r$   update_ssm_state   s    
zMambaCache.update_ssm_statec                 C   s4   t t| jD ]}| j|   | j|   qd S rM   )r5   lenr2   rJ   r3   )r;   r@   r#   r#   r$   reset   s   zMambaCache.reset)__name__
__module____qualname____doc__is_compileabler4   float16r
   intr)   r   r*   strr?   Tensor
LongTensorrK   rN   rP   r#   r#   r#   r$   r&   O   s2    !
%
r&   c                
       s   e Zd ZdZdedef fddZdd Z			dd	ej	d
e
e de
ej de
ej fddZdd
e
e de
ej de
ej fddZ			dd
e
e de
ej de
ej fddZ  ZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r'   r@   c                    sh  t    || _|j| _|j| _|j| _|j| _t	|j
| _
|| _|j| _tj| j| j|j|j| j|jd d| _|j| _t|j | _|j| _tj| j| jd |jd| _tj| j| j
| jd  dd| _tj| j
| jdd| _tjd| jd tjdd d d f }|| jd }tt || _!tt"| j| _#tj| j| j|jd| _$|j| _| %  d S )	Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   r^   FTr)   rD   )&superr?   r'   hidden_sizer.   r/   r0   r1   r-   rW   time_step_rankr@   use_conv_biasr   Conv1dconv1d
hidden_act
activationr	   actuse_mambapyLinearuse_biasin_projx_projdt_projr4   arangefloat32expand
contiguous	ParameterlogA_logonesDout_projwarn_slow_implementation)r;   r'   r@   A	__class__r#   r$   r?      s<   
	 $zMambaMixer.__init__c                 C   sT   t  \}}ttt||tf}|s(| jr!t rtd d S t	dtd d S d S )Na  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)
r%   allr   r   r   rn   r   loggerwarning_onceImportError)r;   r   r   is_fast_path_availabler#   r#   r$   r~      s"   
z#MambaMixer.warn_slow_implementationNhidden_statescache_paramsrB   attention_maskc                 C   s  |  |dd}| jrI|d u rIt|| jj| jr| jjnd | jj| j	j| j
j| jr/| j
j nd t| j  d d | j | j	j dd}|S t \}}|jddd\}}	|d urb||d }| jj| jjd| jjd}
|d ur|d dkr||d|j| j |
| jj| j}|d}n&|d urtj|| j|jd  df}|| j|| |||
| jj| jd}|d ur||d }| |dd}tj || j!| j"| j"gdd\}}}| j	j|dd }t| j  }t#| j	d	r| j	j nd }|d ur8|d dkr8t$|j%| j |d
 |d
 ||d d df |d d df | j|	d
 |dd
d}n,t&||||dd|dd| j |	|ddd
\}}|d urd|d urd|'| j| | 
|dd}|S )Nr   rb   T)
delta_biasdelta_softplusdimr   rD   )rl   r^   ).r   )dt_softplus)r   return_last_state)(rq   	transposetrainingr   rj   weightrh   r^   rr   rs   r}   rp   floatr4   exprz   r|   r%   chunk	unsqueezeviewsizesqueezer2   r@   rl   r   
functionalpadr1   shaperK   splitrg   r/   hasattrr   r3   r   rN   )r;   r   r   rB   r   projected_statescontextualized_statesr   r   gateconv_weightsr2   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsr>   r#   r#   r$   cuda_kernels_forward  s   
Y
$




zMambaMixer.cuda_kernels_forwardc              	   C   s  |j \}}}|j}| |dd}	|	jddd\}
}|d ur&|
|d }
|d ur|j| j  }|	|
j
}|j d | jkrftj|
| j|
j d  df}|| j|| | | |
dd |f }
nU|| j|
|}|	| jjj
}tj|| jjd d dd d f  dd}
| jr|
| jj7 }
| |
	|d}
ntj|| j| jf|
j
|d}| | |
dd |f }
|d ur|
|d }
| |
dd}tj|| j| j| jgdd\}}}| |}tj|dd}t| j !  }t|d d d d d d f |d d d d d d d f  }|d d d d d d d f |d d d d d d d f !  }||
d d d d d d d f !  }| j"r| j#r|d u rt$|dd|dd}||d %ddd}||
| j&d d d d f   }|| | }nug }t'|D ]D}|d d d d |d d f | |d d d d |d d f  }t(|	||d d |d d f d}|)|d d d d df  qtj*|dd}||
| j&d d d d f   }|| | }|d ur|j| j +| | ,|dd}|S )	Nr   rb   r   r   rD   .r+   r   )-r   r)   rq   r   r   r   r3   r@   clonerG   r*   r1   r   r   r   rK   rm   rj   r   r4   sumrh   r^   r7   r-   r/   rr   r   rg   rs   softplusr   rz   r   rn   r   r   r   r|   r5   matmulr:   stackcopy_r}   )r;   input_statesr   rB   r   
batch_sizeseq_lenr<   r)   r   r   r   r>   r=   r   r   r   r   r   r   
discrete_A
discrete_BdeltaB_uhsscan_outputr   ir   r#   r#   r$   slow_forwardh  sp   (
:<$<* 
zMambaMixer.slow_forwardc                 C   sZ   t  \}}ttt||tf}|r%d| jjjjv r%t	j
 s%| ||||S | ||||S )Ncuda)r%   r   r   r   r   rr   r   r*   typer4   r8   is_compilingr   r   )r;   r   r   rB   r   r   r   r   r#   r#   r$   forward  s   
zMambaMixer.forwardr   )rQ   rR   rS   rT   r   rW   r?   r~   r4   rY   r   r&   rZ   r   r   r   __classcell__r#   r#   r   r$   r[      s6    +
(gUr[   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	MambaRMSNormư>c                    s&   t    tt|| _|| _dS )zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)re   r?   r   rx   r4   r{   r   variance_epsilon)r;   rf   epsr   r#   r$   r?     s   

zMambaRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nrb   rD   T)keepdim)	r)   rG   r4   ru   powmeanrsqrtr   r   )r;   r   input_dtypevariancer#   r#   r$   r     s
   zMambaRMSNorm.forwardc                 C   s   | j jd  d| j S )Nr   z, eps=)r   r   r   r;   r#   r#   r$   
extra_repr  s   zMambaRMSNorm.extra_repr)r   )rQ   rR   rS   r?   r   r   r   r#   r#   r   r$   r     s    r   c                       sJ   e Zd Z fddZ			d	dee deej deej fddZ  Z	S )

MambaBlockc                    sB   t    || _|| _|j| _t|j|jd| _t	||d| _
d S )Nr   r@   )re   r?   r'   r@   residual_in_fp32r   rf   layer_norm_epsilonnormr[   mixer)r;   r'   r@   r   r#   r$   r?     s   
zMambaBlock.__init__Nr   rB   r   c                 C   sL   |}|  |j| j jjd}| jr|tj}| j||||d}|| }|S )Nrd   r   rB   r   )r   rG   r   r)   r   r4   ru   r   )r;   r   r   rB   r   residualr#   r#   r$   r     s   zMambaBlock.forwardr   )
rQ   rR   rS   r?   r   r&   r4   rZ   r   r   r#   r#   r   r$   r     s    r   c                   @   s2   e Zd ZU eed< dZddgZdZdZdd Z	dS )	MambaPreTrainedModelr'   backboner   r[   Tc                 C   sd  | j j}t|trtjd|jd tjddddf }||j	d
 }|jt| |jjd | j jd | j j }| j jdkrNtj|jj| n| j jdkr_tj|jj| | tt| j j	t| j jt| j j  t| j j j| j j d	}|tt!|   }|jj"| d
|jj"_#tjj$|j%jt&dd |j%j"durt'|j%j"ddstj(|j%j" tjj$|j)jt&dd | j j*r|j)j}|t&| j j+ }t|tj,rt'|jddstjj-|j|d |j"durt'|j"ddstj(|j" dS dS dS t|t.r|jjd dS t|tj/r0tjj-|j|d dS dS )zInitialize the weights.r   rd   NrD   g      ?g      constantrandom)minT   )a
_no_reinitF)std)0r'   initializer_range
isinstancer[   r4   rt   r/   ru   rv   r-   rw   rz   r   ry   r|   datafill_rg   time_step_scaletime_step_init_schemer   init	constant_rs   r   uniform_r   randmathtime_step_maxtime_step_minrH   time_step_floorexpm1r^   r   kaiming_uniform_rj   sqrtgetattrzeros_r}   rescale_prenorm_residualr6   ro   normal_r   	Embedding)r;   moduler   r   dt_init_stddtinv_dtpr#   r#   r$   _init_weights  sX   
$
z"MambaPreTrainedModel._init_weightsN)
rQ   rR   rS   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr   r#   r#   r#   r$   r     s   
 r   z,
    Class for the MAMBA model outputs.
    )custom_introc                   @   sJ   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeeej  ed< dS )MambaOutputa9  
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   r   )rQ   rR   rS   rT   r  r   r4   FloatTensorr   r   r&   r   tupler#   r#   r#   r$   r   =  s
   
 r   zK
    Base class for causal language model (or autoregressive) outputs.
    c                   @   s\   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dS )MambaCausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   r   )rQ   rR   rS   rT   r  r   r4   r  r   r  r   r&   r   r  r#   r#   r#   r$   r  Q  s   
 r  c                       s   e Zd Z fddZdd Zdd Zdd Ze																dd
ee	j
 dee	j
 dee dee dee dee dee	j
 dee	j
 deeef fddZ  ZS )
MambaModelc                    sn   t    t j j| _t fddt j	D | _
d| _t j jd| _| | j |   d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0idxr'   r#   r$   
<listcomp>p  s    z'MambaModel.__init__.<locals>.<listcomp>Fr   )re   r?   r   r   
vocab_sizerf   
embeddings
ModuleListr5   r6   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_initr;   r'   r   r
  r$   r?   l  s    zMambaModel.__init__c                 G   s2   |D ]}d|v r| |||dd<  d S qd S )Nz
embedding.zembeddings.)popreplace)r;   
state_dictprefixargskr#   r#   r$   r  x  s   zMambaModel.load_hookc                 C   s   | j S rM   r  r   r#   r#   r$   get_input_embeddings~  s   zMambaModel.get_input_embeddingsc                 C   s
   || _ d S rM   r  r;   new_embeddingsr#   r#   r$   set_input_embeddings     
zMambaModel.set_input_embeddingsN	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictrB   r   rC   c	                 C   sd  |dur|n| j j}|dur|n| js| j jnd}|dur|n| j j}|du |duA r/td|du r8| |}| jrB| jrB|rBd}|rk|du rbt| j |	d|j
|jd}tjd| j j|j
d}n|du rjtdnd}|}	|rsdnd}
| jD ]}||	|||d	}	|r|
|	f }
qx| |	}	|r|
|	f }
|std
d |	||
fD S t|	|r||
dS d|
dS )a  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r+   r*   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyr#   r   c                 s   s    | ]	}|d ur|V  qd S rM   r#   )r  vr#   r#   r$   	<genexpr>  s    z%MambaModel.forward.<locals>.<genexpr>)r  r   r   )r'   r%  r   r$  use_return_dict
ValueErrorr  r  r&   r   r*   r)   r4   rt   r0   r  r  r  r   )r;   r"  r#  r   r$  r%  r&  rB   r   r   all_hidden_statesmixer_blockr#   r#   r$   r     s^   





zMambaModel.forward)NNNNNNNN)rQ   rR   rS   r?   r  r  r   r   r   r4   rZ   r&   boolr   r  r   r   r   r#   r#   r   r$   r  j  sB    	

r  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       s  e Zd ZdgZ fddZdd Zdd Z	dd	ed
ee	e
f dedee	e
f fddZ					ddee deej deej fddZe									ddeej deej deej dee deej dee dee dee deej deeef fddZ  ZS ) MambaForCausalLMzlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFrc   )
re   r?   r  r   r   ro   rf   r  lm_headr  r  r   r#   r$   r?     s   
zMambaForCausalLM.__init__c                 C   s
   | j  S rM   )r   r  r   r#   r#   r$   r    r!  z%MambaForCausalLM.get_input_embeddingsc                 C   s   | j |S rM   )r   r   r  r#   r#   r$   r     s   z%MambaForCausalLM.set_input_embeddingsr   outputsmodel_kwargsnum_new_tokensrC   c                 K   s   | dd |d< | ddr$d|v r$|d d ur$|d dd  | |d< d|v r?|d }tj|||jd dfgdd	|d< |S )
Nr   r$  TrB   rD   r   r   r   r   )getr4   catnew_onesr   )r;   r1  r2  r3  kwargsr   r#   r#   r$   #_update_model_kwargs_for_generation  s   

z4MambaForCausalLM._update_model_kwargs_for_generationNr   rB   r   c                 K   s   d|  i}|r7|d u r7tjd| jjj|jd}|d ur&d|i}|d}	n|d}	t| jj|	| j| j	d}|rP|d dkrP|d d df 
d  |d< d }|sZ|d urZd|i}|||||d | D ]\}
}|
|vrt|||
< qh|S )Nr"  r   r'  r#  r+   rD   )r   r$  rB   r   )rw   r4   rt   r   r'   r0   r*   r   r&   r)   r   updateitems)r;   r"  r#  r$  r   rB   r   r7  model_inputsr(   keyvaluer#   r#   r$   prepare_inputs_for_generation  s2   

z.MambaForCausalLM.prepare_inputs_for_generationr"  r#  labelsr%  r&  r$  c
              
   K   s   |dur|n| j j}| j|||||||	|d}|d }| || jjj }d}|dur]||j}|dddddf 	 }|dddf 	 }t
 }||d|d|d}|ss|f|dd  }|durq|f| S |S t|||j|jdS )aS  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        N)r   r#  r%  r&  r$  rB   r   r   .rD   r   )r  r  r   r   )r'   r*  r   r0  rG   r   r)   r   r*   rw   r   r   r   r  r   r   )r;   r"  r   r#  r   r?  r%  r&  r$  rB   r7  mamba_outputsr   r  r  shift_logitsshift_labelsloss_fctoutputr#   r#   r$   r   -  s:   
zMambaForCausalLM.forward)r   )NNNNN)	NNNNNNNNN)rQ   rR   rS   _tied_weights_keysr?   r  r   r   dictrX   r   rW   r8  r   r&   r4   rZ   r>  r   r  r.  rY   r   r  r  r   r   r#   r#   r   r$   r/    st    



0	

r/  )r/  r  r   r&   )8rT   r   dataclassesr   typingr   r   r   r4   r   torch.nnr   activationsr	   configuration_utilsr
   
generationr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   r   configuration_mambar   
get_loggerrQ   r   mambapy.pscanr   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   r   r%   r&   Moduler[   r   r   r   r   r  r  r/  __all__r#   r#   r#   r$   <module>   sf   

g  Bi 