o
    ei                     @   s  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZmZ ddl m!Z! e"e#Z$e rsddl%m&Z& ndZ&G dd dZ'G dd dej(Z)G dd dej(Z*G dd deZ+eG dd deZ,eeddG d d! d!eZ-eed"dG d#d$ d$eZ.eG d%d& d&e,Z/ed'dG d(d) d)e,eZ0g d*Z1dS )+zPyTorch MAMBA model.    N)	dataclass)Any)nn)CrossEntropyLoss   )initialization)ACT2FN)PreTrainedConfig)GenerationMixin)lazy_load_kernel)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_mambapy_availableis_torchdynamo_compiling   )MambaConfig)pscanc                
   @   s~   e Zd ZdZdZejdfdededej	dej
eB dB fdd	Zd
edejdejdejfddZd
edejfddZdd ZdS )
MambaCachea.  
    Cache for mamba model which does not have attention mechanism and key value states.

    Arguments:
        config (`PreTrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if
            a smaller batch size is used.
        dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
            The default `dtype` to use when initializing the layer.
        device (`torch.device` or `str`, *optional*):
            The device on which the cache should be initialized. Should be the same as the layer.

    Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache

        >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

        >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> cache_params = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
        >>> cache_position = torch.arange(len(inputs["input_ids"][0]), device=model.device)  # sequence length
        >>> outputs = model(**inputs, cache_params=cache_params, cache_position=cache_position, use_cache=True)
        >>> outputs.cache_params
        ```
    TNconfigmax_batch_sizedtypedevicec                 C   s   || _ || _|j| _|j| _|j| _g | _g | _|d ur!t	
|nd }t|jD ]6}t	j| j | j| j|| jd}t	j| j | j| j|| jd}t	j| t	j| | j| | j| q(d S )Nr   r   )r   _dtypeintermediate_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeconv_states
ssm_statestorchr   rangenum_hidden_layerszeros_dynamomark_static_addressappend)selfr   r   r   r   _
conv_state	ssm_state r/   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mamba/modeling_mamba.py__init__U   s8   zMambaCache.__init__	layer_idxnew_conv_statecache_positionreturnc                 C   s   | j | j|jkr| j | |j| j |< | j | }|d| jd }|jddd}|j|j|jd|d d d d |f< | j |   | j |  |7  < | j | S )Nr   r   )shiftsdimsr   )r"   r   toclampr!   rollr   zero_)r+   r2   r3   r4   r-   r/   r/   r0   update_conv_statez   s   
$
zMambaCache.update_conv_statenew_ssm_statec                 C   s8   | j |   | j |  || j | j7  < | j | S N)r#   r<   r9   r   )r+   r2   r>   r/   r/   r0   update_ssm_state   s    
zMambaCache.update_ssm_statec                 C   s4   t t| jD ]}| j|   | j|   qd S r?   )r%   lenr"   r<   r#   )r+   r2   r/   r/   r0   reset   s   zMambaCache.reset)__name__
__module____qualname____doc__is_compileabler$   float16r	   intr   r   strr1   Tensor
LongTensorr=   r@   rB   r/   r/   r/   r0   r   0   s2    !
%
r   c                
       s   e Zd ZdZdedef fddZdd Z			dd	ej	d
e
dB dejdB dejdB fddZdd
e
dB dejdB dejdB fddZ			dd
e
dB dejdB dejdB fddZ  ZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r   r2   c                    s  t    || _|j| _|j| _|j| _|j| _t	|j
| _
|| _|j| _tj| j| j|j|j| j|jd d| _|j| _t|j | _|j| _tj| j| jd |jd| _tj| j| j
| jd  dd| _tj| j
| jdd| _tjd| jd tjdd d d f }|| jd }tt || _!tt"| j| _#tj| j| j|jd| _$|j| _t%d	a&t&d urt&j't&j(fnd
\a'a(t%da)t)d urt)j*t)j+t)j,fnd\a*a+a,| -  d S )Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   rP   FTr   r6   zcausal-conv1d)NNz	mamba-ssmNNN).superr1   r   hidden_sizer   r   r    r!   r   rI   time_step_rankr2   use_conv_biasr   Conv1dconv1d
hidden_act
activationr   actuse_mambapyLinearuse_biasin_projx_projdt_projr$   arangefloat32expand
contiguous	ParameterlogA_logonesDout_projr   causal_conv1dcausal_conv1d_updatecausal_conv1d_fn	mamba_ssmselective_state_updateselective_scan_fnmamba_inner_fnwarn_slow_implementation)r+   r   r2   A	__class__r/   r0   r1      sP   
	 $zMambaMixer.__init__c                 C   sJ   t tttttf}|s#| jrt rt	d d S t
dt	d d S d S )Na  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)allru   rv   rs   rr   rw   ra   r   loggerwarning_onceImportError)r+   is_fast_path_availabler/   r/   r0   rx      s    z#MambaMixer.warn_slow_implementationNhidden_statescache_paramsr4   attention_maskc                 C   s  |  |dd}| jrI|d u rIt|| jj| jr| jjnd | jj| j	j| j
j| jr/| j
j nd t| j  d d | j | j	j dd}|S |jddd\}}|d ur]||d }| jj| jjd| jjd}|d ur|d dkrt|d|j| j || jj| j}|d}n&|d urtj|| j|jd  df}	|| j|	| t ||| jj| jd}|d ur||d }| |dd}
tj!|
| j"| j#| j#gdd\}}}| j	j|dd }t| j  }t$| j	d	r| j	j nd }|d ur2|d dkr2t%|j&| j |d
 |d
 ||d d df |d d df | j|d
 |dd
d}n,t'||||dd|dd| j ||ddd
\}}|d ur^|d ur^|(| j| | 
|dd}|S )Nr   rT   T)
delta_biasdelta_softplusdimr   r6   )r_   rP   ).r   )dt_softplus)r   return_last_state))rd   	transposetrainingrw   r]   weightr[   rP   re   rf   rp   rc   floatr$   exprm   ro   chunk	unsqueezeviewsizerr   squeezer"   r2   r_   r   
functionalpadr!   shaper=   rs   splitrZ   r   hasattrru   r#   rv   r@   )r+   r   r   r4   r   projected_statescontextualized_statesgateconv_weightsr"   ssm_parameters	time_stepBCdiscrete_time_stepry   time_proj_biasscan_outputsr.   r/   r/   r0   cuda_kernels_forward   s   
X$




zMambaMixer.cuda_kernels_forwardc              	   C   s  |j \}}}|j}| |dd}	|	jddd\}
}|d ur&|
|d }
|d ur|j| j  }|	|
j
}|j d | jkrftj|
| j|
j d  df}|| j|| | | |
dd |f }
nU|| j|
|}|	| jjj
}tj|| jjd d dd d f  dd}
| jr|
| jj7 }
| |
	|d}
ntj|| j| jf|
j
|d}| | |
dd |f }
|d ur|
|d }
| |
dd}tj|| j| j| jgdd\}}}| |}tj|dd}t| j !  }t|d d d d d d f |d d d d d d d f  }|d d d d d d d f |d d d d d d d f !  }||
d d d d d d d f !  }| j"r| j#r|d u rt$|dd|dd}||d %ddd}||
| j&d d d d f   }|| | }nug }t'|D ]D}|d d d d |d d f | |d d d d |d d f  }t(|	||d d |d d f d}|)|d d d d df  qtj*|dd}||
| j&d d d d f   }|| | }|d ur|j| j +| | ,|dd}|S )	Nr   rT   r   r   r6   .r   r   )-r   r   rd   r   r   r   r#   r2   cloner9   r   r!   r   r   r   r=   r`   r]   r   r$   sumr[   rP   r'   r   r   re   r   rZ   rf   softplusr   rm   r   ra   r   r   r   ro   r%   matmulr*   stackcopy_rp   )r+   input_statesr   r4   r   
batch_sizeseq_lenr,   r   r   r   r   r.   r-   r   r   r   r   r   ry   
discrete_A
discrete_BdeltaB_uhsscan_outputr   ir   r/   r/   r0   slow_forwardV  sp   (
:<$<* 
zMambaMixer.slow_forwardc                 C   sL   t tttttf}|rd| jjjj	v rt
 s| ||||S | ||||S )Ncuda)r|   ru   rv   rs   rr   rw   re   r   r   typer   r   r   )r+   r   r   r4   r   r   r/   r/   r0   forward  s   zMambaMixer.forwardrW   )rC   rD   rE   rF   r   rI   r1   rx   r$   rK   r   rL   r   r   r   __classcell__r/   r/   rz   r0   rM      s6    :
(fUrM   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	MambaRMSNormư>c                    s&   t    tt|| _|| _dS )zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)rX   r1   r   rk   r$   rn   r   variance_epsilon)r+   rY   epsrz   r/   r0   r1     s   

zMambaRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )NrT   r6   T)keepdim)	r   r9   r$   rh   powmeanrsqrtr   r   )r+   r   input_dtypevariancer/   r/   r0   r     s
   zMambaRMSNorm.forwardc                 C   s   | j jd  d| j S )Nr   z, eps=)r   r   r   r+   r/   r/   r0   
extra_repr  s   zMambaRMSNorm.extra_repr)r   )rC   rD   rE   r1   r   r   r   r/   r/   rz   r0   r     s    r   c                       sJ   e Zd Z fddZ			d	dedB dejdB dejdB fddZ  ZS )

MambaBlockc                    sB   t    || _|| _|j| _t|j|jd| _t	||d| _
d S )Nr   r2   )rX   r1   r   r2   residual_in_fp32r   rY   layer_norm_epsilonnormrM   mixer)r+   r   r2   rz   r/   r0   r1     s   
zMambaBlock.__init__Nr   r4   r   c                 C   sL   |}|  |j| j jjd}| jr|tj}| j||||d}|| }|S )NrV   r   r4   r   )r   r9   r   r   r   r$   rh   r   )r+   r   r   r4   r   residualr/   r/   r0   r     s   zMambaBlock.forwardrW   )	rC   rD   rE   r1   r   r$   rL   r   r   r/   r/   rz   r0   r     s    r   c                   @   s:   e Zd ZU eed< dZddgZdZdZe	
 dd ZdS )	MambaPreTrainedModelr   backboner   rM   Tc                 C   s  | j j}t|trtjd|jd tjddddf }||j	d
 }t|jt| t|j | j jd | j j }| j jdkrMt|jj| n| j jdkr]t|jj| | tt| j j	t| j jt| j j  t| j j j| j jd}|tt|   }t|jj | tj!|j"jt#d	d
 |j"j durt$|j"j  tj!|j%jt#d	d
 | j j&r|j%j}|t#| j j' }t|t(j)rtj*|j|d |j durt$|j  dS dS t|t+rt|j dS t|t(j,rtj*|j|d dS dS )zInitialize the weights.r   rV   Nr6   g      constantrandom)min   )a)std)-r   initializer_range
isinstancerM   r$   rg   r   rh   ri   r   rj   initr   rm   rl   ones_ro   rZ   time_step_scaletime_step_init_scheme	constant_rf   r   uniform_r   randmathtime_step_maxtime_step_minr:   time_step_floorexpm1rP   kaiming_uniform_r]   sqrtzeros_rp   rescale_prenorm_residualr&   r   rb   normal_r   	Embedding)r+   moduler   ry   dt_init_stddtinv_dtpr/   r/   r0   _init_weights  sN   
$

z"MambaPreTrainedModel._init_weightsN)rC   rD   rE   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr$   no_gradr   r/   r/   r/   r0   r     s   
 r   z,
    Class for the MAMBA model outputs.
    )custom_introc                   @   sJ   e Zd ZU dZdZejdB ed< dZe	dB ed< dZ
eej dB ed< dS )MambaOutputa9  
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   r   )rC   rD   rE   rF   r   r$   FloatTensorr   r   r   r   tupler/   r/   r/   r0   r   '  s
   
 r   zK
    Base class for causal language model (or autoregressive) outputs.
    c                   @   s\   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dS )MambaCausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   r   )rC   rD   rE   rF   r   r$   r   r   r   r   r   r   r   r/   r/   r/   r0   r   ;  s   
 r   c                       s   e Zd Z fddZdd Zdd Zdd Ze																dd
ej	d	B dej	d	B de
d	B ded	B ded	B ded	B dej	d	B dej	d	B deeB fddZ  ZS )
MambaModelc                    sn   t    t j j| _t fddt j	D | _
d| _t j jd| _| | j |   d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0idxr   r/   r0   
<listcomp>Z  s    z'MambaModel.__init__.<locals>.<listcomp>Fr   )rX   r1   r   r   
vocab_sizerY   
embeddings
ModuleListr%   r&   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_initr+   r   rz   r  r0   r1   V  s    zMambaModel.__init__c                 G   s2   |D ]}d|v r| |||dd<  d S qd S )Nz
embedding.zembeddings.)popreplace)r+   
state_dictprefixargskr/   r/   r0   r
  b  s   zMambaModel.load_hookc                 C   s   | j S r?   r  r   r/   r/   r0   get_input_embeddingsh  s   zMambaModel.get_input_embeddingsc                 C   s
   || _ d S r?   r  r+   new_embeddingsr/   r/   r0   set_input_embeddingsk     
zMambaModel.set_input_embeddingsN	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr4   r   r5   c	                 K   sd  |dur|n| j j}|dur|n| js| j jnd}|dur|n| j j}|du |duA r/td|du r8| |}| jrB| jrB|rBd}|rk|du rbt| j |	d|j
|jd}tjd| j j|j
d}n|du rjtdnd}|}
|rsdnd}| jD ]}||
|||d	}
|r||
f }qx| |
}
|r||
f }|std
d |
||fD S t|
|r||dS d|dS )a  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyr/   r   c                 s   s    | ]	}|d ur|V  qd S r?   r/   )r   vr/   r/   r0   	<genexpr>  s    z%MambaModel.forward.<locals>.<genexpr>)r   r   r   )r   r  r   r  use_return_dict
ValueErrorr  r  r   r   r   r   r$   rg   r    r  r  r   r   )r+   r  r  r   r  r  r  r4   r   kwargsr   all_hidden_statesmixer_blockr/   r/   r0   r   n  s^   





zMambaModel.forward)NNNNNNNN)rC   rD   rE   r1   r
  r  r  r   r$   rL   r   boolr   r   r   r   r/   r/   rz   r0   r   T  sB    	r   z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       s*  e Zd ZddiZ fddZdd Zdd Z		d"d
edee	e
f dedee	e
f fddZ						d#dedB dejdB dejdB dedB f fddZe										d$dejdB dejdB dejdB dedB dejdB dedB dedB dedB dejdB deejB deeB fd d!Z  ZS )%MambaForCausalLMzlm_head.weightzbackbone.embeddings.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFrU   )
rX   r1   r   r   r   rb   rY   r  lm_headr  r  rz   r/   r0   r1     s   
zMambaForCausalLM.__init__c                 C   s
   | j  S r?   )r   r  r   r/   r/   r0   r    r  z%MambaForCausalLM.get_input_embeddingsc                 C   s   | j |S r?   )r   r  r  r/   r/   r0   r    s   z%MambaForCausalLM.set_input_embeddingsr   outputsmodel_kwargsnum_new_tokensr5   c                 K   s   | dd |d< | ddr$d|v r$|d d ur$|d dd  | |d< d|v r?|d }tj|||jd dfgdd	|d< |S )
Nr   r  Tr4   r6   r   r   r   r   )getr$   catnew_onesr   )r+   r)  r*  r+  r#  r   r/   r/   r0   #_update_model_kwargs_for_generation  s   

z4MambaForCausalLM._update_model_kwargs_for_generationNFr   r4   r   is_first_iterationc              	      s   t  j|f||||||d|}	|rD|d u rDtjd| jjj|jd|	d< |d ur/|d}
n|d}
t	| jj|
| j| j
d|	d< |	S |rP|d dkrPd |	d< |	S )N)r  r  r   r4   r   r0  r   r  r4   r   r   r   )rX   prepare_inputs_for_generationr$   rg   r   r   r    r   r   r   r   )r+   r  r  r  r   r4   r   r0  r#  model_inputsr   rz   r/   r0   r1    s0   

z.MambaForCausalLM.prepare_inputs_for_generationr   r  r  labelsr  r  r  logits_to_keepc              
   K   s&  |dur|n| j j}| j|||||||	|d}|d }t|
tr&t|
 dn|
}| |dd|ddf | jjj	
 }d}|durs||j}|dddddf  }|dddf  }t }||d|d|d}|s|f|dd  }|dur|f| S |S t|||j|jdS )aS  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        N)r   r  r  r  r  r4   r   r   .r6   r   )r   r   r   r   )r   r!  r   r   rI   slicer(  r9   r   r   r   r   rj   r   r   r   r   r   r   )r+   r  r   r  r   r3  r  r  r  r4   r4  r#  mamba_outputsr   slice_indicesr   r   shift_logitsshift_labelsloss_fctoutputr/   r/   r0   r     s<   ,zMambaForCausalLM.forward)r   )NNNNNF)
NNNNNNNNNr   )rC   rD   rE   _tied_weights_keysr1   r  r  r   dictrJ   r   rI   r/  r   r$   rL   r&  r1  r   r   rK   r   r   r   r   r/   r/   rz   r0   r'    s    


)	
r'  )r'  r   r   r   )2rF   r   dataclassesr   typingr   r$   r   torch.nnr    r   r   activationsr   configuration_utilsr	   
generationr
   integrationsr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   configuration_mambar   
get_loggerrC   r}   mambapy.pscanr   r   ModulerM   r   r   r   r   r   r   r'  __all__r/   r/   r/   r0   <module>   s^   
g  "?j 