o
    wi!                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZmZmZ ddlm Z  e!e"Z#e rqddl$m%Z% ndZ%e rddl&m'Z'm(Z( ddl)m*Z* nd\Z*Z(Z'e rddl+m,Z,m-Z- nd\Z-Z,e.e*e(e,e-e'fZ/G dd de
j0Z1G dd de
j0Z2G dd deZ3eG dd deZ4eedd G d!d" d"eZ5eed#d G d$d% d%eZ6eG d&d' d'e4Z7ed(d G d)d* d*e4eZ8g d+Z9dS ),zPyTorch MAMBA model.    N)	dataclass)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)
MambaCache)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNN)causal_conv1d_fncausal_conv1d_update)NNc                
       s   e Zd ZdZdedef fddZ			ddejde	e
 d	e	ej d
e	ej fddZdde	e
 d	e	ej d
e	ej fddZ			dde	e
 d	e	ej d
e	ej fddZ  ZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    config	layer_idxc                    s  t    || _|j| _|j| _|j| _|j| _t	|j
| _
|| _|j| _tj| j| j|j|j| j|jd d| _|j| _t|j | _|j| _tj| j| jd |jd| _tj| j| j
| jd  dd| _tj| j
| jdd| _tjd| jd tjdd d d f }|| jd }tt || _!tt"| j| _#tj| j| j|jd| _$|j| _t%s| jrt& rt'(d	 d S t)d
t'(d d S d S )Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   r"   FTdtypea7  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)*super__init__r   hidden_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeintermediate_sizeinttime_step_rankr   use_conv_biasr   Conv1dconv1d
hidden_act
activationr	   actuse_mambapyLinearuse_biasin_projx_projdt_projtorcharangefloat32expand
contiguous	ParameterlogA_logonesDout_projis_fast_path_availabler   loggerwarning_onceImportError)selfr   r   A	__class__ e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/mamba/modeling_mamba.pyr,   G   sT   
	 $zMambaMixer.__init__Nhidden_statescache_paramscache_positionattention_maskc                 C   s  |  |dd}| jrI|d u rIt|| jj| jr| jjnd | jj| j	j| j
j| jr/| j
j nd t| j  d d | j | j	j dd}|S |jddd\}}|d ur]||d }| jj| jjd| jjd}|d ur|d dkrt|d|j| j || jj| j}|d}n&|d urtj|| j|jd  df}	|| j|	| t ||| jj| jd}|d ur||d }| |dd}
tj!|
| j"| j#| j#gdd\}}}| j	j|dd }t| j  }t$| j	d	r| j	j nd }|d ur2|d dkr2t%|j&| j |d
 |d
 ||d d df |d d df | j|d
 |dd
d}n,t'||||dd|dd| j ||ddd
\}}|d ur^|d ur^|(| j| | 
|dd}|S )Nr   r&   T)
delta_biasdelta_softplusdimr   r*   )r9   r"   ).r   )dt_softplus)r[   return_last_state))r>   	transposetrainingr   r7   weightr5   r"   r?   r@   rK   r=   floatrA   exprH   rJ   chunk	unsqueezeviewsizer   squeezeconv_statesr   r9   r   
functionalpadr1   shapeupdate_conv_stater   splitr4   r/   hasattrr   
ssm_statesr   update_ssm_state)rP   rV   rW   rX   rY   projected_statescontextualized_statesgateconv_weightsrj   ssm_parameters	time_stepBCdiscrete_time_steprQ   time_proj_biasscan_outputs	ssm_staterT   rT   rU   cuda_kernels_forward   s   
X$




zMambaMixer.cuda_kernels_forwardc              	   C   s  |j \}}}|j}| |dd}	|	jddd\}
}|d ur&|
|d }
|d ur|j| j  }|	|
j
}|j d | jkrftj|
| j|
j d  df}|| j|| | | |
dd |f }
nU|| j|
|}|	| jjj
}tj|| jjd d dd d f  dd}
| jr|
| jj7 }
| |
	|d}
ntj|| j| jf|
j
|d}| | |
dd |f }
|d ur|
|d }
| |
dd}tj|| j| j| jgdd\}}}| |}tj|dd}t| j !  }t|d d d d d d f |d d d d d d d f  }|d d d d d d d f |d d d d d d d f !  }||
d d d d d d d f !  }| j"r| j#r|d u rt$|dd|dd}||d %ddd}||
| j&d d d d f   }|| | }nug }t'|D ]D}|d d d d |d d f | |d d d d |d d f  }t(|	||d d |d d f d}|)|d d d d df  qtj*|dd}||
| j&d d d d f   }|| | }|d ur|j| j +| | ,|dd}|S )	Nr   r&   r\   r   r*   .devicer)   r   )-rm   r)   r>   r`   re   rf   rq   r   clonetor   r1   r   rk   rl   rn   r:   r7   rb   rA   sumr5   r"   zerosr2   r/   r?   ro   r4   r@   softplusrd   rH   rc   r;   ra   r   ri   rJ   rangematmulappendstackcopy_rK   )rP   input_statesrW   rX   rY   
batch_sizeseq_len_r)   rs   rV   ru   r~   
conv_staterw   rx   ry   rz   r{   rQ   
discrete_A
discrete_BdeltaB_uhsscan_outputr}   irt   rT   rT   rU   slow_forward   sp   (
:<$<* 
zMambaMixer.slow_forwardc                 C   s>   t rd| jjjjv rtj s| ||||S | 	||||S )Ncuda)
rL   r?   rb   r   typerA   _dynamois_compilingr   r   )rP   rV   rW   rX   rY   rT   rT   rU   forward;  s   zMambaMixer.forwardr   )__name__
__module____qualname____doc__r   r3   r,   rA   Tensorr   r
   
LongTensorr   r   r   __classcell__rT   rT   rR   rU   r   ?   s4    ?
(fUr   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	MambaRMSNormư>c                    s&   t    tt|| _|| _dS )zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)r+   r,   r   rF   rA   rI   rb   variance_epsilon)rP   r-   epsrR   rT   rU   r,   H  s   

zMambaRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr&   r*   T)keepdim)	r)   r   rA   rC   powmeanrsqrtr   rb   )rP   rV   input_dtypevariancerT   rT   rU   r   P  s
   zMambaRMSNorm.forwardc                 C   s   | j jd  d| j S )Nr   z, eps=)rb   rm   r   rP   rT   rT   rU   
extra_reprW  s   zMambaRMSNorm.extra_repr)r   )r   r   r   r,   r   r   r   rT   rT   rR   rU   r   G  s    r   c                       sJ   e Zd Z fddZ			d	dee deej deej fddZ  Z	S )

MambaBlockc                    sB   t    || _|| _|j| _t|j|jd| _t	||d| _
d S )Nr   r   )r+   r,   r   r   residual_in_fp32r   r-   layer_norm_epsilonnormr   mixer)rP   r   r   rR   rT   rU   r,   \  s   
zMambaBlock.__init__NrW   rX   rY   c                 C   sL   |}|  |j| j jjd}| jr|tj}| j||||d}|| }|S )Nr(   rW   rX   rY   )r   r   rb   r)   r   rA   rC   r   )rP   rV   rW   rX   rY   residualrT   rT   rU   r   d  s   zMambaBlock.forwardr   )
r   r   r   r,   r   r
   rA   r   r   r   rT   rT   rR   rU   r   [  s    r   c                   @   s,   e Zd ZeZdZddgZdZdZdd Z	dS )MambaPreTrainedModelbackboner   r   Tc              	   C   s  t |trd|j_d|j_| jjd | jj }| jjdkr't	j
|jj| n| jjdkr8t	j
|jj| | tt| jjt| jjt| jj  t| jj j| jjd}|tt|   }t  |jj| W d   n1 sw   Y  d|jj_t |t	jr|jdurt|jddst	j
 |j nt |t	j!rt	j
j"|j| jj#d	 | jj$r|% D ]2\}}|d
v rt	j
j&|t'dd t  |t'| jj( }W d   n1 sw   Y  qdS dS )zInitialize the weights.Tg      constantrandom)minN
_no_reinitF)std)zout_proj.weight   )a))
isinstancer   rH   _no_weight_decayrJ   r   r4   time_step_scaletime_step_init_schemer   init	constant_r@   rb   uniform_rA   rd   randr2   mathrG   time_step_maxtime_step_minclamptime_step_floorexpm1no_gradr"   r   r   r<   getattrzeros_	Embeddingnormal_initializer_rangerescale_prenorm_residualnamed_parameterskaiming_uniform_sqrtnum_hidden_layers)rP   moduledt_init_stddtinv_dtnameprT   rT   rU   _init_weights  sN   




z"MambaPreTrainedModel._init_weightsN)
r   r   r   r   config_classbase_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr   rT   rT   rT   rU   r   w  s    r   z,
    Class for the MAMBA model outputs.
    )custom_introc                   @   sJ   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeeej  ed< dS )MambaOutputa9  
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_staterW   rV   )r   r   r   r   r   r   rA   FloatTensor__annotations__rW   r
   rV   tuplerT   rT   rT   rU   r     s
   
 r   zK
    Base class for causal language model (or autoregressive) outputs.
    c                   @   s\   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dS )MambaCausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    NlosslogitsrW   rV   )r   r   r   r   r   r   rA   r   r   r   rW   r
   rV   r   rT   rT   rT   rU   r     s   
 r   c                       s   e Zd Z fddZdd Zdd Zdd Ze																dd
ee	j
 dee	j
 dee dee dee dee dee	j
 dee	j
 deeef fddZ  ZS )
MambaModelc                    sn   t    t j j| _t fddt j	D | _
d| _t j jd| _| | j |   d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0idxr   rT   rU   
<listcomp>  s    z'MambaModel.__init__.<locals>.<listcomp>Fr   )r+   r,   r   r   
vocab_sizer-   
embeddings
ModuleListr   r   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_initrP   r   rR   r   rU   r,     s    zMambaModel.__init__c                 G   s2   |D ]}d|v r| |||dd<  d S qd S )Nz
embedding.zembeddings.)popreplace)rP   
state_dictprefixargskrT   rT   rU   r     s   zMambaModel.load_hookc                 C      | j S Nr   r   rT   rT   rU   get_input_embeddings     zMambaModel.get_input_embeddingsc                 C   
   || _ d S r  r  rP   new_embeddingsrT   rT   rU   set_input_embeddings     
zMambaModel.set_input_embeddingsN	input_idsinputs_embedsrW   	use_cacheoutput_hidden_statesreturn_dictrX   rY   returnc	                 C   sd  |dur|n| j j}|dur|n| js| j jnd}|dur|n| j j}|du |duA r/td|du r8| |}| jrB| jrB|rBd}|rk|du rbt| j |	d|j
|jd}tjd| j j|j
d}n|du rjtdnd}|}	|rsdnd}
| jD ]}||	|||d	}	|r|
|	f }
qx| |	}	|r|
|	f }
|std
d |	||
fD S t|	|r||
dS d|
dS )a  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyrT   r   c                 s   s    | ]	}|d ur|V  qd S r  rT   )r   vrT   rT   rU   	<genexpr>=  s    z%MambaModel.forward.<locals>.<genexpr>)r   rW   rV   )r   r  ra   r  use_return_dict
ValueErrorr   r   r
   rh   r   r)   rA   rB   r0   r   r   r   r   )rP   r  r  rW   r  r  r  rX   rY   rV   all_hidden_statesmixer_blockrT   rT   rU   r     s^   





zMambaModel.forward)NNNNNNNN)r   r   r   r,   r   r  r  r   r   rA   r   r
   boolr   r   r   r   r   rT   rT   rR   rU   r     sB    	

r   z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       s"  e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Z	d!de	de
eef dede
eef fddZ					d"dee deej deej fddZe									d#deej deej deej dee deej dee dee dee deej deeef fdd Z  ZS )$MambaForCausalLMzlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr'   )
r+   r,   r   r   r   r<   r-   r   lm_headr   r   rR   rT   rU   r,   O  s   
zMambaForCausalLM.__init__c                 C   r  r  r  r   rT   rT   rU   get_output_embeddingsV  r	  z&MambaForCausalLM.get_output_embeddingsc                 C   r
  r  r  r  rT   rT   rU   set_output_embeddingsY  r  z&MambaForCausalLM.set_output_embeddingsc                 C   s
   | j  S r  )r   r  r   rT   rT   rU   r  \  r  z%MambaForCausalLM.get_input_embeddingsc                 C   s   | j |S r  )r   r  r  rT   rT   rU   r  _  s   z%MambaForCausalLM.set_input_embeddingsr   outputsmodel_kwargsnum_new_tokensr  c                 K   s   | dd |d< | ddr$d|v r$|d d ur$|d dd  | |d< d|v r?|d }tj|||jd dfgdd	|d< |S )
NrW   r  TrX   r*   rY   r   r   r\   )getrA   catnew_onesrm   )rP   r"  r#  r$  kwargsrY   rT   rT   rU   #_update_model_kwargs_for_generationb  s   

z4MambaForCausalLM._update_model_kwargs_for_generationNrW   rX   rY   c           	      K   s   |r-|d u r
t d|d dkr"|d d df d}|d ur!d }ntjd| jj|jd}|d ur:|d u r:d|i}nd| i}|||||d |S )Nz`cache_position` should not be None as it should have been initialized in `model.generate`, you are responsible for passing in a valid `cache_position` if you are calling `prepare_inputs_for_generation` directly with `use_cache=True`r   r*   r  r  r  )rW   r  rX   rY   )	r  rf   rA   rB   r   r0   r   rE   update)	rP   r  r  r  rW   rX   rY   r(  model_inputsrT   rT   rU   prepare_inputs_for_generationu  s,   
z.MambaForCausalLM.prepare_inputs_for_generationr  r  labelsr  r  r  c
              
   K   s   |dur|n| j j}| j|||||||	|d}|d }| || jjj }d}|dur]||j}|dddddf 	 }|dddf 	 }t
 }||d|d|d}|ss|f|dd  }|durq|f| S |S t|||j|jdS )aS  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        N)rW   r  r  r  r  rX   rY   r   .r*   r   )r   r   rW   rV   )r   r  r   r  r   rb   r)   rc   r   rE   r   rg   rh   r   rW   rV   )rP   r  rY   r  rW   r-  r  r  r  rX   r(  mamba_outputsrV   r   r   shift_logitsshift_labelsloss_fctoutputrT   rT   rU   r     s:   
zMambaForCausalLM.forward)r   )NNNNN)	NNNNNNNNN)r   r   r   _tied_weights_keysr,   r   r!  r  r  r   dictstrr   r3   r)  r   r
   rA   r   r,  r   r   r  r   r   r   r   r   r   rT   rT   rR   rU   r  F  sx    



0	

r  )r  r   r   ):r   r   dataclassesr   typingr   r   r   rA   torch.utils.checkpointr   torch.nnr   activationsr	   cache_utilsr
   
generationr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   configuration_mambar   
get_loggerr   rM   mambapy.pscanr   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   causal_conv1dr   r   allrL   Moduler   r   r   r   r   r   r   r  __all__rT   rT   rT   rU   <module>   sn   

  
7i 