o
    wi                     @   sN  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	  m
  mZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlmZmZmZmZ d	d
lmZ d	dlmZmZ d	dl m!Z! d	dl"m#Z# d	dl$m%Z%m&Z&m'Z' d	dl(m)Z)m*Z* ddl+m,Z, e* rddl-m.Z. ddl/m0Z0m1Z1 ndZ.e) rddl2m3Z3m4Z4 nd\Z4Z3e5e.e3e4fZ6e'7e8Z9G dd deddZ:G dd dej;Z;G dd deZ<d5ddZ=G d d! d!eZ>G d"d# d#eZ?d$d% Z@G d&d' d'ejAZBG d(d) d)eZCG d*d+ d+eZDG d,d- d-eZEe%G d.d/ d/e!ZFe%G d0d1 d1eFZGG d2d3 d3eZHg d4ZIdS )6zPyTorch Bamba model.    )Optional	TypedDictUnionN)nn)ACT2FN)JambaAttentionDecoderLayer)LlamaAttentionLlamaForCausalLMLlamaMLPLlamaRMSNormLlamaRotaryEmbeddingrotate_half)MambaRMSNormGatedpad_tensor_by_sizereshape_into_chunkssegment_sum   )AttentionMaskConverter)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging)is_causal_conv1d_availableis_mamba_2_ssm_available   )BambaConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_update)NNc                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )BambaFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor r3   r3   d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/bamba/modular_bamba.pyr$   L   s   
 

r$   F)totalc                       s.   e Zd ZdZejdfdef fddZ  ZS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nconfigc                    sB  t  | | |j| _d| _|j}|j}g | _g | _g | _t	|j
D ]^}| j| dkr\|  jtj |j|j d|j |  ||dg7  _|  jtj |j|j||dg7  _q$|  jtjg g  dg7  _|  jtjg g  dg7  _| j| q$ fddt	|j
D | _ fddt	|j
D | _d S )	NFmamba   devicedtyper;   c                        g | ]}t jg g  d qS r=   r.   tensor.0_
batch_sizer;   r3   r4   
<listcomp>        z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>c                    r>   r?   r@   rB   rE   r3   r4   rG      rH   )super__init__layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr.   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headrA   append	key_cachevalue_cache)selfr7   rF   r<   r;   conv_kernel_sizessm_state_sizei	__class__rE   r4   rJ   t   sD   	
   z)HybridMambaAttentionDynamicCache.__init__)	r*   r+   r,   r-   r.   float16r   rJ   __classcell__r3   r3   ra   r4   r6   f   s    "r6   c                   @      e Zd ZdS )BambaRotaryEmbeddingNr*   r+   r,   r3   r3   r3   r4   rf          rf   c                 C   s   | |}| |}|jd }| dd|f | d|df }}|dd|f |d|df }	}
|| t||  }|	| t|	|  }tj||gdd}tj||
gdd}||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Ndim)	unsqueezeshaper   r.   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embedr3   r3   r4   apply_rotary_pos_emb   s   


""r|   c                   @   re   )BambaAttentionNrg   r3   r3   r3   r4   r}      rh   r}   c                   @   re   )BambaRMSNormGatedNrg   r3   r3   r3   r4   r~      rh   r~   c                 C   sN   |dur%|j d dkr%|j d dkr%| j}| |dddddf  |} | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rm   r<   to)hidden_statesattention_maskr<   r3   r3   r4   apply_mask_to_padding_states   s   $ r   c                       s   e Zd ZdZdedef fddZ				ddejde	e
 d	e	ej d
e	ej de	ej f
ddZ			dde	e
 d	e	ej d
e	ej fddZ				dde	e
 d	e	ej d
e	ej de	ej fddZ  ZS )
BambaMixeruO  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the HybridCache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r7   	layer_idxc                    s  t    |j| _|j| _|j| _|j| _t	|j
| j | _|| _|j| _|j| _t|j | _|j| _|j| _|j| _|j| _|j| _dtdf| _d| _d| _ | jd| j | j  | _!t"j#| j!| j!|j| j| j!| jd d| _$| j| j! | j }t"j%| j|| jd| _&t"'t()| j| _*t(+d| jd }t"'t(,|| _-d	| j-_.t/| j| jd
| _0t"'t()| j| _1d	| j1_.t"j%| j| j| jd| _2t3st45d d S t45d d S )N        infgMbP?g?r9   r   )in_channelsout_channelsbiaskernel_sizegroupspadding)r   Tepsa  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)6rI   rJ   rX   	num_headsrV   rN   r_   rM   r^   r1   rU   intermediate_sizer   mamba_conv_biasuse_conv_bias
hidden_act
activationr   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrW   n_groupsrY   head_dimmamba_chunk_size
chunk_sizefloattime_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearin_proj	Parameterr.   onesdt_biasarangelogA_log_no_weight_decayr~   normDout_projis_fast_path_availableloggerwarning_once)r]   r7   r   projection_sizeAra   r3   r4   rJ      s\   

	zBambaMixer.__init__Nr   cache_paramscache_positionr   r)   c                 C   s  t ||}| |}|j\}}}	| j| j }
|d uoD|joD|dkoD|j| j jd |j| j jd   ko8|kn  oD|d uoD|d dk}|r)|	dj
| j| j| jgdd\}}}t||j| j | jj	d| jj| j}tj
|| j|
|
gdd\}}}t| j  }|d d d df d d d d d f d| j| jjtjd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|jd | j }||| j|jd | j }||| j| j}t|j| j ||||||d |dd
}||| j| j }| ||}|  |d d d df }|S t| j  }| j!d	td
fkr>i nd| j!i}| j"r||d u r|t#|| jj	d| jj| j|f| j| j$|| j| jj| jj%| j j| j j| j| jddd|}|S |j
| j| j| jgdd\}}}|d ur|&dd}t'j()|| j*|jd  df}|j| j +| | jdvr| ,| |&dddd |f &dd}nt-|&dd| jj	d| jj| j|d&dd}t ||}tj
|| j|
|
gdd\}}}t.|||d| j|||||| jd|||| jdf| j$| jd |d| jdd|\}}|d ur:|d ur:|j| j +| |||d}| ||}|  |}|S )Nr   r   ri   rj   .r<   T)zr   dt_softplusr   r   dt_limitF)r   r   r)   r   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr9   )siluswish)xweightr   r   r)   )r   r   r   r)   r   r   r   )/r   r   rm   r   r_   rL   rO   r   rP   squeezesplitr   r   r   r#   r   r   r   r   r.   expr   r   expandr   r   float32r   r   viewr   r   r   r   trainingr!   r   variance_epsilon	transposer   
functionalpadr^   copy_r   r"   r    )r]   r   r   r   r   r)   projected_statesrF   seq_lenrD   groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrO   scan_output	ssm_stater3   r3   r4   cuda_kernels_forward.  s  
	




<"
^"V
$




zBambaMixer.cuda_kernels_forwardc           3   
      s  |j \}}}|j}t||}|}	|	jjjjgdd\}
}}|d uoQ|joQ|dkoQ|j	j
 j d |jj
 j d   koE|kn  oQ|d uoQ|d dk}|r|j	j
 jddd|j	j
< |d d dd d f |j	j
 j|j	j
 d d d d df< |j	j
 jjjjd}tj|jjd dd}jr|jj }|}n8|d ur|dd}tj|j|j d  df}|j	j
 | |dddd |f dd}t||}tj|jjj jj gdd\}}}tj !  }|r[|jj
 j}|d d dd d f d d d df }|dd"||j d j#}j$d	 "j$j d j#}tjj%|||j }t&|j'd j'd }|d
 "jj#jjtj(d}t|d	 | j|d}|)|jddd d d f }|"|jjj |j d * }|)|d|j d }|d	 |dd d d f  }|)|dj#}||d	  j|d}|jj
 |jj
 | |  |)|jddd d d f }|"|jjj |j d * }|)|d|j d }|jj
 j|j|jd}|+|j j#j}|+|j jd}t,||}|+|jj#}j-d	 "j-j d j#}|||  |j}|)|dd d d df }ntj%|j$ }t&|j'd j'd }|)||dj#! }|)||dj! }|)||dj! }|j.jj djd}|j.jj djd}j/|j/  j/  j-d	 t0|  }||d	  }||j| } fdd||||fD \}}}}|1dddd}tj2|dd}tt3|} |d d d d d d d d d d d f |d d d d d d d d d d d f  }!|!jdd}"|"d	 | 1dddddd	  }#|#jdd}$|$d	 |d d d d d f  jdd}%t|d d d d d d dd f | }&||&1ddddd	  }'|'dd d d f |d	  jdd}(|r|jj
 d d d df j|(jd})nt4|(d d d df })tj5|)|(gdd}(tt3tj|d d d d d d df d}*|*dd}*|*d
 |(d d d d d df  jdd}+|+d d d df |+d d df }(},t|}-|dd d d f |(d d d d d df  }.|-1dddd}/|.d|/d	  }0|%|0 }|)|djj#}|| } dkrB|d d d |d d d d f }|)||d}|,d ur\|d ur\|jj
 |, 6||
}17|1|}2|2S )Nri   rj   r   r   )shiftsdimsr=   r9   .).N).NNr   r:   )rk   output_sizec                    s   g | ]	}t | jqS r3   )r   r   )rC   tpad_sizer]   r3   r4   rG   a  s    z,BambaMixer.torch_forward.<locals>.<listcomp>r      )r   r   )8rm   r<   r   r   r   r   r   r   rL   rO   r   rP   rollr   r;   r   r   r.   sumr   r   r   r   r   r   r   r   r^   r   r   r_   r   r   r   r   r   r   softplusclampr   r   reshape
contiguousr   bmmr   repeat_interleaver   r   permutecumsumr   
zeros_likern   r   r   )3r]   input_statesr   r   r   rF   r   rD   r<   r   r   r   r   r   rO   r   r   r   r   r   cache_devicer   dAdBdBxrP   ssm_states_reshaped
C_reshapedyr   
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statesr3   r   r4   torch_forward  s   


@,
$"$$$P&*"&0(&
*
 zBambaMixer.torch_forwardc                 K   s   t rd| jjjjv r| |||||S |d urtd|j}|d ur@|jd dkr@|jd dkr@||d d d d d f  	|}| 
||||S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )r   r   r   r;   typer   NotImplementedErrorr<   rm   r   r  )r]   r   r   r   r   r)   kwargsr<   r3   r3   r4   forward  s   	$ zBambaMixer.forward)NNNN)NNN)r*   r+   r,   r-   r   r1   rJ   r.   Tensorr   r6   r/   r2   r   r  r  rd   r3   r3   ra   r4   r      sV    F
 .
 Sr   c                   @   re   )BambaMLPNrg   r3   r3   r3   r4   r    rh   r  c                   @   re   )BambaRMSNormNrg   r3   r3   r3   r4   r    rh   r  c                       s   e Zd Zddededef fddZ							dd	ejd
e	ej de	ej
 de	e de	e de	e de	ej
 de	eejejf  dee deeje	eejejf  f fddZ  ZS )BambaDecoderLayerr8   r7   r   
layer_typec                    sl   t    | `d}|dkrtnd }||| _|| _|dkr&t||d| _d S |dkr2t||| _d S t	d)Nr   r8   )r7   r   	attentionzInvalid layer_type)
rI   rJ   	self_attnr  feed_forwardr  r   r8   r}   
ValueError)r]   r7   r   r  num_expertsffn_layer_classra   r3   r4   rJ     s   

zBambaDecoderLayer.__init__NFr   r   rs   past_key_valueoutput_attentions	use_cacher   position_embeddingsr  returnc	                 K   s   |}
|  |}| jdkr| jd||||d|	}d}n| jdkr4| jd||||||||d|	\}}|
| }|}
| |}| |}|
| }|f}|rR||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r8   )r   r   r   r   Nr  )r   r   rs   r#  r$  r%  r   r&  r3   )input_layernormr  r8   r  pre_ff_layernormr  )r]   r   r   rs   r#  r$  r%  r   r&  r  residualself_attn_weightsoutputsr3   r3   r4   r    sD   "


	



zBambaDecoderLayer.forward)r8   )NNNFFNN)r*   r+   r,   r   r1   strrJ   r.   r  r   r/   r6   booltupler   r$   FloatTensorr  rd   r3   r3   ra   r4   r    s<    	
r  c                   @   s:   e Zd ZeZdZdZdgZdZdZ	dZ
dZdZdd ZdS )BambaPreTrainedModelmodelTr  past_key_valuesc                 C   s   | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|ttfr5|jjd d S t|tjrV|jjjd|d |jd urT|jj|j 
  d S d S t|try|jjd ttd|jd |j_|jjd d S d S )Nr   )meanstdg      ?r   )r7   initializer_range
isinstancer   r   r   r   datanormal_r   zero_r~   r  fill_	Embeddingpadding_idxr   r   r.   r   r   r   r   r   )r]   moduler5  r3   r3   r4   _init_weights2  s$   


z"BambaPreTrainedModel._init_weightsN)r*   r+   r,   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_class_is_statefulr?  r3   r3   r3   r4   r1  &  s    r1  c                       s  e Zd Zdef fddZdd Zdd Zee									d!d	e	e
j d
e	e
j de	e
j de	e de	e
j de	e de	e de	e de	e
j dee defddZd
e
jde
jde
jdedef
ddZed
e
jdedede
jde
jdefddZdd  Z  ZS )"
BambaModelr7   c                    s   t  | |j| _|j| _t|j|j| j| _g }t	|j
D ]}|t|||j| d q t|| _|j| _t|j|jd| _t|d| _d| _|   d S )N)r   r  r   )r7   F)rI   rJ   pad_token_idr=  
vocab_sizer   r<  rV   embed_tokensrR   rS   rZ   r  rK   
ModuleListlayers_attn_implementationr  r   final_layernormrf   
rotary_embgradient_checkpointing	post_init)r]   r7   decoder_layersr`   ra   r3   r4   rJ   F  s   zBambaModel.__init__c                 C   s   | j S NrL  )r]   r3   r3   r4   get_input_embeddingsY  s   zBambaModel.get_input_embeddingsc                 C   s
   || _ d S rU  rV  )r]   valuer3   r3   r4   set_input_embeddings\  s   
zBambaModel.set_input_embeddingsN	input_idsr   rs   r3  inputs_embedsr%  r$  output_hidden_statesr   r  r'  c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|}|rO|d u rOtd |	d u r^t
j|jd |jd}	|d u rg|	d}| |||	||}| ||	}| ||}|rdnd }|rdnd }| jD ]5}|jd	kr|n|}|r||f7 }||f||||||	|d
|
}|d }|r|d d ur||d f7 }q| |}|r||f7 }|r|jsd|_|sd n|}t||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r   r=   r   r3   r8   )r   rs   r#  r$  r%  r   r&  T)last_hidden_stater3  r   
attentions)r7   r$  r\  r%  r   rR  r   r   r   rL  r.   r   rm   r;   rl   _update_causal_mask_update_mamba_maskrQ  rN  r  rP  rL   r   )r]   rZ  r   rs   r3  r[  r%  r$  r\  r   r  r   causal_mask
mamba_maskr&  all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_cacher3   r3   r4   r  _  s~   




	


zBambaModel.forwardinput_tensorc                 C   s   | j jdkr|d urd|v r|S d S |d ur| nd}| j jdkr0|s0tj|||| jdr0d S |j}|jd }t|t	j
rC|jd n|| d }	| j|||	|||jd d}
| j jdkru|d uru|jjd	v ru|sut	|j}t|
|}
|
S )
Nflash_attention_2r   r   sdpa)r[  past_key_values_lengthis_trainingr   ri   )sequence_lengthtarget_lengthr<   r   rF   )r  xpunpu)r7   rO  get_seq_lengthr   _ignore_causal_mask_sdpar   r<   rm   r7  r.   r  5_prepare_4d_causal_attention_mask_with_cache_positionr;   r  finfomin_unmask_unattended)r]   r   ri  r   r3  r$  past_seen_tokensr<   rn  ro  ra  	min_dtyper3   r3   r4   r_    sF   



zBambaModel._update_causal_maskrn  ro  r<   rF   c                 K   s|  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	| ddddddf | ddddddf kdddd| dddf |}
|ddddddd|	f |
 }|dk}|ddddddd|	f |||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )
fill_valuer<   r;   r   )diagonalr=   ri   r   )rk   r.   ru  rv  fullr;   triur   r   r   clonerm   r   masked_fill)r   rn  ro  r<   r   rF   r  ra  ry  mask_lengthpadding_attention_maskpadding_maskr3   r3   r4   rt    s2    $
.$  z@BambaModel._prepare_4d_causal_attention_mask_with_cache_positionc                 C   s.   |}|d dks|durt |dkrd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r.   all)r]   r   r   rb  r3   r3   r4   r`  7  s   "zBambaModel._update_mamba_mask)	NNNNNNNNN)r*   r+   r,   r   rJ   rW  rY  r   r   r   r.   r/   r  r6   r0  r.  r   r$   r   r  r_  staticmethodr1   r<   rt  r`  rd   r3   r3   ra   r4   rI  D  s    	
b
<7rI  c                       s   e Zd Z fddZ											ddeej deej deej dee d	eej	 d
eej dee
 dee
 dee
 deej deeejf defddZ						dddZ  ZS )BambaForCausalLMc                    s    t  | |j| _|   d S rU  )rI   rJ   z_loss_coefficientrS  )r]   r7   ra   r3   r4   rJ   D  s   zBambaForCausalLM.__init__Nr   rZ  r   rs   r3  r[  labelsr%  r$  r\  r   logits_to_keepr'  c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd
||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }d}|durt| j	d
||| j j
d|}| jdkrt|jddj|jdd }|| j|  }t|||j|j|jd	S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	rZ  r   rs   r3  r[  r%  r$  r\  r   )logitsr  rK  r   ri   rj   r   r9   )lossr  r3  r   r^  r3   )r7   r$  r\  r2  r]  r7  r1   slicelm_headloss_functionrK  r  	logsumexpr   r<   powr4  r   r3  r   r^  )r]   rZ  r   rs   r3  r[  r  r%  r$  r\  r   r  r  r,  r   slice_indicesr  r  z_lossr3   r3   r4   r  K  s@   %

 zBambaForCausalLM.forwardTc              	   K   s  |d u }	|	s5|d us|d |j d kr"|d d |j d  d f }n!|j d |j d kr4|d d |f }nt| j|j d | j| jd}|d url|d u rl| dd }||dkd |	sl|d d |j d  d f }|d urw|	rwd|i}
nd| i}
|
	||||| jj
|d |
S )Nri   r   r   r=   r[  rZ  )rs   r3  r%  r   r  r   )rm   r6   r7   r<   r;   longr   masked_fill_r   updatenum_logits_to_keep)r]   rZ  r3  r   r[  r   rs   r%  r  empty_past_kvmodel_inputsr3   r3   r4   prepare_inputs_for_generation  s:   

z.BambaForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNT)r*   r+   r,   rJ   r   r.   r/   r  r6   r0  r.  r   r1   r   r  r  rd   r3   r3   ra   r4   r  C  sZ    		

Pr  )rI  r  r1  )Nr   )Jr-   typingr   r   r   r.   torch.utils.checkpointr   (transformers.models.jamba.modeling_jambamodelsjambamodeling_jambatransformers.activationsr   r   (transformers.models.llama.modeling_llamar   r	   r
   r   r   r   *transformers.models.mamba2.modeling_mamba2r   r   r   r   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.import_utilsr   r   configuration_bambar   +mamba_ssm.ops.triton.selective_state_updater   !mamba_ssm.ops.triton.ssd_combinedr    r!   causal_conv1dr"   r#   r  r   
get_loggerr*   r   r$   r6   rf   r|   r}   r~   r   Moduler   r  r  r  r1  rI  r  __all__r3   r3   r3   r4   <module>   s`    
6
(   d`  