o
    ei                     @   s  d dl mZ d dlmZmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z. e*/e0Z1G dd deddZ2G dd dZ3G dd dej4Z5dd Z6d ej7d!e8d"ej7fd#d$Z9	%dPd&ej4d'ej7d(ej7d)ej7d*ej7dB d+e:d,e:d-e$e& fd.d/Z;dQd0d1Z<ee<G d2d3 d3ej4Z=G d4d5 d5ejj4Z>d6ej7d7e8fd8d9Z?d:d; Z@d<d= ZAd>d? ZBG d@dA dAej4ZCG dBdC dCej4ZDedDG dEdF dFej4ZEG dGdH dHeZFe'G dIdJ dJe"ZGe'G dKdL dLeGZHe'G dMdN dNeGeZIg dOZJdS )R    )Callable)AnyOptional	TypedDictN)nn)ACT2FN   )initialization)Cache)GenerationMixin)use_kernel_forward_from_hubuse_kernelized_func)lazy_load_kernel)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)maybe_autocast   )BambaConfigc                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )BambaFlashAttentionKwargsaU  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    cu_seq_lens_q (`torch.LongTensor`):
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`):
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor`):
        Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor r/   r/   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/bamba/modeling_bamba.pyr    5   s   
 

r    F)totalc                   @   s   e Zd ZdZdZejdfdefddZdd Z	d	d
 Z
	ddejdejdedeeef dB deejejf f
ddZdejfddZdejdedeeef fddZddedB defddZdS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNconfigc                    s0  |j | _ d| _|j}|j}g | _g | _g | _t|jD ]^}| j | dkrS|  jt	j
 |j|j d|j |  ||dg7  _|  jt	j
 |j|j||dg7  _q|  jt	jg g  dg7  _|  jt	jg g  dg7  _| j| q fddt|jD | _ fddt|jD | _d S )	NFmamba   devicedtyper7   c                        g | ]}t jg g  d qS r9   r*   tensor.0_
batch_sizer7   r/   r0   
<listcomp>        z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>c                    r:   r;   r<   r>   rA   r/   r0   rC      rD   )layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr*   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headr=   append	key_cachevalue_cache)selfr3   rB   r8   r7   conv_kernel_sizessm_state_sizeir/   rA   r0   __init__]   sB   	
   z)HybridMambaAttentionDynamicCache.__init__c                 C   s
   t | jS N)lenrU   rW   r/   r/   r0   __len__   s   
z(HybridMambaAttentionDynamicCache.__len__c                 C   s   | j | | j| fS r\   )rU   rV   rW   	layer_idxr/   r/   r0   __getitem__   s   z,HybridMambaAttentionDynamicCache.__getitem__
key_statesvalue_statesra   cache_kwargsreturnc                 C   sz   | j | jd dkr|| j |< || j|< ntj| j | |gdd| j |< tj| j| |gdd| j|< | j | | j| fS )Nr   r5   dim)rU   shaperV   r*   cat)rW   rc   rd   ra   re   r/   r/   r0   update   s   
z'HybridMambaAttentionDynamicCache.updatebeam_idxc                 C   s   |   dkrdtt| jD ]X}| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< | j	| j}| j	| d||| j	|< qdS dS )zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthrL   r]   rU   r7   index_selecttorV   rI   rJ   )rW   rm   ra   r7   r/   r/   r0   reorder_cache   s    z.HybridMambaAttentionDynamicCache.reorder_cachecache_positionc                 C   s$   d}|j d }| || }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )rj   rn   )rW   rr   ra   	kv_offsetquery_length	kv_lengthr/   r/   r0   get_mask_sizes   s   
z/HybridMambaAttentionDynamicCache.get_mask_sizesr   c                 C   sN   || j vr
| j d n|}t| j|ks| j| jd dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   rg   )rK   r]   rU   rj   r`   r/   r/   r0   rn      s   "z/HybridMambaAttentionDynamicCache.get_seq_lengthr\   )r   )r&   r'   r(   r)   is_compileabler*   float16r   r[   r_   rb   Tensorr-   dictstrr   tuplerl   r+   rq   rv   rn   r/   r/   r/   r0   r2   M   s*    &
 r2   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )BambaRotaryEmbeddinginv_freqNr3   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)superr[   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr3   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)rW   r3   r7   rope_init_fnr   	__class__r/   r0   r[      s   


zBambaRotaryEmbedding.__init__r7   ztorch.deviceseq_lenrf   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r5   r8   r6   )	r   getattrrP   num_attention_headsr*   arangeint64rp   float)r3   r7   r   baseri   attention_factorr   r/   r/   r0   r      s   
&z4BambaRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   rg   r   mpscpuF)device_typeenabledr5   rh   r   )r   r   expandrj   rp   r7   
isinstancetyper|   r   	transposer*   rk   cosr   sinr8   )
rW   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   r/   r/   r0   forward   s   0&zBambaRotaryEmbedding.forwardr\   NNN)r&   r'   r(   r*   rz   r,   r   r[   staticmethodr   r-   r}   r   r   no_gradr   r   __classcell__r/   r/   r   r0   r~      s&   
 

r~   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nrg   r5   rh   )rj   r*   rk   )r   x1x2r/   r/   r0   rotate_half   s   r   hidden_statesn_reprf   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rj   r   reshape)r   r   batchnum_key_value_headsslenr   r/   r/   r0   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr5   r   rg   )ri   r8   )ptrainingr   )r   num_key_value_groupsr*   matmulr   r   
functionalsoftmaxfloat32rp   r8   r   r   
contiguous)r   r   r   r   r   r   r   r   rc   rd   attn_weightsattn_outputr/   r/   r0   eager_attention_forward  s   
r   c                 C   s   | |}| |}|jd }| dd|f | d|df }}|dd|f |d|df }}	|| t||  }
|| t||  }tj|
|gdd}
tj||	gdd}|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rg   .Nrh   )	unsqueezerj   r   r*   rk   )qkr   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embedr/   r/   r0   apply_rotary_pos_emb&  s   


""r   c                       s   e Zd ZdZdedef fddZ				ddejde	ejejf dB d	ejdB d
e
dB dejdB dee de	ejejf fddZ  ZS )BambaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr3   ra   c                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )Nr   g      Tbias)r   r[   r3   ra   r   rP   r   r   r   r   r   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_proj)rW   r3   ra   r   r/   r0   r[   P  s(   
zBambaAttention.__init__Nr   position_embeddingsr   past_key_valuesrr   r   rf   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jskdn| j| jd|\}}|jg |dR   }| |}||fS )Nrg   r   r5   )r   r   rr   r   )r   r   )rj   r   r   viewr   r   r   r   rl   ra   r   get_interfacer3   _attn_implementationr   r   r   r   r   r   r   )rW   r   r   r   r   rr   r   input_shapehidden_shapequery_statesrc   rd   r   r   re   attention_interfacer   r   r/   r/   r0   r   g  s8   	

zBambaAttention.forwardNNNN)r&   r'   r(   r)   r   r-   r[   r*   rz   r}   r
   r+   r   r   r   r   r/   r/   r   r0   r   L  s,    r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	BambaRMSNormGatedư>c                    s&   t    tt|| _|| _d S r\   r   r[   r   	Parameterr*   onesweightvariance_epsilonrW   rP   epsr   r/   r0   r[     s   

zBambaRMSNormGated.__init__Nc                 C   sj   |j }|tj}|d ur|tj|tj }|djddd}|t	|| j
  }| j|| S Nr5   rg   T)keepdim)r8   rp   r*   r   r   r   silupowmeanrsqrtr   r   )rW   r   gateinput_dtypevariancer/   r/   r0   r     s   zBambaRMSNormGated.forwardr   r\   r&   r'   r(   r[   r   r   r/   r/   r   r0   r     s    r   input_tensorpad_sizec                 C   sH   t | jdkrddddd|ddfnddd|ddf}tjjj| |dddS )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moder   )r]   rj   r*   r   r   pad)r  r  	pad_shaper/   r/   r0   pad_tensor_by_size  s   2r
  c                 C   sX   t | |} t| jdkr| | jd d|| jd S | | jd d|| jd | jd S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r   r   rg   r5   )r
  r]   rj   r   )r  r  
chunk_sizer/   r/   r0   reshape_into_chunks  s   
r  c                 C   s   |  d}| d jg |   |R  } tjtj||| jtjddd}| | d} tj| dd}tjtj||| jtjddd}|| tj	 }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    rg   .Nr6   )diagonalr   rw   rh   )
sizer   r*   trilr   r7   boolmasked_fillcumsuminf)r  r  masktensor_segsumr/   r/   r0   segment_sum  s   
  r  c                 C   sN   |dur%|j d dkr%|j d dkr%| j}| |dddddf  |} | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rj   r8   rp   )r   r   r8   r/   r/   r0   apply_mask_to_padding_states  s   $ r  c                       s   e Zd ZdZdedef fddZ				ddejde	dB d	ej
dB d
ejdB dejdB f
ddZ			dde	dB d	ej
dB d
ejdB fddZ				dde	dB d	ej
dB d
ejdB dejdB fddZ  ZS )
BambaMixeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r3   ra   c                    s  t    |j| _|j| _|j| _|j| _t	|j
| j | _|| _|j| _|j| _t|j | _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _| jd| j | j  | _ t!j"| j | j |j| j| j | jd d| _#| j| j  | j }t!j$| j|| jd| _%t!&t'(| j| _)t'*d| jd }t!&t'+|| _,t-| j| jd| _.t!&t'(| j| _/t!j$| j| j| jd| _0t1d}t2|dd a3t2|dd a4t1d	}t2|d
d a5t2|dd a6t2|dd a7t8t5t4t3fa9t9st:;d d S t:;d d S )Nr5   r   )in_channelsout_channelsr   kernel_sizegroupspaddingr   r   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmselective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)<r   r[   rR   	num_headsrP   rH   rY   rG   rX   r-   rO   intermediate_sizera   mamba_conv_biasuse_conv_bias
hidden_act
activationr   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrQ   n_groupsrS   r   mamba_chunk_sizer  time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dr   in_projr   r*   r   dt_biasr   logA_logr   normDout_projr   r   r   r!  r"  r#  r$  allis_fast_path_availableloggerwarning_once)rW   r3   ra   projection_sizeAcausal_conv1d	mamba_ssmr   r/   r0   r[     sh   

	zBambaMixer.__init__Nr   cache_paramsrr   r   r%   c                 C   s  t ||}| |}|j\}}}	| j| j }
|d uoD|joD|dkoD|j| j jd |j| j jd   ko8|kn  oD|d uoD|d dk}|r)|	dj
| j| j| jgdd\}}}t||j| j | jj	d| jj| j}tj
|| j|
|
gdd\}}}t| j  }|d d d df d d d d d f d| j| jjtjd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|jd | j }||| j|jd | j }||| j| j}t|j| j ||||||d |dd
}||| j| j }| ||}|  |d d d df }|S t| j  }| j!d	td
fkr>i nd| j!i}| j"r||d u r|t#|| jj	d| jj| j|f| j| j$|| j| jj| jj%| j j| j j| j| jddd|}|S |j
| j| j| jgdd\}}}|d ur|&dd}t'j()|| j*|jd  df}|j| j +| | jdvr| ,| |&dddd |f &dd}nt-|&dd| jj	d| jj| j|d&dd}t ||}tj
|| j|
|
gdd\}}}t.|||d| j|||||| jd|||| jdf| j$| jd |d| jdd|\}}|d ur:|d ur:|j| j +| |||d}| ||}|  |}|S )Nr   r   rg   rh   .r   T)zr9  dt_softplusr   r  dt_limitF)r=  r  r%   r*  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr5   )r   swish)r   r   r   r*  r%   )r  r=  rH  r%   rR  r9  rI  )/r  r8  rj   r0  rY   rF   rI   ra   rJ   squeezesplitr&  r5  r%  r   r7  r   r   r*  r*   expr;  r   r   r   rp   r   r9  r=  r   r"  r<  r>  r2  r   r$  r  r   r   r   r   r  rX   copy_r+  r!  r#  )rW   r   rG  rr   r   r%   projected_statesrB   r   r@   groups_time_state_sizeuse_precomputed_statesr   hidden_states_B_CdtBCrD  r9  r=  hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrI   scan_output	ssm_stater/   r/   r0   cuda_kernels_forwardD  s  
	




<"
^"V
$




zBambaMixer.cuda_kernels_forwardc           3   
      s  |j \}}}|j}t||}|}	|	jjjjgdd\}
}}|d uoQ|joQ|dkoQ|j	j
 j d |jj
 j d   koE|kn  oQ|d uoQ|d dk}|r|j	j
 jddd|j	j
< |d d dd d f |j	j
 j|j	j
 d d d d df< |j	j
 jjjjd}tj|jjd dd}jr|jj }|}n8|d ur|dd}tj|j|j d  df}|j	j
 | |dddd |f dd}t||}tj|jjj jj gdd\}}}tj !  }|r[|jj
 j}|d d dd d f d d d df }|dd"||j d j#}j$d	 "j$j d j#}tjj%|||j }t&|j'd j'd }|d
 "jj#jjtj(d}t|d	 | j|d}|)|jddd d d f }|"|jjj |j d * }|)|d|j d }|d	 |dd d d f  }|)|dj#}||d	  j|d}|jj
 |jj
 | |  |)|jddd d d f }|"|jjj |j d * }|)|d|j d }|jj
 j|j|jd}|+|j j#j}|+|j jd}t,||}|+|jj#}j-d	 "j-j d j#}|||  |j}|)|dd d d df }ntj%|j$ }t&|j'd j'd }|)||dj#! }|)||dj! }|)||dj! }|j.jj djd}|j.jj djd}j/|j/  j/  j-d	 t0|  }||d	  }||j| } fdd||||fD \}}}}|1dddd}tj2|dd}tt3|} |d d d d d d d d d d d f |d d d d d d d d d d d f  }!|!jdd}"|"d	 | 1dddddd	  }#|#jdd}$|$d	 |d d d d d f  jdd}%t|d d d d d d dd f | }&||&1ddddd	  }'|'dd d d f |d	  jdd}(|r|jj
 d d d df j|(jd})nt4|(d d d df })tj5|)|(gdd}(tt3tj|d d d d d d df d}*|*dd}*|*d
 |(d d d d d df  jdd}+|+d d d df |+d d df }(},t|}-|dd d d f |(d d d d d df  }.|-1dddd}/|.d|/d	  }0|%|0 }|)|djj#}|| } dkrB|d d d |d d d d f }|)||d}|,d ur\|d ur\|jj
 |, 6||
}17|1|}2|2S )Nrg   rh   r   r   )shiftsdimsr9   r5   .r  ).NNr   r6   )ri   output_sizec                    s   g | ]	}t | jqS r/   )r  r  )r?   tr  rW   r/   r0   rC   w  s    z,BambaMixer.torch_forward.<locals>.<listcomp>r   r  rw   )r   r   )8rj   r8   r  r8  rU  r&  r5  r%  rF   rI   ra   rJ   rollrp   r7   r7  r   r*   sumrT  r(  r   r+  r   r   r   r  rX   rW  r0  rY   rV  r;  r   r   r   r9  softplusclampr2  r   r   r   r   bmmr=  repeat_interleaver  r
  permuter  r  
zeros_likerk   r<  r>  )3rW   input_statesrG  rr   r   rB   r   r@   r8   rX  r   r[  r\  rZ  rI   rb  r   r]  r^  rD  cache_devicer9  dAdBdBxrJ   ssm_states_reshaped
C_reshapedyr=  
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesrd  state_decay_outC_times_statesstate_decay_out_permutedY_offrc  contextualized_statesr/   rj  r0   torch_forward  s   


@,
$"$$$P&*"&0(&
*
 zBambaMixer.torch_forwardc                 K   s   t rd| jjjjv rt s| |||||S |d urtd|j}|d urC|j	d dkrC|j	d dkrC||d d d d d f  
|}| ||||S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )r@  r8  r   r7   r   r   re  NotImplementedErrorr8   rj   rp   r  )rW   r   rG  rr   r   r%   r   r8   r/   r/   r0   r     s   	$ zBambaMixer.forwardr   r   )r&   r'   r(   r)   r   r-   r[   r*   rz   r2   r+   r.   re  r  r   r   r/   r/   r   r0   r    sV    Q
 .
 Sr  c                       s$   e Zd Z fddZdd Z  ZS )BambaMLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )Nr   )r   r[   r3   rP   r&  r   r   mlp_bias	gate_projup_proj	down_projr   r)  act_fnrW   r3   r   r/   r0   r[     s   
zBambaMLP.__init__c                 C   s$   |  | | || | }|S r\   )r  r  r  r  )rW   r   r  r/   r/   r0   r     s    zBambaMLP.forwardr  r/   r/   r   r0   r    s    
r  RMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )BambaRMSNormr   r   rf   Nc                    s&   t    tt|| _|| _dS )z;
        BambaRMSNorm is equivalent to T5LayerNorm
        Nr   r   r   r/   r0   r[     s   

zBambaRMSNorm.__init__r   c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S r   )	r8   rp   r*   r   r   r   r   r   r   )rW   r   r   r   r/   r/   r0   r     s
   zBambaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r}   r   rj   r   r^   r/   r/   r0   
extra_repr  s   zBambaRMSNorm.extra_reprr  )
r&   r'   r(   r   r[   r*   rz   r   r  r   r/   r/   r   r0   r    s    r  c                       s   e Zd Zddededef fddZ							dd	ejd
ejdB dej	dB de
dB dedB dedB dej	dB deejejf dB dee deejeejejf dB f fddZ  ZS )BambaDecoderLayerr4   r3   ra   
layer_typec                    s   t    d}|dkrtnd }||| _t|j|jd| _t|j|jd| _|| _	|dkr6t
||d| _d S |dkrBt||| _d S td)Nr   r  r4   )r3   ra   	attentionzInvalid layer_type)r   r[   r  feed_forwardr  rP   r.  input_layernormpre_ff_layernormr  r  r4   r   	self_attn
ValueError)rW   r3   ra   r  num_expertsffn_layer_classr   r/   r0   r[     s   

zBambaDecoderLayer.__init__NFr   r   r   r   output_attentions	use_cacherr   r   r   rf   c	                 K   s   |}
|  |}| jdkr| jd||||d|	}d}n| jdkr4| jd||||||||d|	\}}|
| }|}
| |}| |}|
| }|f}|rR||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r4   )r   rG  rr   r   Nr  )r   r   r   r   r  r  rr   r   r/   )r  r  r4   r  r  r  )rW   r   r   r   r   r  r  rr   r   r   residualself_attn_weightsoutputsr/   r/   r0   r     sD   "


	



zBambaDecoderLayer.forward)r4   )NNNFFNN)r&   r'   r(   r   r-   r|   r[   r*   rz   r+   r2   r  r}   r   r    FloatTensorr   r   r/   r/   r   r0   r    s<    	
r  c                       sL   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZe  fddZ  ZS )BambaPreTrainedModelr3   modelTr  r   c              
      sX   t  | t|tr*t|j t|jt	
t	d|jd  t|j d S d S )Nr   )r   _init_weightsr   r  initones_r9  rW  r;  r*   r:  r   r%  r=  )rW   r   r   r/   r0   r  e  s   
"z"BambaPreTrainedModel._init_weights)r&   r'   r(   r   r,   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr*   r   r  r   r/   r/   r   r0   r  Y  s   
 r  c                       s   e Zd Zdef fddZee									ddejdB dej	dB dejdB de
dB d	ejdB d
edB dedB dedB dejdB dee defddZdd Z  ZS )
BambaModelr3   c                    s   t  | |j| _|j| _t|j|j| j| _g }t	|j
D ]}|t|||j| d q t|| _|j| _t|j|jd| _t|d| _d| _|   d S )N)ra   r  r  )r3   F)r   r[   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrP   embed_tokensrL   rM   rT   r  rE   
ModuleListlayersr   r  r.  final_layernormr~   
rotary_embgradient_checkpointing	post_init)rW   r3   decoder_layersrZ   r   r/   r0   r[   p  s   zBambaModel.__init__N	input_idsr   r   r   inputs_embedsr  r  output_hidden_statesrr   r   rf   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|}|rO|d u rOtd |	d u r^t
j|jd |jd}	|d u rg|	d}t| j |||	||d}| ||	}| j||d	}|rd
nd }|rd
nd }| jD ]5}|jdkr|n|}|r||f7 }||f||||||	|d|
}|d }|r|d d ur||d f7 }q| |}|r||f7 }|r|jsd|_|sd n|}t||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r   r9   r   )r3   r  r   rr   r   r   )r   r/   r4   )r   r   r   r  r  rr   r   T)last_hidden_stater   r   
attentions)r3   r  r  r  r  r  r   rA  rB  r  r*   r   rj   r7   r   r   _update_mamba_maskr  r  r  r  rF   r   )rW   r  r   r   r   r  r  r  r  rr   r   r   causal_mask
mamba_maskr   all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_cacher/   r/   r0   r     s   



	


zBambaModel.forwardc                 C   s.   |}|d dks|durt |dkrd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r*   r?  )rW   r   rr   r  r/   r/   r0   r    s   "zBambaModel._update_mamba_mask)	NNNNNNNNN)r&   r'   r(   r   r[   r   r   r*   r+   rz   r2   r  r  r   r    r   r   r  r   r/   r/   r   r0   r  n  sJ    	
er  c                       s   e Zd ZddiZddiZddgdgfiZ fddZee																					
dde	j
d	B de	jd	B de	j
d	B ded	B de	jd	B de	j
d	B ded	B ded	B ded	B de	j
d	B dee	jB defddZ												d fdd	Z  ZS )BambaForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr   logitsc                    sH   t  | t|| _|j| _tj|j|jdd| _|j	| _	| 
  d S )NFr   )r   r[   r  r  r  r   r   rP   r  z_loss_coefficientr  r  r   r/   r0   r[     s   
zBambaForCausalLM.__init__Nr   r  r   r   r   r  labelsr  r  r  rr   logits_to_keeprf   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd
||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }d}|durt| j	d
||| j j
d|}| jdkrt|jddj|jdd }|| j|  }t|||j|j|jd	S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r  r   r   r   r  r  r  r  rr   )r  r  r  r   rg   rh   r   r5   )lossr  r   r   r  r/   )r3   r  r  r  r  r   r-   slicer  loss_functionr  r  	logsumexprp   r8   r   r   r   r   r   r  )rW   r  r   r   r   r  r  r  r  r  rr   r  r   r  r   slice_indicesr  r  z_lossr/   r/   r0   r     s@   '

 zBambaForCausalLM.forwardTFc	              
      sX   |d u rt | j|jd | j| jd}| jj|	d< t j|f|||||||d|	}
|
S )Nr   r9   r  )r   r   r  rr   r   r  is_first_iteration)r2   r3   rj   r8   r7   num_logits_to_keepr   prepare_inputs_for_generation)rW   r  r   r   r  rr   r   r  r  r   model_inputsr   r/   r0   r  U  s&   	z.BambaForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNTF)r&   r'   r(   _tied_weights_keys_tp_plan_pp_planr[   r   r   r*   r+   rz   r2   r  r  r-   r   r   r  r   r/   r/   r   r0   r    sf    
	
Pr  )r  r  r  )r   )r   )Kcollections.abcr   typingr   r   r   r*   r   transformers.activationsr    r	   r  cache_utilsr
   
generationr   integrationsr   r   integrations.hub_kernelsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   configuration_bambar   
get_loggerr&   rA  r    r2   Moduler~   r   rz   r-   r   r   r   r   r   r   r
  r  r  r  r  r  r  r  r  r  r  __all__r/   r/   r/   r0   <module>   s   
kA

&F   o`  