o
    wi                     @   sh  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	Zd dlm
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. e rd dl/m0Z0 d dl1m2Z2m3Z3 nd\Z0Z2Z3e rd dl4m5Z5m6Z6 nd\Z6Z5e7e0e5e6fZ8dZ9e:e;Z<G dd dej
j=Z>G dd de+Z?G dd de'Z@G dd  d eZAG d!d" d"e#ZBG d#d$ d$e
j=ZCG d%d& d&e
j=ZDG d'd( d(e$ZEG d)d* d*e)ZFG d+d, d,e(ZGG d-d. d.eZHG d/d0 d0e*eHZIG d1d2 d2e%ZJG d3d4 d4e&ZKg d5ZLdS )6    N)cycle)CallableOptionalUnion)nn   )ACT2FN)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)logging)is_causal_conv1d_availableis_mamba_ssm_available   )LlamaRotaryEmbeddingapply_rotary_pos_emb)pad_tensor_by_sizereshape_into_chunkssegment_sum)
ZambaAttentionZambaAttentionDecoderLayerZambaForCausalLMZambaForSequenceClassificationZambaHybridDynamicCacheZambaHybridLayerZambaMambaDecoderLayer
ZambaModelZambaRMSNormeager_attention_forward   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNzZyphra/Zamba2-2.7Bc                       s(   e Zd Zd fdd	ZdddZ  ZS )	Zamba2RMSNormGatedư>c                    s,   t    tt|| _|| _|| _d S N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer4   eps	__class__ f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/zamba2/modular_zamba2.pyr.   J   s   

zZamba2RMSNormGated.__init__Nc           	      C   s   |j }|tj}|d ur|tj|tj }|j^ }}|| j }|j	g ||| jR  }|
djddd}|t|| j  }|j	g ||| j R  }| j|| S )Nr   T)keepdim)dtypetor0   float32r   
functionalsilushaper4   viewpowmeanrsqrtr3   r2   )	r5   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariancer:   r:   r;   forwardP   s   
zZamba2RMSNormGated.forward)r+   r,   )__name__
__module____qualname__r.   rP   __classcell__r:   r:   r8   r;   r*   I   s    r*   c                   @      e Zd ZdS )Zamba2RMSNormNrQ   rR   rS   r:   r:   r:   r;   rV   ^       rV   c                
   @   sx   e Zd ZdZejdfdededejde	e
 fddZd	ed
ejdejdejfddZdd Zdd	e	e defddZdS )Zamba2HybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nconfig
batch_sizer>   devicec              	      s  || _ |j| _d| _t|j|j | _|j| _|j	| _
|j| _g | _i | _i | _i | _i | _i | _t|jD ]7}tj | jd|j |j  | j
|d| j|< tj | j|j| j|d| j|< | j| dkrm| j| q6 fddt|jD | _ fddt|jD | _d S )NFr   r\   r>   hybridc                        g | ]}t jg g  d qS r\   r0   tensor.0_r[   r\   r:   r;   
<listcomp>        z5Zamba2HybridDynamicCache.__init__.<locals>.<listcomp>c                    r_   r`   rb   rd   rg   r:   r;   rh      ri   )r>   layers_block_typehas_previous_stateintmamba_expandr6   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr0   zerosmamba_ngroupsmamba_headdimappend	key_cachevalue_cache)r5   rZ   r[   r>   r\   ir:   rg   r;   r.   p   s:    z!Zamba2HybridDynamicCache.__init__	layer_idxnew_conv_statecache_positionreturnc                 C   sr   | j | }|d| jd }|jddd}||j|d d d d |f< | j |   | j |  |7  < | j | S )Nr   r!   r<   shiftsdims)rx   clamprr   rollr?   r\   zero_)r5   r   r   r   
conv_stater:   r:   r;   update_conv_state   s   

z*Zamba2HybridDynamicCache.update_conv_statec                 C   s   | j   | j  d S r,   )rx   r   ry   )r5   r:   r:   r;   reset   s   
zZamba2HybridDynamicCache.resetr   c                 C   sL   || j vr
| j d n|}t| j|ks| j|  dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rt   lenr   numelrC   )r5   r   r:   r:   r;   get_seq_length   s    z'Zamba2HybridDynamicCache.get_seq_length)r   )rQ   rR   rS   __doc__r0   float16r"   rl   r>   r   strr.   Tensor
LongTensorr   r   r   r:   r:   r:   r;   rY   b   s.    
 
rY   c                       s&   e Zd Z	ddef fddZ  ZS )Zamba2RotaryEmbeddingNrZ   c                    s,   t  || | j||j|jd\}| _d S )N)r\   basedim)r-   r.   rope_init_fn
rope_thetaattention_head_dimattention_scaling)r5   rZ   r\   inv_freqr8   r:   r;   r.      s   
zZamba2RotaryEmbedding.__init__r,   )rQ   rR   rS   r"   r.   rT   r:   r:   r8   r;   r      s
    r   c                       s   e Zd ZdZ			ddedee dee dee f fddZ			dd	ej	ded
eej	 dee
 deeej	ej	f  dee deej	eej	 eeej	  f fddZ  ZS )Zamba2AttentionaZ  
    Multi-headed attention from 'Attention Is All You Need' paper.

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
    expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
    NrZ   r   num_fwd_mem_blocksblock_idc           	   	      sR  t  || || _|j| _|| _|jrtg | _	tg | _
tg | _t| jD ]p}||j |kr}ttj| j| jjddtj| jj| jdd}ttj| j| jjddtj| jj| jdd}ttj| j| jjddtj| jj| jdd}nt }t }t }| j	| | j
| | j| q+dd t| jD | _d S )NFbiasc                 S      i | ]\}}||qS r:   r:   re   indexvaluer:   r:   r;   
<dictcomp>       z,Zamba2Attention.__init__.<locals>.<dictcomp>)r-   r.   r   hybrid_layer_idslayer_block_mapr   use_shared_attention_adapterr   
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listrz   num_mem_blocks
SequentialLinearattention_hidden_sizerZ   adapter_rankIdentityr   	enumerate	layer_dic)	r5   rZ   r   r   r   r   linear_q_adapterlinear_k_adapterlinear_v_adapterr8   r:   r;   r.      s:   zZamba2Attention.__init__rH   attention_maskpast_key_valueposition_embeddingskwargsr   c                 K   sp  |j d d }g |d| jR }| |}	| |}
| |}| jjrD| j| }|	| j| | }	|
| j	| | }
|| j
| | }|	|dd}	|
|dd}
||dd}| jjrp|\}}t|	|
||\}	}
|d ur}||
||\}
}t}| jjdkrt| jj }|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )Nr<   r!   r   eager        )dropoutscaling)rC   head_dimq_projk_projv_projrZ   r   r   r   r   r   rD   	transposeuse_mem_roper   updater    _attn_implementationr   trainingattention_dropoutr   reshape
contiguouso_proj)r5   rH   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesadapter_layer_idxcossinattention_interfaceattn_outputattn_weightsr:   r:   r;   rP      sH   	





zZamba2Attention.forwardr&   )rQ   rR   rS   r   r"   r   rl   r.   r0   r   rY   tupler   r	   rP   rT   r:   r:   r8   r;   r      s@    -r   c                       s   e Zd ZdZddedee f fddZ		ddej	dee
 d	eej	 fd
dZddee
 d	eej	 fddZ		ddee
 d	eej	 fddZ  ZS )Zamba2MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    NrZ   r   c                    s  t    || _|j| _|j| _|j| _t|j	| j | _
|| _|j| _d| _t | _|j| _|j| _|j| _| jj| _|j| _|j| _|j| _|j| _| j
d| j | j  | _tj| j| jd|j| j|jd d| _| j
| j | j }tj| j||j d| _!t"t#$| j| _%t#&d| jd }t"t#'|| _(d| j(_)t*| j
| j
| j dd| _+t"t#$| j| _,d| j,_)tj| j
| j|j d| _-t.st/0d	 d S d S )
NrB   r   Tr!   )in_channelsout_channelsr   kernel_sizegroupspaddingr   gh㈵>)r4   r7   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)1r-   r.   rZ   r6   ro   rp   rq   rr   rl   rm   rn   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathr}   n_groupsr~   r   rs   	num_heads
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr   add_bias_linearin_projr/   r0   r1   dt_biasarangelogA_log_no_weight_decayr*   normDout_projis_fast_path_availableloggerwarning_once)r5   rZ   r   projection_sizeAr8   r:   r;   r.   +  s`   



	zZamba2MambaMixer.__init__rH   cache_paramsr   c                 C   sF  |j \}}}| j| j }d| j d| j | j  | j }|d ur|jr| |d}	|	j d | d }
|
|
| j| j| jg}t	j
|	|dd\}}}}}t||j| j | jjd| jj| j}t	j
|| j||gdd\}}}t	| j  }|d d d df d d d d d f d| j| jjt	jd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|j d | j }||| j|j d | j }||| j| j}t|j| j ||||||d |dd
}||| j| j }| ||}| |d d d df }|S |d ur;t	 |dks;|j!}||d d d d d f  |}| |}t	| j  }| j"d u rQi nd	| j"i}|d urct	 |dk}nd}| j#r| j$r|d u r|rt%|| jjd| jj| j|f| j| j&d | j| jj| jj'| jj| jj| j| jd
dd|\}}|S t	j
|| j| j| jgdd\}}}|d ur|(dd}t)j*+|| j,|j d  df}|j| j -| t.d u s| jdvr| /| |(dd(ddd d d |f }n t.|(dd| jjd| jj| jd(ddd d d |f }t	j
|| j||gdd\}}}|d urNt	 |dksN|j!}||d d d d d f  |}t0|||d| j|||||| jd|||| jdf| j&| jd d d| jdd|\}}|d ur|d ur|j| j -| |||d}| ||}| |}|S )Nr   r!   r<   r   .r>   T)zr   dt_softplusdt_limitF)r   r   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rB   swish)xr2   r   r   )r   r   r  r  r  r   r  )1rC   r   rp   rn   r   rk   r   squeezer   r0   splitr(   rx   r   r   r2   r   r   expr   floatexpandr   r?   r@   r   r   rD   r#   ry   r   r   allr>   r   r   r   r%   r   r3   r   r   rA   padrr   copy_r'   r   r$   )r5   rH   r  r   r[   seq_lenrf   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrI   hidden_states_B_CdtBCr  r   r   hidden_states_reshapedoutr>   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputr:   r:   r;   cuda_kernels_forwardl  s   

<"
] 

 
L
(

 

z%Zamba2MambaMixer.cuda_kernels_forwardc           1   
      s	  |j \}}}|j}|d ur|jr|d}n |d ur4t|dks4||d d d d d f  |}|}|j d dj  dj	 j
  j d }	|j|	|	jjjgdd\}}}
}}|d ur8|jj  }||j}|jr|
d}
|jj }tj|ddd}|jdkr|d d dd d f n||d d d d df< |jj | tj||jjjd d dd d f  dd}jr|jj7 }||d d d df }n||dd}tj |j!|j d  df}|jj | |ddd d d |d d f }|d ur7t|dks7|j}||d d d d d f  |}n&tj"|jj#j
f|j|d	}|dddd |f dd}tj|jj	j
 j	j
 gdd\}}}t$j%&  }|d ur|jr|jdkr|d d d df n|d d dd d f d d d df }|dd'||j d j#}j(d
 'j(j d j#}tjj)|||j }t*|j+}|d 'jj#j
jtj,d}t$|d
 | }|-|j	ddd d d f }|'|j	jj	 |j d . }|-|d|j d }|d
 |dd d d f  }|-|dj#}||d
  }|jj |jj | |  |-|j	ddd d d f }|'|j	jj	 |j d . }|-|d|j d }|jj |j}|/|j j#j
}|/|j j
d}t0||}|/|jj#}j1d
 'j1j d j#}|||  |j}|-|dd d d df }ntj)|j( }t*|j+}|-||dj#& }|-||dj
& }|-||dj
& }|j2jj	 djd}|j2jj	 djd}j3|j3  j3  j1d
 t4|  }||d
  }||j| } fdd||||fD \}}}}|5dddd}tj6|dd}t$t7|}|d d d d d d d d d d d f |d d d d d d d d d d d f  }|jdd}|d
 |5dddddd
  } | jdd}!|!d
 |d d d d d f  d}"t$|d d d d d d dd f | }#||#5ddddd
  }$|$5dddddd
 |5ddddddd d d f  jdd5ddddd}%|d ur|jr|jj d d d df }&nt8|%d d d df }&tj9|&|%gdd}%t$t7tj |d d d d d d df d}'|%5ddddd}(|'d |(d d d d d df  jdd})|)5ddddd}*|*d d d df |*d d df }%}t$|}+|dd d d f |%d d d d d df  },|+5dddd}-|,d|-d
  }.|"|. }|-|djj#}|| } dkr|d d d |d d d d f }|-||d}|d ur|d ur|jj | :||
}/;|/|}0|0S )Nr!   r<   r   r  r   r   r   .r]   ).N).NNr  )r   output_sizec                    s   g | ]	}t | jqS r:   )r   r   )re   tpad_sizer5   r:   r;   rh   ~  s    z2Zamba2MambaMixer.torch_forward.<locals>.<listcomp>   )r!   r   )<rC   r>   rk   r   r  r0   r  r?   rn   r   rp   r   r  r   ry   r   cloner\   	unsqueezerx   r   ndimr  sumr   r2   r   r   r   r   r   rA   r  rr   r|   r   r  r   r  r  r   softplusr   r   r@   r   r   rD   bmmr   repeat_interleaver   r   permutecumsumr   
zeros_likecatr   r   )1r5   input_statesr  r   r[   r  rf   r>   r'  r  rI   rH   r"  r*  r   r#  r$  r  r   dAdBdBxry   ssm_states_reshaped
C_reshapedyr   
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offr-  contextualized_statesr:   r1  r;   torch_forward  s    
.

60 . ,.B"$$$P$*L0(&
*
 zZamba2MambaMixer.torch_forwardc                 C   s0   t rd| jjjjv r| |||S | |||S )Ncuda)r   r   r2   r\   typer.  r[  )r5   rH   r  r   r:   r:   r;   rP     s   zZamba2MambaMixer.forwardr,   r)   )rQ   rR   rS   r   r"   r   rl   r.   r0   r   rY   r.  r[  rP   rT   r:   r:   r8   r;   r   #  s,    D
  Fr   c                       s6   e Zd Zddedee f fddZd	ddZ  ZS )
	Zamba2MLPNrZ   r   c              	      s   t    || _|j| _|j| _|| _|| _tj| jd| j |j	d| _
tj| j| j|j	d| _t|j | _tg | _t| jD ]/}||j |krfttj| jj| jjddtj| jjd| j dd}nt }| j| qA|j}dd t|D | _dS )aQ  
        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
        r   r   Fc                 S   r   r:   r:   r   r:   r:   r;   r     r   z&Zamba2MLP.__init__.<locals>.<dictcomp>N)r-   r.   rZ   r6   rn   r   r   r   r   r   gate_up_proj	down_projr   
hidden_actact_fnr   gate_up_proj_adapter_listrz   r   r   r   r   r   r   r   r   )r5   rZ   r   r   r   gate_up_proj_adapterr   r8   r:   r;   r.     s(   
zZamba2MLP.__init__c                 C   sZ   |  |}| j| }|| j| | }tj|ddd}| |d |d  }| |}|S )Nr   r<   r  r   r!   )r_  r   rc  r0   chunkrb  r`  )r5   hidden_stater   gate_up_stateoutputr:   r:   r;   rP     s   


zZamba2MLP.forwardr)   r,   )	rQ   rR   rS   r"   r   rl   r.   rP   rT   r:   r:   r8   r;   r^    s    r^  c                       s   e Zd Zddedee dee f fddZ				ddejd	ejded
eej dee	 dee
 deej dee deejeeejejf  f fddZ  ZS )Zamba2AttentionDecoderLayerNrZ   r   r   c                    sD   || _ t|j}t || t|d||d| _t|||d| _d S )Nr<   )r   r   r   )r   r   )	r   r   r   r-   r.   r   	self_attnr^  feed_forward)r5   rZ   r   r   num_gsr8   r:   r;   r.     s
   
z$Zamba2AttentionDecoderLayer.__init__FrH   original_hidden_statesr   r   output_attentionsr   r   r   c              	   K   sl   t j||gdd}| |}| jd||||||d|\}}	| |}| ||}|f}
|r4|
|	f7 }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        r<   r  )rH   r   r   r   rn  r   Nr:   )r0   concatenateinput_layernormrj  pre_ff_layernormrk  )r5   rH   rm  r   r   r   rn  r   r   self_attn_weightsoutputsr:   r:   r;   rP     s$   




z#Zamba2AttentionDecoderLayer.forwardr)   )NNFN)rQ   rR   rS   r"   r   rl   r.   r0   r   rY   boolr   r   r	   r   FloatTensorrP   rT   r:   r:   r8   r;   ri    s2    $	
ri  c                       s&   e Zd Zdedef fddZ  ZS )Zamba2MambaDecoderLayerrZ   r   c                    s2   t  || t||d| _t|j|jd| _d S )N)rZ   r   r7   )r-   r.   r   mambarV   r6   rms_norm_epsrp  )r5   rZ   r   r8   r:   r;   r.   :  s   z Zamba2MambaDecoderLayer.__init__)rQ   rR   rS   r"   rl   r.   rT   r:   r:   r8   r;   rv  9  s    rv  c                       s   e Zd Zdedejdef fddZ								ddej	d	e
ej	 d
e
e de
ej	 de
ej	 de
e de
e de
e de
ej deeje
eejejf  f fddZ  ZS )Zamba2HybridLayershared_transformerlinearrx  c                    s   t  ||| | `|| _d S r,   )r-   r.   shared_transfr{  )r5   r{  r|  rx  r8   r:   r;   r.   A  s   
zZamba2HybridLayer.__init__NFrH   rm  r   r   causal_maskr   rn  	use_cacher   r   c
              	   C   sn   | j |||||||	d}
|
d }|r|
d }| |}| j|||||||	d}
|r5|
d |f|
dd  }
|
S )aX  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )rm  r   r   r   rn  r   r   r!   )transformer_hidden_statesr   r   rn  r  r   r   N)r{  r|  mamba_decoder)r5   rH   rm  r   r   r~  r   rn  r  r   layer_outputsr  rr  r:   r:   r;   rP   H  s2    


zZamba2HybridLayer.forward)NNNNNFFN)rQ   rR   rS   ri  r   r   rv  r.   r0   r   r   rl   rY   rt  r   r   ru  rP   rT   r:   r:   r8   r;   rz  @  sJ    
	
rz  c                   @   s@   e Zd ZeZdZdZddgZdZdZ	dZ
dZdZdZdd ZdS )	Zamba2PreTrainedModelmodelTri  rv  past_key_valuesc                 C   sb  | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tjrF|jjjd|d |jd urD|jj|j 
  d S d S t|ttfrV|jjd d S t|trtt| j jt| j jt| j j  t| j j j| j jd}|tt|   }|jj| td|jd }|j jt| |j!jd d S d S )Nr   )rF   stdg      ?)minr!   )"rZ   initializer_range
isinstancer   r   r   r2   datanormal_r   r   	Embeddingpadding_idxrV   r*   fill_r   r0   r  randrs   mathr   r   r   r   time_step_floorexpm1r   r  r   r   r   r   )r5   moduler  r"  inv_dtr  r:   r:   r;   _init_weights  s:   


z#Zamba2PreTrainedModel._init_weightsN)rQ   rR   rS   r"   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_flex_attn_supports_sdpa_supports_cache_class_is_statefulr  r:   r:   r:   r;   r    s    r  c                   @   s   e Zd ZdZdefddZdd Z										ddeej	 d	eej
 d
eej	 dee deej dee dee dee dee deej	 deeef fddZdS )Zamba2Modelzh
    Model consisting of *config.num_hidden_layers* layers.

    Args:
        config: Zamba2Config
    rZ   c                    sN  t |    | _ j| _ j| _t j j| j| _	 fddt
 jD }g }g } j| _t
 jD ]2} j| dkrH|t |d q5 j| dkrg|tj| jj| jjdd |t |d q5t|}t|}t|}| |||}t|| _ j| _t j jd| _ jr jrtd	 t | _d| _ | !  d S )
Nc                    s   g | ]}t  |d qS ))r   )ri  )re   krZ   r:   r;   rh     s    z(Zamba2Model.__init__.<locals>.<listcomp>rx  r   r^   Fr   rw  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.)"r  r.   rZ   pad_token_idr  
vocab_sizer   r  r6   embed_tokensrz   r   rj   r{   r   rv  r   iterr   
get_layersr   layersr   rV   ry  final_layernormr   use_long_contextr   r   r   
rotary_embgradient_checkpointing	post_init)r5   rZ   blocksmamba_layerslinear_layersr   r  r:   r  r;   r.     s>   
zZamba2Model.__init__c                 C   sp  g }g | _ d| _t| jD ]\}}|dkr| jdkr|| _t|}| jjt| jj dkrd| d}t	
|d d d d	 d
 }	| j |	 d}
| jD ]$}|dkrm|
| jj |jkrmt	
dt|
 d }| j | |
d7 }
qM| jjrd}
| jD ]$}|dkr|
| jj |jkrt	
dt|
 d }| j | |
d7 }
q{|t|t|t| q|t| q|S )Nr   r^   r!   z	^layers\.z\.shared_transformer\.z(?:z3self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|z1feed_forward\.(?:gate_up_proj|down_proj)\.weight|z,(?:input_layernorm|pre_ff_layernorm)\.weightz)$z>^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\.z\.(?:0|1)\.weight$zg^shared_transformer\.self_attn\.(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\.)_tied_weights_keysfirst_transformer_layer_idr   rj   nextrZ   r   r   r   recompiler   r   r   r   rz  )r5   r  r  r  r  layer_id
layer_typeblockprefix_patternmain_keys_pattern
adapter_id_layer_typeadapter_patternattn_adapter_patternr:   r:   r;   r    sh   




zZamba2Model.get_layersN	input_idsr   position_idsr  inputs_embedsr  rn  output_hidden_statesreturn_dictr   r   c                 C   s`  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|}t|}|rr|d u rr|d urb|jd n|jd }t| j || j| jd}|
d u r|d ur|j| jdnd}tj|||jd  |jd}
|d u r|
d}| |||
}| j jr| ||}nd }|rd	nd }|rd	nd }t| jD ]C\}}|r||f7 }| jr| jr| |j|||||||||
}n||||||||||d
	}|d }|r|d d ur||d f7 }q| |}|r||f7 }|r|jsd|_t||r!|nd ||d}|	r,|S | S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   )r>   r\   r  r!   ra   r:   )rm  r   r   r~  r   rn  r  r   T)last_hidden_stater  rH   
attentions) rZ   rn  r  r  use_return_dict
ValueErrorr  r   r   r   r  r0   r4  rC   rY   r>   r\   r   r  r   r5  _update_causal_maskr   r  r   r  _gradient_checkpointing_func__call__r  rk   r
   to_tuple)r5   r  r   r  r  r  r  rn  r  r  r   rH   rm  r[   past_seen_tokensr~  r   all_hidden_statesall_self_attnsr   layerr  rh  r:   r:   r;   rP     s   





zZamba2Model.forward)
NNNNNNNNNN)rQ   rR   rS   r   r"   r.   r  r   r0   r   r   rY   ru  rt  r   r   r
   rP   r:   r:   r:   r;   r    sJ    $2	

r  c                   @   rU   )Zamba2ForCausalLMNrW   r:   r:   r:   r;   r    rX   r  c                   @   rU   )Zamba2ForSequenceClassificationNrW   r:   r:   r:   r;   r    rX   r  )r  r  r  r  )Mr  r  	itertoolsr   typingr   r   r   r0   torch.utils.checkpointr   activationsr   modeling_flash_attention_utilsr	   modeling_outputsr
   modeling_utilsr   r   processing_utilsr   utilsr   utils.import_utilsr   r   llama.modeling_llamar   r   mamba2.modeling_mamba2r   r   r   zamba.modeling_zambar   r   r   r   r   r   r   r   r   r    configuration_zamba2r"   +mamba_ssm.ops.triton.selective_state_updater#   !mamba_ssm.ops.triton.ssd_combinedr$   r%   causal_conv1dr'   r(   r  r   _CONFIG_FOR_DOC
get_loggerrQ   r   Moduler*   rV   rY   r   r   r   r^  ri  rv  rz  r  r  r  r  __all__r:   r:   r:   r;   <module>   s^   0

Gm   1*>I' V