o
    i                     @   sl  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ e rd dl0m1Z1 d dl2m3Z3m4Z4 nd\Z1Z3Z4e rd dl5m6Z6m7Z7 nd\Z7Z6e8e1e6e7fZ9dZ:e;e<Z=G dd dej	j>Z?G dd de,Z@G dd de(ZAG d d! d!eZBG d"d# d#e$ZCG d$d% d%e	j>ZDG d&d' d'e	j>ZEG d(d) d)e%ZFG d*d+ d+e*ZGG d,d- d-e)ZHG d.d/ d/eZIG d0d1 d1e+eIZJG d2d3 d3e&ZKG d4d5 d5e'ZLg d6ZMdS )7    N)cycle)CallableOptionalUnion)nn   )ACT2FN)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)logging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_ssm_available   )LlamaRotaryEmbeddingapply_rotary_pos_emb)pad_tensor_by_sizereshape_into_chunkssegment_sum)
ZambaAttentionZambaAttentionDecoderLayerZambaForCausalLMZambaForSequenceClassificationZambaHybridDynamicCacheZambaHybridLayerZambaMambaDecoderLayer
ZambaModelZambaRMSNormeager_attention_forward   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNzZyphra/Zamba2-2.7Bc                       s(   e Zd Zd fdd	ZdddZ  ZS )	Zamba2RMSNormGatedư>c                    s,   t    tt|| _|| _|| _d S N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer5   eps	__class__ f/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/zamba2/modular_zamba2.pyr/   J   s   

zZamba2RMSNormGated.__init__Nc           	      C   s   |j }|tj}|d ur|tj|tj }|j^ }}|| j }|j	g ||| jR  }|
djddd}|t|| j  }|j	g ||| j R  }| j|| S )Nr   T)keepdim)dtypetor1   float32r   
functionalsilushaper5   viewpowmeanrsqrtr4   r3   )	r6   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariancer;   r;   r<   forwardP   s   
zZamba2RMSNormGated.forward)r,   r-   )__name__
__module____qualname__r/   rQ   __classcell__r;   r;   r9   r<   r+   I   s    r+   c                   @      e Zd ZdS )Zamba2RMSNormNrR   rS   rT   r;   r;   r;   r<   rW   ^       rW   c                
   @   sx   e Zd ZdZejdfdededejde	e
 fddZd	ed
ejdejdejfddZdd Zdd	e	e defddZdS )Zamba2HybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nconfig
batch_sizer?   devicec              	      s  || _ |j| _d| _t|j|j | _|j| _|j	| _
|j| _g | _i | _i | _i | _i | _i | _t|jD ]7}tj | jd|j |j  | j
|d| j|< tj | j|j| j|d| j|< | j| dkrm| j| q6 fddt|jD | _ fddt|jD | _d S )NFr   r]   r?   hybridc                        g | ]}t jg g  d qS r]   r1   tensor.0_r\   r]   r;   r<   
<listcomp>        z5Zamba2HybridDynamicCache.__init__.<locals>.<listcomp>c                    r`   ra   rc   re   rh   r;   r<   ri      rj   )r?   layers_block_typehas_previous_stateintmamba_expandr7   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr1   zerosmamba_ngroupsmamba_headdimappend	key_cachevalue_cache)r6   r[   r\   r?   r]   ir;   rh   r<   r/   p   s:    z!Zamba2HybridDynamicCache.__init__	layer_idxnew_conv_statecache_positionreturnc                 C   sr   | j | }|d| jd }|jddd}||j|d d d d |f< | j |   | j |  |7  < | j | S )Nr   r"   r=   shiftsdims)ry   clamprs   rollr@   r]   zero_)r6   r   r   r   
conv_stater;   r;   r<   update_conv_state   s   

z*Zamba2HybridDynamicCache.update_conv_statec                 C   s   | j   | j  d S r-   )ry   r   rz   )r6   r;   r;   r<   reset   s   
zZamba2HybridDynamicCache.resetr   c                 C   sL   || j vr
| j d n|}t| j|ks| j|  dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )ru   lenr   numelrD   )r6   r   r;   r;   r<   get_seq_length   s    z'Zamba2HybridDynamicCache.get_seq_length)r   )rR   rS   rT   __doc__r1   float16r#   rm   r?   r   strr/   Tensor
LongTensorr   r   r   r;   r;   r;   r<   rZ   b   s.    
 
rZ   c                   @   rV   )Zamba2RotaryEmbeddingNrX   r;   r;   r;   r<   r      rY   r   c                       s   e Zd ZdZ			ddedee dee dee f fddZed	d
dd			dde	j
dedee	j
 d
ee deee	j
e	j
f  dee dee	j
ee	j
 eee	j
  f fddZ  ZS )Zamba2AttentionaZ  
    Multi-headed attention from 'Attention Is All You Need' paper.

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
    expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
    Nr[   r   num_fwd_mem_blocksblock_idc           	   	      sR  t  || || _|j| _|| _|jrtg | _	tg | _
tg | _t| jD ]p}||j |kr}ttj| j| jjddtj| jj| jdd}ttj| j| jjddtj| jj| jdd}ttj| j| jjddtj| jj| jdd}nt }t }t }| j	| | j
| | j| q+dd t| jD | _d S )NFbiasc                 S      i | ]\}}||qS r;   r;   rf   indexvaluer;   r;   r<   
<dictcomp>       z,Zamba2Attention.__init__.<locals>.<dictcomp>)r.   r/   r   hybrid_layer_idslayer_block_mapr   use_shared_attention_adapterr   
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listr{   num_mem_blocks
SequentialLinearattention_hidden_sizer[   adapter_rankIdentityr   	enumerate	layer_dic)	r6   r[   r   r   r   r   linear_q_adapterlinear_k_adapterlinear_v_adapterr9   r;   r<   r/      s:   zZamba2Attention.__init__past_key_valuepast_key_values4.58new_nameversionrI   attention_maskposition_embeddingskwargsr   c                 K   sp  |j d d }g |d| jR }| |}	| |}
| |}| jjrD| j| }|	| j| | }	|
| j	| | }
|| j
| | }|	|dd}	|
|dd}
||dd}| jjrp|\}}t|	|
||\}	}
|d ur}||
||\}
}t}| jjdkrt| jj }|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )Nr=   r"   r   eagerg        )dropoutscaling)rD   head_dimq_projk_projv_projr[   r   r   r   r   r   rE   	transposeuse_mem_roper   updater!   _attn_implementationr   trainingattention_dropoutr   reshape
contiguouso_proj)r6   rI   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesadapter_layer_idxcossinattention_interfaceattn_outputattn_weightsr;   r;   r<   rQ      sH   






zZamba2Attention.forwardr'   )rR   rS   rT   r   r#   r   rm   r/   r   r1   r   rZ   tupler   r	   rQ   rU   r;   r;   r9   r<   r      sB    )r   c                       s   e Zd ZdZddedee f fddZ		ddej	dee
 d	eej	 fd
dZddee
 d	eej	 fddZ		ddee
 d	eej	 fddZ  ZS )Zamba2MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    Nr[   r   c                    s  t    || _|j| _|j| _|j| _t|j	| j | _
|| _|j| _d| _t | _|j| _|j| _|j| _| jj| _|j| _|j| _|j| _|j| _| j
d| j | j  | _tj| j| jd|j| j|jd d| _| j
| j | j }tj| j||j d| _!t"t#$| j| _%t#&d| jd }t"t#'|| _(t)| j
| j
| j dd| _*t"t#$| j| _+tj| j
| j|j d| _,t-st./d	 d S d S )
NrC   r   Tr"   )in_channelsout_channelsr   kernel_sizegroupspaddingr   gh㈵>)r5   r8   a  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)0r.   r/   r[   r7   rp   rq   rr   rs   rm   rn   ro   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathr~   n_groupsr   r   rt   	num_heads
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr   add_bias_linearin_projr0   r1   r2   dt_biasarangelogA_logr+   normDout_projis_fast_path_availableloggerwarning_once)r6   r[   r   projection_sizeAr9   r;   r<   r/   #  s\   



	zZamba2MambaMixer.__init__rI   cache_paramsr   c                 C   sF  |j \}}}| j| j }d| j d| j | j  | j }|d ur|jr| |d}	|	j d | d }
|
|
| j| j| jg}t	j
|	|dd\}}}}}t||j| j | jjd| jj| j}t	j
|| j||gdd\}}}t	| j  }|d d d df d d d d d f d| j| jjt	jd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|j d | j }||| j|j d | j }||| j| j}t|j| j ||||||d |dd
}||| j| j }| ||}| |d d d df }|S |d ur;t	 |dks;|j!}||d d d d d f  |}| |}t	| j  }| j"d u rQi nd	| j"i}|d urct	 |dk}nd}| j#r| j$r|d u r|rt%|| jjd| jj| j|f| j| j&d | j| jj| jj'| jj| jj| j| jd
dd|\}}|S t	j
|| j| j| jgdd\}}}|d ur|(dd}t)j*+|| j,|j d  df}|j| j -| t.d u s| jdvr| /| |(dd(ddd d d |f }n t.|(dd| jjd| jj| jd(ddd d d |f }t	j
|| j||gdd\}}}|d urNt	 |dksN|j!}||d d d d d f  |}t0|||d| j|||||| jd|||| jdf| j&| jd d d| jdd|\}}|d ur|d ur|j| j -| |||d}| ||}| |}|S )Nr   r"   r=   dim.r?   T)zr   dt_softplusdt_limitF)r   r   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rC   swish)xr3   r   r   )r   r   r  r  r  r   r  )1rD   r   rq   ro   r   rl   r   squeezer   r1   splitr)   ry   r   r   r3   r   r   expr   floatexpandr   r@   rA   r   r   rE   r$   rz   r   r   allr?   r   r   r   r&   r   r4   r   r   rB   padrs   copy_r(   r   r%   )r6   rI   r   r   r\   seq_lenrg   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrJ   hidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutr?   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputr;   r;   r<   cuda_kernels_forwardb  s   

<"
] 

 
L
(

 

z%Zamba2MambaMixer.cuda_kernels_forwardc           1   
      s	  |j \}}}|j}|d ur|jr|d}n |d ur4t|dks4||d d d d d f  |}|}|j d dj  dj	 j
  j d }	|j|	|	jjjgdd\}}}
}}|d ur8|jj  }||j}|jr|
d}
|jj }tj|ddd}|jdkr|d d dd d f n||d d d d df< |jj | tj||jjjd d dd d f  dd}jr|jj7 }||d d d df }n||dd}tj |j!|j d  df}|jj | |ddd d d |d d f }|d ur7t|dks7|j}||d d d d d f  |}n&tj"|jj#j
f|j|d	}|dddd |f dd}tj|jj	j
 j	j
 gdd\}}}t$j%&  }|d ur|jr|jdkr|d d d df n|d d dd d f d d d df }|dd'||j d j#}j(d
 'j(j d j#}tjj)|||j }t*|j+}|d 'jj#j
jtj,d}t$|d
 | }|-|j	ddd d d f }|'|j	jj	 |j d . }|-|d|j d }|d
 |dd d d f  }|-|dj#}||d
  }|jj |jj | |  |-|j	ddd d d f }|'|j	jj	 |j d . }|-|d|j d }|jj |j}|/|j j#j
}|/|j j
d}t0||}|/|jj#}j1d
 'j1j d j#}|||  |j}|-|dd d d df }ntj)|j( }t*|j+}|-||dj#& }|-||dj
& }|-||dj
& }|j2jj	 djd}|j2jj	 djd}j3|j3  j3  j1d
 t4|  }||d
  }||j| } fdd||||fD \}}}}|5dddd}tj6|dd}t$t7|}|d d d d d d d d d d d f |d d d d d d d d d d d f  }|jdd}|d
 |5dddddd
  } | jdd}!|!d
 |d d d d d f  d}"t$|d d d d d d dd f | }#||#5ddddd
  }$|$5dddddd
 |5ddddddd d d f  jdd5ddddd}%|d ur|jr|jj d d d df }&nt8|%d d d df }&tj9|&|%gdd}%t$t7tj |d d d d d d df d}'|%5ddddd}(|'d |(d d d d d df  jdd})|)5ddddd}*|*d d d df |*d d df }%}t$|}+|dd d d f |%d d d d d df  },|+5dddd}-|,d|-d
  }.|"|. }|-|djj#}|| } dkr|d d d |d d d d f }|-||d}|d ur|d ur|jj | :||
}/;|/|}0|0S )Nr"   r=   r   r   r   r   r   .r^   ).N).NNr  )r  output_sizec                    s   g | ]	}t | jqS r;   )r   r   )rf   tpad_sizer6   r;   r<   ri   t  s    z2Zamba2MambaMixer.torch_forward.<locals>.<listcomp>   )r"   r   )<rD   r?   rl   r   r  r1   r  r@   ro   r   rq   r   r  r   rz   r   cloner]   	unsqueezery   r   ndimr  sumr   r3   r   r   r   r   r   rB   r  rs   r}   r   r  r   r  r  r   softplusr   r   rA   r   r   rE   bmmr   repeat_interleaver   r   permutecumsumr   
zeros_likecatr   r   )1r6   input_statesr   r   r\   r  rg   r?   r%  r  rJ   rI   r   r(  r   r!  r"  r   r   dAdBdBxrz   ssm_states_reshaped
C_reshapedyr   
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offr+  contextualized_statesr;   r/  r<   torch_forward  s    
.

60 . ,.B"$$$P$*L0(&
*
 zZamba2MambaMixer.torch_forwardc                 C   s0   t rd| jjjjv r| |||S | |||S )Ncuda)r   r   r3   r]   typer,  rY  )r6   rI   r   r   r;   r;   r<   rQ     s   zZamba2MambaMixer.forwardr-   r*   )rR   rS   rT   r   r#   r   rm   r/   r1   r   rZ   r,  rY  rQ   rU   r;   r;   r9   r<   r     s,    B
  Fr   c                       s6   e Zd Zddedee f fddZd	ddZ  ZS )
	Zamba2MLPNr[   r   c              	      s   t    || _|j| _|j| _|| _|| _tj| jd| j |j	d| _
tj| j| j|j	d| _t|j | _tg | _t| jD ]/}||j |krfttj| jj| jjddtj| jjd| j dd}nt }| j| qA|j}dd t|D | _dS )aQ  
        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
        r   r   Fc                 S   r   r;   r;   r   r;   r;   r<   r     r   z&Zamba2MLP.__init__.<locals>.<dictcomp>N)r.   r/   r[   r7   ro   r   r   r   r   r   gate_up_proj	down_projr   
hidden_actact_fnr   gate_up_proj_adapter_listr{   r   r   r   r   r   r   r   r   )r6   r[   r   r   r   gate_up_proj_adapterr   r9   r;   r<   r/     s(   
zZamba2MLP.__init__c                 C   sZ   |  |}| j| }|| j| | }tj|ddd}| |d |d  }| |}|S )Nr   r=   r   r   r"   )r]  r   ra  r1   chunkr`  r^  )r6   hidden_stater   gate_up_stateoutputr;   r;   r<   rQ     s   


zZamba2MLP.forwardr*   r-   )	rR   rS   rT   r#   r   rm   r/   rQ   rU   r;   r;   r9   r<   r\    s    r\  c                       s   e Zd Zddedee dee f fddZeddd	d
				ddej	dej	dedeej	 dee
 dee deej dee deejeeejejf  f fddZ  ZS )Zamba2AttentionDecoderLayerNr[   r   r   c                    sD   || _ t|j}t || t|d||d| _t|||d| _d S )Nr=   )r   r   r   )r   r   )	r   r   r   r.   r/   r   	self_attnr\  feed_forward)r6   r[   r   r   num_gsr9   r;   r<   r/     s
   
z$Zamba2AttentionDecoderLayer.__init__r   r   r   r   FrI   original_hidden_statesr   output_attentionsr   r   r   c              	   K   sl   t j||gdd}| |}| jd||||||d|\}}	| |}| ||}|f}
|r4|
|	f7 }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        r=   r   )rI   r   r   r   rl  r   Nr;   )r1   concatenateinput_layernormrh  pre_ff_layernormri  )r6   rI   rk  r   r   r   rl  r   r   self_attn_weightsoutputsr;   r;   r<   rQ     s$    




z#Zamba2AttentionDecoderLayer.forwardr*   )NNFN)rR   rS   rT   r#   r   rm   r/   r   r1   r   rZ   boolr   r   r	   r   FloatTensorrQ   rU   r;   r;   r9   r<   rg    s4    $	
rg  c                       s&   e Zd Zdedef fddZ  ZS )Zamba2MambaDecoderLayerr[   r   c                    s2   t  || t||d| _t|j|jd| _d S )N)r[   r   r8   )r.   r/   r   mambarW   r7   rms_norm_epsrn  )r6   r[   r   r9   r;   r<   r/   1  s   z Zamba2MambaDecoderLayer.__init__)rR   rS   rT   r#   rm   r/   rU   r;   r;   r9   r<   rt  0  s    rt  c                       s   e Zd Zdedejdef fddZedddd		
	
	
	
	
			
dde	j
dee	j
 dee dee	j
 dee	j
 dee dee dee dee	j dee	jeee	je	jf  f fddZ  ZS )Zamba2HybridLayershared_transformerlinearrv  c                    s   t  ||| | `|| _d S r-   )r.   r/   shared_transfry  )r6   ry  rz  rv  r9   r;   r<   r/   8  s   
zZamba2HybridLayer.__init__r   r   r   r   NFrI   rk  r   r   causal_maskrl  	use_cacher   r   c
              	   C   sn   | j |||||||	d}
|
d }|r|
d }| |}| j|||||||	d}
|r5|
d |f|
dd  }
|
S )aY  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )rk  r   r   r   rl  r   r   r"   )transformer_hidden_statesr   r   rl  r}  r   r   N)ry  rz  mamba_decoder)r6   rI   rk  r   r   r|  r   rl  r}  r   layer_outputsr~  rp  r;   r;   r<   rQ   ?  s2   !


zZamba2HybridLayer.forward)NNNNNFFN)rR   rS   rT   rg  r   r   rt  r/   r   r1   r   r   rm   rZ   rr  r   r   rs  rQ   rU   r;   r;   r9   r<   rx  7  sL    	
rx  c                       sJ   e Zd ZU eed< dZdZddgZdZdZ	dZ
dZdZ fddZ  ZS )	Zamba2PreTrainedModelr[   modelTrg  rt  r   c                    s   t  | t|tr_tt| jjt	
| jjt	
| jj  t	
| jj j| jjd}|t
t|   }|jj| td|jd }|jjt
| |jjd d S d S )N)minr"   g      ?)r.   _init_weights
isinstancer   r1   r  randr[   rt   mathr   r   r   r   time_step_floorexpm1r   datar  r   r   r   r   fill_)r6   moduler   inv_dtr   r9   r;   r<   r    s"   
z#Zamba2PreTrainedModel._init_weights)rR   rS   rT   r#   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulr  rU   r;   r;   r9   r<   r    s   
 r  c                   @   s   e Zd ZdZdefddZdd Z										ddeej	 d	eej
 d
eej	 dee deej dee dee dee dee deej	 deeef fddZdS )Zamba2Modelzh
    Model consisting of *config.num_hidden_layers* layers.

    Args:
        config: Zamba2Config
    r[   c                    sN  t |    | _ j| _ j| _t j j| j| _	 fddt
 jD }g }g } j| _t
 jD ]2} j| dkrH|t |d q5 j| dkrg|tj| jj| jjdd |t |d q5t|}t|}t|}| |||}t|| _ j| _t j jd| _ jr jrtd	 t | _d| _ | !  d S )
Nc                    s   g | ]}t  |d qS ))r   )rg  )rf   kr[   r;   r<   ri     s    z(Zamba2Model.__init__.<locals>.<listcomp>rv  r   r_   Fr   ru  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.)"r  r/   r[   pad_token_idpadding_idx
vocab_sizer   	Embeddingr7   embed_tokensr{   r   rk   r|   r   rt  r   iterr   
get_layersr   layersr   rW   rw  final_layernormr   use_long_contextr   r   r   
rotary_embgradient_checkpointing	post_init)r6   r[   blocksmamba_layerslinear_layersr   r  r;   r  r<   r/     s>   
zZamba2Model.__init__c                 C   sp  g }g | _ d| _t| jD ]\}}|dkr| jdkr|| _t|}| jjt| jj dkrd| d}t	
|d d d d	 d
 }	| j |	 d}
| jD ]$}|dkrm|
| jj |jkrmt	
dt|
 d }| j | |
d7 }
qM| jjrd}
| jD ]$}|dkr|
| jj |jkrt	
dt|
 d }| j | |
d7 }
q{|t|t|t| q|t| q|S )Nr   r_   r"   z	^layers\.z\.shared_transformer\.z(?:z3self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|z1feed_forward\.(?:gate_up_proj|down_proj)\.weight|z,(?:input_layernorm|pre_ff_layernorm)\.weightz)$z>^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\.z\.(?:0|1)\.weight$zg^shared_transformer\.self_attn\.(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\.)_tied_weights_keysfirst_transformer_layer_idr   rk   nextr[   r   r   r   recompiler   r   r   r   rx  )r6   r  r  r  r  layer_id
layer_typeblockprefix_patternmain_keys_pattern
adapter_id_layer_typeadapter_patternattn_adapter_patternr;   r;   r<   r    sh   




zZamba2Model.get_layersN	input_idsr   position_idsr   inputs_embedsr}  rl  output_hidden_statesreturn_dictr   r   c                 C   sd  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|}t|}|rr|d u rr|d urb|jd n|jd }t| j || j| jd}|
d u r|d ur|j| jdnd}tj|||jd  |jd}
|d u r|
d}| |||
}| j jr| ||}nd }|rd	nd }|rd	nd }t| jD ]C\}}|r||f7 }| jr| jr| |j|||||||||
}n||||||||||d
	}|d }|r|d d ur||d f7 }q| |}|r||f7 }|d ur|jsd|_t||r#|nd ||d}|	r.|S | S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   )r?   r]   r  r"   rb   r;   )rk  r   r   r|  r   rl  r}  r   T)last_hidden_stater   rI   
attentions) r[   rl  r  r}  use_return_dict
ValueErrorr  r   r   r   r  r1   r2  rD   rZ   r?   r]   r   r  r   r3  _update_causal_maskr   r  r   r  _gradient_checkpointing_func__call__r  rl   r
   to_tuple)r6   r  r   r  r   r  r}  rl  r  r  r   rI   rk  r\   past_seen_tokensr|  r   all_hidden_statesall_self_attnsr   layerr  rf  r;   r;   r<   rQ     s   





zZamba2Model.forward)
NNNNNNNNNN)rR   rS   rT   r   r#   r/   r  r   r1   r   r   rZ   rs  rr  r   r   r
   rQ   r;   r;   r;   r<   r    sJ    $2	

r  c                   @   rV   )Zamba2ForCausalLMNrX   r;   r;   r;   r<   r  s  rY   r  c                   @   rV   )Zamba2ForSequenceClassificationNrX   r;   r;   r;   r<   r  w  rY   r  )r  r  r  r  )Nr  r  	itertoolsr   typingr   r   r   r1   r   activationsr   modeling_flash_attention_utilsr	   modeling_outputsr
   modeling_utilsr   r   processing_utilsr   utilsr   utils.deprecationr   utils.import_utilsr   r   llama.modeling_llamar   r   mamba2.modeling_mamba2r   r   r   zamba.modeling_zambar   r   r   r   r   r   r   r   r    r!   configuration_zamba2r#   +mamba_ssm.ops.triton.selective_state_updater$   !mamba_ssm.ops.triton.ssd_combinedr%   r&   causal_conv1dr(   r)   r  r   _CONFIG_FOR_DOC
get_loggerrR   r   Moduler+   rW   rZ   r   r   r   r\  rg  rt  rx  r  r  r  r  __all__r;   r;   r;   r<   <module>   s^   0

Gn   /*?J V