o
    ei                     @   sl  d dl Z d dlmZ d dlmZ d dlZd dlmZ ddlmZ	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 e rd dl1m2Z2 d dl3m4Z4m5Z5 nd\Z2Z4Z5e rd dl6m7Z7m8Z8 nd\Z8Z7e9e2e7e8fZ:dZ;e<e=Z>G dd dejj?Z@G dd de-ZAG dd  d e)ZBG d!d" d"eZCG d#d$ d$e%ZDG d%d& d&ej?ZEG d'd( d(ej?ZFG d)d* d*e&ZGG d+d, d,e+ZHG d-d. d.e*ZIG d/d0 d0eZJG d1d2 d2e,eJZKG d3d4 d4e'ZLG d5d6 d6e(ZMg d7ZNdS )8    N)Callable)cycle)nn   )initialization)ACT2FN)create_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)is_torchdynamo_compilinglogging)is_causal_conv1d_availableis_mamba_ssm_available   )LlamaRotaryEmbeddingapply_rotary_pos_emb)pad_tensor_by_sizereshape_into_chunkssegment_sum)
ZambaAttentionZambaAttentionDecoderLayerZambaForCausalLMZambaForSequenceClassificationZambaHybridDynamicCacheZambaHybridLayerZambaMambaDecoderLayer
ZambaModelZambaRMSNormeager_attention_forward   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNzZyphra/Zamba2-2.7Bc                       s(   e Zd Zd fdd	ZdddZ  ZS )	Zamba2RMSNormGatedư>c                    s,   t    tt|| _|| _|| _d S N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer5   eps	__class__ g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/zamba2/modular_zamba2.pyr/   G   s   

zZamba2RMSNormGated.__init__Nc           	      C   s   |j }|tj}|d ur|tj|tj }|j^ }}|| j }|j	g ||| jR  }|
djddd}|t|| j  }|j	g ||| j R  }| j|| S )Nr   T)keepdim)dtypetor1   float32r   
functionalsilushaper5   viewpowmeanrsqrtr4   r3   )	r6   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariancer;   r;   r<   forwardM   s   
zZamba2RMSNormGated.forward)r,   r-   )__name__
__module____qualname__r/   rQ   __classcell__r;   r;   r9   r<   r+   F   s    r+   c                   @      e Zd ZdS )Zamba2RMSNormNrR   rS   rT   r;   r;   r;   r<   rW   [       rW   c                
   @   s   e Zd ZdZejdfdededejde	dB fddZ
d	ed
ejdejdejfddZdd Zdd	edB defddZdejd	edeeef fddZdS )Zamba2HybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nconfig
batch_sizer?   devicec              	      s  || _ |j| _d| _t|j|j | _|j| _|j	| _
|j| _g | _i | _i | _i | _i | _i | _t|jD ]7}tj | jd|j |j  | j
|d| j|< tj | j|j| j|d| j|< | j| dkrm| j| q6 fddt|jD | _ fddt|jD | _d S )NFr   r]   r?   hybridc                        g | ]}t jg g  d qS r]   r1   tensor.0_r\   r]   r;   r<   
<listcomp>        z5Zamba2HybridDynamicCache.__init__.<locals>.<listcomp>c                    r`   ra   rc   re   rh   r;   r<   ri      rj   )r?   layers_block_typehas_previous_stateintmamba_expandr7   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr1   zerosmamba_ngroupsmamba_headdimappend	key_cachevalue_cache)r6   r[   r\   r?   r]   ir;   rh   r<   r/   m   s:    z!Zamba2HybridDynamicCache.__init__	layer_idxnew_conv_statecache_positionreturnc                 C   sr   | j | }|d| jd }|jddd}||j|d d d d |f< | j |   | j |  |7  < | j | S )Nr   r"   r=   shiftsdims)ry   clamprs   rollr@   r]   zero_)r6   r   r   r   
conv_stater;   r;   r<   update_conv_state   s   

z*Zamba2HybridDynamicCache.update_conv_statec                 C   s   | j   | j  d S r-   )ry   r   rz   )r6   r;   r;   r<   reset   s   
zZamba2HybridDynamicCache.resetr   c                 C   sL   || j vr
| j d n|}t| j|ks| j|  dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )ru   lenr   numelrD   )r6   r   r;   r;   r<   get_seq_length   s    z'Zamba2HybridDynamicCache.get_seq_lengthc                 C   s$   d}|j d }| || }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )rD   r   )r6   r   r   	kv_offsetquery_length	kv_lengthr;   r;   r<   get_mask_sizes   s   
z'Zamba2HybridDynamicCache.get_mask_sizes)r   )rR   rS   rT   __doc__r1   float16r#   rm   r?   strr/   Tensor
LongTensorr   r   r   tupler   r;   r;   r;   r<   rZ   _   s0    
 
$rZ   c                   @   rV   )Zamba2RotaryEmbeddingNrX   r;   r;   r;   r<   r      rY   r   c                       s   e Zd ZdZ			ddededB dedB dedB f fddZ			dd	ejded
ejdB de	dB de
ejejf dB dee de
ejejdB e
ej dB f fddZ  ZS )Zamba2AttentionaZ  
    Multi-headed attention from 'Attention Is All You Need' paper.

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
    expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
    Nr[   r   num_fwd_mem_blocksblock_idc           	   	      sR  t  || || _|j| _|| _|jrtg | _	tg | _
tg | _t| jD ]p}||j |kr}ttj| j| jjddtj| jj| jdd}ttj| j| jjddtj| jj| jdd}ttj| j| jjddtj| jj| jdd}nt }t }t }| j	| | j
| | j| q+dd t| jD | _d S )NFbiasc                 S      i | ]\}}||qS r;   r;   rf   indexvaluer;   r;   r<   
<dictcomp>       z,Zamba2Attention.__init__.<locals>.<dictcomp>)r.   r/   r   hybrid_layer_idslayer_block_mapr   use_shared_attention_adapterr   
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listr{   num_mem_blocks
SequentialLinearattention_hidden_sizer[   adapter_rankIdentityr   	enumerate	layer_dic)	r6   r[   r   r   r   r   linear_q_adapterlinear_k_adapterlinear_v_adapterr9   r;   r<   r/      s:   zZamba2Attention.__init__rI   attention_maskpast_key_valuesposition_embeddingskwargsr   c                 K   sd  |j d d }g |d| jR }| |}	| |}
| |}| jjrD| j| }|	| j| | }	|
| j	| | }
|| j
| | }|	|dd}	|
|dd}
||dd}| jjrp|\}}t|	|
||\}	}
|d ur}||
||\}
}t| jjt}|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )Nr=   r"   r   g        )dropoutscaling)rD   head_dimq_projk_projv_projr[   r   r   r   r   r   rE   	transposeuse_mem_roper   updater   get_interface_attn_implementationr!   trainingattention_dropoutr   reshape
contiguouso_proj)r6   rI   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesadapter_layer_idxcossinattention_interfaceattn_outputattn_weightsr;   r;   r<   rQ      sH   	





zZamba2Attention.forwardr'   )rR   rS   rT   r   r#   rm   r/   r1   r   rZ   r   r   r	   rQ   rU   r;   r;   r9   r<   r      s@    -r   c                       s   e Zd ZdZddededB f fddZ		ddejde	dB d	ejdB fd
dZ
dde	dB d	ejdB fddZ		dde	dB d	ejdB fddZ  ZS )Zamba2MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    Nr[   r   c                    s  t    || _|j| _|j| _|j| _t|j	| j | _
|| _|j| _d| _t | _|j| _|j| _|j| _| jj| _|j| _|j| _|j| _|j| _| j
d| j | j  | _tj| j| jd|j| j|jd d| _| j
| j | j }tj| j||j d| _!t"t#$| j| _%t#&d| jd }t"t#'|| _(t)| j
| j
| j dd| _*t"t#$| j| _+tj| j
| j|j d| _,t-st./d	 d S d S )
NrC   r   Tr"   )in_channelsout_channelsr   kernel_sizegroupspaddingr   gh㈵>)r5   r8   a  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)0r.   r/   r[   r7   rp   rq   rr   rs   rm   rn   ro   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathr~   n_groupsr   r   rt   	num_heads
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr   add_bias_linearin_projr0   r1   r2   dt_biasarangelogA_logr+   normDout_projis_fast_path_availableloggerwarning_once)r6   r[   r   projection_sizeAr9   r;   r<   r/   &  s\   



	zZamba2MambaMixer.__init__rI   cache_paramsr   c                 C   sF  |j \}}}| j| j }d| j d| j | j  | j }|d ur|jr| |d}	|	j d | d }
|
|
| j| j| jg}t	j
|	|dd\}}}}}t||j| j | jjd| jj| j}t	j
|| j||gdd\}}}t	| j  }|d d d df d d d d d f d| j| jjt	jd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|j d | j }||| j|j d | j }||| j| j}t|j| j ||||||d |dd
}||| j| j }| ||}| |d d d df }|S |d ur;t	 |dks;|j!}||d d d d d f  |}| |}t	| j  }| j"d u rQi nd	| j"i}|d urct	 |dk}nd}| j#r| j$r|d u r|rt%|| jjd| jj| j|f| j| j&d | j| jj| jj'| jj| jj| j| jd
dd|\}}|S t	j
|| j| j| jgdd\}}}|d ur|(dd}t)j*+|| j,|j d  df}|j| j -| t.d u s| jdvr| /| |(dd(ddd d d |f }n t.|(dd| jjd| jj| jd(ddd d d |f }t	j
|| j||gdd\}}}|d urNt	 |dksN|j!}||d d d d d f  |}t0|||d| j|||||| jd|||| jdf| j&| jd d d| jdd|\}}|d ur|d ur|j| j -| |||d}| ||}| |}|S )Nr   r"   r=   dim.r?   T)zr   dt_softplusdt_limitF)r   r   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rC   swish)xr3   r   r   )r   r   r  r  r  r   r  )1rD   r   rq   ro   r   rl   r   squeezer   r1   splitr)   ry   r   r   r3   r   r   expr   floatexpandr   r@   rA   r   r   rE   r$   rz   r   r   allr?   r   r   r   r&   r   r4   r   r   rB   padrs   copy_r(   r   r%   )r6   rI   r   r   r\   seq_lenrg   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrJ   hidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutr?   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputr;   r;   r<   cuda_kernels_forwarde  s   

<"
] 

 
L
(

 

z%Zamba2MambaMixer.cuda_kernels_forwardc           1   
      s	  |j \}}}|j}|d ur|jr|d}n|d ur-||d d d d d f  |}|}|j d dj  dj j  j	 d }	|j
|	|	jjj	gdd\}}}
}}|d ur)|jj  }||j}|jr|
d}
|jj }tj|ddd}|jdkr|d d dd d f n||d d d d df< |jj | tj||jjjd d dd d f  dd}jr|jj7 }||d d d df }nt|dd}tj|j |j d  df}|jj | |ddd d d |d d f }|d ur(|j}||d d d d d f  |}n&tj!|j	j"jf|j|d	}|dddd |f dd}tj
|jjj jj gdd\}}}t#j$%  }|d ur|jr|jdkr|d d d df n|d d dd d f d d d df }|dd&||j d j"}j'd
 &j'j d j"}tjj(|||j }t)|j*}|d &j	j"jjtj+d}t#|d
 | }|,|jddd d d f }|&|jj	j |j d - }|,|d|j d }|d
 |dd d d f  }|,|dj"}||d
  }|jj |jj | |  |,|jddd d d f }|&|jj	j |j d - }|,|d|j d }|jj |j}|.|j	 j"j}|.|j	 jd}t/||}|.|j	j"}j0d
 &j0j d j"}|||  |j}|,|dd d d df }ntj(|j' }t)|j*}|,||dj"% }|,||dj% }|,||dj% }|j1j	j dj	d}|j1j	j dj	d}j2|j2  j2  j0d
 t3|  }||d
  }||j| } fdd||||fD \}}}}|4dddd}tj5|dd}t#t6|}|d d d d d d d d d d d f |d d d d d d d d d d d f  }|jdd}|d
 |4dddddd
  } | jdd}!|!d
 |d d d d d f  d}"t#|d d d d d d dd f | }#||#4ddddd
  }$|$4dddddd
 |4ddddddd d d f  jdd4ddddd}%|d ur|jr|jj d d d df }&nt7|%d d d df }&tj8|&|%gdd}%t#t6tj|d d d d d d df d}'|%4ddddd}(|'d |(d d d d d df  jdd})|)4ddddd}*|*d d d df |*d d df }%}t#|}+|dd d d f |%d d d d d df  },|+4dddd}-|,d|-d
  }.|"|. }|,|dj	j"}|| } dkr|d d d |d d d d f }|,||d}|d ur|d ur|jj | 9||
}/:|/|}0|0S )Nr"   r=   r   r   r   r   r   .r^   ).N).NNr  )r   output_sizec                    s   g | ]	}t | jqS r;   )r   r   )rf   tpad_sizer6   r;   r<   ri   w  s    z2Zamba2MambaMixer.torch_forward.<locals>.<listcomp>   )r"   r   );rD   r?   rl   r   r  r@   ro   r   rq   r   r  r   rz   r   cloner]   	unsqueezery   r1   r   ndimr  sumr   r3   r   r   r   r   r   rB   r  rs   r}   r   r  r   r  r  r   softplusr   r   rA   r   r   rE   bmmr   repeat_interleaver   r   permutecumsumr   
zeros_likecatr   r   )1r6   input_statesr   r   r\   r  rg   r?   r$  r  rJ   rI   r  r'  r   r   r!  r   r   dAdBdBxrz   ssm_states_reshaped
C_reshapedyr   
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offr*  contextualized_statesr;   r.  r<   torch_forward  s    
.

60 .
 ,.B"$$$P$*L0(&
*
 zZamba2MambaMixer.torch_forwardc                 C   s6   t rd| jjjjv rt s| |||S | |||S )Ncuda)r   r   r3   r]   typer   r+  rX  )r6   rI   r   r   r;   r;   r<   rQ     s   zZamba2MambaMixer.forwardr-   r*   )rR   rS   rT   r   r#   rm   r/   r1   r   rZ   r+  rX  rQ   rU   r;   r;   r9   r<   r     s,    B
  Fr   c                       s6   e Zd ZddededB f fddZd	ddZ  ZS )
	Zamba2MLPNr[   r   c              	      s   t    || _|j| _|j| _|| _|| _tj| jd| j |j	d| _
tj| j| j|j	d| _t|j | _tg | _t| jD ]/}||j |krfttj| jj| jjddtj| jjd| j dd}nt }| j| qA|j}dd t|D | _dS )aQ  
        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
        r   r   Fc                 S   r   r;   r;   r   r;   r;   r<   r     r   z&Zamba2MLP.__init__.<locals>.<dictcomp>N)r.   r/   r[   r7   ro   r   r   r   r   r   gate_up_proj	down_projr   
hidden_actact_fnr   gate_up_proj_adapter_listr{   r   r   r   r   r   r   r   r   )r6   r[   r   r   r   gate_up_proj_adapterr   r9   r;   r<   r/     s(   
zZamba2MLP.__init__c                 C   sZ   |  |}| j| }|| j| | }tj|ddd}| |d |d  }| |}|S )Nr   r=   r   r   r"   )r\  r   r`  r1   chunkr_  r]  )r6   hidden_stater   gate_up_stateoutputr;   r;   r<   rQ     s   


zZamba2MLP.forwardr*   r-   )rR   rS   rT   r#   rm   r/   rQ   rU   r;   r;   r9   r<   r[    s    r[  c                       s   e Zd ZddededB dedB f fddZ				ddejd	ejded
ejdB dedB de	dB dej
dB dee deejeejejf dB f fddZ  ZS )Zamba2AttentionDecoderLayerNr[   r   r   c                    sD   || _ t|j}t || t|d||d| _t|||d| _d S )Nr=   )r   r   r   )r   r   )	r   r   r   r.   r/   r   	self_attnr[  feed_forward)r6   r[   r   r   num_gsr9   r;   r<   r/     s
   
z$Zamba2AttentionDecoderLayer.__init__FrI   original_hidden_statesr   r   output_attentionsr   r   r   c              	   K   sl   t j||gdd}| |}| jd||||||d|\}}	| |}| ||}|f}
|r4|
|	f7 }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        r=   r   )rI   r   r   r   rk  r   Nr;   )r1   concatenateinput_layernormrg  pre_ff_layernormrh  )r6   rI   rj  r   r   r   rk  r   r   self_attn_weightsoutputsr;   r;   r<   rQ     s$   




z#Zamba2AttentionDecoderLayer.forwardr*   )NNFN)rR   rS   rT   r#   rm   r/   r1   r   rZ   boolr   r   r	   r   FloatTensorrQ   rU   r;   r;   r9   r<   rf    s2    $	
rf  c                       s&   e Zd Zdedef fddZ  ZS )Zamba2MambaDecoderLayerr[   r   c                    s2   t  || t||d| _t|j|jd| _d S )N)r[   r   r8   )r.   r/   r   mambarW   r7   rms_norm_epsrm  )r6   r[   r   r9   r;   r<   r/   3  s   z Zamba2MambaDecoderLayer.__init__)rR   rS   rT   r#   rm   r/   rU   r;   r;   r9   r<   rs  2  s    rs  c                       s   e Zd Zdedejdef fddZ									ddej	d	ej	dB d
e
dB dej	dB dej	dB dedB dedB dedB dejdB dejdB deejeejejf dB f fddZ  ZS )Zamba2HybridLayershared_transformerlinearru  c                    s   t  ||| | `|| _d S r-   )r.   r/   shared_transfrx  )r6   rx  ry  ru  r9   r;   r<   r/   :  s   
zZamba2HybridLayer.__init__NFrI   rj  r   r   causal_maskr   rk  	use_cacher   position_idsr   c              
   C   sp   | j |||||||	|
d}|d }|r|d }| |}| j|||||||	d}|r6|d |f|dd  }|S )aY  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )rj  r   r   r   rk  r   r}  r   r"   )transformer_hidden_statesr   r   rk  r|  r   r   N)rx  ry  mamba_decoder)r6   rI   rj  r   r   r{  r   rk  r|  r   r}  layer_outputsr~  ro  r;   r;   r<   rQ   A  s4   !

zZamba2HybridLayer.forward)	NNNNNFFNN)rR   rS   rT   rf  r   r   rs  r/   r1   r   rm   rZ   rq  r   r   rr  rQ   rU   r;   r;   r9   r<   rw  9  sP    
	
rw  c                       sR   e Zd ZU eed< dZdZddgZdZdZ	dZ
dZdZe  fddZ  ZS )	Zamba2PreTrainedModelr[   modelTrf  rs  r   c                    s   t  | t|tr^tt| jjt	
| jjt	
| jj  t	
| jj j| jjd}|t
t|   }t|j| td|jd }t|jt
| t|j d S d S )N)minr"   )r.   _init_weights
isinstancer   r1   r  randr[   rt   mathr   r   r   r   time_step_floorexpm1initr  r   r   r   r   ones_r   )r6   moduler  inv_dtr   r9   r;   r<   r    s"   
z#Zamba2PreTrainedModel._init_weights)rR   rS   rT   r#   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulr1   no_gradr  rU   r;   r;   r9   r<   r    s   
 r  c                   @   s   e Zd ZdZdefddZdd Z										ddejdB d	ej	dB d
ejdB de
dB dejdB dedB dedB dedB dedB dejdB deeB fddZdS )Zamba2Modelzh
    Model consisting of *config.num_hidden_layers* layers.

    Args:
        config: Zamba2Config
    r[   c                 C   s   t | | || _|j| _|j| _t|j|j| j| _	|j
| _
|  | _|j| _t|j|jd| _|jrB|jr=td t|| _d| _|   d S )Nrt  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.F)r  r/   r[   pad_token_idpadding_idx
vocab_sizer   	Embeddingr7   embed_tokensrk   
get_layerslayersr   rW   rv  final_layernormr   use_long_contextr   r   r   
rotary_embgradient_checkpointing	post_init)r6   r[   r;   r;   r<   r/     s"   

zZamba2Model.__init__c                 C   s   g }i | _ d| _g }t| jD ]e\}}t| j|d}|dkrod| d}t|tr1t|| jj	krGt|tr:t
|}t|}| j ||i n|| || jj	 }t| j|d}	tj| jj| jjdd}
|t|	|
| q|| qt|S )	Nr   r   r_   zlayers.z.shared_transformer)r   Fr   )_tied_weights_keysfirst_transformer_layer_idr   rk   rs  r[   r  listr   r   r   nextr   r   rf  r   r   r7   rw  r   )r6   r  unique_hybrid_blockslayer_id
layer_typemamba_layerprefix_patterntarget_patternr   
attn_blocklinear_layerr;   r;   r<   r    s,   


zZamba2Model.get_layersN	input_idsr   r}  r   inputs_embedsr|  rk  output_hidden_statesreturn_dictr   r   c                 K   s0  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|}t|}|rr|d u rr|d urb|jd n|jd }t| j || j| jd}|
d u r|d ur|j| jdnd}tj|||jd  |jd}
|d u r|
d}t| j |||
||d	}| j||d
}|rdnd }|rdnd }t| jD ],\}}|r||f7 }|||||||||||d
}|d }|r|d d ur||d f7 }q| |}|r||f7 }|d ur|jsd|_t||r	|nd ||d}|	r|S | S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   )r?   r]   r  r"   rb   )r[   r  r   r   r   r}  )r}  r;   )r   rk  r|  r   r}  T)last_hidden_stater   rI   
attentions)r[   rk  r  r|  use_return_dict
ValueErrorr  r   r   r   r  r1   r1  rD   rZ   r?   r]   r   r  r   r2  r   r  r   r  r  rl   r
   to_tuple)r6   r  r   r}  r   r  r|  rk  r  r  r   r   rI   rj  r\   past_seen_tokensr{  r   all_hidden_statesall_self_attnsr   layerr  re  r;   r;   r<   rQ     s   





zZamba2Model.forward)
NNNNNNNNNN)rR   rS   rT   r   r#   r/   r  r1   r   r   rZ   rr  rq  r   r
   rQ   r;   r;   r;   r<   r    sJ    %	
r  c                   @   rV   )Zamba2ForCausalLMNrX   r;   r;   r;   r<   r  T  rY   r  c                   @   rV   )Zamba2ForSequenceClassificationNrX   r;   r;   r;   r<   r  X  rY   r  )r  r  r  r  )Or  collections.abcr   	itertoolsr   r1   r    r   r  activationsr   masking_utilsr   modeling_flash_attention_utilsr	   modeling_outputsr
   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.import_utilsr   r   llama.modeling_llamar   r   mamba2.modeling_mamba2r   r   r   zamba.modeling_zambar   r   r   r   r   r   r   r   r    r!   configuration_zamba2r#   +mamba_ssm.ops.triton.selective_state_updater$   !mamba_ssm.ops.triton.ssd_combinedr%   r&   causal_conv1dr(   r)   r  r   _CONFIG_FOR_DOC
get_loggerrR   r   Moduler+   rW   rZ   r   r   r   r[  rf  rs  rw  r  r  r  r  __all__r;   r;   r;   r<   <module>   s^   0

Nm   /*>K 3