o
    ei:9                    @   s4  d dl Z d dlmZ d dlmZ d dlmZmZ d dlZd dlm	Z	 d dl
mZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5 e3 rd dl6m7Z7 d dl8m9Z9m:Z: nd\Z7Z9Z:e2 rd dl;m<Z<m=Z= nd\Z=Z<e.>e?Z@G dd dej	jAZBG d d! d!e	jAZCG d"d# d#ZDG d$d% d%e	jAZEd&ejFd'eGd(ejFfd)d*ZH	+dYd,e	jAd-ejFd.ejFd/ejFd0ejFdB d1eId2eIfd3d4ZJd5d6 ZKed7dZd8d9ZLG d:d; d;e	jAZMd<ejFd=eGfd>d?ZNd@dA ZOdBdC ZPeQe7e<e=fZRG dDdE dEe	jAZSG dFdG dGe	jAZTG dHdI dIe	jAZUG dJdK dKeZVG dLdM dMeZWG dNdO dOe(ZXe,G dPdQ dQeXZYG dRdS dSeXeZZe,dTdUG dVdW dWeXZ[g dXZ\dS )[    N)Callable)cycle)AnyOptional)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)Cache)GenerationMixin)use_kernel_func_from_hub)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torchdynamo_compilinglogging)maybe_autocast)is_causal_conv1d_availableis_mamba_ssm_available   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNc                       s(   e Zd Zd fdd	ZdddZ  ZS )	Zamba2RMSNormGatedư>c                    s,   t    tt|| _|| _|| _d S N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer4   eps	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/zamba2/modeling_zamba2.pyr.   ?   s   

zZamba2RMSNormGated.__init__Nc           	      C   s   |j }|tj}|d ur|tj|tj }|j^ }}|| j }|j	g ||| jR  }|
djddd}|t|| j  }|j	g ||| j R  }| j|| S N   T)keepdim)dtypetor0   float32r   
functionalsilushaper4   viewpowmeanrsqrtr3   r2   )	r5   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariancer:   r:   r;   forwardE   s   
zZamba2RMSNormGated.forwardr+   r,   )__name__
__module____qualname__r.   rR   __classcell__r:   r:   r8   r;   r*   >   s    r*   c                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )Zamba2RMSNormr+   r7   returnNc                    s&   t    tt|| _|| _dS )z<
        Zamba2RMSNorm is equivalent to T5LayerNorm
        N)r-   r.   r   r/   r0   r1   r2   r3   )r5   r6   r7   r8   r:   r;   r.   T   s   

zZamba2RMSNorm.__init__rJ   c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S r<   )	r@   rA   r0   rB   rG   rH   rI   r3   r2   )r5   rJ   rL   rQ   r:   r:   r;   rR   \   s
   zZamba2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler2   rE   r3   r5   r:   r:   r;   
extra_reprc   s   zZamba2RMSNorm.extra_reprrS   )
rT   rU   rV   floatr.   r0   TensorrR   r\   rW   r:   r:   r8   r;   rX   S   s    rX   c                   @   s   e Zd ZdZdZejdfdededej	de
dB fdd	Zd
d Z	d!dejdejdedee
ef dB deejejf f
ddZdejfddZd"dedB defddZdejdedeeef fddZdedejdejdejfddZdd  ZdS )#Zamba2HybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNconfig
batch_sizer@   devicec              	      s  || _ |j| _d| _t|j|j | _|j| _|j	| _
|j| _g | _i | _i | _i | _i | _i | _t|jD ]7}tj | jd|j |j  | j
|d| j|< tj | j|j| j|d| j|< | j| dkrm| j| q6 fddt|jD | _ fddt|jD | _d S )NFr=   rb   r@   hybridc                        g | ]}t jg g  d qS rb   r0   tensor.0_ra   rb   r:   r;   
<listcomp>        z5Zamba2HybridDynamicCache.__init__.<locals>.<listcomp>c                    re   rf   rh   rj   rm   r:   r;   rn      ro   )r@   layers_block_typehas_previous_stateintmamba_expandr6   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr0   zerosmamba_ngroupsmamba_headdimappend	key_cachevalue_cache)r5   r`   ra   r@   rb   ir:   rm   r;   r.   w   s:    z!Zamba2HybridDynamicCache.__init__c                 C   s
   t | jS r,   )lenr   r[   r:   r:   r;   __len__   s   
z Zamba2HybridDynamicCache.__len__
key_statesvalue_states	layer_idxcache_kwargsrY   c                 C   sz   | j | jd dkr|| j |< || j|< ntj| j | |gdd| j |< tj| j| |gdd| j|< | j | | j| fS )Nr>   r   r=   dim)r   rE   r   r0   cat)r5   r   r   r   r   r:   r:   r;   update   s   
zZamba2HybridDynamicCache.updatebeam_idxc                 C   s   |   dkrdtt| jD ]X}| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< | j	| j}| j	| d||| j	|< qdS dS )zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthr   r   r   rb   index_selectrA   r   r~   r   )r5   r   r   rb   r:   r:   r;   reorder_cache   s    z&Zamba2HybridDynamicCache.reorder_cacher   c                 C   sL   || j vr
| j d n|}t| j|ks| j|  dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rz   r   r   numelrE   )r5   r   r:   r:   r;   r      s    z'Zamba2HybridDynamicCache.get_seq_lengthcache_positionc                 C   s$   d}|j d }| || }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )rE   r   )r5   r   r   	kv_offsetquery_length	kv_lengthr:   r:   r;   get_mask_sizes   s   
z'Zamba2HybridDynamicCache.get_mask_sizesnew_conv_statec                 C   sr   | j | }|d| jd }|jddd}||j|d d d d |f< | j |   | j |  |7  < | j | S )Nr   r!   r>   shiftsdims)r~   clamprx   rollrA   rb   zero_)r5   r   r   r   
conv_stater:   r:   r;   update_conv_state   s   

z*Zamba2HybridDynamicCache.update_conv_statec                 C   s   | j   | j  d S r,   )r~   r   r   r[   r:   r:   r;   reset   s   
zZamba2HybridDynamicCache.resetr,   )r   )rT   rU   rV   __doc__is_compileabler0   float16r"   rr   r@   strr.   r   r^   dictr   rZ   r   
LongTensorr   r   r   r   r   r:   r:   r:   r;   r_   g   sN    
 
 
r_   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )Zamba2RotaryEmbeddinginv_freqNr`   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)r-   r.   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr`   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r5   r`   rb   rope_init_fnr   r8   r:   r;   r.      s   


zZamba2RotaryEmbedding.__init__rb   ztorch.deviceseq_lenrY   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r=   r@   rc   )	r   getattrr6   num_attention_headsr0   arangeint64rA   r]   )r`   rb   r   baser   attention_factorr   r:   r:   r;   r      s   
&z5Zamba2RotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r>   r!   mpscpuF)device_typeenabledr=   r   r   )r   r]   expandrE   rA   rb   
isinstancetyper   r   	transposer0   r   cosr   sinr@   )
r5   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   r:   r:   r;   rR   
  s   0&zZamba2RotaryEmbedding.forwardr,   r&   )rT   rU   rV   r0   r^   __annotations__r"   r.   staticmethodr   rr   rZ   r]   r   no_gradr   rR   rW   r:   r:   r8   r;   r      s&   
 

r   rJ   n_reprY   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)rE   r   reshape)rJ   r   batchnum_key_value_headsslenr   r:   r:   r;   	repeat_kv  s
   0r           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr=   r
   r>   )r   r@   )ptrainingr!   )r   num_key_value_groupsr0   matmulr   r   rC   softmaxrB   rA   r@   r   r   
contiguous)r   r   r   r   r   r   r   kwargsr   r   attn_weightsattn_outputr:   r:   r;   eager_attention_forward&  s   
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr>   r=   r   )rE   r0   r   )r   x1x2r:   r:   r;   rotate_half?  s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embedr:   r:   r;   apply_rotary_pos_embF  s
   

r   c                       s   e Zd ZdZ			ddededB dedB dedB f fddZ			dd	ejded
ejdB de	dB de
ejejf dB dee de
ejejdB e
ej dB f fddZ  ZS )Zamba2AttentionaZ  
    Multi-headed attention from 'Attention Is All You Need' paper.

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
    expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
    Nr`   r   num_fwd_mem_blocksblock_idc           	   	      s  t    || _|| _|j| _|j| _|j|j | _	|j
| _
| jd d | _d| _|j| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j| j |jdd| _|| _|j| _|| _|jrtg | _tg | _tg | _t| jD ]p}||j |krt tj| j| jj!ddtj| jj!| jdd}t tj| j| jj!ddtj| jj!| jdd}t tj| j| jj!ddtj| jj!| jdd}nt" }t" }t" }| j#| | j#| | j#| qdd t$| jD | _%d S )Nr=   g      TFbiasc                 S      i | ]\}}||qS r:   r:   rk   indexr   r:   r:   r;   
<dictcomp>      z,Zamba2Attention.__init__.<locals>.<dictcomp>)&r-   r.   r`   r   attention_hidden_sizeattention_head_dimr   r   r   r   r   r   	is_causalattention_dropoutr   Linearq_projk_projv_projr6   o_projr   hybrid_layer_idslayer_block_mapr   use_shared_attention_adapter
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listr   num_mem_blocks
Sequentialadapter_rankIdentityr   	enumerate	layer_dic)	r5   r`   r   r   r   r   linear_q_adapterlinear_k_adapterlinear_v_adapterr8   r:   r;   r.   p  sT   
zZamba2Attention.__init__rJ   r   past_key_valuesposition_embeddingsr   rY   c                 K   sd  |j d d }g |d| jR }| |}	| |}
| |}| jjrD| j| }|	| j| | }	|
| j	| | }
|| j
| | }|	|dd}	|
|dd}
||dd}| jjrp|\}}t|	|
||\}	}
|d ur}||
||\}
}t| jjt}|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )Nr>   r!   r=   r   )r   r   )rE   r   r  r  r  r`   r  r  r  r  r  rF   r   use_mem_roper   r   r   get_interface_attn_implementationr   r   r
  r   r   r   r  )r5   rJ   r   r   r   r!  r   input_shapehidden_shapequery_statesr   r   adapter_layer_idxr   r   attention_interfacer   r   r:   r:   r;   rR     sH   	





zZamba2Attention.forwardr&   )rT   rU   rV   r   r"   rr   r.   r0   r^   r_   rZ   r   r   rR   rW   r:   r:   r8   r;   r   `  s@    <r   input_tensorpad_sizec                 C   sH   t | jdkrddddd|ddfnddd|ddf}tjjj| |dddS )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moder   )r   rE   r0   r   rC   pad)r*  r+  	pad_shaper:   r:   r;   pad_tensor_by_size  s   2r1  c                 C   sX   t | |} t| jdkr| | jd d|| jd S | | jd d|| jd | jd S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r
   r   r>   r=   )r1  r   rE   r   )r*  r+  
chunk_sizer:   r:   r;   reshape_into_chunks  s   
r3  c                 C   s   |  d}| d jg |   |R  } tjtj||| jtjddd}| | d} tj| dd}tjtj||| jtjddd}|| tj	 }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r>   .Nrc   )diagonalr   r   r   )
sizer   r0   trilr1   rb   boolmasked_fillcumsuminf)r*  r2  masktensor_segsumr:   r:   r;   segment_sum  s   
  r>  c                       s   e Zd ZdZddededB f fddZ		ddejde	dB d	ejdB fd
dZ
dde	dB d	ejdB fddZ		dde	dB d	ejdB fddZ  ZS )Zamba2MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    Nr`   r   c                    s  t    || _|j| _|j| _|j| _t|j	| j | _
|| _|j| _d| _t | _|j| _|j| _|j| _| jj| _|j| _|j| _|j| _|j| _| j
d| j | j  | _tj| j| jd|j| j|jd d| _| j
| j | j }tj| j||j d| _!t"t#$| j| _%t#&d| jd }t"t#'|| _(t)| j
| j
| j dd| _*t"t#$| j| _+tj| j
| j|j d| _,t-st./d	 d S d S )
NrD   r=   Tr!   )in_channelsout_channelsr  kernel_sizegroupspaddingr   gh㈵>)r4   r7   a  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)0r-   r.   r`   r6   ru   rv   rw   rx   rr   rs   rt   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathr   n_groupsr   r   ry   	num_headsr2  time_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr  add_bias_linearin_projr/   r0   r1   dt_biasr   logA_logr*   normDout_projis_fast_path_availableloggerwarning_once)r5   r`   r   projection_sizeAr8   r:   r;   r.     s\   



	zZamba2MambaMixer.__init__rJ   cache_paramsr   c                 C   sF  |j \}}}| j| j }d| j d| j | j  | j }|d ur|jr| |d}	|	j d | d }
|
|
| j| j| jg}t	j
|	|dd\}}}}}t||j| j | jjd| jj| j}t	j
|| j||gdd\}}}t	| j  }|d d d df d d d d d f d| j| jjt	jd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|j d | j }||| j|j d | j }||| j| j}t|j| j ||||||d |dd
}||| j| j }| ||}| |d d d df }|S |d ur;t	 |dks;|j!}||d d d d d f  |}| |}t	| j  }| j"d u rQi nd	| j"i}|d urct	 |dk}nd}| j#r| j$r|d u r|rt%|| jjd| jj| j|f| j| j&d | j| jj| jj'| jj| jj| j| jd
dd|\}}|S t	j
|| j| j| jgdd\}}}|d ur|(dd}t)j*+|| j,|j d  df}|j| j -| t.d u s| jdvr| /| |(dd(ddd d d |f }n t.|(dd| jjd| jj| jd(ddd d d |f }t	j
|| j||gdd\}}}|d urNt	 |dksN|j!}||d d d d d f  |}t0|||d| j|||||| jd|||| jdf| j&| jd d d| jdd|\}}|d ur|d ur|j| j -| |||d}| ||}| |}|S )Nr=   r!   r>   r   .r   T)zrT  dt_softplusdt_limitF)rX  r2  seq_idxrF  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rD   swish)r   r2   r  rF  )r2  rX  r`  rc  rk  rT  ra  )1rE   rJ  rv   rt   rK  rq   rS  squeezerO  r0   splitr(   r~   r   rQ  r2   r  rF  exprV  r]   r   r   rA   rB   rT  rX  rF   r#   r   rW  rY  allr@   rL  rI  r   r%   r2  r3   r   r   rC   r/  rx   copy_r'   rH  r$   )r5   rJ   r_  r   ra   r   rl   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrK   hidden_states_B_CdtBCr^  rT  rX  hidden_states_reshapedoutr@   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputr:   r:   r;   cuda_kernels_forward\  s   

<"
] 

 
L
(

 

z%Zamba2MambaMixer.cuda_kernels_forwardc           1   
      s	  |j \}}}|j}|d ur|jr|d}n|d ur-||d d d d d f  |}|}|j d dj  dj j  j	 d }	|j
|	|	jjj	gdd\}}}
}}|d ur)|jj  }||j}|jr|
d}
|jj }tj|ddd}|jdkr|d d dd d f n||d d d d df< |jj | tj||jjjd d dd d f  dd}jr|jj7 }||d d d df }nt|dd}tj|j |j d  df}|jj | |ddd d d |d d f }|d ur(|j}||d d d d d f  |}n&tj!|j	j"jf|j|d	}|dddd |f dd}tj
|jjj jj gdd\}}}t#j$%  }|d ur|jr|jdkr|d d d df n|d d dd d f d d d df }|dd&||j d j"}j'd
 &j'j d j"}tjj(|||j }t)|j*}|d &j	j"jjtj+d}t#|d
 | }|,|jddd d d f }|&|jj	j |j d - }|,|d|j d }|d
 |dd d d f  }|,|dj"}||d
  }|jj |jj | |  |,|jddd d d f }|&|jj	j |j d - }|,|d|j d }|jj |j}|.|j	 j"j}|.|j	 jd}t/||}|.|j	j"}j0d
 &j0j d j"}|||  |j}|,|dd d d df }ntj(|j' }t)|j*}|,||dj"% }|,||dj% }|,||dj% }|j1j	j dj	d}|j1j	j dj	d}j2|j2  j2  j0d
 t3|  }||d
  }||j| } fdd||||fD \}}}}|4dddd}tj5|dd}t#t6|}|d d d d d d d d d d d f |d d d d d d d d d d d f  }|jdd}|d
 |4dddddd
  } | jdd}!|!d
 |d d d d d f  d}"t#|d d d d d d dd f | }#||#4ddddd
  }$|$4dddddd
 |4ddddddd d d f  jdd4ddddd}%|d ur|jr|jj d d d df }&nt7|%d d d df }&tj8|&|%gdd}%t#t6tj|d d d d d d df d}'|%4ddddd}(|'d |(d d d d d df  jdd})|)4ddddd}*|*d d d df |*d d df }%}t#|}+|dd d d f |%d d d d d df  },|+4dddd}-|,d|-d
  }.|"|. }|,|dj	j"}|| } dkr|d d d |d d d d f }|,||d}|d ur|d ur|jj | 9||
}/:|/|}0|0S )Nr!   r>   r=   r   r   r
   r   .rc   r4  ).NNr   )r   output_sizec                    s   g | ]	}t | jqS r:   )r3  r2  )rk   tr+  r5   r:   r;   rn   n  s    z2Zamba2MambaMixer.torch_forward.<locals>.<listcomp>r,  )r!   r   );rE   r@   rq   rS  rm  rA   rt   rJ  rv   rK  rn  rO  r   r   r   rb   r   r~   r0   r   ndimrq  sumrQ  r2   rE  r  rH  r   r   rC   r/  rx   r   r   ro  rV  r]   r   rT  softplusr   rM  rB   r   r   rF   bmmrX  repeat_interleaver2  r1  permuter:  r>  
zeros_liker   rW  rY  )1r5   input_statesr_  r   ra   r   rl   r@   r}  ru  rK   rJ   rx  r  r   ry  rz  r^  rT  dAdBdBxr   ssm_states_reshaped
C_reshapedyrX  
D_residualA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offr  contextualized_statesr:   r  r;   torch_forward  s    
.

60 .
 ,.B"$$$P$*L0(&
*
 zZamba2MambaMixer.torch_forwardc                 C   s6   t rd| jjjjv rt s| |||S | |||S )Ncuda)rZ  rS  r2   rb   r   r   r  r  )r5   rJ   r_  r   r:   r:   r;   rR     s   zZamba2MambaMixer.forwardr,   r)   )rT   rU   rV   r   r"   rr   r.   r0   r^   r_   r  r  rR   rW   r:   r:   r8   r;   r?    s,    B
  Fr?  c                       s6   e Zd ZddededB f fddZd	ddZ  ZS )
	Zamba2MLPNr`   r   c              	      s   t    || _|j| _|j| _|| _|| _tj| jd| j |j	d| _
tj| j| j|j	d| _t|j | _tg | _t| jD ]/}||j |krfttj| jj| jjddtj| jjd| j dd}nt }| j| qA|j}dd t|D | _dS )aQ  
        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
        r=   r   Fc                 S   r  r:   r:   r  r:   r:   r;   r    r  z&Zamba2MLP.__init__.<locals>.<dictcomp>N)r-   r.   r`   r6   rt   r   r   r   r  rR  gate_up_proj	down_projr   
hidden_actact_fnr  gate_up_proj_adapter_listr   r  r  r  r  r   r  r  r  )r5   r`   r   r   r   gate_up_proj_adapterr  r8   r:   r;   r.     s(   
zZamba2MLP.__init__c                 C   sZ   |  |}| j| }|| j| | }tj|ddd}| |d |d  }| |}|S )Nr=   r>   r   r   r!   )r  r  r  r0   chunkr  r  )r5   hidden_stater   gate_up_stateoutputr:   r:   r;   rR     s   


zZamba2MLP.forwardr)   r,   )rT   rU   rV   r"   rr   r.   rR   rW   r:   r:   r8   r;   r    s    r  c                       s   e Zd ZddededB dedB f fddZ				ddejd	ejded
ejdB dedB de	dB dej
dB dee deejeejejf dB f fddZ  ZS )Zamba2AttentionDecoderLayerNr`   r   r   c                    sd   t    || _t|j}t|d||d| _t|||d| _t	|j
|jd| _t	|j|jd| _d S )Nr>   )r   r   r   )r   r   r7   )r-   r.   r   r   r  r   	self_attnr  feed_forwardrX   r  rms_norm_epsinput_layernormr6   pre_ff_layernorm)r5   r`   r   r   num_gsr8   r:   r;   r.     s   

z$Zamba2AttentionDecoderLayer.__init__FrJ   original_hidden_statesr   r   output_attentionsr!  r   rY   c              	   K   sl   t j||gdd}| |}| jd||||||d|\}}	| |}| ||}|f}
|r4|
|	f7 }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        r>   r   )rJ   r   r   r   r  r!  Nr:   )r0   concatenater  r  r  r  )r5   rJ   r  r   r   r   r  r!  r   self_attn_weightsoutputsr:   r:   r;   rR     s$   




z#Zamba2AttentionDecoderLayer.forwardr)   )NNFN)rT   rU   rV   r"   rr   r.   r0   r^   r_   r8  r   r   r   rZ   FloatTensorrR   rW   r:   r:   r8   r;   r    s2    $	
r  c                       s   e Zd Zdedef fddZ										ddejdejdB dedB d	ejdB d
ejdB dedB de	dB de	dB dej
dB dej
dB dejdB deejeejejf dB f fddZ  ZS )Zamba2MambaDecoderLayerr`   r   c                    s4   t    t||d| _t|j|jd| _|| _d S )N)r`   r   r  )	r-   r.   r?  mambarX   r6   r  r  r   )r5   r`   r   r8   r:   r;   r.   ,  s   

z Zamba2MambaDecoderLayer.__init__NFrJ   r  r   causal_maskr   r  	use_cacher   r   transformer_hidden_statesrY   c                 K   sd   |}|dur
|| n|}|  |}| j|||d}d}|| }|f}|r)||f7 }|r0||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        N)rJ   r_  r   )r  r  )r5   rJ   r  r   r   r  r   r  r  r   r   r  r   residualr  r  r:   r:   r;   rR   2  s"   


zZamba2MambaDecoderLayer.forward)
NNNNNFFNNN)rT   rU   rV   r"   rr   r.   r0   r^   r_   r8  r   rZ   r  rR   rW   r:   r:   r8   r;   r  +  sJ    		
r  c                       s   e Zd Zdedejdef fddZ									ddej	d	ej	dB d
e
dB dej	dB dej	dB dedB dedB dedB dejdB dejdB deejeejejf dB f fddZ  ZS )Zamba2HybridLayershared_transformerlinearr  c                    s    t    || _|| _|| _d S r,   )r-   r.   r  mamba_decoderr  )r5   r  r  r  r8   r:   r;   r.   q  s   

zZamba2HybridLayer.__init__NFrJ   r  r   r   r  r   r  r  r!  r   rY   c              
   C   sp   | j |||||||	|
d}|d }|r|d }| |}| j|||||||	d}|r6|d |f|dd  }|S )aY  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r  r   r   r   r  r!  r   r   r!   )r  r   r   r  r  r!  r=   N)r  r  r  )r5   rJ   r  r   r   r  r   r  r  r!  r   layer_outputsr  r  r:   r:   r;   rR   y  s4   !

zZamba2HybridLayer.forward)	NNNNNFFNN)rT   rU   rV   r  r   r  r  r.   r0   r^   rr   r_   r8  r   rZ   r  rR   rW   r:   r:   r8   r;   r  p  sP    	
r  c                       sR   e Zd ZU eed< dZdZddgZdZdZ	dZ
dZdZe  fddZ  ZS )	Zamba2PreTrainedModelr`   modelTr  r  r   c                    s   t  | t|tr^tt| jjt	
| jjt	
| jj  t	
| jj j| jjd}|t
t|   }t|j| td|jd }t|jt
| t|j d S d S )N)minr!   )r-   _init_weightsr   r?  r0   ro  randr`   ry   mathrU  rN  rM  r   time_step_floorexpm1initrq  rT  r   rK  rV  ones_rX  )r5   r   rx  inv_dtr^  r8   r:   r;   r    s"   
z#Zamba2PreTrainedModel._init_weights)rT   rU   rV   r"   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulr0   r   r  rW   r:   r:   r8   r;   r    s   
 r  c                       s   e Zd ZdZdef fddZe										ddejdB dej	dB dejdB d	e
dB d
ejdB dedB dedB dedB dedB dejdB deeB fddZdd Z  ZS )Zamba2Modelzh
    Model consisting of *config.num_hidden_layers* layers.

    Args:
        config: Zamba2Config
    r`   c                    s   t  | || _|j| _|j| _t|j|j| j| _	|j
| _
|  | _|j| _t|j|jd| _|jrB|jr=td t|| _d| _|   d S )Nr  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.F)r-   r.   r`   pad_token_idpadding_idx
vocab_sizer   	Embeddingr6   embed_tokensrp   
get_layerslayersr$  rX   r  final_layernormr"  use_long_contextr[  r\  r   
rotary_embgradient_checkpointing	post_initr5   r`   r8   r:   r;   r.     s"   

zZamba2Model.__init__N	input_idsr   r   r   inputs_embedsr  r  output_hidden_statesreturn_dictr   rY   c                 K   s0  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|}t|}|rr|d u rr|d urb|jd n|jd }t| j || j| jd}|
d u r|d ur|j| jdnd}tj|||jd  |jd}
|d u r|
d}t| j |||
||d	}| j||d
}|rdnd }|rdnd }t| jD ],\}}|r||f7 }|||||||||||d
}|d }|r|d d ur||d f7 }q| |}|r||f7 }|d ur|jsd|_t||r	|nd ||d}|	r|S | S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r@   rb   r   r!   rg   )r`   r  r   r   r   r   )r   r:   )r   r  r  r!  r   T)last_hidden_stater   rJ   
attentions)r`   r  r  r  use_return_dict
ValueErrorr  r   r[  r\  r  r0   r   rE   r_   r@   rb   r   first_transformer_layer_idr   r   r   r  r  r  r  rq   r   to_tuple)r5   r  r   r   r   r  r  r  r  r  r   r   rJ   r  ra   past_seen_tokensr  r!  all_hidden_statesall_self_attnsr   layerr  r  r:   r:   r;   rR     s   





zZamba2Model.forwardc                 C   s   g }i | _ d| _g }t| jD ]e\}}t| j|d}|dkrod| d}t|tr1t|| jj	krGt|tr:t
|}t|}| j ||i n|| || jj	 }t| j|d}	tj| jj| jjdd}
|t|	|
| q|| qt|S )	Nr   r  rd   zlayers.z.shared_transformer)r   Fr   )_tied_weights_keysr  r  rp   r  r`   r   listr   r  r   nextr   r   r  r   r  r6   r  r  )r5   r  unique_hybrid_blockslayer_id
layer_typemamba_layerprefix_patterntarget_patternr   
attn_blocklinear_layerr:   r:   r;   r  j  s,   


zZamba2Model.get_layers
NNNNNNNNNN)rT   rU   rV   r   r"   r.   r   r0   r   r^   r_   r  r8  rZ   r   rR   r  rW   r:   r:   r8   r;   r    sL    	
or  c                       s   e Zd ZddiZdef fddZe												ddejdB d	ej	dB d
ejdB de
dB dejdB dejdB dedB dedB dedB dedB dejdB deej	B deeB fddZ							d fdd	Z  ZS )Zamba2ForCausalLMzlm_head.weightzmodel.embed_tokens.weightr`   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S NFr   )
r-   r.   r  r  r  r   r  r6   lm_headr  r  r8   r:   r;   r.     s
   
zZamba2ForCausalLM.__init__Nr   r  r   r   r   r  labelsr  r  r  r  r   logits_to_keeprY   c                 K   s   |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
| j||||||||	||
d
}|d }t|tr<t| dn|}| |dd|ddf }d}|dur^| j	||| j
fi |}|
st|f|dd  }|durr|f| S |S t|||j|j|jdS )al  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Zamba2ForCausalLM

        >>> model = Zamba2ForCausalLM.from_pretrained("Zyphra/Zamba2-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
r  r   r   r   r  r  r  r  r   r  r   r!   losslogitsr   rJ   r  )r`   r  r  r  r  r   rr   slicer  loss_functionr  r   r   rJ   r  )r5   r  r   r   r   r  r  r  r  r  r  r   r  r   r  rJ   slice_indicesr  r  r  r:   r:   r;   rR     s@   (zZamba2ForCausalLM.forwardTFc	              
      sX   |d u rt | j|jd | j| jd}| jj|	d< t j|f|||||||d|	}
|
S )Nr   r  r  )r   r   r  r   r   r  is_first_iteration)r_   r`   rE   r@   rb   num_logits_to_keepr-   prepare_inputs_for_generation)r5   r  r   r   r  r   r   r  r  r   model_inputsr8   r:   r;   r    s&   	z/Zamba2ForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNNr   )NNNNNTF)rT   rU   rV   r  r"   r.   r   r0   r   r^   r_   r  r8  rr   rZ   r   rR   r  rW   r:   r:   r8   r;   r    sf    		
Tr  a  
    The Zamba2 Model with a sequence classification head on top (linear layer).

    [`Zamba2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                       s   e Zd Z fddZe										ddejdB dejdB dejdB dedB dej	dB d	ejdB d
e
dB de
dB de
dB de
dB deeB fddZ  ZS )Zamba2ForSequenceClassificationc                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r  )
r-   r.   
num_labelsr  r  r   r  r6   scorer  r  r8   r:   r;   r.     s
   
z(Zamba2ForSequenceClassification.__init__Nr  r   r   r   r  r  r  r  r  r  rY   c                 K   sB  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}|dur+|jd }n|jd }| j jdu r>|dkr>td| j jdu rGd}n1|durl|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur||j}| j jdu r| jdkrd
| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jd
krt }| jdkr|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   r  r  r  r  r  r   r!   z=Cannot handle batch sizes > 1 if no padding token is defined.r>   rc   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rg   
regressionsingle_label_classificationmulti_label_classificationr  )r`   r  r  r   rE   r  r  rA   rb   r0   int32r   argmaxr[  r\  r9   rT   problem_typer  r@   longrr   r	   rm  r   rF   r   r   r   rJ   r  )r5   r  r   r   r   r  r  r  r  r  r  r   transformer_outputsrJ   r  ra   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  r:   r:   r;   rR   (  sx   



"


z'Zamba2ForSequenceClassification.forwardr  )rT   rU   rV   r.   r   r0   r   r^   r   r  r8  rZ   r   rR   rW   r:   r:   r8   r;   r    sH    		
r  )r  r  r  r  )r   )r!   )]r  collections.abcr   	itertoolsr   typingr   r   r0   r   torch.nnr   r   r	    r   r  activationsr   cache_utilsr   
generationr   integrationsr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.import_utilsr   r    configuration_zamba2r"   +mamba_ssm.ops.triton.selective_state_updater#   !mamba_ssm.ops.triton.ssd_combinedr$   r%   causal_conv1dr'   r(   
get_loggerrT   r[  Moduler*   rX   r_   r   r^   rr   r   r]   r   r   r   r   r1  r3  r>  rp  rZ  r?  r  r  r  r  r  r  r  r  __all__r:   r:   r:   r;   <module>   s   

rA
   /*@EL 5 j