o
    eiD                     @   s  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ e) rddl,m-Z-m.Z. ddl/m0Z0 nd\Z0Z.Z-e( rddl1m2Z2m3Z3 nd\Z3Z2e4e0e.e2e3e-fZ5e&6e7Z8G dd dej9Z:dej;de<dej;fd d!Z=G d"d# d#Z>	$dEd%ej9d&ej;d'ej;d(ej;d)ej;dB d*e?d+e?fd,d-Z@G d.d/ d/ej9ZAG d0d1 d1ej9ZBG d2d3 d3ej9ZCG d4d5 d5ej9ZDG d6d7 d7eZEG d8d9 d9eZFe%G d:d; d;e!ZGe%G d<d= d=eGZHG d>d? d?eGeZIe%d@dAG dBdC dCeGZJg dDZKdS )FzPyTorch Zamba model.    N)Callable)Any)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)Cache)GenerationMixin)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_available   )ZambaConfig)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_updateNNc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )ZambaRMSNormư>epsreturnNc                    s&   t    tt|| _|| _dS )z;
        ZambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer$   	__class__ f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/zamba/modeling_zamba.pyr'   A   s   

zZambaRMSNorm.__init__hidden_statesc                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor)   float32powmeanrsqrtr,   r+   )r-   r3   input_dtypevariancer1   r1   r2   forwardI   s
   zZambaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler+   shaper,   r-   r1   r1   r2   
extra_reprP   s   zZambaRMSNorm.extra_repr)r#   )
__name__
__module____qualname__floatr'   r)   Tensorr?   rC   __classcell__r1   r1   r/   r2   r"   @   s    r"   r3   n_repr%   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rA   expandreshape)r3   rJ   batchnum_key_value_headsslenhead_dimr1   r1   r2   	repeat_kvU   s
   0rQ   c                   @   s   e Zd ZdZdZejdfddZdd Z	ddej	d	ej	d
e
deeef dB deej	ej	f f
ddZdejfddZdd
e
dB de
fddZdej	d
e
dee
e
f fddZdS )ZambaHybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNc              
      s"  || _ d| _|j| _d| _|j|j | _|j| _|j	| _
|j| _g | _g | _g | _i | _i | _i | _t|jD ];}|  jtj | j| j
|dg7  _ | j| j| j | jf}|  jtj||dg7  _| j| dkrr| j| q7 fddt|jD | _ fddt|jD | _d S )NFdevicer7   hybridc                        g | ]}t jg g  d qS rT   r)   tensor.0_
batch_sizerT   r1   r2   
<listcomp>        z4ZambaHybridDynamicCache.__init__.<locals>.<listcomp>c                    rV   rW   rY   r[   r^   r1   r2   r`      ra   )r7   is_compileablelayers_block_typehas_previous_statemamba_expandr.   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headsconv_states
ssm_statestransformer_layers_modules_parameters_buffersrangenum_hidden_layersr)   zerosappend	key_cachevalue_cache)r-   configr_   r7   rT   icache_shaper1   r^   r2   r'   q   s:   
 z ZambaHybridDynamicCache.__init__c                 C   s
   t | jS N)lenrv   rB   r1   r1   r2   __len__   s   
zZambaHybridDynamicCache.__len__
key_statesvalue_states	layer_idxcache_kwargsr%   c                 C   sz   | j | jd dkr|| j |< || j|< ntj| j | |gdd| j |< tj| j| |gdd| j|< | j | | j| fS )Nr5   r   r4   dim)rv   rA   rw   r)   cat)r-   r~   r   r   r   r1   r1   r2   update   s   
zZambaHybridDynamicCache.updatebeam_idxc                 C   s   |   dkrdtt| jD ]X}| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< | j	| j}| j	| d||| j	|< qdS dS )zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthrr   r|   rv   rT   index_selectr8   rw   rl   rm   )r-   r   r   rT   r1   r1   r2   reorder_cache   s    z%ZambaHybridDynamicCache.reorder_cacher   c                 C   sN   || j vr
| j d n|}t| j|ks| j| jd dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r5   )rn   r|   rv   rA   )r-   r   r1   r1   r2   r      s   "z&ZambaHybridDynamicCache.get_seq_lengthcache_positionc                 C   s$   d}|j d }| || }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )rA   r   )r-   r   r   	kv_offsetquery_length	kv_lengthr1   r1   r2   get_mask_sizes   s   
z&ZambaHybridDynamicCache.get_mask_sizesr{   )r   )rD   rE   rF   __doc__rb   r)   float16r'   r}   rH   intdictstrr   r@   r   
LongTensorr   r   r   r1   r1   r1   r2   rR   a   s(     	
$	rR           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr4   r   r5   )r   r7   )ptrainingr   )rQ   num_key_value_groupsr)   matmul	transposer   
functionalsoftmaxr9   r8   r7   r   r   
contiguous)r   r   r   r   r   r   r   kwargsr~   r   attn_weightsattn_outputr1   r1   r2   eager_attention_forward   s   
r   c                       s|   e Zd ZdZdedef fddZ	ddejdedejdB d	e	dB d
e
e deejejdB eej dB f fddZ  ZS )ZambaAttentionaA  
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    rx   r   c                    s   t    || _|| _|j| _|j| _|j|j | _	|j
| _
| jd d | _d| _|j| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j| j |jdd| _d S )Nr4         TFbias)r&   r'   rx   r   attention_hidden_sizeattention_head_dimrP   num_attention_headsrN   r   max_position_embeddingsr   	is_causalattention_dropoutr   Linearq_projk_projv_projr.   o_projr-   rx   r   r/   r1   r2   r'      s   
 zZambaAttention.__init__Nr3   r   past_key_valuesr   r%   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}	| ||dd}
|d urB||	|
|\}	}
t	| j
jt}|| ||	|
|f| jsVdn| j| jd|\}}|jg |dR   }| |}||fS )Nr5   r   r4   r   )r   r   )rA   rP   r   viewr   r   r   r   r   get_interfacerx   _attn_implementationr   r   r   r   rL   r   r   )r-   r3   r   r   r   r   input_shapehidden_shapequery_statesr~   r   attention_interfacer   r   r1   r1   r2   r?      s2   

zZambaAttention.forwardr{   )rD   rE   rF   r   r   r   r'   r)   rH   rR   r   r   r@   r?   rI   r1   r1   r/   r2   r      s"    r   c                       s^   e Zd ZdZdef fddZ	ddejdefdd	Z	ddefd
dZ
ddefddZ  ZS )ZambaMambaMixeruE  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
    rx   c                    s  t    || _|| _|j| _|j| _|j| _|j	|j | _
|j| _|j| _| j
| j | _|j| _|j| _tj| j
| j
| j| j| j
| jd d| _|j| _t|j | _|j| _tj| j| j
d | jd| _tt | j| j| jd  | j| _!tt | j| j| jd d | jd  | _"tt | j| j| _#tj$d| jd tj%dd d d f }|&| j
d' }tt(|)| j| jd| _*tt+| j| j| _,tj| j
| j| jd| _-t.st/0d d S d S )	Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingr4   r   g      ?r7   r5   aq  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)1r&   r'   rx   r   r.   rg   rh   ri   rj   re   rf   mamba_dt_ranktime_step_rankrk   mamba_head_dimmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1dhidden_mamba_act
activationr
   actuse_mamba_kernelsuse_fast_kernelsr   in_projr(   r)   rt   x_proj_weightdt_proj_weightdt_proj_biasaranger9   rK   r   logrL   A_logr*   Dout_projis_fast_path_availableloggerwarning_once)r-   rx   r   Ar/   r1   r2   r'   3  sb   
	$ zZambaMambaMixer.__init__Nr3   cache_paramsc                 C   s  |j \}}}|d uo|jo|dk}| |dd}||dd|jddd\}}	|d }|	d}	|	|| j	d|dd}	| j
j| j
jd| j
jd}
|rnt|d|j| j |
| j
j| j}|d}nK|d urt|dks||d }|d urtj|| j|j d  df}|j| j | t||
| j
j| jd}|d urt|dks||d }|d| j	| j|dd}| jd d d d d d d f | dd}tj|| j| j| jgdd\}}}| j d d d f |dd }t!| j"#  }| j$d ur| j$# nd }tj%|d|f|j&|j'd}|rtt(| j	D ]K}t)|j*| j d d |f ||d	df ||d	df || ||d d df ||d d df | j+| |	|d	df || d
d
d}tj,||fdd}q'nntj%|d| j| jf|j&|j'd}t(| j	D ]E}t-|| || || || dd|| dd| j+| # |	| || d
d
d
\}}tj,||fdd }tj,||dfdd}q|d ur|d ur|j*| j | | .|dd}|S )Nr   r4   r5   r   r   )r   r   rS   .T)dt_softplus)delta_softplusreturn_last_state)/rA   rd   r   r   r   chunksqueezer   rL   rk   r   r+   sizer    rl   r   r   r   	unsqueezer)   allr   r   padrj   copy_r   r   r   splitr   rh   r   expr   rG   r   emptyrT   r7   rr   r   rm   r   r   r   r   )r-   r3   r   r   r_   seq_lenr]   use_precomputed_statesprojected_statesgateconv_weightsrl   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsnscan_outputs_	ssm_state
ssm_state_contextualized_statesr1   r1   r2   cuda_kernels_forwardp  s   
$
*
z$ZambaMambaMixer.cuda_kernels_forwardc              
   C   sX  |j \}}}|j}| |dd}||dd|jddd\}	}
|	d }	|
d}
|
|| j	d|dd}
t
|t}|r|j| j j d |kr| jrZ|j| j  }n|j| j }||	j}|jr|dkr|j| j j d |kr|j| j }tj|ddd}|	d d d d df |d d d d df< ||j| j< tj|| jjd d dd d f  dd}	| jr|	| jj7 }	| |	|d}	n|d ur|	|d d |	j d  d f d }	tj|	| j |	j d  df}||j| j< | | |	dd |f }	|d ur|	|d d |	j d  d f d }	n6tj!|| j	| j"| j#f|	j|d}|d ur8|	|d }	| | |	dd |f }	|d urR|	|d }	|	d| j	| j"|dd}	| j$d d d d d d d f |	 dd	}tj%|| j&| j#| j#gdd\}}}| j'd d d f |dd	 | j(d d d d d d f  }tj)|}t*| j+,  }t*|d d d d d d d d f |d d d d d d d d d f  }|d d d d d d d d d f |d d d d d d d d d f ,  }||	d d d d d d d d d f ,  }g }t-|D ]\}|d d d d d d |d d f dd| |d d d d d d |d d f dd }t.|dd||d d d d |d d f d}|/|d d d d d d df  qtj0|dd}||	| j1d d d d d d f   }|| |
 }|r||j| j< | 2|dd|d|dd}|S )
Nr   r4   r5   r   r   )shiftsdims.rS   r   )3rA   r7   r   r   r   r   r   r   rL   rk   
isinstancerR   rm   r   r   cloner8   rT   rd   rl   r)   rollsumr   r+   r   r   r   r   r   r   r   rj   rt   r   rh   r   r   r   r   r   softplusr   r   rG   rr   r   ru   stackr   r   )r-   input_statesr   r   r_   r   r]   r7   r   r3   r   	use_cacher   
conv_stater   r   r   r   r   r   
discrete_A
discrete_BdeltaB_ur   ry   scan_outputr  r1   r1   r2   slow_forward  s   

((&
&

* FH*X8&"zZambaMambaMixer.slow_forwardc                 C   s@   | j rtrd| jjjvrtd| j|||dS | j|||dS )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)r   )r   r   r   rT   type
ValueErrorr  r  )r-   r3   r   r   r1   r1   r2   r?   .  s   zZambaMambaMixer.forwardr!   )rD   rE   rF   r   r   r'   r)   rH   rR   r  r  r?   rI   r1   r1   r/   r2   r   &  s    >
a]r   c                       s$   e Zd Z fddZdd Z  ZS )ZambaMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFr   )r&   r'   rx   r.   rf   r   r   	gate_projup_proj	down_projr
   
hidden_actact_fnr-   rx   r/   r1   r2   r'   <  s   
zZambaMLP.__init__c                 C   s$   |  | | || | }|S r{   )r  r  r  r  )r-   xr  r1   r1   r2   r?   F  s    zZambaMLP.forward)rD   rE   rF   r'   r?   rI   r1   r1   r/   r2   r  ;  s    
r  c                       s   e Zd ZddededB f fddZ				ddejdejded	ejdB d
edB de	dB de	dB de
e deejeejejf dB f fddZ  ZS )ZambaAttentionDecoderLayerNrx   r   c                    sH   t    t||| _t|| _t|j|jd| _	t|j
|jd| _d S )Nr$   )r&   r'   r   	self_attnr  feed_forwardr"   r   rms_norm_epsinput_layernormr.   pre_ff_layernormr   r/   r1   r2   r'   L  s
   

z#ZambaAttentionDecoderLayer.__init__Fr3   original_hidden_statesr   r   output_attentionsr  r   r%   c              	   K   sj   t j||gdd}| |}| jd||||||d|\}}	| |}| |}|f}
|r3|
|	f7 }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        r5   r   )r3   r   r   r   r(  r  Nr1   )r)   concatenater%  r"  r&  r#  )r-   r3   r'  r   r   r   r(  r  r   self_attn_weightsoutputsr1   r1   r2   r?   T  s$   





z"ZambaAttentionDecoderLayer.forwardr{   )NNFF)rD   rE   rF   r   r   r'   r)   rH   rR   boolr   r   r@   FloatTensorr?   rI   r1   r1   r/   r2   r   K  s2    	
r   c                       s   e Zd Zdedef fddZ										ddejdejdB dedB d	ejdB d
ejdB dedB de	dB de	dB dej
dB dej
dB dejdB deejeejejf dB f fddZ  ZS )ZambaMambaDecoderLayerrx   r   c                    s4   t    t||d| _t|j|jd| _|| _d S )N)rx   r   r!  )	r&   r'   r   mambar"   r.   r$  r%  r   r   r/   r1   r2   r'     s   

zZambaMambaDecoderLayer.__init__NFr3   r'  r   causal_maskr   r(  r  r   position_idstransformer_hidden_statesr%   c                 K   sd   |}|dur
|| n|}|  |}| j|||d}d}|| }|f}|r)||f7 }|r0||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        N)r3   r   r   )r%  r/  )r-   r3   r'  r   r   r0  r   r(  r  r   r1  r2  r   residualr*  r+  r1   r1   r2   r?     s"   


zZambaMambaDecoderLayer.forward)
NNNNNFFNNN)rD   rE   rF   r   r   r'   r)   rH   rR   r,  r   r@   r-  r?   rI   r1   r1   r/   r2   r.    sJ    		
r.  c                       s   e Zd Zdedejdef fddZ								ddej	d	ej	dB d
e
dB dej	dB dej	dB dedB dedB dedB dejdB deejeejejf dB f fddZ  ZS )ZambaHybridLayershared_transflinearr/  c                    s    t    || _|| _|| _d S r{   )r&   r'   r5  r6  mamba_decoder)r-   r5  r6  r/  r/   r1   r2   r'     s   

zZambaHybridLayer.__init__NFr3   r'  r   r   r0  r   r(  r  r   r%   c
              
   C   sp   | j ||||||||	d}
|
d }|r|
d }| |}| j|||||||	d}
|r6|
d |f|
dd  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        )r'  r   r   r   r(  r  r   r   r   )r2  r   r   r(  r  r   r4   N)r5  r6  r7  )r-   r3   r'  r   r   r0  r   r(  r  r   layer_outputsr2  r*  r1   r1   r2   r?     s4   

zZambaHybridLayer.forward)NNNNNFFN)rD   rE   rF   r   r   r   r.  r'   r)   rH   r   rR   r,  r   r@   r-  r?   rI   r1   r1   r/   r2   r4    s>    		
r4  c                       sN   e Zd ZU eed< dZdZddgZdZdZ	dZ
dZe  fdd	Z  ZS )
ZambaPreTrainedModelrx   modelTr   r.  r   Fc                    sD  | j j}t | t|trtj|jd|d | j j	d }t
|j| | | j j| j j | j j }tt| j j|t| j jt| j j  t| j j j| j jd}|tt|   }t|j| tjd|jd tjdd d d f }||jd  }t|j!t|"|j|j#d t$|j% d S d S )Nr   )r;   stdr   )minr   r   r5   )&rx   initializer_ranger&   _init_weightsr  r   initnormal_r   r   uniform_r   re   r.   rk   r)   r   randmathr   time_step_maxtime_step_minclamptime_step_floorexpm1r   r   r   rh   r9   rK   rf   r   r   rL   r   ones_r   )r-   r   r;  dt_init_stdr   dtinv_dtr   r/   r1   r2   r>  #  s.   
$"z"ZambaPreTrainedModel._init_weights)rD   rE   rF   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr)   no_gradr>  rI   r1   r1   r/   r2   r9    s   
 r9  c                       s   e Zd ZdZdef fddZe										ddejdB dej	dB dejdB d	e
dB d
ejdB dedB dedB dedB dedB dejdB deeB fddZ  ZS )
ZambaModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

    Args:
        config: ZambaConfig
    rx   c                    s  t  | |j| _|j| _t|j|j| j| _|j	| _	g }d | _
t| j	D ]=\}}t||d}|dkr_tj| jj| jjdd}|tt||| | j
d u r^d| dd| di| _
q'|| q't|| _|j| _t|j|jd	| _d| _|   d S )
N)r   rU   Fr   z
layers.(?!z\.)\d+.shared_transfzlayers.z.shared_transfr!  )r&   r'   pad_token_idpadding_idx
vocab_sizer   	Embeddingr.   embed_tokensrc   _tied_weights_keys	enumerater.  r   rx   ru   r4  r   
ModuleListlayersr   r"   r$  final_layernormgradient_checkpointing	post_init)r-   rx   r_  layer_id
layer_typer/  r6  r/   r1   r2   r'   E  s,   
zZambaModel.__init__N	input_idsr   r1  r   inputs_embedsr  r(  output_hidden_statesreturn_dictr   r%   c                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|}t|}|r^|d u r^t	d |
d u rmtj|jd |jd}
|d u rv|
d}t| j |||
||d}|rd	nd }|rd	nd }t| jD ]+\}}|r||f7 }||||||||||
d
	}|d }|r|d d ur||d f7 }q| |}|r||f7 }|r|jsd|_t||r|nd ||d}|	r|S | S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz{Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was provided, so no cache will be returned.r   rX   r   )rx   rf  r   r   r   r1  r1   )r   r(  r  r   T)last_hidden_stater   r3   
attentions)rx   r(  rg  r  use_return_dictr  ra  r   r   r   r[  r)   r  r   rA   rT   r   r   r]  r_  r`  rd   r   to_tuple)r-   re  r   r1  r   rf  r  r(  rg  rh  r   r   r3   r'  r0  all_hidden_statesall_self_attnsr   layerr8  outputr1   r1   r2   r?   b  s   


	




zZambaModel.forward
NNNNNNNNNN)rD   rE   rF   r   r   r'   r   r)   r   rH   rR   r-  r,  r@   r   r?   rI   r1   r1   r/   r2   rV  <  sJ    	
rV  c                       s   e Zd ZddiZdef fddZe												ddejdB d	ej	dB d
ejdB de
dB dejdB dejdB dedB dedB dedB dedB dejdB deej	B deeB fddZ							d fdd	Z  ZS )ZambaForCausalLMzlm_head.weightzmodel.embed_tokens.weightrx   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r  )
r&   r'   rV  r:  rY  r   r   r.   lm_headrb  r  r/   r1   r2   r'     s
   
zZambaForCausalLM.__init__Nr   re  r   r1  r   rf  labelsr  r(  rg  rh  r   logits_to_keepr%   c                 K   s   |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
| j||||||||	||
d
}|d }t|tr<t| dn|}| |dd|ddf }d}|dur^| j	||| j
fi |}|
st|f|dd  }|durr|f| S |S t|||j|j|jdS )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ZambaForCausalLM

        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
re  r   r1  r   rf  r  r(  rg  r   rh  r   r   losslogitsr   r3   rj  )rx   r(  rg  rk  r:  r  r   slicers  loss_functionrY  r   r   r3   rj  )r-   re  r   r1  r   rf  rt  r  r(  rg  rh  r   ru  r   r+  r3   slice_indicesrx  rw  rp  r1   r1   r2   r?     s@   (zZambaForCausalLM.forwardTFc	              
      sX   |d u rt | j|jd | j| jd}| jj|	d< t j|f|||||||d|	}
|
S )Nr   )r7   rT   ru  )r   r   rf  r   r1  r  is_first_iteration)rR   rx   rA   r7   rT   num_logits_to_keepr&   prepare_inputs_for_generation)r-   re  r   r   rf  r   r1  r  r|  r   model_inputsr/   r1   r2   r~  +  s&   	z.ZambaForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNNr   )NNNNNTF)rD   rE   rF   r\  r   r'   r   r)   r   rH   rR   r-  r,  r   r@   r   r?   r~  rI   r1   r1   r/   r2   rr    sf    		
Trr  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                       s   e Zd Z fddZe										ddejdB dejdB dejdB dedB dej	dB d	ejdB d
e
dB de
dB de
dB de
dB deeB fddZ  ZS )ZambaForSequenceClassificationc                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r  )
r&   r'   
num_labelsrV  r:  r   r   r.   scorerb  r  r/   r1   r2   r'   ]  s
   
z'ZambaForSequenceClassification.__init__Nre  r   r1  r   rf  rt  r  r(  rg  rh  r%   c                 K   sB  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}|dur+|jd }n|jd }| j jdu r>|dkr>td| j jdu rGd}n1|durl|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur||j}| j jdu r| jdkrd
| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jd
krt }| jdkr|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r1  r   rf  r  r(  rg  rh  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r5   rS   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rX   
regressionsingle_label_classificationmulti_label_classificationrv  )rx   rk  r:  r  rA   rW  r  r8   rT   r)   int32r   argmaxr   r   r0   rD   problem_typer  r7   longr   r   r   r   r   r   r   r   r3   rj  )r-   re  r   r1  r   rf  rt  r  r(  rg  rh  r   transformer_outputsr3   rx  r_   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsrw  loss_fctrp  r1   r1   r2   r?   f  sx   



"


z&ZambaForSequenceClassification.forwardrq  )rD   rE   rF   r'   r   r)   r   rH   r   r-  r,  r@   r   r?   rI   r1   r1   r/   r2   r  N  sH    		
r  )rr  r  rV  r9  )r   )Lr   rC  collections.abcr   typingr   r)   r   torch.nnr   r   r    r	   r?  activationsr
   cache_utilsr   
generationr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.import_utilsr   r   configuration_zambar   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   causal_conv1dr   r    r   r   
get_loggerrD   r   Moduler"   rH   r   rQ   rR   rG   r   r   r   r  r   r.  r4  r9  rV  rr  r  __all__r1   r1   r1   r2   <module>   s   

m
F  ?EH$  j