o
    eiZN                     @   s  d dl mZ d dlmZ d dlZd dlm  mZ d dlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& e r}d dl'm(Z(m)Z) nd\Z(Z)e(e)fZ*e+e*Z,e-e.Z/G dd de"Z0G dd deZ1G dd dej2Z3G dd dZ4G dd deZ5G dd  d ej2Z6G d!d" d"eZ7G d#d$ d$e!Z8G d%d& d&e Z9G d'd( d(eZ:g d)Z;dS )*    )Callable)AnyN)nn   )create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)is_causal_conv1d_availableis_torchdynamo_compiling   )apply_mask_to_padding_states)Gemma2RotaryEmbedding)LlamaAttentionLlamaForCausalLM
LlamaModelLlamaPreTrainedModelLlamaRMSNormapply_rotary_pos_embeager_attention_forward   )
Lfm2Config)causal_conv1d_fncausal_conv1d_updateNNc                   @      e Zd ZdS )Lfm2RMSNormN__name__
__module____qualname__ r$   r$   c/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/lfm2/modular_lfm2.pyr   7       r   c                   @   r   )Lfm2RotaryEmbeddingNr    r$   r$   r$   r%   r'   ;   r&   r'   c                       s*   e Zd Zdef fddZdd Z  ZS )Lfm2MLPconfigc                    s   t    |j}|jr,td| d }|jd ur,t|j| }|j||j d |j  }tj|j	|dd| _
tj|j	|dd| _tj||j	dd| _d S )Nr   r   r   Fbias)super__init__intermediate_sizeblock_auto_adjust_ff_dimintblock_ffn_dim_multiplierblock_multiple_ofr   Linearhidden_sizew1w3w2)selfr)   r.   	__class__r$   r%   r-   @   s   

zLfm2MLP.__init__c                 C   s    |  t| || | S N)r7   Fsilur5   r6   )r8   xr$   r$   r%   forwardO   s    zLfm2MLP.forward)r!   r"   r#   r   r-   r?   __classcell__r$   r$   r9   r%   r(   ?   s    r(   c                   @   s   e Zd ZdZdZdZdZdZej	dfde
dedejdejeB dB fdd	Z	d!d
ejdejdedeeef dB deejejf f
ddZdejfddZd"dedB defddZdejdedeeef fddZdefddZdefddZdd  ZdS )#Lfm2HybridConvCachea  
    Attention and conv cache for Lfm2.

    It stores the Key and Value states as a list of tensors, one for each layer.
    Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
    Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
    NFr)   max_batch_sizedtypedevicec                 C   s   g | _ g | _|| _|j| _| jd| _|j| _|| _g | _|d ur't	
|nd }t|jD ](}t	j| j|j| j| j|d}| j| | j t	g  | jt	g  q.d S )Nfull_attention)rC   rD   )	key_cachevalue_cacherB   layer_typesindexfirst_attention_layerconv_L_cache_dtype
conv_cachetorchrD   rangenum_hidden_layerszerosr4   appendtensor)r8   r)   rB   rC   rD   _
conv_stater$   r$   r%   r-   b   s*   zLfm2HybridConvCache.__init__
key_statesvalue_states	layer_idxcache_kwargsreturnc                 C   sx   | j |  dkr|| j |< || j|< ntj| j | |gdd| j |< tj| j| |gdd| j|< | j | | j| fS )a  
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

        Return:
            A tuple containing the updated key and value states.
        r   dim)rF   numelrG   rN   cat)r8   rV   rW   rX   rY   r$   r$   r%   update   s   
zLfm2HybridConvCache.updatebeam_idxc                 C   s   t t| jD ]O}| j|  r:| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j|  rV| j| j}| j| d||| j|< qdS )zDReorders the cache for beam search, given the selected beam indices.r   N)	rO   lenrF   r^   rD   index_selecttorG   rM   )r8   ra   rX   rD   r$   r$   r%   reorder_cache   s   z!Lfm2HybridConvCache.reorder_cacher   c                 C   sL   | j | dkr
| jn|}t| j|ks| j|  dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.rE   r   r[   )rH   rJ   rb   rF   r^   shaper8   rX   r$   r$   r%   get_seq_length   s    z"Lfm2HybridConvCache.get_seq_lengthcache_positionc                 C   s&   d}|j d }|  }|| }||fS )aB  
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
        for each layer.
        r   )rf   rh   )r8   ri   rX   full_mask_kv_offsetquery_lengthpast_seen_tokens	kv_lengthr$   r$   r%   get_mask_sizes   s
   
z"Lfm2HybridConvCache.get_mask_sizes
max_lengthc                 C   s   |dk r|   t| }|   |krdS tt| jD ]+}| j|  rF| j| dd|ddf | j|< | j| dd|ddf | j|< qdS )z"Crop the cache to the given lengthr   N.)rh   absrO   rb   rF   r^   rG   )r8   ro   idxr$   r$   r%   crop   s   ""zLfm2HybridConvCache.cropc                 C   s
   t | jS r;   )rb   rF   )r8   r$   r$   r%   __len__   s   
zLfm2HybridConvCache.__len__c                 C   s&   t t| jD ]	}| j|   qd S r;   )rO   rb   rM   zero_rg   r$   r$   r%   reset   s   zLfm2HybridConvCache.resetr;   )r   )r!   r"   r#   __doc__rB   is_compileablerF   rG   rN   float32r   r0   rC   rD   strr-   Tensordictr   tupler`   
LongTensorre   rh   rn   rr   rs   ru   r$   r$   r$   r%   rA   S   sF    	
#
! rA   c                       s|   e Zd Zdedef fddZ		ddejdeejejf dejdB d	e	dB d
ej
dB deejejdB f fddZ  ZS )Lfm2Attentionr)   rX   c                    s   t  || tj|j|j| j dd| _tj|j|j| j dd| _	tj|j|j| j dd| _
tj|j| j |jdd| _t| j|jd| _t| j|jd| _| `| `d S )NFr*   eps)r,   r-   r   r3   r4   num_attention_headshead_dimq_projnum_key_value_headsk_projv_projout_projr   norm_epsq_layernormk_layernormo_projattention_dropoutr8   r)   rX   r9   r$   r%   r-      s   zLfm2Attention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuesri   rZ   c                 K   s  |j d d }g |d| jR }| | |j| dd}	| | |j| dd}
| |j| dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}t| jjt}|| |	|
||fd| jd|\}}|jg |dR   }| |}||fS )Nr   r   )sincosri   g        )dropoutscaling)rf   r   r   r   view	transposer   r   r   r   r`   rX   r	   get_interfacer)   _attn_implementationr   r   reshape
contiguousr   )r8   r   r   r   r   ri   kwargsinput_shapehidden_shapequery_statesrV   rW   r   r   rY   attention_interfaceattn_outputattn_weightsoutputr$   r$   r%   r?      s8   	


zLfm2Attention.forwardr   )r!   r"   r#   r   r0   r-   rN   rz   r|   rA   r}   r?   r@   r$   r$   r9   r%   r~      s"    r~   c                
       s   e Zd Zdedef fddZ			ddejdedB dej	dB d	ejdB fd
dZ
			ddejdedB dej	dB d	ejdB fddZ			ddejdedB dej	dB d	ejdB fddZ  ZS )Lfm2ShortConvr)   rX   c                    s   t    || _|| _|j| _|j| _tj	|j
|j
| j|j
| j| jd d| _tj|j
d|j
 | jd| _tj|j
|j
| jd| _d S )Nr   )in_channelsout_channelskernel_sizegroupsr+   paddingr   r*   )r,   r-   r)   rX   rK   L_cache	conv_biasr+   r   Conv1dr4   convr3   in_projr   r   r9   r$   r%   r-     s   
zLfm2ShortConv.__init__Nr>   r   ri   r   c                 C   s  t ||}| |dd}|jddd\}}}|| }| jj| jjd| jjd}	|d urO|d dkrOt|	d|j
| j |	| jjd }
|
d}
n&|d urktj|| j|jd  df}|j
| j | t||	| jjd d}
||
 }| |dd }|S )Nr   r[   r   r\   r   r   )
activation)r   r   r   chunkr   weightr   sizer   squeezerM   rX   r+   	unsqueezer   
functionalpadr   rf   copy_r   r   r   )r8   r>   r   ri   r   BCxBCBxconv_weightsconv_outrU   yr$   r$   r%   cuda_kernels_forward&  s*   
$
z"Lfm2ShortConv.cuda_kernels_forwardc                 C   sz  |j d }t||}| |dd}|jddd\}}}|| }	|d ur|d dkr|j| j }
|d| jd }|
j	ddd}
|	j
|
j|
jd|
d d d d |f< |j| j |
 tj|

|	j| jjd d dd d f  dd}| jr}|| jj7 }|d}n'|d urtj|	| j|	j d  df}
|j| j |
 | |	d	d |f }|| }|dd }| |}|S )
Nr   r   r[   r   r\   r   )shiftsdims)rD   rC   .)rf   r   r   r   r   rM   rX   clampr   rollrd   rD   rC   r   rN   sumr   r   r+   r   r   r   r   r   r   )r8   r>   r   ri   r   seqlenr   r   r   r   rU   r   r   r$   r$   r%   slow_forwardH  s.   

$0
zLfm2ShortConv.slow_forwardr   c                 C   s6   t rd|jjv rt s| ||||S | ||||S )Ncuda)is_fast_path_availablerD   typer   r   r   )r8   r   r   ri   r   r$   r$   r%   r?   n  s   zLfm2ShortConv.forward)NNN)r!   r"   r#   r   r0   r-   rN   rz   rA   r}   r   r   r?   r@   r$   r$   r9   r%   r     sT    
%
)r   c                       s   e Zd Zdedef fddZ					ddejdeejejf dB dejdB d	ej	dB d
e
dB dej	dB dejfddZ  ZS )Lfm2DecoderLayerr)   rX   c                    sl   t    |j| dk| _| jrt||| _nt||| _t|| _	t
|j|jd| _t
|j|jd| _d S )NrE   r   )r,   r-   rH   is_attention_layerr~   	self_attnr   r   r(   feed_forwardr   r4   r   operator_normffn_normr   r9   r$   r%   r-   {  s   

zLfm2DecoderLayer.__init__Nr   r   r   position_idsr   ri   rZ   c           
   	   K   sl   |}| j r| jd| ||||||d|\}}	n| j| ||||d}|| }|| | | }|S )N)r   r   r   r   r   ri   )r   r   ri   r   r$   )r   r   r   r   r   r   )
r8   r   r   r   r   r   ri   r   residualrT   r$   r$   r%   r?     s*   

zLfm2DecoderLayer.forward)NNNNN)r!   r"   r#   r   r0   r-   rN   rz   r|   r}   rA   r?   r@   r$   r$   r9   r%   r   z  s,    	r   c                   @   s   e Zd ZdZdS )Lfm2PreTrainedModelFN)r!   r"   r#   _can_compile_fullgraphr$   r$   r$   r%   r     s    r   c                       s   e Zd Zdef fddZ							ddejdB dejdB dejdB dedB d	ej	dB d
e
dB dejdB dee defddZ  ZS )	Lfm2Modelr)   c                    s&   t  | t|j|jd| _| `d S )Nr   )r,   r-   r   r4   r   embedding_normnorm)r8   r)   r9   r$   r%   r-     s   zLfm2Model.__init__N	input_idsr   r   r   inputs_embeds	use_cacheri   r   rZ   c              	   K   s6  |d u |d uA rt d|d u r| |}|r+|d u r+|jd }	t| j|	| j| jd}|d u rG|d ur7| nd}
tj	|
|
|jd  |jd}|d u rP|
d}t| j|||||d}|jd dkrd|nd }|}| j||d}| jd | jj D ]}|jr|n|}||f|||||d|}qx| |}t||d	S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   )r)   rB   rC   rD   r   )rD   )r)   r   r   ri   r   r   )r   )r   r   r   r   ri   )last_hidden_stater   )
ValueErrorembed_tokensrf   rA   r)   rC   rD   rh   rN   aranger   r   
rotary_emblayersrP   r   r   r   )r8   r   r   r   r   r   r   ri   r   
batch_sizerl   causal_masklinear_attentionr   r   decoder_layer
layer_maskr$   r$   r%   r?     sZ   


	

zLfm2Model.forward)NNNNNNN)r!   r"   r#   r   r-   rN   r}   rz   rA   FloatTensorboolr
   r   r   r?   r@   r$   r$   r9   r%   r     s8    	
r   c                   @   r   )Lfm2ForCausalLMNr    r$   r$   r$   r%   r     r&   r   )r   r   r   )<collections.abcr   typingr   rN   torch.nn.functionalr   r   r<   masking_utilsr   modeling_layersr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   utils.import_utilsr   r   bamba.modeling_bambar   gemma2.modeling_gemma2r   llama.modeling_llamar   r   r   r   r   r   r   configuration_lfm2r   causal_conv1dr   r   kernel_modulesallr   
get_loggerr!   loggerr   r'   Moduler(   rA   r~   r   r   r   r   r   __all__r$   r$   r$   r%   <module>   sD   $	
 6k/I