o
    ίiQ                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	Z	d dl	m
Z
 d dlmZ d dlmZmZmZ d dlmZmZ d d	lmZ G d
d deZeG dd dZdd Zde	jdedede	jfddZde	jde	jdefddZde	jde	jdede	jdee	je	jf f
ddZdd Zd d! Z d"d# Z!	d7d$ed%e	jd&ee	j defd'd(Z"G d)d* d*e	j
j#Z$G d+d, d,e
j#Z%G d-d. d.e
j#Z&G d/d0 d0e
j#Z'G d1d2 d2e
j#Z(G d3d4 d4e
j#Z)G d5d6 d6e
j#Z*dS )8    N)	dataclass)Enum)OptionalTupleUnion)nn)
functional)	BlockMask_mask_mod_signatureflex_attention)AttentionBiasfmha)probec                   @   s   e Zd ZdZdZdZdZdS )InitStdFactordisabledglobal_depthcurrent_depth	dim_ratioN)__name__
__module____qualname__DISABLEDGLOBAL_DEPTHCURRENT_DEPTH	DIM_RATIO r   r   D/home/ubuntu/.local/lib/python3.10/site-packages/core/transformer.pyr      s
    r   c                   @   s   e Zd ZU dZeed< dZeed< dZee ed< dZ	ee ed< dZ
ee ed< dZee ed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZeed< dZeed< dS )BaseTransformerArgsi   dim   n_layersNhead_dimn_heads
n_kv_headsffn_dim_multiplier   multiple_ofgh㈵>norm_eps     @
rope_theta    old_context_len   rope_scale_factorlow_freq_factor    high_freq_factorinit_base_stdr   init_std_factor   
max_seqlen)r   r   r   r   int__annotations__r    r!   r   r"   r#   r$   floatr&   r'   r)   r+   r-   r.   r0   r1   r2   strr4   r   r   r   r   r      s"   
 r   c                 K   s0   t jt | jdd d|jddfi |S )N)end_dim)Fnll_losslog_softmaxflattenr7   )predtargetkwargsr   r   r   cross_entropy4   s   
rC   xn_repr   returnc                 C   sj   |dksJ d| j \}}}}|dkr| S | dddddddddf ||||||||| |S )z0torch.repeat_interleave(x, dim=2, repeats=n_rep)   zAOnly dim=2 is supported. Check the implementation for other dims.r,   N)shapeexpandreshape)rD   rE   r   bsslenr#   r!   r   r   r   	repeat_kv<   s    rM   	freqs_cisseq_dimc                    s   |j  d  kr k sJ  J | j|j |jd ddfks+J d| j|jf  fddt|jdd D ddg }| j| S )	a  
    Reshape frequency tensor for broadcasting it with another tensor.

    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
    for the purpose of broadcasting the frequency tensor during element-wise operations.

    Args:
        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
        x (torch.Tensor): Target tensor for broadcasting compatibility.
        seq_dim (int): Sequence dimension index.

    Returns:
        torch.Tensor: Reshaped frequency tensor.
    r   rG   zfreqs_cis vs x: c                    s,   g | ]\}}|ks| d  kr|ndqS )   r,   r   ).0idndimrO   r   r   
<listcomp>`   s     z)reshape_for_broadcast.<locals>.<listcomp>Nr9   )rV   rH   	enumerateview)rN   rD   rO   rH   r   rU   r   reshape_for_broadcastI   s    
rZ   xqxkc                 C   s   | j g | jd d dddR  }|j g |jd d dddR  }t||| }|| dd}|| dd}|| ||fS )Nr;   r,   rG      rQ   )rJ   rH   rZ   r7   sumr?   type_as)r[   r\   rO   rN   xq_xk_xq_outxk_outr   r   r   apply_rotary_embf   s   &&rd   c                 C   s   ||kS Nr   )bhq_idxkv_idxr   r   r   causal_maskv   s   rj   c                 C   s    |  d}|d}d|d< |S )Nr   r,   )cumsumroll)lengths	doc_startr   r   r   lengths_to_start_idsz   s   

ro   c                 C   sV   | j dksJ | d}|  }t| }t| }|| }tj|| jd| }||fS )Nr,   r   device)rV   sizer^   torchrepeat_interleavero   arangerq   )rm   nb_seqstotal_seqlendoc_idrn   tok_idr   r   r   lengths_to_local_ids   s   

rz   mask_modrm   
kv_lengthsc                    s\   |dur|n|}t |\t |\ | d | d  fdd}|S )a  Generates mask mods that apply to inputs to flex attention in the sequence stacked
    format.

    Args:
        mask_mod: The mask mod to apply to the documents
        lengths: Lengths of each document

    Note:
        What is the sequence stacked format? When assembling batches of inputs, we
        take multiple sequences and stack them together to form 1 large sequence. We then
        use masking to ensure that the attention scores are only applied to tokens within
        the same document.

    Example:

    - Square mask
      doc_mask         lengths
      a a b b b c c    2 3 2
    a 1 0 0 0 0 0 0
    a 1 1 0 0 0 0 0
    b 0 0 1 0 0 0 0
    b 0 0 1 1 0 0 0
    b 0 0 1 1 1 0 0
    c 0 0 0 0 0 1 0
    c 0 0 0 0 0 1 1

    Nr,   c                    sb   t |}t |}|k|k@ }|  | k}| }| }	| |||	}
||
@ |@ S re   )rs   minimum)rf   rg   rh   ri   	q_idx_cap
kv_idx_cap	valid_idxsame_doc	q_logical
kv_logical
inner_maskkv_document_id
kv_max_idxkv_token_idr{   q_document_id	q_max_idx
q_token_idr   r   doc_mask_mod   s   z+generate_doc_mask_mod.<locals>.doc_mask_mod)rz   r^   )r{   rm   r|   r   r   r   r   generate_doc_mask_mod   s    
r   c                       s   e Zd ZdZ					ddededed	ed
ededef fddZdd Zdd Z	ddededefddZ		dde
e de
ej fddZ  ZS ) RotaryEmbeddingz 
    RotaryEmbedding Module
    r3   r,   r/   r*   thetar!   r4   scale_factorr.   r0   r+   c                    sh   t    || _|| _|| _|| _|| _|| _|| _|dkr0|| | _	|| | _
| j	| j
ks2J d S d S )Nr,   )super__init__r   r!   r4   r   r.   r0   r+   low_freq_wavelenhigh_freq_wavelen)selfr   r!   r4   r   r.   r0   r+   	__class__r   r   r      s   



zRotaryEmbedding.__init__c                 C   s&   | j d| j| j| j| jddd d S )NrN   )r   endr   F)
persistent)register_bufferprecompute_freqs_cisr!   r4   r   r   r   r   r   reset_parameters   s   
z RotaryEmbedding.reset_parametersc                 C   s   | j dkr|S g }|D ]H}dtj | }|| jk r|| q|| jkr-||| j   q| j| jks5J | j| | j | j| j  }|d| | | j  ||   qt	j
||j|jdS )Nr,   rG   )dtyperq   )r   mathpir   appendr   r+   r.   r0   rs   tensorr   rq   )r   freqs	new_freqsfreqwavelensmoothr   r   r   apply_scaling   s"   



zRotaryEmbedding.apply_scalingr(   r   r   c                 C   s   d|t d|dd|d   |   }| |}t j||jd}t || }| | }}t j|| ||fddj	g |
 ddR  S )a  
        Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

        This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
        and the end index 'end'. The 'theta' parameter scales the frequencies.
        The returned tensor contains complex values in complex64 data type.

        Args:
            dim (int): Dimension of the frequency tensor.
            end (int): End index for precomputing frequencies.
            theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.

        Returns:
            torch.Tensor: Precomputed frequency tensor with complex exponentials.
              ?r   rG   Nrp   r;   r   )rs   ru   r7   r   rq   outercossinstackrY   rr   )r   r   r   r   r   tr   r   r   r   r   r      s   *
0z$RotaryEmbedding.precompute_freqs_cisNseqlentok_idxc                 C   sH   |dup|du}|sJ d|dur| j | S |dur"| j d| S dS )a}  
        Return freqs_cis corresponding to consecutive seqlen positions or the corresponding tok_idx positions
        Args:
            seqlen (int): Contiguous sequence length
            tok_idx (torch.Tensor[int]): Position indices of each token this overrides seqlen

        Returns:
            Tuple(torch.Tensor, torch.Tensor): Embedded input tensor and freqs_cis
        Nz(Should provide atleast seqlen or tok_idxr   )rN   )r   r   r   testr   r   r   forward  s   
zRotaryEmbedding.forward)r3   r,   r,   r/   r*   )r(   )NN)r   r   r   __doc__r7   r5   r   r   r   r   r   rs   Tensorr   __classcell__r   r   r   r   r      sL    	
 r   c                       sT   e Zd ZdZddedef fddZdejfdd	Z	dejfd
dZ
dd Z  ZS )RMSNorma  
    Initialize the RMSNorm normalization layer.

    Args:
        dim (int): The dimension of the input tensor.
        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.

    Attributes:
        eps (float): A small value added to the denominator for numerical stability.
        weight (nn.Parameter): Learnable scaling parameter.

    ư>r   epsc                    s&   t    || _tt|| _d S re   )r   r   r   r   	Parameterrs   onesweight)r   r   r   r   r   r   r   @  s   
zRMSNorm.__init__rD   c                 C   s"   |t || jddd| j  S )Nr;   T)keepdim)rs   rsqrtmeanr   r   rD   r   r   r   _normE  s   "zRMSNorm._normc                 C   s.   t |d}| | }|| j  |S )Nresid)r   	log_statsr   r7   r   r_   )r   rD   outputr   r   r   r   H  s   zRMSNorm.forwardc                 C   s   t jj| j d S re   )rs   r   initones_r   r   r   r   r   r   M  s   zRMSNorm.reset_parameters)r   )r   r   r   r   r5   r7   r   rs   r   r   r   r   r   r   r   r   r   r   2  s    r   c                       s>   e Zd Zdejddf fddZdejdejfddZ  Z	S )	
TiedLineartied_modulerF   Nc                    s&   t    || _t|dstdd S )Nr   zPProvided module does not have attribute 'weight'. Please check your tied_module.)r   r   r   hasattrAttributeError)r   r   r   r   r   r   R  s   

zTiedLinear.__init__rD   c                 C   s   t || jjS re   )r<   linearr   r   r   r   r   r   __call__Z  s   zTiedLinear.__call__)
r   r   r   r   Moduler   rs   r   r   r   r   r   r   r   r   Q  s    r   c                       s   e Zd Zdededededef
 fddZ				dd
ejdejdeej dee	e
eef  dedejfddZdddZ  ZS )	Attentionr   r!   r"   r#   r)   c                    s   t    || _|| _|| _|| _|| _| j| j | _tj	||| dd| _
tj	||| dd| _tj	||| dd| _tj	|| |dd| _d S )NFbias)r   r   r   r!   r)   r"   r#   heads_per_groupr   Linearwqwkwvwo)r   r   r!   r"   r#   r)   r   r   r   r   _  s6   
zAttention.__init__NsdparD   freq_cisr   mask	attn_implrF   c                 C   s  |j \}}}| ||}	| ||}
| ||}|	j }|	||| j| j}	|
||| j| j}
|||| j| j}t	|	|
d|d| \}	}
t
| dr[| j|
||\}
}t|
| jdd}
t|| jdd}|dkr|d u szt|tszJ tdd |	|
|f\}	}
}t|	|
||d	}|dd }nh|d
kr|d u st|tsJ tj|	|
||d}nO|dkrtdd |	|
|f\}	}
}|d u st|ttjfsJ t|tr|dknd}t|tjr|nd }tj|	|
|||d}|dd }ntd| d| ||}|S )Nr,   r   kv_cacherG   r   r   c                 S      |  ddS Nr,   rG   	transposeer   r   r   <lambda>      z#Attention.forward.<locals>.<lambda>)
block_maskr   )	attn_biasr   c                 S   r   r   r   r   r   r   r   r     r   causalF)	is_causal	attn_maskzAttention implementation z not supported) rH   r   view_asr   r   rY   r"   r!   r#   rd   r   r   updaterM   r   
isinstancer	   mapr   r   
contiguousr   r   memory_efficient_attentionr8   rs   r   r<   scaled_dot_product_attentionNotImplementedErrorr   rJ   )r   rD   r   r   r   r   bszseq_lenr   r[   r\   xvoutput_shaper   r   r   r   r   r     sN   	

zAttention.forwardr   c                 C   sn   |p| j d }| j| j| jfD ]}tjj|jd|d| d| d qtjj| jjd|| d| d| d d S Ng      g        rP   rQ   )r   stdarf   )	r   r   r   r   r   r   trunc_normal_r   r   )r   init_stdfactorwr   r   r   r     s    
zAttention.reset_parametersNNr   Nr   )r   r   r   r5   r7   r   rs   r   r   r   r	   r   r8   r   r   r   r   r   r   r   r   ^  s:    ,
?r   c                       sZ   e Zd Z	ddedededee def
 fddZd	ejd
ejfddZ	dddZ
  ZS )FeedForwardr,   r   
hidden_dimr&   r$   mp_sizec                    s   t    td| d }|d urt|| }||| d |  }|| dks)J || _|| _tj||dd| _tj||dd| _tj||dd| _	d S )NrG   rQ   r,   r   Fr   )
r   r   r5   r   r   r   r   w1w3w2)r   r   r   r&   r$   r   r   r   r   r     s.   
zFeedForward.__init__rD   rF   c                 C   s8   |  ||}| ||}| t|| }|S re   )r  r   r  r  r<   silu)r   rD   x1x3r   r   r   r   r     s   zFeedForward.forwardNr   c                 C   s   |p| j d }|p| jd }|}|| }| j| jfD ]}tjj|jd|d| d| d qtjj| jjd|d| d| d d S r   )	r   r   r  r  r   r   r   r   r  )r   r   r   in_init_stdout_init_stdr   r   r   r   r     s&   
zFeedForward.reset_parameters)r,   r   )r   r   r   r5   r   r7   r   rs   r   r   r   r   r   r   r   r   r     s    #r   c                       sp   e Zd Zdef fddZ			ddejdejdeej d	eee	e
ef  d
edejfddZdddZ  ZS )TransformerBlockargsc                    s   t    |jd us|jd usJ d|jp|j|j | _|jp%|j|j | _|jp,| j| _|j| j dks8J |j|j dksBJ t|j| j| j| j|jd| _t	|jd|j |j
|jd| _t|j|jd| _t|j|jd| _d S )Nz+Should specify at least head_dim or n_headsr   )r   r!   r"   r#   r)      )r   r   r&   r$   )r   )r   r   r!   r"   r   r#   r   r)   	attentionr   r&   r$   feed_forwardr   r'   attention_normffn_norm)r   r
  r   r   r   r     s2   

zTransformerBlock.__init__Nr   rD   r   r   r   r   rF   c                 C   s6   || j | |||||d }|| | | }|S )Nr   r   r   )r  r  r  r  )r   rD   r   r   r   r   rg   outr   r   r   r   :  s   	zTransformerBlock.forwardr   c                 C   s4   | j || | j  | j|| | j  d S re   )r  r   r  r  r  )r   r   r   r   r   r   init_weightsM  s   
zTransformerBlock.init_weightsr   r   )r   r   r   r   r   rs   r   r   r   r	   r   r8   r   r  r   r   r   r   r   r	    s&    !
r	  c                	       sd   e Zd Zdef fddZ			ddeej deee	e
ef  defd	d
Zdd Zdd Z  ZS )BaseTransformerr
  c              	      s   t    |j| _|j| _t|j| _|j| _t|j|j	p"|j|j
 |j|j|j|j|jd| _t | _t|jD ]
}| jt| q:d S )N)r   r!   r4   r   r.   r0   r+   )r   r   r   r1   r   r2   r4   r   r)   r!   r"   r-   r.   r0   r+   rope_embeddingsr   
ModuleListlayersranger    r   r	  )r   r
  _r   r   r   r   V  s$   


zBaseTransformer.__init__Nr   r   r   r   c                 C   s:   | j | j|d}t| jD ]\}}||||||d}q|S )N)r   r   r  )r  r4   rX   r  )r   rg   r   r   r   r   rS   layerr   r   r   r   j  s   zBaseTransformer.forwardc                 C   s   | j   d S re   )r  r   r   r   r   r   r   x  s   z BaseTransformer.reset_parametersc              	   C   st   |    t| jD ].\}}tjd|d  d tjdt| jd  d tj| jd tj	di| j
 }|| j| q	d S )NrG   r,   g      ?i   r   )r   rX   r  r   r   r   lenr   r   r   r2   r  r1   )r   depthr  r   r   r   r   r  |  s   zBaseTransformer.init_weightsr   )r   r   r   r   r   r   rs   r   r   r	   r   r8   r   r   r  r   r   r   r   r   r  U  s    
r  re   )+r   dataclassesr   enumr   typingr   r   r   rs   r   torch.nnr   r<   !torch.nn.attention.flex_attentionr	   r
   r   xformers.opsr   r   corer   r   r   rC   r   r5   rM   rZ   rd   rj   ro   rz   r   r   r   r   r   r   r   r	  r  r   r   r   r   <module>   s^   

4m}A9