o
    i/                     @  s  d dl mZ d dlmZ d dlmZ d dlZd dlmZ d dlm  m	Z
 d dlmZmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ dd Zdd Zdd Zdd Zd%ddZG dd deZG dd deZd&ddZG dd deZed d!d"gZ G d#d$ d$eZ!dS )'    )annotations)deepcopy)
namedtupleN)nn)Module
ModuleList)	rearrangeeinsum)LocalAttention)apply_rotary_pos_emb)+get_init_and_expand_reduce_stream_functionsc                 C  s   | d uS N )valr   r   O/home/ubuntu/.local/lib/python3.10/site-packages/local_attention/transformer.pyexists   s   r   c                 C  s   t | r| S |S r   )r   )r   dr   r   r   default   s   r   c                 C  s   t j| ddS )Ndim)F	normalizetr   r   r   l2norm   s   r   c                   s    fdd}|S )Nc                   s2   | j }|    | g|R i |}| | |S r   )trainingevaltrain)modelargskwargswas_trainingoutfnr   r   inner   s
   
zeval_decorator.<locals>.innerr   )r%   r&   r   r$   r   eval_decorator   s   r'   ?c                 C  sH   t d| | jd  }t| |\}}t| td}|d|| |S )N   r   z-inf)intshapetorchtopk	full_likefloatscatter_)logitsthreskr   indprobsr   r   r   top_k'   s
   r6   c                      sH   e Zd Zdddddddddddd fdd
Z				dd	d
Z  ZS )LocalMHA@              FN)dim_headheadsdropoutcausalprenorm
qk_rmsnormqk_scaleuse_xposxpos_scale_baseexact_windowsizegate_values_per_headc                  s   t    || }|rt|nd | _|| _tj||d dd| _|| _|r8t	t
|| _t	t
|| _|| _|| _t|d| _td|||d|rN|	nd | j|
|d|| _d | _|ritt||| _tj||dd| _d S )N   FbiasT)r   window_sizer>   autopadscalerD   rB   rC   r   )super__init__r   	LayerNormnormr<   Linearto_qkvr@   	Parameterr,   onesq_scalek_scaler>   rI   r   rD   r
   attn_fn	to_v_gate
Sequentialto_out)selfr   rI   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   r!   	inner_dim	__class__r   r   rM   1   s<   

	
zLocalMHA.__init__c                   s`  |j d }tjr|}|jddd\}}}	tfdd|||	f\}}}	jrAtt||f\}}|j }|j	 }t|r|dksKJ j
rRt|rVJ d|\}
}||j d d	  }tj|
|fdd}tj||	fdd}	jjj }jr|d   n|j d }||j    t fd
d||	fD \}}	tjjrjj}||\}}t||||d\}}t||d}t|r|j d }|ddd | d f }|j d |j d ksJ || }|jdd}t||	d}n
j|||	||d}|rt||	f}tjr|}t|d}||  }t|d}|}|s,|S ||fS )NrF   r   r   c                   s   t | d jdS )Nzb n (h d) -> b h n d)h)r   r<   r   rZ   r   r   <lambda>x   s    z"LocalMHA.forward.<locals>.<lambda>r)   z-only allow caching for specific configurationg      c                 3  s&    | ]}|d  dddf V  qdS ).Nr   ).0r   )kv_start_indexr   r   	<genexpr>   s   $ z#LocalMHA.forward.<locals>.<genexpr>)rK   zb h i d, b h j d -> b h i j.zb h i j, b h j d -> b h i d)mask	attn_biaszb n h -> b h n 1zb h n d -> b n (h d))r+   r   rO   rQ   chunkmapr@   r   rT   rU   r>   r,   catrV   look_backwardrI   rD   tuplerel_posr   r	   softmaxstackrW   r   sigmoidrY   )rZ   xre   rf   cachereturn_cacheseq_lenqr3   vckcveffective_window_sizerl   pos_emb
xpos_scalesimk_lenattnr#   kvgatesr   )rc   rZ   r   forwardj   s\   










zLocalMHA.forward)NNNF)__name__
__module____qualname__rM   r   __classcell__r   r   r\   r   r7   0   s$    <r7   c                   @  s   e Zd Zdd ZdS )GEGLUc                 C  s    |j ddd\}}|t| S )N   r   r   )rg   r   gelu)rZ   rp   gater   r   r   r      s   zGEGLU.forwardN)r   r   r   r   r   r   r   r   r      s    r      r:   c                 C  sP   t | | d d }tt| tj| |d ddt t|tj|| ddS )Nr   rF   FrG   )r*   r   rX   rN   rP   r   Dropout)r   multr=   r[   r   r   r   FeedForward   s   r   c                      s0   e Zd Z fddZedd Zdd Z  ZS )DynamicPositionBiasc              
     sB   t    ttd|t t||t t||| _d S )Nr)   )rL   rM   r   rX   rP   SiLUmlp)rZ   r   r<   r\   r   r   rM      s   




zDynamicPositionBias.__init__c                 C  s   t |  jS r   )next
parametersdevicer`   r   r   r   r      s   zDynamicPositionBias.devicec           	      C  s   | j }||ks	J tj|tj|d}| t|d}tj|| ||d}tj||d}t|dt|d  }t|| d}|S )N)dtyper   z... -> ... 1r   zi -> i 1zj -> 1 jzi j h -> h i j)r   r,   aranger/   r   r   abs)	rZ   ijr   rel_distrH   i_seqj_seqrel_dist_indicesr   r   r   r      s   zDynamicPositionBias.forward)r   r   r   rM   propertyr   r   r   r   r   r\   r   r      s
    
r   Cachecache_kvmaybe_cached_attn_biasc                      sl   e Zd Zdddddddddd	dd	d	dd
d fddZe e			dddZ						dddZ  Z	S )LocalTransformerTi   r8   r9   r   r:   r   FN)r>   local_attn_window_sizer;   r<   ff_multattn_dropout
ff_dropoutignore_indexrB   rC   use_dynamic_pos_biasglobal_attn_layerlayers_insert_global_attnnum_residual_streamsr   Module | Noner   tuple[int, ...] | Nonec                  s  t    t|| _| jrt||| _t||| _|| _t	g | _
|| _d | _|r4t|d |d| _t||dkd\}| _| _t|ttd d }t fdd|D sZJ t|}t	g | _t D ]E}|d }| jt|r||v r||t|dnd  | j
t	||td||||
||||| dd	
|d||t||	|d
dg qg|| _| jrtt|tj||dd| _d S d S )Nr   )r   r<   r)   )disablec                   s$   g | ]}d |  k o kn  qS )r   r   )rb   layerdepthr   r   
<listcomp>  s   $ z-LocalTransformer.__init__.<locals>.<listcomp>)r   branchT)
r   r;   r<   r=   r>   rI   rB   rC   use_rotary_pos_embr?   )r   r   r=   FrG   r   ) rL   rM   r   has_embed_unembedr   	Embedding	token_embry   max_seq_lenr   layersr   dynamic_pos_biasr   r   expand_streamsreduce_streamsr   rk   rangeallsetglobal_layersappendr   r7   r   r   rX   rN   rP   	to_logits)rZ   
num_tokensr   r   r   r>   r   r;   r<   r   r   r   r   rB   rC   r   r   r   r   r!   init_hyper_connglobal_attn_layersindexr   r\   r   r   rM      s<   



,
,

zLocalTransformer.__init__      ?r(   c                 K  s   | j sJ |dksJ |jd |j}}|}	d }
t|D ]N}| j|	d d | j d f f|
dd|\}}|r:|}
t|d d df |d}|dkrR|jddd}ntj	|| dd}t
|d}t
j|	|fdd}	q|	d d |d f S )	Nr:   r)   T)rq   rr   r   )r2   )r   keepdimr   )r   r+   r   r   r   r   r6   argmaxr   rm   r,   multinomialri   )rZ   primers   temperaturefilter_thresuse_kv_cacher!   nr   r#   rq   _r1   	new_cachefiltered_logitssampledr5   r   r   r   generate9  s.   

zLocalTransformer.generatec              	   C  s  |r|d d d df |d d dd f }}|j d |j}}| jr;| |}|| jks/J || tj||d }t|}	d  }
}|	rI|\}
}g }t	t
|
g }|	r^|d d dd f }|}t|stt| jrt| j}| ||d }| |}t| j| jD ]%\\}}}t|r||}||||dt|d d\}}|| ||}q| |}| js|S | |}|s|s|S |t||fS tjt|d|| jd}|S )	Nr   r)   r   r   T)re   rf   rr   rq   zb n c -> b c n)r   )r+   r   r   r   r   ry   r,   r   r   iterr   r   r   r   zipr   r   r   r   r   r   r   r   cross_entropyr   r   )rZ   rp   re   rq   return_lossrr   labelsr   r   	has_cache	cached_kvcached_attn_biasnew_cached_kviter_cached_kvrf   wr}   ffglobal_layerlayer_cached_kvr1   lossr   r   r   r   e  s\   *






zLocalTransformer.forward)r   r   r   r   )r   r(   T)NNFF)
r   r   r   rM   r,   no_gradr'   r   r   r   r   r   r\   r   r      s6    F-r   )r(   )r   r:   )"
__future__r   copyr   collectionsr   r,   r   torch.nn.functional
functionalr   torch.nnr   r   einopsr   r	   local_attention.local_attentionr
   local_attention.rotaryr   hyper_connectionsr   r   r   r   r'   r6   r7   r   r   r   r   r   r   r   r   r   <module>   s.    
	 
$