o
    Ni                    @  s  d dl mZ d dlmZ d dlZd dlmZ d dlmZmZ d dl	m
Z
mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlZd dlmZ d dlm  mZ d dlmZmZmZmZm Z m!Z!m"Z"m#Z# d dl$m%Z%m&Z&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3Z3d dl4m5Z5 d dl6m7Z7m8Z8m9Z9m:Z:m;Z; dZ<eG dd dZ=e
ej>ddZ?dd Z@dd ZAdd ZBdd d!ZCd"d# ZDdd%d&ZEd'd( ZFd)d* ZGdd+d,ZHd-d. ZIG d/d0 d0ZJG d1d2 d2ZKG d3d4 d4ZLd5d6 ZMG d7d8 d8e)ZNdd:d;ZOd<d= ZPdd>d?ZQd@dA ZRddBdCZSdddHdIZTdJdK ZUdLdM ZVddQdRZW	dddUdVZX		WdddZd[ZYd\d] ZZd^d_ Z[d`da Z\dbdc Z]ddde Z^dfdg Z_dhdi Z`G djdk dke)ZaG dldm dme)ZbG dndo doe)ZcG dpdq dqe)ZdG drds dse)ZeG dtdu due)ZfG dvdw dwe)ZgG dxdy dye)ZhG dzd{ d{e)ZiG d|d} d}e)ZjG d~d de)ZkG dd de)Zldd ZmeddddddZnG dd de)Zoeddddd ZpG dd de)ZqG dd de)ZrG dd de)ZsG dd de)ZtG dd de)ZuG dd de)ZvG dd de)ZwG dd de)ZxG dd de)ZyG dd de)ZzG dd de)Z{G dd de)Z|dddZ}G dd de)Z~G dd de)ZdddZG dd de)ZG dd de)ZG dd de)ZG dd de)ZG dd de)ZG dd de)ZG dd de)ZG dd de)ZG dd de)ZG dd deZG dd deZG ddĄ deZG ddƄ deZG ddȄ de)ZG ddʄ de)ZG dd̄ de)ZG dd΄ de)ZdS )    )annotations)CallableN)deepcopy)random	randrange)partialwraps)chain)
namedtuple)nullcontext)	dataclass)version)autocast)nneinsumtensorTensorcatstackarange	is_tensor)tree_flattentree_unflattentree_map)Module
ModuleList
ModuleDict)logger)AttendIntermediates)AutoregressiveWrapper)	Rearrange)	rearrangerepeatreducepackunpack@   c                   @  s   e Zd ZU dZded< dZded< dZded< dZded< dZded	< dZ	ded
< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< dS )LayerIntermediatesNzlist[Tensor] | NonehiddensTensor | Nonelast_hiddenzlist[Intermediates] | Noneattn_intermediateslayer_hiddensattn_z_lossmemslast_layer_hiddensinitial_embedsattn_pooled_tokensmemory_tokenslogit_entropieslogitsr   intcache_length)__name__
__module____qualname__r)   __annotations__r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r7    r<   r<   Q/home/ubuntu/.local/lib/python3.10/site-packages/x_transformers/x_transformers.pyr(   *   s   
 r(   Fbiasc                 C  s   | d uS Nr<   valr<   r<   r=   exists>   s   rC   c                 C  s   t | r| S t|r| S |S r@   )rC   callable)rB   dr<   r<   r=   defaultA   s   rF   c                 O  s   | S r@   r<   )targskwargsr<   r<   r=   identityF      rJ   c                 C  s   t | dkr
| d S |S Nr   len)itrF   r<   r<   r=   firstI      rP   c                 C  s   t | dkS rL   rM   )xr<   r<   r=   is_emptyL      rS      c                 C  s   t | tr| S | f| S r@   )
isinstancetuple)rB   depthr<   r<   r=   
cast_tupleO   rQ   rY   c                 C  s   | | dkS rL   r<   )numdenr<   r<   r=   divisible_byR   rT   r\   c                 C  s   t dd | S )Nc                 S  s   t | r| jr|  S | S r@   )r   requires_graddetachrG   r<   r<   r=   <lambda>V       zdetach_all.<locals>.<lambda>)r   )objr<   r<   r=   
detach_allU      rc   c                   s$   t  st t  fdd}|S )Nc                   s"   t | s| S  | g|R i |S r@   )rC   )rR   rH   rI   fnr<   r=   inner\   s   zmaybe.<locals>.inner)rC   rJ   r   )rf   rg   r<   re   r=   maybeX   s
   rh   c                  G  s   t tt| dkS NrU   )summapr6   )boolsr<   r<   r=   at_most_one_ofc   s   rm   c                   @     e Zd Zdd Zdd ZdS )alwaysc                 C  
   || _ d S r@   rA   selfrB   r<   r<   r=   __init__g      
zalways.__init__c                 O  s   | j S r@   rA   )rr   rH   rI   r<   r<   r=   __call__i   s   zalways.__call__Nr8   r9   r:   rs   ru   r<   r<   r<   r=   ro   f       ro   c                   @  rn   )
not_equalsc                 C  rp   r@   rA   rq   r<   r<   r=   rs   m   rt   znot_equals.__init__c                 O  s
   || j kS r@   rA   rr   rR   rH   rI   r<   r<   r=   ru   o   rt   znot_equals.__call__Nrv   r<   r<   r<   r=   rx   l   rw   rx   c                   @  rn   )equalsc                 C  rp   r@   rA   rq   r<   r<   r=   rs   s   rt   zequals.__init__c                 O  s
   || j kS r@   rA   ry   r<   r<   r=   ru   u   rt   zequals.__call__Nrv   r<   r<   r<   r=   rz   r   rw   rz   c                  G  s   t jtt|  S r@   )r   
SequentialfilterrC   )modulesr<   r<   r=   r{   x      r{   c                   @     e Zd Zdd ZdS )Identityc                 O  s   |S r@   r<   )rr   rG   rH   rI   r<   r<   r=   forward|   rK   zIdentity.forwardNr8   r9   r:   r   r<   r<   r<   r=   r   {       r   #B;c                 C  s   | j |d S )Nmin)clamplog)rG   epsr<   r<   r=   r      r~   r   c                 C  s   t | jj S r@   )torchfinfodtypemax)r   r<   r<   r=   max_neg_value   r~   r   c                 C  s(   t | d|d} tj| ddd} t | dS )Nz... (g d) -> ... g dg   pdimz... g d -> ... (g d))r"   F	normalize)rG   groupsr<   r<   r=   l2norm   s   
r   c                 C  s   | |   | S r@   )tanh)rG   valuer<   r<   r=   	softclamp   r~   r   c                 C  sf   t |s
| j|dS d| j|j  }|jg |j|R  }| | j|d}|j|djdd}|| S )Nr   rU         ?r   )rC   meanndimreshapeshaperj   r   )rG   maskr   dims_appendrZ   r[   r<   r<   r=   masked_mean   s   r   r           padtuple[int, int]c                 C  sN   |dkr| S |dk r| d n| j | d }d| }tj| g ||R |dS )N)r   r   r   rU   r   )r   r   r   )rG   r   r   r   dims_from_rightzerosr<   r<   r=   
pad_at_dim   s
    r   c                 C  s   | ^}}|D ]}||B }q|S r@   r<   )masksheadbodyrestr<   r<   r=   	or_reduce   s   
r   c                 C  s|   t | gd\} }t |gd\}}| j}|  | } }tj|dd}| | jddd| }| | }t||d\}||S )Nzb *r   r   T)r   keepdim)r%   r   doubler   r   rj   r&   to)rR   ypacked_shape_r   unitparallelorthogr<   r<   r=   orthog_project   s   
r   cachereturnlist[tuple[Tensor, Tensor]]c                 C  s    g }| j D ]}||j q|S r@   )r,   append	cached_kv)r   
cached_kvsattn_intermediater<   r<   r=   get_cached_kvs   s   
r   rG   r   c                 C  s*   |s| j ddn| }|t| jdd S Nr   r   )softmaxr   rj   )rG   is_probprobr<   r<   r=   calc_entropy   s   r   r   pre_softmax_attnslist[Tensor]c                 C  sl   d}| D ]
}||j dd }qt|}t|dd}t|s$| | S ||  | jdd }|| S )Nr   r   r   zb h n -> b nrj   h㈵>r   )	logsumexpr   squarer$   rC   r   rj   r   )r   r   weightlseattnlossr<   r<   r=   calc_z_loss   s   	
r   c                 C  s2   t j| jd t| jrt j| jd d S d S )Nr   )r   init	constant_r   rC   r?   )layerr<   r<   r=   
init_zero_   s   
r   c                   s$   t  fdd| D }tt| |S )Nc                 3  s    | ]}  |V  qd S r@   )pop).0keyrE   r<   r=   	<genexpr>       zpick_and_pop.<locals>.<genexpr>)rW   dictzip)keysrE   valuesr<   r   r=   pick_and_pop   s   r   c                 C  sH   t  t  g}| D ]}t| |}t| }|| || |< q
t|S r@   )r   r   boolr6   rW   )condrE   
return_valr   matchindr<   r<   r=   group_dict_by_key   s   
r   c                 C  s
   | | S r@   )
startswith)prefixstrr<   r<   r=   string_begins_with   rt   r   c                 C  s   t tt| |S r@   )r   r   r   )r   rE   r<   r<   r=   group_by_key_prefix   r~   r   c                   s:   t tt| |\}}t|   fdd| D }||fS )Nc                   s   i | ]\}}| d  |qS r@   r<   )r   r   r   
prefix_lenr<   r=   
<dictcomp>  s    z+groupby_prefix_and_trim.<locals>.<dictcomp>)r   r   r   rN   items)r   rE   kwargs_with_prefixrI   kwargs_without_prefixr<   r   r=   groupby_prefix_and_trim  s   r   c                 C  s   g | j | jR ^}}}}tj|||d}t|r%t|}|| |}d| }	tdt|	| }
|j	|
ddj
}t||d}t|d}| ||f } t|rq|jdd}t||	  }t|
|dt|dk }|||f |@ }| |fS )Ndevicer   rU   r   zb -> b 1r   )r   r   r   randnrC   r   masked_fillr   r6   topkindicesr   r"   rj   ceil)seqr   dropoutbnr   r   r5   
mask_value	keep_probnum_keepkeep_indicesbatch_indices
seq_countsseq_keep_counts	keep_maskr<   r<   r=   dropout_seq	  s"   
r  c                   @  r   )ReluSquaredc                 C  s   t |d S Nr   )r   relurr   rR   r<   r<   r=   r   &  rd   zReluSquared.forwardNr   r<   r<   r<   r=   r	  %  r   r	  c                      $   e Zd Z fddZdd Z  ZS )SoLUc                   s   t    t|| _d S r@   )superrs   	LayerNormnorm)rr   r   	__class__r<   r=   rs   *  s   
zSoLU.__init__c                 C  s   |j dd| }| |S r   )r   r  rr   rR   	activatedr<   r<   r=   r   .     
zSoLU.forwardr8   r9   r:   rs   r   __classcell__r<   r<   r  r=   r  )  s    r  c                      .   e Zd Zd fdd	Zdd Zdd Z  ZS )	TokenEmbeddingFc                   s"   t    || _t||| _d S r@   )r  rs   l2norm_embedr   	Embeddingemb)rr   r   
num_tokensr  r  r<   r=   rs   5  s   
zTokenEmbedding.__init__c                 C  s    |  | }| jrt|S |S r@   )r  longr  r   )rr   rR   	token_embr<   r<   r=   r   :  s   zTokenEmbedding.forwardc                 C  s2   | j rtjj| jjdd d S tj| jj d S )Nr   std)r  r   r   normal_r  r   kaiming_normal_rr   r<   r<   r=   init_>  s   zTokenEmbedding.init_F)r8   r9   r:   rs   r   r&  r  r<   r<   r  r=   r  4  s    r  c                      .   e Zd Zd fdd	Z			d	ddZ  ZS )
AbsolutePositionalEmbeddingFc                   s:   t    |s|d nd| _|| _|| _t||| _d S )N      r   )r  rs   scalemax_seq_lenr  r   r  r  )rr   r   r,  r  r  r<   r=   rs   G  s
   
z$AbsolutePositionalEmbedding.__init__Nr   c                 C  s   |j d |j}}|| jksJ d| d| j t|s%t||d| }t|r3||d  jdd}| |}|| j }| jrDt	|S |S )NrU   z(you are passing in a sequence length of zE but your absolute positional embedding has a max sequence length of r   .Nr   r   )
r   r   r,  rC   r   r   r  r+  r  r   )rr   rR   posseq_start_posoffsetseq_lenr   pos_embr<   r<   r=   r   N  s    

z#AbsolutePositionalEmbedding.forwardr'  NNr   r  r<   r<   r  r=   r)  F  s    
r)  c                      r(  )
ScaledSinusoidalEmbedding'  c                   sh   t    t|dsJ ttd|d  | _|d }t|	 | }||  }| j
d|dd d S )Nr   rU   r*  inv_freqF
persistent)r  rs   r\   r   	Parameterr   onesr+  r   floatregister_buffer)rr   r   thetahalf_dimfreq_seqr6  r  r<   r=   rs   c  s   

z"ScaledSinusoidalEmbedding.__init__Nr   c                 C  sn   |j d |j}}t|st||d| }t|r||d  }td|| j}t| | fdd}|| j	 S )NrU   r   r-  i, j -> i jr   r   )
r   r   rC   r   r   r6  r   sincosr+  )rr   rR   r.  r/  r0  r1  r   r  r<   r<   r=   r   m  s   
z!ScaledSinusoidalEmbedding.forward)r5  r3  r  r<   r<   r  r=   r4  b  s    r4  c                      s@   e Zd Zd fdd	Zeddd	Zed
d Zdd Z  Z	S )RelativePositionBiasF          c                   s4   t    || _|| _|| _|| _t||| _d S r@   )	r  rs   r+  causalnum_bucketsmax_distancer   r  relative_attention_bias)rr   r+  rG  rH  rI  headsr  r<   r=   rs     s   
zRelativePositionBias.__init__Tc           	      C  s   d}|  }|s|d }||dk   | 7 }t|}n	t|t|}|d }||k }|t| | t||  ||     }t|t	||d }|t
|||7 }|S )Nr   r   rU   )r  r   absr   
zeros_liker   r;  mathr   	full_likewhere)	relative_positionrG  rH  rI  retr   	max_exactis_smallval_if_larger<   r<   r=   _relative_position_bucket  s    &z.RelativePositionBias._relative_position_bucketc                 C     t |  jS r@   next
parametersr   r%  r<   r<   r=   r        zRelativePositionBias.devicec           
      C  sp   | j }t|| |tj|d}t|tj|d}td||}| j|| j| j| j	d}| 
|}t|d}	|	| j S )Nr   r   zj, i -> i j)rG  rH  rI  i j h -> h i j)r   r   r   r  einxsubtractrV  rG  rH  rI  rJ  r"   r+  )
rr   ijr   q_posk_posrel_pos	rp_bucketr   r?   r<   r<   r=   r     s   


zRelativePositionBias.forward)FrD  rE  rF  )TrD  rE  )
r8   r9   r:   rs   staticmethodrV  propertyr   r   r  r<   r<   r  r=   rC    s    
rC  c                      s0   e Zd ZdZ			d fdd	Zdd Z  ZS )	CoPEz8
    Appendix B of https://arxiv.org/abs/2405.18719
    F皙?c                   sh   t    || _tt||| _|rtj||dddnd | _	|| _
|| _|s*d S | dt| d S )NrU   Fr>   	positions)r  rs   max_posr   r9  r   r   r2  Conv2dtalking_headssoft_onehotsoft_onehot_tempr<  r   )rr   r   rK  rk  rn  rm  ro  r  r<   r=   rs     s   
	zCoPE.__init__c                 C  s,  t | jr.|jdd  \}}||||| d  }| |}||t|j	j
 }| }|djddd}|j| jd d}td|| j}| jrntd|| j }	tj|	 | j dd}
td|
|}|S |  }|  }|d|}|d|}|| }|| |d|   }|S )	NrU   r   r   r   zb h n d, p d -> b h n pr@  zb h i j p, b h i p -> b h i j)rC   rm  r   new_onestriu_r   r   r   r   r   r   sigmoidflipcumsumr   rk  r   r2  rn  r^  r_  rj  rL  r   r   ro  r   r  floorgather)rr   queryattn_logitsr`  ra  causal_maskgatesr.  
logits_intdiff_possoft_onehot_poscope_pos_embpos_ceil	pos_floorlogits_ceillogits_floorwr<   r<   r=   r     s*   

zCoPE.forward)FFri  r8   r9   r:   __doc__rs   r   r  r<   r<   r  r=   rh    s    rh  c                      s8   e Zd Zddd fdd
Zedd Zdd Z  ZS )	DynamicPositionBiasF)log_distancer  c                  s   t    |dksJ d|| _tg | _| jttd||r$t	|nd t
  t|d D ]}| jtt|||rCt	|nd t
  q1| jt|| d S )NrU   zAdepth for dynamic position bias MLP must be greater or equal to 1)r  rs   r  r   mlpr   r{   r   Linearr  SiLUrange)rr   r   rK  rX   r  r  r   r  r<   r=   rs     s    



zDynamicPositionBias.__init__c                 C  rW  r@   rX  r%  r<   r<   r=   r     r[  zDynamicPositionBias.devicec                 C  s   || j }}t|| ||d}t||d}td||}||d 7 }t| d ||d }t|d}| jrDt|t	|
 d  }| jD ]}	|	|}qG|| }
t|
d}
|
S )Nr   r@  rU   z... -> ... 1r]  )r   r   r^  r_  r;  r"   r  r   signr   rL  r  )rr   r`  ra  r   r   
seq_arangecontext_aranger   r.  r   r?   r<   r<   r=   r     s   



zDynamicPositionBias.forward)r8   r9   r:   rs   rg  r   r   r  r<   r<   r  r=   r    s
    
r  c                      sR   e Zd Z		dd fddZedd Zedd	 Z	ddddZdd Z	  Z
S )AlibiPositionalBiasNslopeslist[int] | Nonec                   s^   t    || _t||| _tt|| |}t|d}| jd|dd | jdd dd d S )Nz
h -> h 1 1r  Fr7  r?   )	r  rs   rK  rF   total_headsr   _get_slopesr"   r<  )rr   rK  r  r  rI   r  r<   r=   rs     s   

zAlibiPositionalBias.__init__c                 C  rW  r@   )rY  buffersr   r%  r<   r<   r=   r   /  r[  zAlibiPositionalBias.devicec                 C  s\   dd }t |  r|| S dt t |  }|||d| dd d d | |   S )Nc                   s6   ddt | d       fddt| D S )Nr      c                   s   g | ]} |  qS r<   r<   r   r`  ratiostartr<   r=   
<listcomp>8      zRAlibiPositionalBias._get_slopes.<locals>.get_slopes_power_of_2.<locals>.<listcomp>)rN  log2r  )r   r<   r  r=   get_slopes_power_of_25  s   z>AlibiPositionalBias._get_slopes.<locals>.get_slopes_power_of_2r   r   )rN  r  
is_integerrw  )rK  r  closest_power_of_2r<   r<   r=   r  3  s
   *zAlibiPositionalBias._get_slopespos_ir   pos_jr*   c                 C  sn   | j | j}}t||}td||  }|jdkr t|d}|| j }||j	d  }t
|d|fdd}|S )Nz... j, ... i -> ... i jr  zb i j -> b 1 i jr   r   )r  r   rF   r^  r_  rL  r   r"   r  r   r   )rr   r  r  hr   r?   num_heads_unalibiedr<   r<   r=   forward_custom_pos@  s   



z&AlibiPositionalBias.forward_custom_posc           	      C  s   | j | j}}t| jr*| jjd |kr*| jjd |kr*| jd| d | d f S t|| ||d}t||d}td||  }|| j	 }||jd  }t
|d|fdd}| jd	|d
d | jS )Nr   rp  .r   zj, i -> 1 i jr  r   r   r?   Fr7  )r  r   rC   r?   r   r   r^  r_  rL  r  r   r<  )	rr   r`  ra  r  r   r  r  r?   r  r<   r<   r=   r   S  s   *
zAlibiPositionalBias.forwardNN)r  r  r@   )r  r   r  r*   )r8   r9   r:   rs   rg  r   rf  r  r  r   r  r<   r<   r  r=   r    s    

r  c                      0   e Zd ZdZ			d	 fdd	Zdd Z  ZS )
DataDependentAlibiz, https://openreview.net/forum?id=q2Lnyegkr8 T      @r   c                   sZ   t    || _t|||rdnd }t|tdt | _tj	
|j| || _d S )NrU   r   zb n h -> b h n)r  rs   rG  r   r  r{   r!   
LogSigmoidto_forget_gatesr   r   r?   post_log_scale)rr   r   rK  rG  	bias_initr  linearr  r<   r=   rs   g  s   

zDataDependentAlibi.__init__c                 C  sn   | j  }| || j }|jdd}|r|jddd\}}td||}|r5td||}| |  }|S )Nr   r   r   rU   zb h i, b h j -> b h i jzb h j, b h i -> b h i j)	rG  r  r  rv  chunkr^  r_  triltriu)rr   rR   bidirectionalforget_gatesforget_gates_reversedr<   r<   r=   r   ~  s   zDataDependentAlibi.forward)Tr  r   r  r<   r<   r  r=   r  d  s    r  c                      r  )
PerRowDataDependentAlibiz same as data dependent alibi from forgetting transformer, but the forgetting gates are also derived by a queries and keys with a small head dimension TrF  r   c                   sZ   t    |sJ d|d | _tj||| d dd}t|tdd|d| _|| _d S )Nzbidirectional not supported yetr*  r   Fr>   zb n (qk h d) -> qk b h n d)qkrE   )	r  rs   r+  r   r  r{   r!   r  r  )rr   r   rK  rG  dim_headr  r  r  r<   r=   rs     s   


z!PerRowDataDependentAlibi.__init__c                 C  s   |  |\}}td||| j }t|| j }|jd }tj||ftj	|j
d }||d}|jdd}|jdd}|jdd}|S )	Nz... i d, ... j d -> ... i jrp  r\  r   )r   )dimsr   r   )r  r   r+  r   
logsigmoidr  r   r   r:  r   r   r  r   ru  rv  )rr   rR   qkr  r   r{  r<   r<   r=   r     s   
z PerRowDataDependentAlibi.forward)TrF  r   r  r<   r<   r  r=   r    s    r  c                      sF   e Zd Z					d fdd	Zdd Zed	dd
dddZ  ZS )RotaryEmbeddingF   r   r5  c           	        s   t    ||||d   9 }d|td|d |   }| d| |dks)J || _|s6| dd  d S td|dd|  d|  }|| _| d| d S )Nr   r   r   r6  r+  g?gffffff?)r  rs   r   r;  r<  interpolation_factor
scale_base)	rr   r   use_xposr  r  basebase_rescale_factorr6  r+  r  r<   r=   rs     s   
	zRotaryEmbedding.__init__c                 C  s   | j j}t||d}| |S )Nr   )r6  r   r   r   )rr   r1  r   rG   r<   r<   r=   forward_from_seq_len  s   
z$RotaryEmbedding.forward_from_seq_lencudaenabledr   c                 C  s   |  d }|jdkrt|d}td|| j| j| j }t||fdd}t|d}t	| j
s5|dfS ||d  | j }| j
t|d	 }t||fdd}t|d}||fS )
NrU   n -> 1 nb i , j -> b i jr   r   ... d r -> ... (d r)r   r   z... n -> ... n 1)r   r   r"   r   r   type_asr6  r  r   rC   r+  r  )rr   rG   r0  rk  freqspowerr+  r<   r<   r=   r     s   




zRotaryEmbedding.forward)Fr  r   r5  r   r   )r8   r9   r:   rs   r  r   r   r  r<   r<   r  r=   r    s    
r  c                 C  s:   t | ddd} | jdd\}}t| |fdd} t | dS )Nz... (d r) -> ... d rr   rr   r   r  )r"   unbindr   )rR   x1x2r<   r<   r=   rotate_half  s   
r  r  r  c                 C  s   |j d | j d | j}}}|d d | d d d f }t|r/|d d | d d d f n|}| jdkrI|jdkrIt|d}t|rIt|d}| dd |f | d|d f } }| |  | t| |  |  } t| |fdd}|	|S )Nr   rp     r  zb n d -> b 1 n d.r   )
r   r   r   r   r"   rB  r  rA  r   type)rG   r  r+  rot_dimr1  
orig_dtypet_unrotatedoutr<   r<   r=   apply_rotary_pos_emb  s   (

"$
r  c                      s<   e Zd ZdZ		d fdd	Zeddddd	d
Z  ZS )PolarEmbeddingz" https://arxiv.org/abs/2509.10534 Fr5  c                   sf   t    d|td| |   }| d| tt|d|| _	|r1| j	
dtj d d S d S )Nr   r   r6  rU          r   )r  rs   r   r;  r<  r   r9  r   r   learned_biasuniform_rN  pi)rr   r   rK  bias_uniform_initr  r6  r  r<   r=   rs     s   
zPolarEmbedding.__init__r  r  r   c                 C  sT   |  d }|jdkrt|d}td|| j| j}| jdt	j
 d}||fS )NrU   r  r  r  r   )r   r   r"   r   r   r  r6  r  r   rN  r  )rr   rG   r0  rk  r  r?   r<   r<   r=   r   %  s   

zPolarEmbedding.forward)Fr5  r  )r8   r9   r:   r  rs   r   r   r  r<   r<   r  r=   r    s    
r  c                 C  sp   |j d | j d | j}}}|d d | d f }|  } t| } t| |  | |  fdd}||S )Nr   rp  r   )	r   r   r;  r   softplusr   rB  rA  r  )rG   r  r  r1  r  r  r<   r<   r=   apply_polar_pos_emb2  s   
 
r  c                      r  )Scalec                   s   t    || _|| _d S r@   )r  rs   r   rf   )rr   r   rf   r  r<   r=   rs   A  s   

zScale.__init__c                   sL    j |fi |} fdd}t|ts||S ||d g|dd  R S )Nc                   s
   |  j  S r@   r   r_   r%  r<   r=   r`   H  s   
 zScale.forward.<locals>.<lambda>r   rU   )rf   rV   rW   )rr   rR   rI   r  scale_fnr<   r%  r=   r   F  s
   
zScale.forwardr  r<   r<   r  r=   r  @      r  c                      (   e Zd Z	d fdd	Zdd Z  ZS )r  Fc                   sN   t    || _tj|dd| _tt|| _	tj
| j	dt|  dS )z
        bias-less layernorm has been shown to be more stable. most newer models have moved towards rmsnorm, also bias-less
        Felementwise_affiner   N)r  rs   unit_offsetr   r  lnr9  r   r:  gammar   r   r;  rr   r   r  r  r<   r=   rs   P  s
   
zLayerNorm.__init__c                 C  s"   |  |}| jt| j }|| S r@   )r  r  r;  r  )rr   rR   normedr  r<   r<   r=   r   _  s   
zLayerNorm.forwardr'  r  r<   r<   r  r=   r  O  s    r  c                      r  )AdaptiveLayerNormNc                   sD   t    t||}tj|dd| _t||| _tj	| jj
 d S )NFr  )r  rs   rF   r   r  r  LinearNoBiasto_gammar   zeros_r   rr   r   dim_conditionr  r<   r=   rs   e  s
   

zAdaptiveLayerNorm.__init__c                C  s4   |j dkr
t|d}| |}| |}||d  S )Nr   b d -> b 1 dr   )r   r"   r  r  rr   rR   	conditionr  r  r<   r<   r=   r   q  s
   



zAdaptiveLayerNorm.forwardr@   r  r<   r<   r  r=   r  d      r  c                      r  )	ScaleNormFc                   sH   t    || _|d | _ttd| _tj	
| jdt|  d S )N      ?rU   r   r  rs   r  r+  r   r9  r   r   r   r   r   r;  r  r  r<   r=   rs   z  
   

zScaleNorm.__init__c                 C  (   | j t| j }tj|dd| j | S r   r   r;  r  r   r   r+  rr   rR   r  r<   r<   r=   r        zScaleNorm.forwardr'  r  r<   r<   r  r=   r  y  r  r  c                      r  )RMSNormFc                   sH   t    || _|d | _tt|| _tj	
| jdt|  d S )Nr  r   r  r  r  r<   r=   rs     r  zRMSNorm.__init__c                 C  r  r   r  r   r<   r<   r=   r     r  zRMSNorm.forwardr'  r  r<   r<   r  r=   r    r  r  c                      r  )AdaptiveRMSNormNc                   s>   t    |d | _t||}t||| _tj| jj	 d S Nr  )
r  rs   r+  rF   r  r  r   r   r  r   r  r  r<   r=   rs     s
   


zAdaptiveRMSNorm.__init__c                C  s>   |j dkr
t|d}tj|dd}| |}|| j |d  S )Nr   r  r   r   r   )r   r"   r   r   r  r+  r  r<   r<   r=   r     s
   


zAdaptiveRMSNorm.forwardr@   r  r<   r<   r  r=   r    r  r  c                      r  )SimpleRMSNormc                   s   t    |d | _d S r  )r  rs   r+  )rr   r   rI   r  r<   r=   rs     s   
zSimpleRMSNorm.__init__c                 C  s   t j|dd| j S r   )r   r   r+  r  r<   r<   r=   r        zSimpleRMSNorm.forwardr  r<   r<   r  r=   r    s    r  c                      r  )MultiheadRMSNormc                   s.   t    t|| _tt|d|| _d S ri   )	r  rs   r  rmsnormr   r9  r   r   r  )rr   r   rK  r  r<   r=   rs     s   

zMultiheadRMSNorm.__init__c                 C  s   |  || jd  S Nr   )r  r  r  r<   r<   r=   r     r  zMultiheadRMSNorm.forwardr  r<   r<   r  r=   r    r  r  c                      s2   e Zd ZdZ				d	 fdd	Zdd Z  ZS )
DynamicTanhz" https://arxiv.org/abs/2503.10622 r   r   Fc                   s   t    tt|| _tt|| _tt	|| _
|r#|nd| _t|| _tj| j|r4dn| tj| jdt|  d S )Nr   r   r   )r  rs   r   r9  r   pre_tanh_scaler   r:  r  r   betapre_tanh_scale_offsetr;  gamma_offsetr   r   )rr   r   
init_alphar  r  r  r  r<   r=   rs     s   

zDynamicTanh.__init__c                 C  s.   | j | j }| j| j }||  | | j S r@   )r  r  r  r  r   r  )rr   rR   r  r  r<   r<   r=   r     s   zDynamicTanh.forward)r   r   r   Fr  r<   r<   r  r=   r
    s    r
  c                      r  )
Derfz" https://arxiv.org/abs/2512.10938 r  r   Fc                   sl   t    |r	dnd}tt|| | _tt|| _tt|| | _	tt
|| _|| _d S )Nr   r   )r  rs   r   r9  r   alphasr   r:  r  r   r  scale_offset)rr   r   r  	init_biasr  r  r  r<   r=   rs     s   

zDerf.__init__c                 C  s6   || j | j  | j }t|}|| j| j  | j S r@   )r  r  r  r   erfr  r  r  r<   r<   r=   r     s   
zDerf.forward)r  r   Fr  r<   r<   r  r=   r    s    r  c                      s.   e Zd Zd	 fdd	Zdd Zdd Z  ZS )
ResidualFr   c                   s.   t    |rtt|nd | _|| _d S r@   )r  rs   r   r9  r   r:  residual_scalescale_residual_constant)rr   r   scale_residualr  rI   r  r<   r=   rs     s   

zResidual.__init__c                 C     ||t  fS r@   r   rr   residualr<   r<   r=   prepare  rT   zResidual.preparec                 K  s0   t | jr
|| j }| jdkr|| j }|| S ri   )rC   r  r  )rr   rR   r  rI   r<   r<   r=   r     s
   



zResidual.forward)Fr   r8   r9   r:   rs   r  r   r  r<   r<   r  r=   r        r  c                      r  )		GRUGatingFc                   s<   t    t||| _|rtt|| _d S d | _d S r@   )	r  rs   r   GRUCellgrur9  r   r:  r  )rr   r   r  rI   r  r<   r=   rs     s   
$zGRUGating.__init__c                 C  r  r@   r  r  r<   r<   r=   r    rT   zGRUGating.preparec                 K  s6   t | jr
|| j }| t|dt|d}||S )Nzb n d -> (b n) d)rC   r  r#  r"   
reshape_as)rr   rR   r  rI   gated_outputr<   r<   r=   r     s   


zGRUGating.forwardr'  r  r<   r<   r  r=   r!    r   r!     c                 C  sR   | j }|  } | jdd} t|D ]}tj| ddd} tj| ddd} q| |S )Nrp  r   rU   r   r   )r   r;  r   r  r   r   r   )rG   itersr   r   r<   r<   r=   sinkhorn#  s   
r(  c                      s4   e Zd Zddd fdd
Zdd Zdd	 Z  ZS )
HyperConnectionrU      )num_input_viewssinkhorn_itersc                  s   t    tj|dd| _|| _|| _tt	|| _
t||f}d||| ddf< tt|t|gdd| _tt||| | _tt	dd | _|| _tt|| _tt	dd | _|| _dS )	z
        https://arxiv.org/abs/2409.19606
        Appendix J - Algorithm 2, Dynamic only

        https://arxiv.org/abs/2512.24880
        "Manifold constrained" mixing matrices
        Fr>   r   NrU   r   r<   {Gz?)r  rs   r   r  r  num_residual_streamslayer_indexr9  r   r:  static_betar   r   eyestatic_alphadynamic_alpha_fndynamic_alpha_scaler+  dynamic_beta_fndynamic_beta_scaler,  )rr   r   r/  r.  r+  r,  rI   init_alpha0r  r<   r=   rs   0  s   

zHyperConnection.__init__c                 C  s\  | j }| j}t|d| jd}| |}|| j }|| j }|| j }|dd |f |d|d f }}	| }t|	d|d}	t|	| j	}	t|	d}	t
||	fdd}|| j  d	 }
|
| j }|| j }| d	 }td
||}|dkr|ddd d f |ddd d d f }}n|dd |d d f |d|d d d f }}t|d}||t|dfS )Nz(b s) n d -> b n s dr  .z... (s1 s2) -> ... s1 s2)s2z... s1 s2 -> ... (s1 s2)r   r   r   z... s t, ... s d -> ... t drU   r   z... v d -> v ... d)r  )r+  r.  r"   r  r3  r4  r2  rt  r(  r,  r   r5  r6  r0  r   r   )rr   	residualsviewsstreamsr  	wc_weightdynamic_alphar  alpha_inputalpha_residual	dc_weightdynamic_betar  mix_hbranch_inputr<   r<   r=   r  Y  s.   



"


,.
zHyperConnection.preparec                C  s   t d||| }t|dS )Nzb n d, b n s -> b n s dzb n s d -> (b s) n d)r   r"   )rr   rR   r:  r  r<   r<   r=   r     r  zHyperConnection.forwardr  r<   r<   r  r=   r)  /  s    )*r)  c                      s,   e Zd Z			d fdd	Zdd Z  ZS )DynamicLIMerU   Tc                   sb   t    || _|dk| _t|rt|nd t||| td|d|r)tj	ddnt
 | _d S )NrU   z&... (views layers) -> views ... layers)r;  r   r   )r  rs   
num_layersmultiple_viewsr{   r  r   r  r!   SoftmaxReLU
to_weights)rr   r   rF  	num_viewsr  use_softmaxr  r<   r=   rs     s   



zDynamicLIMe.__init__c                 C  sh   t |st|}|jd | jksJ d| j dt|j d| |}td||}| jr/|S t|dS )Nr   zexpected hiddens to have z layers but received z) instead (first dimension must be layers)zl b n d, v b n l -> v b n dz1 ... -> ...)	r   r   r   rF  rW   rJ  r   rG  r"   )rr   rR   r)   weightsr  r<   r<   r=   r     s   .

zDynamicLIMe.forward)rU   TTr  r<   r<   r  r=   rE    s    rE  c                 C  sL   |dkr| S t || jd }t|r| |d  d} t| || fdddS )Nr   rU   r-  r   rp  r   r   )r   r   rC   r   r   )rG   amountr   r<   r<   r=   shift  s   rP  c                      r  )ShiftTokensc                   s   t    || _t|| _d S r@   )r  rs   rf   rW   shifts)rr   rR  rf   r  r<   r=   rs     s   
zShiftTokens.__init__c           	        s   | dd  | j}t|}|jd | }|j|dd}|d | ||d  }} fddt||D }tg ||R dd}| j|fi |S )Nr   r   r   c                   s   g | ]	}t |d  iqS r   )rP  )r   rH   rS  r<   r=   r    ra   z'ShiftTokens.forward.<locals>.<listcomp>)getrR  rN   r   splitr   r   rf   )	rr   rR   rI   rR  segmentsfeats_per_shiftsplittedsegments_to_shiftr   r<   rS  r=   r     s   zShiftTokens.forwardr  r<   r<   r  r=   rQ    r  rQ  c                      s&   e Zd Zd fddZdd Z  ZS )FoldAxiallyrf   r   c                   s   t    || _|| _d S r@   )r  rs   rf   	axial_dim)rr   r[  rf   r  r<   r=   rs     s   

zFoldAxially.__init__c           
      O  s   | j dkr| j|g|R i |S |jd | j }}t|| | }t|d|| fdd}t|d|d}| j|g|R i |}t|\^}}}	t|d|d}|d d d |f }t|g|R |	}|S )NrU   r   r   z*b (n axial_dim) ... -> (b axial_dim) n ...)r[  z*(b axial_dim) n ... -> b (n axial_dim) ...)	r[  rf   r   rN  r   r   r"   r   r   )
rr   rR   rH   rI   r1  r[  next_multipler  rest_out	tree_specr<   r<   r=   r     s   
zFoldAxially.forwardrf   r   r  r<   r<   r  r=   rZ    s    	rZ  c                      ,   e Zd Z		d	d
 fddZdd Z  ZS )
LayerScaler   Frf   r   c                   sD   t    || _|| _tt|| _tj	
| j|t|  d S r@   )r  rs   r  rf   r   r9  r   r   r  r   r   r;  )rr   rf   r   
init_valuer  r  r<   r=   rs     s
   
zLayerScale.__init__c                 K  sL   | j |fi |}| jt| j }t|tr|| S |^}}|| g|R S r@   )rf   r  r;  r  rV   r   )rr   rR   rI   r  r  r   r<   r<   r=   r     s   
zLayerScale.forward)r   Fr_  r  r<   r<   r  r=   ra    s
    ra  c                      r`  )AdaptiveLayerScaleNr  rf   r   c                   sN   t    || _t||}t||| _tj| jj	 tj
| jj| d S r@   )r  rs   rf   rF   r   r  r  r   r  r   r   r?   )rr   rf   r   r  init_bias_valuer  r<   r=   rs     s   

zAdaptiveLayerScale.__init__c                K  s^   |j dkr
t|d}| j|fi |}| | }t|tr#|| S |^}}|| g|R S )Nr   r  )r   r"   rf   r  rt  rV   r   )rr   rR   r  rI   r  r  r   r<   r<   r=   r   !  s   


zAdaptiveLayerScale.forward)Nr  r_  r  r<   r<   r  r=   rc    s
    rc  c                      s&   e Zd Z fddZdddZ  ZS )ConcatCombinec                   s$   t    || _t|d || _d S r
  )r  rs   prev_layer_indr  combine)rr   r   rf  r  r<   r=   rs   1  s   
zConcatCombine.__init__prev_layersr   c                 C  s$   || j  }t||fdd}| |S r   )rf  r   rg  )rr   rR   rh  skipconcatted_skipr<   r<   r=   r   6  s   

zConcatCombine.forward)rh  r   r  r<   r<   r  r=   re  0  s    re  c                      s*   e Zd Z	dd	 fddZdd Z  ZS )
GLUF
activationr   c                   sF   t    || _t||d | _|rtt|| _	d S d| _	d S )Nr   r   )
r  rs   actr   r  projr9  r   r:  	mult_bias)rr   dim_indim_outrl  ro  r  r<   r=   rs   >  s   
$zGLU.__init__c                 C  s,   |  |jddd\}}|| | | j S )Nr   r   r   )rn  r  rm  ro  )rr   rR   gater<   r<   r=   r   J  s   zGLU.forwardr'  )rl  r   r  r<   r<   r  r=   rk  =  s    rk  c                      sL   e Zd Z													d fdd	Zdd Z	dd	d
Z  ZS )FeedForwardNr  Fr   c                   s   t    t|| }t||}t||sJ t|	r t|	}n|r&t }n|r-t|}n|r4t	
 }nt	 }|rCt||||d}nt	t	j||| d|}t	j||| d}t||
rat|nd t	|||dkrpt	|nd | _|r|t| d S d S )N)ro  r>   r   )r  rs   r6   rF   rm   rC   r   r	  r  r   r  GELUrk  r{   r  r  Dropoutffr   )rr   r   rq  multgluglu_mult_biasswishrelu_squaredsolucustom_activationpost_act_lnr   sublayer_dropoutno_biaszero_init_output	inner_dimrl  proj_inproj_outr  r<   r=   rs   O  s<   





zFeedForward.__init__c                 C  s0   g }|   D ]}t|tjsq||j q|S r@   )r}   rV   r   r  r   r   )rr   rM  mr<   r<   r=   muon_parameters  s   zFeedForward.muon_parametersc                 C  s   |  |}t|r|| }|S r@   )rv  rC   )rr   rR   
deep_embedr  r<   r<   r=   r     s   
zFeedForward.forward)Nr  FFFFFNFr   r   FFr@   )r8   r9   r:   rs   r  r   r  r<   r<   r  r=   rs  N  s$    9rs  c                H      s  e Zd Zedddddddddddddddddddddddddddddd	ddddddddddddddddde dd
dddddddddddddddded	d	d	ddfEd( fddZe 	d)d*ddZd d! Z																					d+d,d&d'Z
  ZS )-	AttentionNrF  Fr   r   rU   
   r   T   g      I@g      .@)enable_flashenable_mathenable_mem_efficientcustom_attn_fnCallable | Nonehybrid_moduleModule | Nonehybrid_mask_kwarg
str | Nonehybrid_fold_axial_dim
int | Nonedata_dependent_alibi_kwargsr   attend_sdp_kwargscG           T        s2  t    t||}G|d | _|| _|| _|| _t|)r"|(r"J dt|*|}*t|)|})|(r0dn|)})t||)s9J |)| _	||) | _
|| }H||) }I|*|) }J|*| }Kd | _d | _d | _|}L|G}M|?rnt|@sfJ t||@| _|@}L|Ar~t|BsvJ t||B| _|B}Mt|Crt|.rJ d|C}.||)|C  }It|||C | _td|Cd| _|?| _|A| _t|L|H| _t|M|I| _t|M|J| _td|d| _td|d| _td|*d| _td| _|>| _|<| _|=| _d | _|rt||K| _|rt j!nt j"| _#tj$%| jj&d	 tj$%| jj'd
 d | _(|r#t||| _(tj$%| j(j&d	 tj$%| j(j'd
 || _)|| _*|| _+|| _,d | _-| _.|rQ|rQt/t01|d|| _-t/t01|)d|| _.|r^t||s^J d|rl|| dkrlJ d|rut2||dnd | _3d }N|3r|sJ d|rJ dt4|||4|6|5d}Nd | _5|/r|0st6nt7}Ot8|||d}P|0r|Pj9|1d |Od5i |P|2| _5t:d5i d|d|d|d|d|	d|d|d|d|d|d|r|n| jd|d |d!|d"|d#| d$|!d%|"d&|#d'|,d(|-d)|d*|7d+|8d,|Nd-|Dd.|Ed/|F| _;|
| _<|
r-t/t01d|dd| _=|| _>|| _?|d	krNt/t0@|)||| _At/t0@|)||| _B|9r`tCt||tD td0ntEd1| _F|| _G|:| _H|;| _I|KtJdtK|:tK|; 9 }Kd }Qd }RtLtM|$}$t|$rt|&rtN|&|$d2}$|'rt||nd }QtOt2||dt2||dg}R|$| _P|R| _Q|Q| _R|%| _St|+|}+|rtCt|K|+d tT nt|K|+| _U|d3krtV|nd | _Wt|.|}.d	|.  k r|ksJ  J |.|k }S|Sr|)|k rJ d4|.| _X|! | _Y|rtZ| jU d S d S )6Nr*  zreither attn_one_kv_head is set to True (in which case kv_heads is set to 1), or attn_kv_heads is set, but not bothrU   zJ`rotate_num_heads` cannot be set when multi-latent attention is being usedb n (h d) -> b h n dr  r   zb h n d -> b n (h d)r   r  zDdimension per attention head must be divisible by the qk norm groupsr   zcthe group dimension may be too small (2 was too small in my tests, but 4 still works, surprisingly))rK  z&CoPE was designed for causal attentionz&CoPE is not flash attention compatible)r   rK  rk  rm  rn  )r   rK  rG  )r  rK  rG  pre_talking_headspost_talking_headspre_scale_post_talking_headsr   sparse_topksparse_topk_straight_throughhardqk_normr+  l2_distancert  gumbel_softmaxgumbel_softmax_tempgumbel_softmax_hard	selective
cog_signedr  add_zero_kvhead_learned_sinkflashsoftclamp_logitslogit_softclamp_valuecopeonnxable
sdp_kwargsflash_pack_seqb n h -> b h n 1r  )r[  rf   r   zqgrouped query attention not compatible with partial rotate heads (decoupled rope for multi-latent attention), yetr<   )[r  rs   rF   r+  rK  rG  max_attend_pastrC   r\   kv_headsr   to_latent_qto_latent_kvto_rotateable_kr  r!   split_rotateable_k_headsuse_latent_quse_latent_kvto_qto_kto_vsplit_q_headssplit_k_headssplit_v_headsmerge_headsqkv_receive_diff_residualslaserlaser_softclamp_value	to_v_gater   r  r   silurt  to_v_gate_activationr   r   r   r?   to_v_head_gater  qk_norm_groupsqk_norm_scaleqk_norm_dim_scaleqk_norm_q_scaleqk_norm_k_scaler9  r   r:  r  value_rmsnormrh  data_dependent_alibir  r  r   updater   attend
head_scalehead_scale_paramsr  
num_mem_kvr   mem_kmem_vr{   Sigmoidro   to_value_residual_mixattn_on_attnorthog_projected_values orthog_projected_values_per_headr   r6   rh   r   rZ  r   r  hybrid_norms
hybrid_mixr  rk  to_outru  r  rotate_num_headscan_cache_kvr   )Trr   r   r  dim_contextrK  rG  r  r  r  r  r  r  r  r  r   r  on_attngate_value_headsswiglu_valuesgate_valuesr  r  r  r  r  r  r  r  r  rt  r  r  r  r  r  r  r  r  r  hybrid_learned_mixone_kv_headr  value_dim_headrq  r  r  r  r  data_dependent_alibi_per_row%data_dependent_alibi_per_row_dim_headr  use_copecope_max_poscope_soft_onehot_poscope_talking_headsr  r  learned_value_residual_mixr  r  r  r  r  r  dim_latent_qr  dim_latent_kvlatent_rope_subheadsr  r  r  dim_kvq_dimk_dimv_dimout_dimdim_q_inputdim_kv_inputr  	dda_klass
dda_kwargsr  r  is_partial_rotate_headsr  r<   r=   rs     s  
M





	
!




,
 zAttention.__init__d   pre_softmax_attnTensor | Intermediatesc           	      C  s|   t |s|j}t|dd}|| jdd }| jj}| jj}|jd |	 }}t
|d|| d}|| || dS )	zN proposed by the Moonshot AI team as a solution for Muon training instability zb h i j -> hr   r   rq  r   zh -> (h expand))expandN)r   r  r$   r   sqrtr  r   r  r   numelr#   mul_)	rr   r  tauattn_logit_maxesqk_weight_scaleq_weightk_weightqk_dimrK  r<   r<   r=   qk_clip_  s   
zAttention.qk_clip_c                 C  s   t | j | j S r@   )r	   r  rZ  r  r%  r<   r<   r=   r    s   zAttention.muon_parametersr   Intermediates | Noneadditional_key_valuestuple[Tensor, Tensor] | Nonec           b   
     s
  |j d |j d | j| j| j| j|jt|| j| jf
\
 }}}}}}}}|r-|r-J d|rC|j	dkr;|j d dks=J |\}}} nt
||}!||!|!}}} t|r\|| }| | } t|rrt||gd\}}"t|| gd\} }#d }$| jr|| |}|r|rJ t| j}%| |}&|%r| |}'| |'}$t|r|j\}(})t|(|&fdd}&t|)rt|)|$fdd}$|r|&|$f}*|& }} | |}+| |},| | }-| |+}+| |,},| |-}-t|$rt|,|$fdd},|-}.t|r| |}/||-|/}-| jrtt| jd	}0t|0|+|,f\}+},| j }1|+| j! }+|,| j" },t#| j$|-}-|st|ri|j\}2}3t|rDt%|,|"d
\}4},t%|-|"d
\}5}-t|2|,fdd},t|3|-fdd}-t|rit|4|,fdd},t|5|-fdd}-|rt|rv|j d nd}6|,d|6d d d f |-d|6d d d f f}*t|r| j&}7|7k }8|\}9}:t|:r|:|:d fnd\};}<|8r|+d d d |7 f |+d d |7 d f }=}+|,d d d |7 f |,d d |7 d f }>},t'|+|9|;}+|r|	\}9}:t|:r|:|:d fnd\}#}<t'|,|9|<},|8rt|=|+fdd}+t|>|,fdd},t|
r(|
\}9}?t(|+|9}+t(|,|9|? },|}@t|@ss|ss|}@t|@s>t|rst|rs||j d }A}6t|s[t)|@|6dfddd}@nt|@skt)|d|Afddd}@nt||@fdd}@t*dd |+|,fD \}B}C|dkrt* fdd| j+| j,fD \}D}E| jrt|D}D|D| j" }Dt|D|,fdd},t|E|-fdd}-t|@rt)|@| jdfddd}@t|r>|,j d }A|\}F}G|Fj d |Fj d }H}I|H|krt-|HsJ t*fdd|,|-|F|GfD \},}-}F}Gt|F|,fdd},t|G|-fdd}-t|@st|r>t|s&t)|@|Idfddd}@nt|@s6t)|d|Afddd}@nt||@fdd}@t.|+}Jg }Kd }Lt|@rVt/|@d}@|K0|@  t|rd|j	  krkdkspJ d J d|j	dkr|t/|d}n|j	dkrt/|d}|K0|  t| j1rt2|C|B |C|d}Mt2|C|d}Nt34d|M|N}O|O| j1k}Pt)|P|dfddd}P|K0|P t5|Kdkrt6|K }Lt|rt|rJ t|rt7|t8sJ d|9|}n||B|C}t)||df}t| j:r| :|}t)||df}| j;rt<|-| j=}-|-> }-| j?|+|,|-|L|||d\}Q}R| j;r,t@|Q}Q|.|R_A|r7|Q| jB }Qt| jCrK| C|}St3Dd |SE |Q}Qt| jFrtG }T| jHsct| jIrc| jI|i}Td!}Ut|rvt|jJrv|jJ}V|Vf}U| jF|g|UR i |T}WtK|W\^}X}Y}#|Xj	dkrt/|Xd"d#}Xt5|YdkrtL|Y}Z|Z|R_J| jM\}[}\|[|Q}Q|\|X}Xt| jNr| N|}]t/|]d$}]|Q|X|]E }Qnd%|Q|X  }Q| O|Q}Qt| jPr| P|}^|Q| Q|^ }Q| jRs| jSr/g }_tT|.d&| jUd'}`| jRrtV|Q|`}a|_0|a | jSr)t/|`d(d#}`t/|Qd(d#}QtV|Q|`}at/|ad)}a|_0|a t|_dd}Q| W|Q}Qt#| jX|Q}Qt|rMt|sMt3Yd*||Qd+}Q|sR|QS |*|R_|Q|RfS ),Nr   rU   zEqkv receiving different sequences can only be used for self attentionr  r  b * drp  r   )r   zb h * d.g      )r   r   r   TrN  c                 s  s    | ]}|j d  V  qdS )rp  N)r   r   rG   r<   r<   r=   r     r   z$Attention.forward.<locals>.<genexpr>c                 3  s    | ]
}t |d  dV  qdS )zh n d -> b h n dr   N)r#   r  r  r<   r=   r     s    c                 3  s(    | ]}t |d  |jd  dV  qdS )zb h ... -> b (r h) ...rU   r  N)r#   r   r  r  r<   r=   r     s   & zb j -> b 1 1 jr   zNattention mask must have greater than 2 dimensions but less than or equal to 4zi j -> 1 1 i jzh i j -> 1 h i jr   zi, j -> 1 1 i jF)r   r   z5only alibi allowed for custom positions at the moment)r   	attn_bias	prev_attnflash_pack_seq_kwargszb n h, b h n d ->b h n dr<   r  r  r  r  zb h n d -> b n (g h d)r   zb n (h d) -> b n h dzb n h d -> b n (h d)zb n, b n d, -> b n dr   )Zr   rK  r  r  r  r   rC   r  r  r   rF   r%   r  r  r  r  r  r   r   r  r  r  r  r  r  r  lerpr  r   r   r  rk   r  r  r  rh   r  r&   r  r  r  r   rW   r  r  r\   r   r"   r   r  r   r^  r_  rN   r   rV   r  r  r  r  r   r  expr  r   r   r  r  multiplyrt  r  r   rG  r  hybrid_hiddenr   rP   r  r  r  r  r  r  r  r#   r   r   r  r  rP  )brr   rR   contextr   context_mask	attn_maskrd  r  rotary_pos_embcontext_rotary_pos_embpolar_pos_embr.  r  memmem_maskreturn_intermediatesr   value_residualr
  additional_key_value_maskkv_input_residualr  r   kv_hr  r  r   has_contextr  is_multi_latent_attnq_inputk_inputv_inputkv_inputmem_packed_shaper   k_sub_headsneeds_k_sub_headslatent_kv_inputrotateable_kcached_latent_kvmaybe_cached_k_sub_headsr   r  r  vorig_valuesvalue_residual_mix	qk_l2normr+  ckcvmkmvmem_lenr  partial_rotate_headsr  
xpos_scaleq_xpos_scalek_xpos_scaleq_restk_restr?   
input_maskr1  r`  ra  r  r  added_kadded_vadded_kv_headsadded_kv_lenr   r   final_attn_maskrange_qrange_kdistmax_attend_past_maskr  intermediates	head_gatehybrid_forward_kwargshybrid_forward_argshybrid_hiddenshybrid_outputs
hybrid_outrest_hybrid_outsr  out_normhybrid_out_normmixr|  orthog_projected
v_for_proj	projectedr<   )r   r  r=   r     s  J



















,
..




"




&




*























zAttention.forward)r  r  r  r  r  r  r  r  r  r   r  r   )r  )r  r  )NNNNNNNNNNNNNFNNNNNN)r   r	  r
  r  )r8   r9   r:   DEFAULT_DIM_HEADr   rs   r   no_gradr  r  r   r  r<   r<   r  r=   r    s      \r  c                H      s  e Zd Zdddddddddddddddddddddddddd	dddddd
dddddddddddddddddddddddddddddddddddde e ddfHd, fddZ	d-d.ddZdd Z																														d/d0d*d+Z  ZS )1AttentionLayersNrF  Fr   Tr  rD  rE  r   r  r   g      >@r   rU   r*  custom_layerstuple[str, ...] | Nonelayers_execute_ordertuple[int, ...] | Nonerel_pos_kwargsr   residual_fn_kwargscJ           |   	     s
  t    |p|}td|J\}K}Jtd|J\}L}Jtd|J\}M}J|Ldt}N|Ldd}O|Ldd}Pt|Jdks?J d	|J  |_|_t	g _
|_|N_|A|>O }A|@dksZJ |@d
k}Q|@_|@d
krntt|@|nd _|Qrw|0rwJ |Ao{|> }Rt	g _|Ar|Rs|>sJ t|:|p|p|$_t||Nd }||NksJ d| d|N |Ir|dk rtd t||$sJ d|r|sJ d|Pr|sJ d|rt|||!| |"dnd _|$rt|N||%nd _t|||OsJ d||ksJ d|Ldd}St|||sJ dd _|r&|SrJ dtdG|Nd ||||d|F_n7|rA|Sr0J dtdG|d ||||d|F_n|r]t||}||ksRJ dt dG||d|F_|.sg|4rgJ d |._!|4_"|+_#|,_$|Sr|+s||,rJ d!|_%t|||	|
|||sJ d"d}Tt||}d
}U|r|}U|rt&}VnA|rt'}Vn;|rt(}Vn5|	r|.sJ d#t)t*|d$}Vn$|
rt+}Vn|rd%}Tt)t,||U d&}Vn|rd%}Tt)t-||U d&}Vnt.}Vt)|V|}W|Ts|rt)|Wd%d'}W|T_/|_0|r|sd(}Xn|r|rd)}Xnd*}X|-rd+|X }Xt|;|s!J d }Yd}Z|;r0t)t1||<d,}Yn|r>t)t2|||U d-}Yd%}Z|Z_3t4|YrR|ZsR|rRt)|Yd%d'}Y|TpV|Z_5t6 _7j5rq|rqt8t9|||U t: _7|7ri |Ld.d%i}Li |Kd.d%i}Kt4|(rt4|'rt4|rJ d/|&rt;g t<t4|'|*|)frJ |&rt4|sJ d0t4|(rJ t=t>t|X| }(d
}d
 t4|'r|'}[nt4|*r0|t|X }\d
|*  k r|\ksJ d1 J d1t=t?t@d2|X}X|\|* }]|\d d3 }^|^|^|]  |] }_t|X|_ksJ d4|Xd+|_t|X   }`|`|] }a|ad+|\t|a   }[n3t4|)rR|)dkr?|)|ksCJ d5d6|) |X||)   d+|)  }[nt4|s[J d7|X| }[t|X |[_At|(t=t>t|[_BtCfd8d9jBD sJ ttDt?tEd:|[_Ft|tjB}|_GtH|8t|[_I|9_JtH|3t|[}3|5_K|6_L|.r|/r|W nt6 _M|=_NjG  }b|=r|bdkrJ d; fd<d9t>|bD }ct	g _O|B_P|BrtjQ||dd=nd _R|CrtjQ|d
dd=nd _S|D_Td%}dd%}e|E|DM }EtUtVjA|3D ]/\}f\}g}htW|f }i|f  }j|ftjAd
 k}k|gd:ko9|Ao9|do8|> }l|gd:krV|EoD|d }mtX|f|||l|m|#d>|L}nd}dn5|gd?krltX|fd@|ii |L|M}nd}en|gd2krtY|fi |K}n|-s~|nntZd|n}nnt[dA|g |hdkr|hd
 }o|s|h nd}pt\t>|p|o|n}nt4|Yr|Y|n}nd }q|>r|fd
 }r|lrd3nd
}st]||r|s|?dB}q|Qrt)t^|@|HdC}t|gd:kr|Rrt)|td3dD}tn|0rt_}tnt`}t|t|f|f|1|2dE|G}ud }v|io|jjGd k}wjNr|wrta||cb }v|.r|W nd }x|4r|W nd }y|.s"|W nd }zt	|x|y|zg}{jOc|v jc|q j
ct	|{|n|ug qtCdFd9 d D _ed S )HNff_attn_cross_attn_r  r  Fr  r   zunrecognized kwargs passed in rU   r   zrotary emb dim z8 must be less than or equal to attention head dimension rD  zNwhen training language model, rotary embedding dimension should be at least 32zQeither rotary positional embedding or polar positional embedding can be turned onz:rotary xpos is not compatible with bidirectional attentionz:block masking only tested for rotary positional embeddings)r  r  r  r  zyou can only choose one of Alibi positional bias, data dependent Alibi (forgetting transformers), dynamic tanh, or T5 relative positional biaszXnumber of relative position buckets must be less than the relative position max distancer  zFyou can only choose up to one of t5, alibi, or dynamic positional biasz?flash attention not compatible with t5 relative positional biasr  )r+  rG  rK  rH  rI  z;flash attention not compatible with dynamic positional biasr  )r   rK  r  rX   r  zAnumber of ALiBi heads must be less than the total number of heads)rK  r  z3sandwich norm cannot be used when not using prenormz9flash attention is not compatible with residual attentionzcyou can only use either scalenorm, rmsnorm, adaptive layernorm, adaptive rmsnorm, or simple rmsnormz*dynamic tanh norm only tested for pre-norm)r  T)r  )r  )acf)rd  re  )rc  re  )re  )r   rb  )r   r  r  zUdepth should not be passed in if using custom layers and custom layer execution orderz7depth must be passed in with `weight_tie_layers` = Truezpar ratio out of rangere  r  z(default block is too large for par_ratioz2sandwich coefficient should be less than the depthrc  z4`depth` must be passed in for `Decoder` or `Encoder`c                   s   g | ]	}|t  jk qS r<   )rN   layer_typesr  r%  r<   r=   r  	  ra   z,AttentionLayers.__init__.<locals>.<listcomp>rc  z7must have depth of at least 2 for unet skip connectionsc                   s   g | ]}|  qS r<   r<   r  )len_default_blockr<   r=   r  
      r>   )rK  rG  r  r  r  rd  rK  zinvalid layer type )rK  rL  )r.  r,  )r+  )r/  r  r  c                 S  s   g | ]
}t |tr|jqS r<   )rV   r  r  )r   moduler<   r<   r=   r  u
      r<   )fr  rs   r   rT  rW  rN   r   r   rG  r   layers
attn_headsattn_dim_headr.  r   r9  r   r   
stream_emblayer_integratorsrF   disable_abs_pos_embr   warningrm   r  r  r  r  rd  rC  r  r  pre_normsandwich_normresidual_attncross_residual_attncross_attendr  r  r  r   r
  r  r  r  r  norm_need_conditionr  ra  rc  post_branch_fn_needs_conditionrC   need_conditionr   adaptive_mlpr{   r  r  anyrk   rW   r  r|   rx   rg  r\  alllistrz   num_attn_layersrX   rY   layer_dropoutscross_attn_tokens_dropoutsoftclamp_outputsoftclamp_output_value
final_norm
unet_skipsskip_combinesreinject_inputr  reinject_input_projlearned_reinject_input_gateadd_value_residual	enumerater   r\   r  rs  r  	ExceptionrQ  rE  r)  r!  r  re  r   r   r}   r  )|rr   r   rX   rK  rG  rw  
only_crossuse_scalenormuse_rmsnormuse_dynamic_tanhuse_derfdynamic_tanh_init_alphause_simple_rmsnormuse_adaptive_layernormuse_adaptive_rmsnormuse_adaptive_layerscalenorm_add_unit_offsetr  adaptive_condition_mlp adaptive_condition_mlp_expansionalibi_pos_biasalibi_num_headsrel_pos_biasrel_pos_num_bucketsrel_pos_max_distancedynamic_pos_biasdynamic_pos_bias_log_distancedynamic_pos_bias_mlp_depthdynamic_pos_bias_normr  rotary_emb_dimrotary_xposrotary_interpolation_factorrotary_xpos_scale_baserotary_base_rescale_factorr  r  polar_bias_uniform_initweight_tie_layersrZ  r\  sandwich_coef	par_ratioru  rv  macaronrs  pre_norm_has_final_normgate_residualr  r  shift_tokensrt  r  r  zero_init_branch_outputlayer_dropoutr  rq  use_layerscalelayerscale_init_valuer  integrate_layerslayer_integrate_use_softmaxr.  r  r  r  r  r  r^  r_  hyper_conn_sinkhorn_itersverboserI   	ff_kwargsattn_kwargscross_attn_kwargsr  r  r  has_hyper_connectionshyper_conn_produce_diff_views
flash_attnrx  dim_condition_mult
norm_classnorm_fndefault_blockpost_branch_fnry  rg  	par_depthpar_attn	depth_cut	par_width	par_blockpar_head	num_skipsskip_indicesis_first_self_attnis_first_cross_attnr   
layer_typelayer_shift_tokensblock_begin	block_indis_last_layerlayer_qkv_receives_diff_view self_attn_learned_value_residualr   shift_range_uppershift_range_lowerlayer_integratenum_layer_hiddenslayer_integrate_num_viewresidual_fnr  skip_combineis_latter_halfpre_branch_normpost_branch_normpost_main_normnormsr  )rh  rr   r=   rs     s  
M
 


""


&$

(








zAttentionLayers.__init__      Y@rI  r(   c                 C  sd   | j | jf}dd t| j | jD }|j}t|t|ksJ t||D ]\}}|j||d q$d S )Nc                 S  s"   g | ]\\}}}}|d v r|qS )rc  rd  r<   )r   r   r   r  r<   r<   r=   r  
  s   " z1AttentionLayers.attn_qk_clip_.<locals>.<listcomp>r  )rl  rg  r   r,   rN   r  )rr   rI  r  layer_and_layer_typesattn_layersattn_intermeds
attn_layer
attn_interr<   r<   r=   attn_qk_clip_w
  s   zAttentionLayers.attn_qk_clip_c                 C  s8   g }|   D ]}t|ttfsq|t|  q|S r@   )r}   rV   r  rs  extendr~  r  )rr   paramsr  r<   r<   r=   r  
  s   zAttentionLayers.muon_parametersr/  r*   r   LayerIntermediates | Nonedeep_embeds_and_ids"tuple[nn.Parameter, Tensor] | Noneself_attn_additional_kv7LayerIntermediates | list[tuple[Tensor, Tensor]] | Noneself_attn_kv_residualscross_attn_kv_residualsc            V        sJ	  | j t|A rJ dt|| jA rJ dt|r&t|s"t|r&J dt|r2t|r2J dt|
s?t|r=|jnd}
t|ro|jd | jksYJ d| j d|jd  |jd	v s`J |jd
krjt|d}| |}t	 } | j
r{| j|d t	 }!| jr|!j|d g }"g }#g }$d }%d }&t|r| nd g| j }t|r| nd g| j }t|	rt|jd |
 |jtjd}'|'|	d k}(t|r||(@ }n|(}t	 })t| jrt|st|d }*t|*r|*jd nd}+t|st|jd |+ |
 |jd|+ }| |}t|r| j sJ | |},|)j||,d t| jr@t|s@t|s;t|jd |
 |jd}| |}d}-g }.t|r| jrRt|rTJ |j}-t|rf|d d d df }|dkr|d d | d f }t|r|\}/}0|0d d | d f }0|/|0f}|j}.|jd }1t|.}2g }/t|r|\}/}0|/|0 }3t|3d}/t|/}4| j}5|5dk}6|6rtd|| j}| j| j| j| j | j!f}7t" | j# t$ fdd|7D }7t|r|rt%|t&rt'|}|rt(|}t)dd t|7D }8||8 d  }d g|8t*|  | }tt"|d}9d }:| j+r1t|r+J | ,|}:nt|rC|jdkr>|nt|d}:t|:rYt| j-rY| -|. };|:|; }:g }<td}=td}>t|rw|jdkrst|d}t|}=t|r|jdkrt|d}t|}>d }?d }@t/t0|7 D ]\}A\}B}C\}D}E}F}G}H|At*| jd k}I|<1| t|Cr|C||<}| j2r|Gdkrt3 |Gk rːq|Bdkr|r|"1| |r|4dnd }J|r|4dnd }K|Bdkr| j2r| j5dkrt6||| j5\}}|F7|\}}L}M|#1| t|Hr|H||#}|D\}N}O}P| jrBt8t9|Nfi | }Nt8t9|Ofi | }Ot8t9|Pfi | }Pt|:rK||: }t|Nrb|N|}|Bdkrbt|Jrb|N|J}Jt9|Efi |!}Ed }Qd }R| j:rt|?rz|?j;}Qt|@r|@j;}R|Bdkr|E|fi d |d!|d"|d#| j<d$|d%|d&|d't=|9d d(|d)|%d*t=|2d d+|Jd,|Kd-|d.t=|=d d/|Qd0|d1d2\}S}Tn1|Bdkr|E|f||||&t=|2d t=|>d |Rd3|)|d2d4\}S}Tn|Bd5kr|E|t=|4d d6}St|?s|Bdkr|T}?t|@s|Bdkr|T}@t|Or"|O|S}S|F|S|Lfi |M}|Bd7v r;|r;|B|T_>|$1|T |BdkrH| j?rH|Tj@}%n|BdkrT| jArT|Tj@}&t|Pr]|P|}q|rg|#1| | jBrqtC|| jD}| jE}U| jrt8t9|Ufi | }U|6rtF|d8d9|5d:}|U|}|s|S t&|"||$|#|1|- d;}$||$fS )<Nz8context must be passed in if cross_attend is set to TruezIcondition needs to be passed in if using adaptive layernorm or vice versaz9attn_mask or mask cannot be used with flash block maskingz4context_mask cannot be used with flash block maskingr   r   z expected condition dimension of z but received >   r   r  r   r  )r  rp  r   r   r-  rU   r   )r  r  zb n l d -> l b n dzb n d, s d -> (b s) n dc                 3  s&    | ] t  fd dD V  qdS )c                 3  s    | ]} | V  qd S r@   r<   r  layer_variabler<   r=   r   S  s    z4AttentionLayers.forward.<locals>.<genexpr>.<genexpr>N)rW   )r   r\  r  r=   r   S  s   $ z*AttentionLayers.forward.<locals>.<genexpr>c                 S  s   g | ]}|d kqS rf  r<   )r   r  r<   r<   r=   r  _  ri  z+AttentionLayers.forward.<locals>.<listcomp>r<   r  r@   z... ->  1 ...r   rc  rd  r   r  r  rd  r.  r  r  r
  r   r  r   r  r  r  r!  r  r  r  T)r  r   r  r  r   r!  r  )r  r  re  )r  r  z(b s) n d -> b n drj   r8  )r)   r+   r,   r-   r7   )Grw  rC   rz  r7   r   r  r   r"   r{  r   rx  r  ry  copyr  r   r   r   r  r  rP   r  rG  r,   iterr.  r^  addro  rg  r  rl  r  rp  rF   r\  rW   rV   r(   r   rc   rj   rN   r  r  r  rt  r  r   r   trainingr   r   r  r  r  rh   r   r  r   rd  rY  r  ru  r  rv  r  r   r  r  r$   )Vrr   rR   r  r   r  r  self_attn_kv_maskr/   	mem_masksr/  seq_pos_offsetr   input_not_include_cache	cache_agereturn_hiddensr  r  r.  context_posr  r  r  additional_kv_maskdetach_additional_kvroute_additional_kv_to_topr  in_attn_condr\  r  r  r  flash_pack_seq_context_kwargsnorm_kwargsblock_forward_kwargsr)   r-   rI  r  prev_cross_attnr  left_pad_maskcross_attn_rotary_pos_emb	maybe_memr8  r  prev_cache_length
attn_cachedeep_embeds	token_idsnext_cache_lengthiter_attn_cachedeep_embeds_across_depthdeep_embeds_iterr<  is_multistreamlayer_variablesnum_self_attnsiter_self_attn_kv
inp_injectinp_inject_gateskip_hiddensself_attn_kv_residuals_itercross_attn_kv_residuals_iterfirst_self_attn_interfirst_cross_attn_interr   r  r  r  blockr  r  layer_integratoris_last	layer_memlayer_mem_maskinner_residualresidual_kwargsrs  r  r  maybe_self_attn_value_residualmaybe_cross_attn_value_residualr  interr  r<   r  r=   r   
  s  & ,






 


















&















<




zAttentionLayers.forward)rZ  r[  r\  r]  r^  r   r_  r   r  rI  r(   )NNNNNNNNNNFrU   FNNNNNNNNFTNNNNNNN)r/  r*   r   r  r  r  r  r  r\  r]  r  r*   r  r*   )	r8   r9   r:   r   rs   r  r  r   r  r<   r<   r  r=   rY    s       JrY  c                         e Zd Z fddZ  ZS )Encoderc                   *   d|vsJ dt  jdddi| d S )NrG  zcannot set causality on encoderFr<   r  rs   rr   rI   r  r<   r=   rs        zEncoder.__init__r8   r9   r:   rs   r  r<   r<   r  r=   r,        r,  c                      r+  )Decoderc                   r-  )NrG  cannot set causality on decoderTr<   r.  r/  r  r<   r=   rs     r0  zDecoder.__init__r1  r<   r<   r  r=   r3    r2  r3  c                      s0   e Zd Z fddZddd fdd
Z  ZS )PrefixDecoderc                   r-  )NrG  r4  Fr<   r.  r/  r  r<   r=   rs     r0  zPrefixDecoder.__init__N)r  prefix_attn_lenc                  s   |j d |j d |j}}}tj||f|tjdd}	|	 }
t|rAt|tr2tj	|f||d}t
||dt|dk }|
|B }
t|rI|
|@ }
t j|g|R d|
i|S )Nr   rU   r  r   zb -> b 1 1 1r  )r   r   r   r:  r   r  rC   rV   r6   fullr   r"   r  r   )rr   rR   r  r6  rH   rI   r   r   r   r{  forwarded_maskprefix_maskr  r<   r=   r   !  s   
zPrefixDecoder.forwardr  r<   r<   r  r=   r5    s
    r5  c                      r+  )CrossAttenderc                   s   t  jdddd| d S )NT)rw  r  r<   r.  r/  r  r<   r=   rs   ;  s   zCrossAttender.__init__r1  r<   r<   r  r=   r:  :  r2  r:  c                	      s>   e Zd Zdddddddde f	d fdd	Zdd
dZ  ZS )AttentionPoolrU   NFrF  r'   r  r   c                   s   t    t||}t|	d}	|	r|dkrJ t||dk}|s&|dks&J tt||d | _|rN|r:J ddd |
 D }
t	|||||d| _
ntd	||||d|
| _
|| _|	| _d S )
NFrU   r-  zZresidual already in effect when doing a full cross attention based transformer for poolingc                 S  s   i | ]
\}}d | |qS )ra  r<   )r   r  r0  r<   r<   r=   r   Y  rk  z*AttentionPool.__init__.<locals>.<dictcomp>)r   cross_attn_dim_contextrX   rK  rn  )r   r  rK  r  r<   )r  rs   rF   r   r9  r   r   queriesr   r:  poolerr  add_residualsqueeze_output)rr   r   num_pooled_tokensr  r?  rX   rK  r  use_transformer_blocksr@  r  r  r<   r=   rs   ?  s   



zAttentionPool.__init__c                 C  sL   |j d }t| jd|d}| j|||d}| jr|| }| jr$t|d}|S )Nr   n d -> b n dr  )r  b 1 d -> b d)r   r#   r=  r>  r?  r@  r"   )rr   r  r   batchr=  pooledr<   r<   r=   r   b  s   

zAttentionPool.forward)r  r   r@   )r8   r9   r:   r   rs   r   r  r<   r<   r  r=   r;  >  s    #r;  c                      s:   e Zd Zddddddd fd	d
Z		dddZ  ZS )ViTransformerWrapperr  NFr   r   )channelsnum_classespost_emb_normnum_register_tokensemb_dropoutr  r,  c                  s   t    t||sJ d|j}	|| d }
||d  }|| _ttd|
|	| _	|dk}|| _
|r>tt||	| _tt|t||	t|	| _|rTt|	nt | _t|| _|| _t|rnt|	|| _d S t | _d S )Nz4image dimensions must be divisible by the patch sizer   rU   r   )r  rs   r\   r   
patch_sizer   r9  r   r   pos_embeddinghas_register_tokensregister_tokensr{   r  r  patch_to_embeddingr   rJ  ru  r   r  rC   mlp_head)rr   
image_sizerM  r  rH  rI  rJ  rK  rL  r   num_patches	patch_dimrO  r  r<   r=   rs   r  s(   

&zViTransformerWrapper.__init__c                 C  s   |j d | j}}t|d||d}| |}|j d }|| jd d d |f  }| |}| |}| jrFt| j	d|d}t
||fd\}}	| |}
| jrVt|
|	d\}
}t||s]J t| jrd|rf|
S |
jdd	}| |}|su|S ||
fS )
Nr   z&b c (h p1) (w p2) -> b (h w) (p1 p2 c))p1p2rU   rC  r  r  rp  r   )r   rM  r"   rQ  rN  rJ  r   rO  r#   rP  r%   r  r&   rm   rC   rR  r   )rr   imgreturn_embeddingsreturn_logits_and_embeddingsr   r   rR   r   r  psembedr   rF  r5   r<   r<   r=   r     s*   





zViTransformerWrapper.forward)r  r,  )FFr  r<   r<   r  r=   rG  q  s    ,rG  c                $      s   e Zd Ze dddddddddddddddddd	ddddddddddddddddd
#d$ fddZdd Z	d%d&ddZdd Zddddddddddddddde ddd	ddde e fd'd"d#Z  Z	S )(TransformerWrapperNr   r   FrU   Tr  r   g-C6?)#embed_num_tokensemb_dimmax_mem_lenshift_mem_downrL  rJ  num_memory_tokens memory_tokens_interspersed_everytie_embedding
logits_dimreturn_only_embednum_output_headsuse_abs_pos_embscaled_sinu_pos_embr  	recyclingtrain_max_recycle_stepsemb_frac_gradientattn_z_loss_weightaverage_pool_embeduse_cls_tokennum_cls_tokens	attn_poolrA  attn_pool_depthdim_pooled_tokenssqueeze_out_last_dimr   mixture_of_softmaxmixture_of_softmax_ksigsoftmax_logitsff_deep_embed	to_logitsadd_continuous_pred_headr  r  rY  r^  dict[str, int]r   TokenEmbedding | Nonery  r  c       &   )        sn  t    |j |j}'t _|_|_|_|_	|_
|_t|s0t||d}|_|dkp=|o<|j  }(|(rFtd_n|rNt_nt||d_d _t|dkrmtfdd| D _d _|#r}tt||' _|_|	rtnt _ t!|_" krt# nt _$|_%&  |dksJ t'||sJ |_(|rt)  nd _*|_+|r|rJ d _,|rtt-| _,tj.j/j,dd d _0|rt1t|  ||j%j2j%j3d_0|_4| _5d _6d _7| r"|dksJ t8t)  |! t9d	|!d
_6t) |!_7|"_:t||dk_;|r6d _<n7|rKt=|tsCJ dfdd_<n"|dkr`t> fddt?|D _<nt|$sjt) n|$_<|%_@|%rt8t)  tA t)  _Bt|
d}
|
_C|
dkrttD|
 _E|_F|_GjCdko| oj%jH_H|(_I|&_Jd S )N)r  r   c                   s$   i | ]\}}| d t | qS )_embed)r   r  )r   namer  )r_  r<   r=   r     s   $ z/TransformerWrapper.__init__.<locals>.<dictcomp>g{Gz?r!  )r   r  rA  rX   rK  r  rU   z... (k d) -> ... k d)r  z0can only tie embedding if using `TokenEmbedding`c                   s   |  j jj  S r@   )r   r  r   rG   r_   r%  r<   r=   r`   i      z-TransformerWrapper.__init__.<locals>.<lambda>c                   s   g | ]}t  qS r<   )r  )r   r   )r   re  r<   r=   r  k      z/TransformerWrapper.__init__.<locals>.<listcomp>)Kr  rs   r   rX   rF   r_  r  rp  r,  r`  ra  r  rC   r  r   rq  ro   r2  r4  r)  embedsrN   r   r   rx  r   r9  r   r:  rl  r  r   rJ  ru  rL  r  project_embr  r&  rm   rj  r  recycled_projrk  	cls_tokenr   r   r#  rq  r;  rm  rn  rn  output_is_log_prob
to_mixturecombine_mixturer{   r!   rw  has_multiple_headsry  rV   r   r  rz  r  to_next_embed_predrb  r   r3   rc  rt  r   can_cache_kv_outside_max_seq_lenr  ))rr   r  r,  r  r^  r_  r`  ra  rL  rJ  rb  rc  rd  re  rf  rg  rh  ri  r  rj  rk  rl  rm  rn  ro  rp  rq  rA  rr  rs  rt  r   ru  rv  rw  rx  ry  rz  r  rX   no_abs_pos_embr  )r   r_  re  rr   r=   rs     s   
*
$



 


zTransformerWrapper.__init__c                 C  sJ   t | jdr| j  | jr!t| jts#tjj	| jj
jdd d S d S d S )Nr&  r   r!  )hasattrr   r&  r  rV   r2  ro   r   r   r#  r  r   r%  r<   r<   r=   r&    s   
zTransformerWrapper.init_r  rI  r(   c                 C  s   | j j||d d S )Nr  )r  r  )rr   rI  r  r<   r<   r=   r    s   z TransformerWrapper.attn_qk_clip_c                 C  s
   | j  S r@   )r  r  r%  r<   r<   r=   r    rt   z"TransformerWrapper.muon_parameters	embed_idsdict[str, Tensor]r   r  c           I   
     sh  t |j}tst|sJ |j|jd dftjdjd jd jjjdkj	|f\ }}}}}|	|
B |B |B |B } |tj
 B |B }d}!t|rZ|rZ|j}!t|oc|jtjk}"|"soj|||!dn|}#jfi ||# tjt|dkA rJ dtjrt|tjksJ | D ]\}$}%|$ d}&|&jv sJ j|& |%}'|' qt|rÈ| t|r|jdd  \}(|(jd ksJ dt|fd	d
t|st|rt | fdd}t | fdd}t||fdd
}|dk r(|dksJ |  d|   })d }*tjrAj|f}*tjrhtjd d}+t|+gd\},t|rhtj|jdfdd}|rڈjd	 }-j}.t|.r|.dksJ tj t!sJ dt"#|. |. }/t$d|/ fd	ddt%d|.dtj&djd d}0t|0fd\}1t|.st|rt$||dfddd}t|.rt%d dj'rt|r|d j' |j'd  }2}3g |3|2}t(d/i |||!||d}j)s+t|r|dksJ dj f|||||*dd|\}4}5nkt |j*r8t+j,d nd }t|rE|dksIJ dt-|D ]H}6|6dk}7|6|d k}8|8r^t.ntj/}9|9 ' |7so0|4 nd}:j |: f||||dd|\}4}5W d    n	1 sw   Y  qM|4|rt|.rt%d|.| dt1|1d\}0|0|5_&t|.rt%d dd d d |-f |5_2|)|5_3j4rt5|ddtjrt1|,d\};|;|5_2jd dkrt%d j j6 }<tj7o|<}=tj7r|s|<rj7|d!}>|>|5_8d }?tj9r2:j;dd
}?9|sPj<rGt=fd"d#j
D }@n	j
fi }@j>r\|@|@? @  }@t|?rtAd$d%d& |@j;dd
}AtBd'|A|?}Bt@|B}@W d    n	1 sw   Y  jCrt=d(d# tD|@D }@j<stE|@}@|r|@f}Cn|r|5f}Cn|r}Cn|=r|@|5_F|>}Cn|@}C|rԈjGsJ H}D|C|D|)ff}C|rtI|@|5_Jd}|rd)d* |5jKD }EtL|E|d+|5_Md}|	r|5jN}Ft|rd,d* tO||FD n|F}Gfd-d*|GD }G|s|C|GfS |G|5_P|r#|C|5fS |
r2d.d* |5jKD }H|C|HfS |CS )0Nr   )r   rU   )r.  r/  r0  z:`embed_num_tokens` must be defined on `TransformerWrapper`r}  r   zJprepended embeddings need to have same dimensions as text model dimensionsrp  r   c                        t j ft jdS Nr  r   r:  r   r<   )r   r   r   r<   r=   r`     r  z,TransformerWrapper.forward.<locals>.<lambda>c                     r  r  r  r<   )r   r   prepend_seqr<   r=   r`     r  z... -> b ...r  r  Tr   zonly for decoderr   rN  zb (n m) d -> (b n) m d)r  rC  z(b n) m d -> b (n m) d)r.  r  r/  r  z you did not train with recycling)r   r/   r  r   r  r  zV`recycle_steps` must be provided on forward if recycling is turned on and not training)r   r/   r  r   r  )r   r   rD  rS  c                 3  s     | ]}|fi  V  qd S r@   r<   )r   rf   )to_logits_kwargsrR   r<   r=   r     s    z-TransformerWrapper.forward.<locals>.<genexpr>r  Fr  z... k d, ... k -> ... dc                 s  s,    | ]}|j d  dkrt|dn|V  qdS )r   rU   z... 1 -> ...N)r   r"   r  r<   r<   r=   r     s   * c                 S     g | ]}|j qS r<   )r  r  r<   r<   r=   r    r  z.TransformerWrapper.forward.<locals>.<listcomp>)r   c                 S  s   g | ]}t |d dqS )rp  r   )r   )r   pairr<   r<   r=   r    r  c                   s*   g | ]}|d  j  dddf  qS r-  )r`  r^   r  r%  r<   r=   r    s   * c                 S  r  r<   )post_softmax_attnr  r<   r<   r=   r    r  r<   )QrF   r  rC   	new_emptyr   r   r  r   rb  rl  ry  r7   r   r2  r   r  rN   r   rJ  r   r^   rL  r  rx  r  r#   r%   r   r   rp  rc  rV   r  r3  rN  r   r   r"   r3   ra  r   rj  r  r   rk  r  r   rX  r  r&   r0   initial_embedrn  r   rG  rq  r2   r  r  r   r  rW   rw  rt  r   r   r   rt  rY   rP   r5   rz  r  r   r4   r,   r   r.   r)   r   r/   )Irr   rR   rY  rZ  r  #return_embeddings_and_intermediatesreturn_logit_entropiesreturn_next_embed_predr   return_memsreturn_attnr/   r  recycle_stepsr.  prepend_embedsprepend_maskr  
sum_embedsreturn_attn_z_lossrm  r/  r   r  token_emb_kwargsr  rI   r  num_memshas_memory_tokensrl  	orig_maskr  r  external_pos_embr2  r~  embed_id	embed_keyr\  prepend_dim
init_embeddeep_embed_and_ids
cls_tokenscls_packed_shapemem_seq	mem_everynext_seq_lenr  r)  mems_lmems_rattendedrI  r`  
first_step	last_stepr  maybe_recycledr0   
is_encoderreturn_pooled_tokensr2   r  r5   r   mosr  next_embed_outr   r)   new_mems	attn_mapsr<   )r   r   r   r  rr   r  rR   r=   r     sP  <














& (










"zTransformerWrapper.forward)r  rY  r^  r{  r   r|  ry  r  r)  r*  )r  r  r   r  )
r8   r9   r:   r   rs   r&  r  r  r   r  r<   r<   r  r=   r]    s     Or]  c                      sD   e Zd Zddddd fdd
Ze dd	d
ZdddZ  ZS )XTransformerFir   r   )tie_token_embignore_index	pad_valuer  c             	     sF  t    td|\}}td|\}}d|vrd|vsJ dtddg|}	|dd|	d< |d	d |	d	< |d
d|	d
< |dd|	d< tddg|}
|dd|
d< |d
d|
d
< |dd|
d< || _tdi |	dtdd|i|d| _tdi |
dt	d|dd|i| _
|r| jj| j
_t| j
||d| _
d S )Nenc_dec_r   zEdimension of either encoder or decoder must be set with `dim` keywordr  r,  rL  r   rb  ri  Frh  T)rf  r  r  )r   rw  )r  r  r<   )r  rs   r   r   r   r  r]  r,  encoderr3  decoderr   r    )rr   r   r  r  r  r  rI   
enc_kwargs
dec_kwargsenc_transformer_kwargsdec_transformer_kwargsr  r<   r=   rs     s8   


zXTransformer.__init__Nc                 K  s.   | j |||dd}| jj||f||d|S )NT)r   r  rY  r  r  )r  r  generate)rr   seq_inseq_out_startr1  r   r  rI   	encodingsr<   r<   r=   r  #  s   zXTransformer.generatec                 C  st   | j ||||dd}t|rt|rt||jd dfddd}| jr0| jdkr0t||| j\}}| j|||d}|S )NT)r   r  r  rY  rp  r   r   rN  r  )r  rC   r   r   r  r  r  r  )rr   srctgtr   r  src_prepend_embedsencr  r<   r<   r=   r   (  s   zXTransformer.forwardr  )NNN)	r8   r9   r:   rs   r   rX  r  r   r  r<   r<   r  r=   r    s    ,r  r@   r   )r   ri   )r   r   )r   r   )r   r(   r   r   r'  )rG   r   r	  )r   r   )r&  )
__future__r   typingr   rN  r  r   r   r   	functoolsr   r   	itertoolsr	   collectionsr
   
contextlibr   dataclassesr   	packagingr   r   	torch.ampr   torch.nn.functionalr   
functionalr   r   r   r   r   r   r   r   torch.utils._pytreer   r   r   torch.nnr   r   r   logurur   x_transformers.attendr   r   %x_transformers.autoregressive_wrapperr    r^  einops.layers.torchr!   einopsr"   r#   r$   r%   r&   rW  r(   r  r  rC   rF   rJ   rP   rS   rY   r\   rc   rh   rm   ro   rx   rz   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r	  r  r  r)  r4  rC  rh  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r!  r(  r)  rE  rP  rQ  rZ  ra  rc  re  rk  rs  r  rY  r,  r3  r5  r:  r;  rG  r]  r  r<   r<   r<   r=   <module>   s   (






-=3G,.:

 
	
Z
*' S            f3Q    8