o
    @TitT                     @   s
  d dl Z d dlm  mZ d dl mZ ddlmZ ddlmZ e Z	zd dl
mZmZ dZW n ey9   d	ZY nw ddededefddZde jde jfddZde jde jde jfddZG dd dejZG dd dejZG dd dejZG dd dejZdS )     N)nn   )
get_logger   )	AdaLNZero)flash_attn_funcflash_attn_with_kvcacheTF     @dimendthetac                 C   s`   d|t d| dd | d   |    }t j||jt jd}t ||}t t ||}|S )Ng      ?r   r   )devicedtype)torcharangefloatr   float32outerpolar	ones_like)r
   r   r   freqst	freqs_cis r   P/home/ubuntu/.local/lib/python3.10/site-packages/linacodec/module/transformer.pyprecompute_freqs_cis   s
   *r   r   xc                    sb   |j  dd  kr k sJ  J | j|jd |jd fks J  fddt|jD }| j| S )Nr   r   c                    s,   g | ]\}}|d ks| d  kr|nd qS )r   r   ).0idndimr   r   
<listcomp>%   s   , z)reshape_for_broadcast.<locals>.<listcomp>)r"   shape	enumerateview)r   r   r$   r   r!   r   reshape_for_broadcast!   s
   
r'   returnc                 C   sT   t |  jg | jd d ddR  }t||}t || d}|| S )Nr   r      )	r   view_as_complexr   reshaper$   r'   view_as_realflattentype_as)r   r   x_x_outr   r   r   apply_rotary_emb)   s   ,

r1   c                       s  e Zd Z				ddededededB deded	ed
ef fddZdededejdB dej	dejdB f
ddZ
	ddejdejdB dejdB dedejeejeejejf f B f
ddZdejdeejejf dejdedeejeejejf f f
ddZ  ZS )	AttentionFr
   n_headsdropoutwindow_sizeNqkv_bias	proj_biasuse_flash_attentioncausalc	           	         s   t    || _|| | _tj||| j |d| _tj||| j |d| _tj||| j |d| _tj|| j ||d| _	| jd | _
|| _|d u| _| jr]|d dksXJ d|d | _|| _|| _d S )Nbiasg      r   r   z,Window size must be odd for local attention.)super__init__r3   head_dimr   Linearwqwkwvwoscaler4   use_local_attentionwindow_per_sider8   r9   )	selfr
   r3   r4   r5   r6   r7   r8   r9   	__class__r   r   r=   1   s   




zAttention.__init__bszseqlenmaskr   r(   c                 C   s   | j s	|du r	dS tj||ftj|d}| jrt|}| j r0tj|| j d}tj|| jd}|d	|dd}|durd|j
d |krL|j
d |ksPJ d| dkr`|d	|dd}||@ }|d		d| jdd}|S )
zMCreate attention mask combining provided mask and local attention constraintsN)r   r   )diagonalr   r   z.Mask must be square and match sequence length.r   r   )rE   r   onesboolr9   triltriurF   	unsqueezeexpandr$   r
   r3   )rG   rJ   rK   rL   r   	attn_maskr   r   r   create_maskR   s$   
zAttention.create_maskr   r   	return_kvc              	   C   sn  |j \}}}| || || |}}	}
|||| j| j}|	||| j| j}	|
||| j| j}
|durMt||d| d}t|	|d| d}	| jryt	ry|du sZJ d| j
rc| j| jfnd}t||	|
| jro| jnd| j|| jd}n&| ||||j}tj|dd|	dd|
dd|| j| jd	dd}| ||d
}| |}|r||	|
ffS |S )a  Forward pass for multi-head attention.
        Args:
            x (torch.Tensor): Input tensor of shape (bsz, seqlen, dim).
            freqs_cis (torch.Tensor, optional): Precomputed rotary frequencies.
            mask (torch.Tensor, optional): Attention mask.
            return_kv (bool): Whether to return KV pairs for caching.
        Returns:
            output (torch.Tensor): Output tensor of shape (bsz, seqlen, dim).
            new_kv (tuple, optional): KV pairs if return_kv is True.
        Nr   z3Flash attention does not support arbitrary masking.)r   r   g        )	dropout_psoftmax_scaler5   r9   r   r   )rU   rY   rD   r   )r$   r@   rA   rB   r&   r3   r>   r1   r8   FLASH_ATTN_AVAILABLErE   rF   r   trainingr4   rD   r9   rV   r   Fscaled_dot_product_attention	transpose
contiguousrC   )rG   r   r   rL   rW   rJ   rK   _xqxkxvr5   outputrU   r   r   r   forwardv   sH   "



	
zAttention.forwardkv_cache	start_posc                 C   sP  |j \}}}|dksJ d| || || |}}	}
|||| j| j}|	||| j| j}	|
||| j| j}
t|||||  d}t|	||||  d}	|\}}|	|
f}tj	||	gdd}	tj	||
gdd}
| j
r~tr~t||	|
| jd}ntj|dd|	dd|
dd| jddd}| ||d}| ||fS )	ad  
        Forward pass with KV cache for efficient inference. Only used for inference.

        Args:
            x (torch.Tensor): Input tensor for the current step. Shape: (bsz, 1, dim)
            kv_cache: A tuple of (key_cache, value_cache) from previous steps.
            start_pos (int): The starting position of the new token in the sequence.
            freqs_cis (torch.Tensor): Precomputed rotary frequencies.

        Returns:
            output (torch.Tensor): Output tensor after attention. Shape: (bsz, 1, dim)
            new_kv (tuple): Updated KV cache including the new key and value.
        r   z8KV cache method is designed for single-token generation.rX   )r
   )rZ   r   )rD   r   )r$   r@   rA   rB   r&   r3   r>   r1   r   catr8   r[   r   rD   r]   r^   r_   r`   rC   )rG   r   rg   r   rh   rJ   rK   ra   rb   rc   rd   k_cachev_cachenew_kvre   r   r   r   forward_with_cache   s:   "



zAttention.forward_with_cache)FFFF)F)__name__
__module____qualname__intr   rP   r=   r   Tensorr   rV   tuplerf   rm   __classcell__r   r   rH   r   r2   0   sp    	!
)
@r2   c                	       s:   e Zd ZdededededB f fddZdd	 Z  ZS )
FeedForwardr
   
hidden_dimmultiple_offfn_dim_multiplierNc                    s|   t    td| d }|d urt|| }||| d |  }tj||dd| _tj||dd| _tj||dd| _d S )Nr   r)   r   Fr:   )r<   r=   rq   r   r?   w1w2w3)rG   r
   rv   rw   rx   rH   r   r   r=      s   
zFeedForward.__init__c                 C   s    |  t| || | S N)rz   r]   silury   r{   )rG   r   r   r   r   rf     s    zFeedForward.forward)rn   ro   rp   rq   r   r=   rf   rt   r   r   rH   r   ru      s    ru   c                       s   e Zd Z				ddedededededB ded	edB d
edededB dededef fddZ				ddejdejdB dejdB dejdB dede	ejejf dB dedB deje	eje	ejejf f B fddZ
  ZS )TransformerBlockNFr
   r3   r6   r7   r5   rw   rx   r4   norm_epsadanorm_condition_dimr8   use_adaln_zeror9   c              
      s   t    t||||||||d| _t|d| ||d| _|| _| jr?|
d us+J dt||
|	dd| _t||
|	dd| _	d S t
j||	d| _t
j||	d| _	d S )N)r
   r3   r4   r5   r8   r6   r7   r9      )r
   rv   rw   rx   3condition_dim must be provided when using AdaLNZeroTepsreturn_gater   )r<   r=   r2   	attentionru   feed_forwardr   r   attention_normffn_normr   	LayerNorm)rG   r
   r3   r6   r7   r5   rw   rx   r4   r   r   r8   r   r9   rH   r   r   r=     s0   
zTransformerBlock.__init__r   r   rL   	conditionrW   rg   rh   r(   c                 C   s  | j r|dusJ d| j||d\}}	n| |}d}
|dur0|dur0| j||||\}}
n|r>| j|||dd\}}
n| |||}| j rO||	|  }n|| }| j r`| j||d\}}n| |}| |}| j rt|||  }n|| }|
dur||
fS |S )a  
        Forward pass for a single Transformer block.
        Args:
            x (torch.Tensor): Input tensor of shape (bsz, seqlen, dim).
            freqs_cis (torch.Tensor, optional): Precomputed rotary frequencies.
            mask (torch.Tensor, optional): Attention mask.
            condition (torch.Tensor, optional): Conditioning tensor for AdaLNZero.
            return_kv (bool): Whether to return KV pairs for caching.
            kv_cache (tuple, optional): KV cache for efficient inference.
            start_pos (int, optional): Starting position for KV cache.
        Returns:
            out (torch.Tensor): Output tensor of shape (bsz, seqlen, dim).
            new_kv (tuple, optional): New KV pairs if return_kv is True or kv_cache is provided.
        N/condition must be provided when using AdaLNZeror   TrW   )r   r   r   rm   r   r   )rG   r   r   rL   r   rW   rg   rh   attn_normed	attn_gaterl   attn_outh
ffn_normedffn_gateffn_outoutr   r   r   rf   9  s.   


zTransformerBlock.forward)NFFF)NFNN)rn   ro   rp   rq   rP   r   r=   r   rr   rs   rf   rt   r   r   rH   r   r~     sj    	
2	r~   c                )       sV  e Zd Z														
							d1dededededededB dededB dededededededB dedB dedB dedededef( fdd Zed!efd"d#Zd$e	j
fd%d&Zd$e	j
fd'd(Z					d2d)ejd*ejdB d+ejdB d,ed-eeejejf  dB d.edB d!ejeejeeejejf  f B fd/d0Z  ZS )3Transformer       FN   皙?h㈵>T    A   r
   n_layersr3   r6   r7   r5   rw   rx   r4   r   use_rope
rope_thetamax_seq_len	input_dim
output_dimr   r8   r   use_xavier_initr9   c                    s  t    || _|| _|| _|| _t | _t	|D ]}| j
t||||||	|||
||||d q| jrG|d us=J dt|||
dd| _ntj||
d| _|d urYt||nt | _|d urht||nt | _|d urs|n|| _|rt|| |d || _td| d| d	| d
| d| jj 
 nd | _|d urtd|  | jrtd|  |rtd |rtd | | j | | j d S d S )N)r
   r3   r5   rw   rx   r4   r6   r7   r   r   r8   r   r9   r   Fr   r   r   zUsing RoPE with theta=z, max_seq_len=z, dim=z
, n_heads=z, freqs_cis shape=z'Using local attention with window size z0Using AdaLNZero conditioning with condition_dim=z@Using Flash Attention for memory-efficient attention computationz-Using Xavier initialization for linear layers)r<   r=   r
   r3   r   r   r   
ModuleListlayersrangeappendr~   r   normr   r?   Identity
input_projoutput_projoutput_dim_r   r   loggerdebugr$   apply_init_weights_init_adaln_zero)rG   r
   r   r3   r6   r7   r5   rw   rx   r4   r   r   r   r   r   r   r   r8   r   r   r9   layer_idrH   r   r   r=   ~  sl   



zTransformer.__init__r(   c                 C   s   | j S r|   )r   )rG   r   r   r   r     s   zTransformer.output_dimmodulec                 C   s>   t |tjrtj|j |jd urtj|j d S d S d S r|   )
isinstancer   r?   initxavier_normal_weightr;   zeros_rG   r   r   r   r   r     s   
zTransformer._init_weightsc                 C   s:   t |trtj|jd j tj|jd j d S d S )Nr   )r   r   r   r   r   condition_projr   r;   r   r   r   r   r     s   
zTransformer._init_adaln_zeror   rL   r   rW   rg   rh   c              	   C   st  |j \}}}	| jr|dusJ d| jdurR|dur|d n|}
|
| jj d krFtd|
 d| jj d  d t| j| j |
d | j| _| j	|j
| _| j}nd}| |}g }t| jD ]:\}}|dur|dur||||||| |d	\}}|| q`|r|||||d
d\}}|| q`|||||}q`| jr| j||d\}}n| |}| |}|r||fS |S )a  
        Forward pass for the Transformer model.
        Args:
            x (torch.Tensor): Input tensor of shape (bsz, seqlen, input_dim).
            mask (torch.Tensor, optional): Attention mask.
            condition (torch.Tensor, optional): Conditioning tensor for AdaLNZero.
            return_kv (bool): Whether to return KV pairs for caching.
            kv_cache (list, optional): List of KV caches for each layer for efficient inference.
            start_pos (int, optional): Starting position for KV cache.
        Returns:
            output (torch.Tensor): Output tensor of shape (bsz, seqlen, output_dim).
            new_kv_list (list, optional): List of new KV pairs for each layer if return_kv is True or kv_cache is provided.
        Nr   r   r   zInput sequence length z! exceeds precomputed RoPE length z. Recomputing freqs_cis.r   )rg   rh   Tr   r   )r$   r   r   r   warningr   r
   r3   r   tor   r   r%   r   r   r   r   )rG   r   rL   r   rW   rg   rh   rJ   rK   _dimexpected_lenr   new_kv_listr   layerrl   ra   re   r   r   r   rf     s<   



zTransformer.forward)r   r   r   FFNr   Nr   r   Tr   r   NNNFFTF)NNFNN)rn   ro   rp   rq   rP   r   r=   propertyr   r   Moduler   r   r   rr   listrs   rf   rt   r   r   rH   r   r   }  s    	
R	"r   )r	   )r   torch.nn.functionalr   
functionalr]   utilr   
adaln_zeror   r   
flash_attnr   r   r[   ImportErrorrq   r   r   rr   r'   r1   r   r2   ru   r~   r   r   r   r   r   <module>   s(    Er