o
    iJ                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
 dadaG dd dejjZG dd dejjZd	ed
dfddZd	ed
dfddZG dd dejjZG dd deZG dd deZdS )    N)Path)	find_spec)ListOptionalTupleUnionc                   @   h   e Zd ZdZedejdejdejdejdejf
ddZed	ejde	ejejejejf fd
dZ
dS )WKVLinearAttentionEncoder'WKVLinearAttention function definition.
time_decay
time_firstkeyvaluereturnc           	   	   C   s   |  \}}}|tjksJ d| dtj d|| t|d dks3J d| d| dt|d |j| _t| 	  }| 	 }| 	 }| 	 }tj
|tjd	}t||||| | ||||| |S 
ah  WKVLinearAttention function forward pass.

        Args:
            time_decay: Channel-wise time decay vector. (D_att)
            time_first: Channel-wise time first vector. (D_att)
            key: Key tensor. (B, U, D_att)
            value: Value tensor. (B, U, D_att)

        Returns:
            out: Weighted Key-Value tensor. (B, U, D_att)

        zCannot process key of length z while context_size is (z). Limit should be increased.    r   zbatch size (z) by dimension (z) should be a multiple of memory_format)sizewkv_kernel_encodercontext_sizemindtypeinput_dtypetorchexpfloat
contiguous
empty_likecontiguous_formatforwardsave_for_backward	ctxr   r   r   r   batchlengthdimout r(   Y/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/rwkv_bat/rwkv_attention.pyr       $   z!WKVLinearAttentionEncoder.forwardgrad_outputc                 C      | j \}}}}}| j}| \}}	}
tj||
ftj|j|jd}tj||
ftj|j|jd}tj|tjd}tj|tjd}t	
|||||| ||||
 tj|dd}tj|dd}||||fS a  WKVLinearAttention function backward pass.

        Args:
            grad_output: Output gradient. (B, U, D_att)

        Returns:
            grad_time_decay: Gradient for channel-wise time decay vector. (D_att)
            grad_time_first: Gradient for channel-wise time first vector. (D_att)
            grad_key: Gradient for key tensor. (B, U, D_att)
            grad_value: Gradient for value tensor. (B, U, D_att)

        )r   r   devicer   r   )r&   )saved_tensorsr   r   r   emptyr   r   r.   r   r   backwardr   sumr#   r+   r   r   r   r   output
grad_dtyper$   _r&   grad_time_decaygrad_time_firstgrad_key
grad_valuer(   r(   r)   r1   B   H   z"WKVLinearAttentionEncoder.backwardN__name__
__module____qualname____doc__staticmethodr   Tensortensorr    r   r1   r(   r(   r(   r)   r	      (    -r	   c                   @   r   )WKVLinearAttentionDecoderr
   r   r   r   r   r   c           	   	   C   s   |  \}}}|tjksJ d| dtj d|| t|d dks3J d| d| dt|d |j| _t|	 
  }|	 
 }|	 
 }|	 
 }tj|tjd	}t||||| | ||||| |S r   )r   wkv_kernel_decoderr   
wkv_kernelr   r   r   r   r   r   r   r   r   r    r!   r"   r(   r(   r)   r       r*   z!WKVLinearAttentionDecoder.forwardr+   c                 C   r,   r-   )r/   r   r   r   r0   r   r   r.   r   rF   r1   r   r2   r3   r(   r(   r)   r1      r;   z"WKVLinearAttentionDecoder.backwardNr<   r(   r(   r(   r)   rE      rD   rE   r   r   c                       ddl m} tdurtj| krdS tddu rtdtj s$tdt	t
 jd   fdd	d
D }dddddd|  g}|d|  |d|da| t_dS )JLoad WKV CUDA kernel.

    Args:
        context_size: Context size.

    r   loadNninja}Ninja package was not found. WKV kernel module can't be loaded for training. Please, 'pip install ninja' in your environment.jCUDA is currently a requirement for WKV kernel loading. Please set your devices properly and launch again.cuda_encoderc                       g | ]} | qS r(   r(   .0fkernel_folderr(   r)   
<listcomp>
      z+load_encoder_wkv_kernel.<locals>.<listcomp>z
wkv_op.cppzwkv_cuda.cu
-res-usage--maxrregcount 60--use_fast_math-O3-Xptxas -O3-DTmax=encoder_wkv_Tnamesourcesverboseextra_cuda_cflags)torch.utils.cpp_extensionrK   r   r   r   ImportErrorr   cudais_availabler   __file__resolveparentr   rK   kernel_fileskernel_cflagsr(   rT   r)   load_encoder_wkv_kernel   6   

ro   c                    rH   )rI   r   rJ   NrL   rM   rN   cuda_decoderc                    rP   r(   r(   rQ   rT   r(   r)   rV   8  rW   z+load_decoder_wkv_kernel.<locals>.<listcomp>rX   rY   rZ   r[   r\   r]   r^   decoder_wkv_Tr`   )re   rK   rF   r   r   rf   r   rg   rh   r   ri   rj   rk   rl   r(   rT   r)   load_decoder_wkv_kernel  rp   rs   c                       s   e Zd ZdZdedededededdf fd	d
Zdededededdf
ddZe	 dej
dej
dej
dej
deej
ej
ej
f deej
eej
ej
ej
f f fddZ  ZS )SelfAttention  SelfAttention module definition.

    Args:
        size: Input/Output size.
        attention_size: Attention hidden size.
        context_size: Context size for WKV kernel.
        block_id: Block index.
        num_blocks: Number of blocks in the architecture.

    r   attention_sizeblock_iddropout_rate
num_blocksr   Nc                    s  t    tjd| _tjt|| _tjt|| _	tjtdd|| _
tjtdd|| _tjtdd|| _tjj||dd| _tjj||dd| _tjj||dd| _tjj||dd| _|| _| |||| tjj|d| _dS )!Construct a SelfAttention object.)r   r      r{   T)bias)pN)super__init__r   nn	ZeroPad2d
time_shift	Parameterr0   r   r   time_mix_keytime_mix_valuetime_mix_receptanceLinearproj_key
proj_valueproj_receptanceproj_outputrw   reset_parametersDropoutdropout)selfr   rv   rw   rx   ry   	__class__r(   r)   r   W  s   
	zSelfAttention.__init__c           
         s0  ||d  d||  }t dd|}t|D ]}|| |dd|f< q fddt D }t j|| jj| jjd}t jdd t D | jj| jjdd }	t  : || j_	t 
| jtd	 |	 | j_	t ||| j_	t ||d	  | j_	t |d| | j_	W d
   d
S 1 sw   Y  d
S )zReset module parameters.

        Args:
            size: Block size.
            attention_size: Attention hidden size.
            block_id: Block index.
            num_blocks: Number of blocks in the architecture.

        r{   g      ?r   c                    s,   g | ]}d d| d  dd     qS )   r{   gffffff?g?r(   )rR   hrv   ratio_0_to_1r(   r)   rV     s    z2SelfAttention.reset_parameters.<locals>.<listcomp>)r   r.   c                 S   s   g | ]
}|d  d d  qS )r{      r(   )rR   ir(   r(   r)   rV     s    g      ?g333333?N)r   onesrangerC   r   r   r.   r   no_graddata	ones_likemathlogpowr   r   r   )
r   r   rv   rw   ry   ratio_1_to_almost0time_weightr   decay_speedzigzagr(   r   r)   r   u  s6   
	"zSelfAttention.reset_parametersr   r   r   r   statec                 C   s   |\}}}t | }t ||| }	t ||	 }
t || |	 }|
| ||  }|
| | }t ||| }t || | }
t || }|| }|
| ||  |
| | |g}||fS )a  Compute WKV with state (i.e.: for inference).

        Args:
            time_decay: Channel-wise time decay vector. (D_att)
            time_first: Channel-wise time first vector. (D_att)
            key: Key tensor. (B, 1, D_att)
            value: Value tensor. (B, 1, D_att)
            state: Decoder hidden states. [3 x (B, D_att)]

        Returns:
            output: Weighted Key-Value. (B, 1, D_att)
            state: Decoder hidden states. [3 x (B, 1, D_att)]

        )r   r   maximum)r   r   r   r   r   r   	num_state	den_state	max_statemax_for_outpute1e2	numeratordenominatormax_for_statewkvr(   r(   r)   wkv_linear_attention  s   
z"SelfAttention.wkv_linear_attention)r=   r>   r?   r@   intr   r   r   r   r   rB   r   r   __classcell__r(   r(   r   r)   rt   K  sP    
/rt   c                       x   e Zd ZdZdededededededd	f fd
dZ		ddejde	e
ej  deeje	e
ej  f fddZ  ZS )DecoderSelfAttentionru   r   rv   r   rw   rx   ry   r   Nc                       t  ||||| dS rz   Nr   r   r   r   rv   r   rw   rx   ry   r   r(   r)   r        
zDecoderSelfAttention.__init__xr   c           	         V  |du r	  |n|d d jf }| j |d j   }| j |d j   }| j |d j   } |} |}t 	|}|dur||d d jf<  
 j j||t fdd|dd D \}}|d |d d jf< |d |d d jf< |d |d	 d jf< n
t j j||} |} || }||fS )
Compute time mixing.

        Args:
            x: SelfAttention input sequences. (B, U, size)
            state: Decoder hidden states. [5 x (B, 1, D_att, N)]

        Returns:
            x: SelfAttention output sequences. (B, U, size)

        Nr{   .c                 3       | ]
}|d  j f V  qdS .Nrw   rR   sr   r(   r)   	<genexpr>      z/DecoderSelfAttention.forward.<locals>.<genexpr>   r   r      )r   rw   r   r   r   r   r   r   sigmoidr   r   r   r   tuplerE   applyr   r   	r   r   r   	shifted_xr   r   
receptancer   	att_stater(   r   r)   r      .   $


zDecoderSelfAttention.forwardNr=   r>   r?   r@   r   r   r   r   rB   r   r   r   r    r   r(   r(   r   r)   r     2    r   c                       r   )EncoderSelfAttentionru   r   rv   r   rw   rx   ry   r   Nc                    r   r   r   r   r   r(   r)   r   &  r   zEncoderSelfAttention.__init__r   r   c           	         r   )
r   Nr{   .c                 3   r   r   r   r   r   r(   r)   r   T  r   z/EncoderSelfAttention.forward.<locals>.<genexpr>r   r   r   r   )r   rw   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   r   r   r(   r   r)   r    3  r   zEncoderSelfAttention.forwardr   r   r(   r(   r   r)   r     r   r   )r   r   pathlibr   importlib.utilr   typingr   r   r   r   r   rF   autogradFunctionr	   rE   r   ro   rs   r   Modulert   r   r   r(   r(   r(   r)   <module>   s   oo.. I