o
    i4                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ eeZdadd ZG dd de	jj Z!d,ddZ"d,ddZ#G dd de
j$Z%G dd de
j$Z&G dd deZ'eG dd deZ(eedd G d!d" d"eZ)eed#d G d$d% d%eZ*eG d&d' d'e(Z+ed(d G d)d* d*e(eZ,g d+Z-dS )-zPyTorch RWKV model.    N)	dataclass)Path)OptionalUnion)nn   )GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringis_bitsandbytes_availableis_ninja_availableis_torch_cuda_availablelogging   )
RwkvConfigc                    s   ddl m} tt jjjd d   fdddD }td ur'tj| kr'd S t	d|  d	 d
dddddd|  g}|d|  |t
 t
jk|da| t_d S )Nr   )loadkernelsrwkvc                    s   g | ]} | qS  r   ).0fkernel_folderr   Z/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/rwkv/modeling_rwkv.py
<listcomp>4       z(load_wkv_cuda_kernel.<locals>.<listcomp>)z
wkv_op.cppzwkv_cuda.cuzwkv_cuda_bf16.cuz2Loading CUDA kernel for RWKV at context length of .z
-res-usagez--maxrregcount 60z--use_fast_mathz-O3z-Xptxas -O3z--extra-device-vectorizationz-DTmax=wkv_)namesourcesverboseextra_cuda_cflags)torch.utils.cpp_extensionr   r   __file__resolveparentrwkv_cuda_kernelmax_seq_lengthloggerinfor   get_verbosityDEBUG)context_lengthload_kernelcuda_kernel_filesflagsr   r   r   load_wkv_cuda_kernel.   s*   	
r2   c                   @   s(   e Zd ZedddZedddZdS )	RwkvLinearAttentionNFc              	   C   s  |  \}}}	|tjkrtd| dtj d||	 t|	d dkr4td| d|	 dt|	d d	|j| _|jjd
ksP|jjd
ksP|jjd
ksP|jjd
krTtdt	
|   }|jt	jkrp| }| }| }| }| }| }t	j|t	jd}
|s|d ur|d u rt	j||	dt	j|jt	jd}|d d d d df  d8  < nt	jdd |D dd }|jt	jkrtj}ntj}||||||
| n|jt	jkrtjntj}||||||
 | |||||
 |d urdd t	j|dddD }|
| j|fS )NzCannot process a batch with z+ tokens at the same time, use a maximum of z with this model.    r   zThe product of batch size (z) and hidden size (z") needs to be a round multiple of r   cudazUCalling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.memory_formatr   )dtypedevicer7      籡*Gc                 S      g | ]}| d qS r:   )	unsqueezer   sr   r   r   r   }       z/RwkvLinearAttention.forward.<locals>.<listcomp>)dimc                 S   r<   r=   )squeezer?   r   r   r   r      rA   )sizer(   r)   
ValueErrorminr8   input_dtyper9   typetorchexpfloat
contiguousfloat16
empty_likecontiguous_formatzerosfloat32catbfloat16forward_with_state_bf16forward_with_stateforward_bf16forwardsave_for_backwardchunkto)ctx
time_decay
time_firstkeyvaluestatereturn_state
batch_sizeseq_lenhidden_sizeoutputforward_funcr   r   r   rW   O   sd   
 zRwkvLinearAttention.forwardc                 C   s   | j }| j\}}}}}tj|tj|tjkrtjntjd}	tj|tjd}
tj|tjd}tj|tjd}|tjkr>| }|tjkrFt	j
nt	j}||||||| |	|
||
 |	||
|||||d d fS )N)r7   r8   r6   )rG   saved_tensorsrI   rN   rO   rS   rQ   rM   rK   r(   backward_bf16backwardrL   rZ   )r[   g_outputg_staterG   r\   r]   r^   r_   re   g_time_decayg_time_firstg_keyg_valuebackward_funcr   r   r   ri      s@   
zRwkvLinearAttention.backwardNFN)__name__
__module____qualname__staticmethodrW   ri   r   r   r   r   r3   N   s
    >r3   Fc                 C   s  |  \}}}t|}|d u r=tj|d d df tjd}	tj|d d df tjd}
tj|d d df tjdd }n|\}	}
}t|  } t|D ]p}|d d |f  }|d d |f }t||| }t|| }t|| | }||	 ||  }||
 | }|| |j	|d d |f< t||  |}t||  | }t|| }||	 ||  }	||
 | }
|}qL|s|d ur|	|
|g}||fS )Nr   )r8   r;   )
rD   rI   
zeros_likerQ   rJ   rangerK   maximumrZ   r8   )r\   r]   r^   r_   r`   ra   _
seq_lengthre   	num_state	den_state	max_statecurrent_indexcurrent_keycurrent_valuemax_for_outpute1e2	numeratordenominatormax_for_stater   r   r   rwkv_linear_attention_cpu   s4   
"

r   c                 C   s`   t dd | |||fD }|ddk}td u s|s|r&t| |||||dS t| |||||S )Nc                 s   s    | ]	}|j jd kV  qdS )r5   N)r9   rH   )r   tr   r   r   	<genexpr>       z(rwkv_linear_attention.<locals>.<genexpr>r   r`   ra   )anyrD   r(   r   r3   apply)r\   r]   r^   r_   r`   ra   no_cuda	one_tokenr   r   r   rwkv_linear_attention   s
   r   c                       s2   e Zd Zd
 fdd	ZdddZddd	Z  ZS )RwkvSelfAttentionr   c                    sD  t    || _td uotj|jk}t r0t r0|s0zt|j W n t	y/   t
d Y nw || _|j}|jd ur>|jn|}|| _tt|| _tt|| _ttdd|| _ttdd|| _ttdd|| _td| _tj||dd| _tj||dd| _tj||dd| _tj||dd| _d S )Nz9Could not load the custom CUDA kernel for RWKV attention.r   r   r   r   Fbias)super__init__configr(   r)   r.   r   r   r2   	Exceptionr*   r+   layer_idrd   attention_hidden_sizer   	ParameterrI   emptyr\   r]   time_mix_keytime_mix_valuetime_mix_receptance	ZeroPad2d
time_shiftLinearr^   r_   
receptancere   )selfr   r   kernel_loadedrd   r   	__class__r   r   r      s0   
zRwkvSelfAttention.__init__Nc                 C   s  | ddkr|d ur|d d d d d | jf }n| |}|d ur7|d d d d d | jf |d d df< || j |d| j   }|| j |d| j   }|| j |d| j   }| |}| |}t	| 
|}|d ur|d d df |d d d d d | jf< ||||fS Nr   r   r   )rD   r   r   r   r   r   r^   r_   rI   sigmoidr   )r   hiddenr`   shiftedr^   r_   r   r   r   r   extract_key_value  s   
(

(z#RwkvSelfAttention.extract_key_valueFc           	         s    j ||d\}}}}|d urt fdd|dd  D nd }t j j||||d\}}|d urb|d |d d d d d  jf< |d |d d d d d  jf< |d |d	 d d d d  jf<  || |fS )
Nr`   c                 3   s(    | ]}|d d d d  j f V  qd S rr   r   r?   r   r   r   r   #  s   & z,RwkvSelfAttention.forward.<locals>.<genexpr>r:   r   r   r   r      )r   tupler   r\   r]   r   re   )	r   r   r`   	use_cacher   r^   r_   layer_stater   r   r   r   rW   !  s   *
	   zRwkvSelfAttention.forwardr   rr   rq   )rs   rt   ru   r   r   rW   __classcell__r   r   r   r   r      s    
r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	RwkvFeedForwardr   c                    s   t    || _|| _|j}|jd ur|jnd|j }td| _t	t
dd|| _t	t
dd|| _tj||dd| _tj||dd| _tj||dd| _d S )Nr   r   r   Fr   )r   r   r   r   rd   intermediate_sizer   r   r   r   rI   r   r   r   r   r^   r   r_   )r   r   r   rd   r   r   r   r   r   6  s   
zRwkvFeedForward.__init__Nc                 C   s
  | ddkr|d ur|d d d d d | jf }n| |}|d ur7|d d d d d | jf |d d df< || j |d| j   }|| j |d| j   }tt| |}| 	|}t
| |}|d ur|d d df |d d d d d | jf< || |fS r   )rD   r   r   r   r   rI   squarerelur^   r_   r   r   )r   r   r`   r   r^   r   r_   r   r   r   rW   G  s   
(
(zRwkvFeedForward.forwardr   rr   rs   rt   ru   r   rW   r   r   r   r   r   r   5  s    r   c                       s&   e Zd Z fddZdddZ  ZS )	RwkvBlockc                    sv   t    || _|| _|dkrtj|j|jd| _tj|j|jd| _	tj|j|jd| _
t||| _t||| _d S )Nr   )eps)r   r   r   r   r   	LayerNormrd   layer_norm_epsilonpre_lnln1ln2r   	attentionr   feed_forward)r   r   r   r   r   r   r   \  s   
zRwkvBlock.__init__NFc                 C   s|   | j dkr
| |}| j| |||d\}}|| }| j| ||d\}}|| }||f}|r8||f7 }|S |d7 }|S )Nr   )r`   r   r   rr   )r   r   r   r   r   r   )r   r   r`   r   output_attentionsr   r   outputsr   r   r   rW   j  s   


zRwkvBlock.forward)NFFr   r   r   r   r   r   [  s    r   c                   @   s@   e Zd ZU eed< dZdgZddgZdZdZ	de
jfdd	Zd
S )RwkvPreTrainedModelr   r   r   r\   r]   Tmodulec                    s  t |tr|j}|jj}|jj|j ||d  d||  }tjfddt	D |j
j|j
jd}|ddddf } fddt	 D }tj||jj|jjd}tjdd t	 D |jj|jjdd	 }||j_t|jtd
 | |j_t|||j
_t||d
  |j_t|d	| |j_dS t |tr|j}|jj}|jjd||  }tjfddt	D |j
j|j
jd}|ddddf }t|||j
_t|||j_dS t |tjr5|jjj}d}	d}
|jdur|jj  |d |d krt|d |d  }	|d | jjkr&|d | jjkr&d	}
|	|
9 }	tjj |j|	d dS t |tj!rZ|jjj}dtt"|d |d  }	tjj |j|	d dS t |tj#rp|jj$d |jj  dS dS )zInitialize the weights.r   g      ?c                       g | ]}|  qS r   r   r   ird   r   r   r     r   z5RwkvPreTrainedModel._init_weights.<locals>.<listcomp>r8   r9   Nc                    s,   g | ]}d d| d  dd     qS )   r   gffffff?g?r   )r   h)r   ratio_0_to_1r   r   r     s    c                 S   s   g | ]
}|d  d d  qS )r   r   r   r   r   r   r   r     s    g      ?g333333?c                    r   r   r   r   r   r   r   r     r   r   )gaing-C6?)%
isinstancer   r   r   num_hidden_layersrd   r   rI   tensorrx   r   r8   r9   r\   r]   data	ones_likemathlogpowr   r   r   r   r   weightshaper   zero_sqrt
vocab_sizeinitorthogonal_	Embeddingmaxr   fill_)r   r   r   r   ratio_1_to_almost0time_weightdecay_speedzigzagr   r   scaler   )r   rd   r   r   _init_weights  s~   
	


$
z!RwkvPreTrainedModel._init_weightsN)rs   rt   ru   r   __annotations__base_model_prefix_no_split_modules_keep_in_fp32_modulessupports_gradient_checkpointing_is_statefulr   Moduler   r   r   r   r   r   }  s   
 r   z+
    Class for the RWKV model outputs.
    )custom_introc                   @   sn   e Zd ZU dZdZeej ed< dZ	ee
ej  ed< dZeeejdf  ed< dZeeejdf  ed< dS )
RwkvOutputa  
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    Nlast_hidden_stater`   .hidden_states
attentions)rs   rt   ru   __doc__r   r   rI   FloatTensorr   r`   listr   r   r   r   r   r   r   r     s   
 r   zK
    Base class for causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeejdf  ed< dZeeejdf  ed< dS )	RwkvCausalLMOutputap  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    Nlosslogitsr`   .r   r   )rs   rt   ru   r   r   r   rI   r   r   r   r`   r   r   r   r   r   r   r   r   r     s   
 
r   c                       s   e Zd Z fddZdd Zdd Ze								ddeej	 d	eej	 d
eej
 deeej
  dee dee dee dee deeef fddZdd Zdd Z  ZS )	RwkvModelc                    sd   t    t j j| _t fddt j	D | _
t j| _d| _d| _|   d S )Nc                    s   g | ]}t  |d qS )r   )r   )r   idxr   r   r   r     s    z&RwkvModel.__init__.<locals>.<listcomp>F)r   r   r   r   r   rd   
embeddings
ModuleListrx   r   blocksr   ln_outlayers_are_rescaledgradient_checkpointing	post_initr   r   r   r   r   r     s    zRwkvModel.__init__c                 C      | j S rr   r  r   r   r   r   get_input_embeddings     zRwkvModel.get_input_embeddingsc                 C   
   || _ d S rr   r
  r   new_embeddingsr   r   r   set_input_embeddings     
zRwkvModel.set_input_embeddingsN	input_idsattention_maskinputs_embedsr`   r   r   output_hidden_statesreturn_dictreturnc	                    s  |dur|n| j j}|dur|n| j j}|dur|n| js!| j jnd}|dur)|n| j j}|dur6td | j| jkr@| 	  |durL durLt
d|du rX du rXt
d du ra| | |r|du r d| j j| j jf fddtd	D }|d
  d8  < | jr| jr|rtd d} }	|rdnd}
|rdnd}t| jD ]4\}}||	|||d\}	}}| jr| j jdkr|d | j j dkr|	d }	|r||	f }|r|
|f }
q| |	}	|r||	f }|stdd |	|||
fD S t|	|||
dS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        NFz<`attention_mask` was passed, but it is unused in this model.zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr   c                    s0   g | ]}t j|d kr jnt j jdqS )r   r   )rI   rP   r8   rQ   r9   r   r  r   r   r   r   I  s    z%RwkvModel.forward.<locals>.<listcomp>   r   gꌠ9Y>)FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r   )r`   r   r   r   r:   c                 s   s    | ]	}|d ur|V  qd S rr   r   )r   xr   r   r   r   t  r   z$RwkvModel.forward.<locals>.<genexpr>)r   r`   r   r   )r   r   r  trainingr   use_return_dictr*   warning_oncer  _rescale_layersrE   r  rD   rd   r   rx   r  	enumerater  rescale_everyr  r   r   )r   r  r  r  r`   r   r   r  r  r   all_self_attentionsall_hidden_statesr   blockr   r   r  r   rW     sn   





zRwkvModel.forwardc                 C   sx  | j | j kr	d S | jjdkrt  t| jD ]\}}| jrA|jj	j
dt|| jj   |jjj
dt|| jj   qt|jj	j
drl|jj	j
jdt|| jj   |jjj
jdt|| jj   qt|jj	j
dr| |jj	| | |jj| q|jj	j
dt|| jj   |jjj
dt|| jj   qW d    n1 sw   Y  | j | _ d S )Nr   r:   SCBquant_state)r  r  r   r   rI   no_gradr  r  r   re   r   mul_intr   r_   hasattrr$  div_ _bnb_4bit_dequantize_and_rescale)r   block_idr#  r   r   r   r  }  s&   
 ""$ "zRwkvModel._rescale_layersc                 C   st   t  stdddl}|j|jj|jj}|dt	|| j
j   |jj|ddd|j}t|d| dS )	z
        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
        be quantized again.
        z/Please install bitsandbytes to use this method.r   Nr:   cpuF)requires_gradr   )r   ImportErrorbitsandbytes
functionaldequantize_4bitr   r   r%  r*  r(  r   r   r   
Params4bitrZ   r9   setattr)r   target_layerr,  bnbdequant_weightsquant_weightr   r   r   r+    s   z*RwkvModel._bnb_4bit_dequantize_and_rescale)NNNNNNNN)rs   rt   ru   r   r  r  r   r   rI   
LongTensorr   r   boolr   r   r   rW   r  r+  r   r   r   r   r   r     sD    	

ir   z
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       s   e Zd ZdgZ fddZdd Zdd Zdd	d
Ze									dde	e
j de	e
j de	e
j de	ee
j  de	e
j de	e de	e de	e de	e deeef fddZ  ZS )RwkvForCausalLMzhead.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
r   r   r   r   r   r   rd   r   headr  r  r   r   r   r     s   
zRwkvForCausalLM.__init__c                 C   r	  rr   r<  r   r   r   r   get_output_embeddings  r  z%RwkvForCausalLM.get_output_embeddingsc                 C   r  rr   r=  r  r   r   r   set_output_embeddings  r  z%RwkvForCausalLM.set_output_embeddingsNc           	      K   sv   |d ur|d d df  d}|d ur|d u rd|i}nd|i}||d< ||d< | D ]\}}||vr8|||< q,|S )Nr   r  r  r`   r   )r>   items)	r   r  r`   r  r   kwargsmodel_inputsr^   r_   r   r   r   prepare_inputs_for_generation  s   
z-RwkvForCausalLM.prepare_inputs_for_generationr  r  r  r`   labelsr   r   r  r  r  c
              	   K   s   |	dur|	n| j j}	| j|||||||	d}|d }| |}d}|dur3| j||fd| j ji|
}|	sI|f|dd  }|durG|f| S |S t|||j|j|j	dS )aJ  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        N)r  r`   r   r   r  r  r   r   r   )r   r   r`   r   r   )
r   r  r   r<  loss_functionr   r   r`   r   r   )r   r  r  r  r`   rD  r   r   r  r  rA  rwkv_outputsr   r   r   re   r   r   r   rW     s@   %	
zRwkvForCausalLM.forward)NNN)	NNNNNNNNN)rs   rt   ru   _tied_weights_keysr   r>  r?  rC  r   r   rI   r9  r   r   r:  r   r   r   rW   r   r   r   r   r   r;    sJ    
	

r;  )r;  r   r   rq   ).r   r   dataclassesr   pathlibr   typingr   r   rI   r   
generationr   modeling_layersr	   modeling_utilsr
   utilsr   r   r   r   r   r   configuration_rwkvr   
get_loggerrs   r*   r(   r2   autogradFunctionr3   r   r   r   r   r   r   r   r   r   r   r;  __all__r   r   r   r   <module>   sR    
 
j
,F&"T .r