o
    wiy                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ eeZdadd ZG dd de	j j!Z"d,ddZ#d,ddZ$G dd dej%Z&G dd dej%Z'G dd deZ(eG dd deZ)eedd G d!d" d"eZ*eed#d G d$d% d%eZ+eG d&d' d'e)Z,ed(d G d)d* d*e)eZ-g d+Z.dS )-zPyTorch RWKV model.    N)	dataclass)Path)OptionalUnion)nn   )GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringis_bitsandbytes_availableis_ninja_availableis_torch_cuda_availablelogging   )
RwkvConfigc                    s   ddl m} tt jjjd d   fdddD }td ur'tj| kr'd S t	d|  d	 d
dddddd|  g}|d|  |t
 t
jk|da| t_d S )Nr   )loadkernelsrwkvc                    s   g | ]} | qS  r   ).0fkernel_folderr   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/rwkv/modeling_rwkv.py
<listcomp>5       z(load_wkv_cuda_kernel.<locals>.<listcomp>)z
wkv_op.cppzwkv_cuda.cuzwkv_cuda_bf16.cuz2Loading CUDA kernel for RWKV at context length of .z
-res-usagez--maxrregcount 60z--use_fast_mathz-O3z-Xptxas -O3z--extra-device-vectorizationz-DTmax=wkv_)namesourcesverboseextra_cuda_cflags)torch.utils.cpp_extensionr   r   __file__resolveparentrwkv_cuda_kernelmax_seq_lengthloggerinfor   get_verbosityDEBUG)context_lengthload_kernelcuda_kernel_filesflagsr   r   r   load_wkv_cuda_kernel/   s*   	
r2   c                   @   s(   e Zd ZedddZedddZdS )	RwkvLinearAttentionNFc              	   C   s  |  \}}}	|tjkrtd| dtj d||	 t|	d dkr4td| d|	 dt|	d d	|j| _|jjd
ksP|jjd
ksP|jjd
ksP|jjd
krTtdt	
|   }|jt	jkrp| }| }| }| }| }| }t	j|t	jd}
|s|d ur|d u rt	j||	dt	j|jt	jd}|d d d d df  d8  < nt	jdd |D dd }|jt	jkrtj}ntj}||||||
| n|jt	jkrtjntj}||||||
 | |||||
 |d urdd t	j|dddD }|
| j|fS )NzCannot process a batch with z+ tokens at the same time, use a maximum of z with this model.    r   zThe product of batch size (z) and hidden size (z") needs to be a round multiple of r   cudazUCalling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.memory_formatr   )dtypedevicer7      籡*Gc                 S      g | ]}| d qS r:   	unsqueezer   sr   r   r   r   ~       z/RwkvLinearAttention.forward.<locals>.<listcomp>)dimc                 S   r<   r=   )squeezer@   r   r   r   r      rB   )sizer(   r)   
ValueErrorminr8   input_dtyper9   typetorchexpfloat
contiguousfloat16
empty_likecontiguous_formatzerosfloat32catbfloat16forward_with_state_bf16forward_with_stateforward_bf16forwardsave_for_backwardchunkto)ctx
time_decay
time_firstkeyvaluestatereturn_state
batch_sizeseq_lenhidden_sizeoutputforward_funcr   r   r   rX   P   sd   
 zRwkvLinearAttention.forwardc                 C   s   | j }| j\}}}}}tj|tj|tjkrtjntjd}	tj|tjd}
tj|tjd}tj|tjd}|tjkr>| }|tjkrFt	j
nt	j}||||||| |	|
||
 |	||
|||||d d fS )N)r7   r8   r6   )rH   saved_tensorsrJ   rO   rP   rT   rR   rN   rL   r(   backward_bf16backwardrM   r[   )r\   g_outputg_staterH   r]   r^   r_   r`   rf   g_time_decayg_time_firstg_keyg_valuebackward_funcr   r   r   rj      s@   
zRwkvLinearAttention.backwardNFN)__name__
__module____qualname__staticmethodrX   rj   r   r   r   r   r3   O   s
    >r3   Fc                 C   s  |  \}}}t|}|d u r=tj|d d df tjd}	tj|d d df tjd}
tj|d d df tjdd }n|\}	}
}t|  } t|D ]p}|d d |f  }|d d |f }t||| }t|| }t|| | }||	 ||  }||
 | }|| |j	|d d |f< t||  |}t||  | }t|| }||	 ||  }	||
 | }
|}qL|s|d ur|	|
|g}||fS )Nr   )r8   r;   )
rE   rJ   
zeros_likerR   rK   rangerL   maximumr[   r8   )r]   r^   r_   r`   ra   rb   _
seq_lengthrf   	num_state	den_state	max_statecurrent_indexcurrent_keycurrent_valuemax_for_outpute1e2	numeratordenominatormax_for_stater   r   r   rwkv_linear_attention_cpu   s4   
"

r   c                 C   s`   t dd | |||fD }|ddk}td u s|s|r&t| |||||dS t| |||||S )Nc                 s   s    | ]	}|j jd kV  qdS )r5   N)r9   rI   )r   tr   r   r   	<genexpr>       z(rwkv_linear_attention.<locals>.<genexpr>r   ra   rb   )anyrE   r(   r   r3   apply)r]   r^   r_   r`   ra   rb   no_cuda	one_tokenr   r   r   rwkv_linear_attention   s
   r   c                       s2   e Zd Zd
 fdd	ZdddZddd	Z  ZS )RwkvSelfAttentionr   c                    sD  t    || _td uotj|jk}t r0t r0|s0zt|j W n t	y/   t
d Y nw || _|j}|jd ur>|jn|}|| _tt|| _tt|| _ttdd|| _ttdd|| _ttdd|| _td| _tj||dd| _tj||dd| _tj||dd| _tj||dd| _d S )Nz9Could not load the custom CUDA kernel for RWKV attention.r   r   r   r   Fbias)super__init__configr(   r)   r.   r   r   r2   	Exceptionr*   r+   layer_idre   attention_hidden_sizer   	ParameterrJ   emptyr]   r^   time_mix_keytime_mix_valuetime_mix_receptance	ZeroPad2d
time_shiftLinearr_   r`   
receptancerf   )selfr   r   kernel_loadedre   r   	__class__r   r   r      s0   
zRwkvSelfAttention.__init__Nc                 C   s  | ddkr|d ur|d d d d d | jf }n| |}|d ur7|d d d d d | jf |d d df< || j |d| j   }|| j |d| j   }|| j |d| j   }| |}| |}t	| 
|}|d ur|d d df |d d d d d | jf< ||||fS Nr   r   r   )rE   r   r   r   r   r   r_   r`   rJ   sigmoidr   )r   hiddenra   shiftedr_   r`   r   r   r   r   extract_key_value  s   
(

(z#RwkvSelfAttention.extract_key_valueFc           	         s    j ||d\}}}}|d urt fdd|dd  D nd }t j j||||d\}}|d urb|d |d d d d d  jf< |d |d d d d d  jf< |d |d	 d d d d  jf<  || |fS )
Nra   c                 3   s(    | ]}|d d d d  j f V  qd S rs   r   r@   r   r   r   r   $  s   & z,RwkvSelfAttention.forward.<locals>.<genexpr>r:   r   r   r   r      )r   tupler   r]   r^   r   rf   )	r   r   ra   	use_cacher   r_   r`   layer_stater   r   r   r   rX   "  s   *
	   zRwkvSelfAttention.forwardr   rs   rr   )rt   ru   rv   r   r   rX   __classcell__r   r   r   r   r      s    
r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	RwkvFeedForwardr   c                    s   t    || _|| _|j}|jd ur|jnd|j }td| _t	t
dd|| _t	t
dd|| _tj||dd| _tj||dd| _tj||dd| _d S )Nr   r   r   Fr   )r   r   r   r   re   intermediate_sizer   r   r   r   rJ   r   r   r   r   r_   r   r`   )r   r   r   re   r   r   r   r   r   7  s   
zRwkvFeedForward.__init__Nc                 C   s
  | ddkr|d ur|d d d d d | jf }n| |}|d ur7|d d d d d | jf |d d df< || j |d| j   }|| j |d| j   }tt| |}| 	|}t
| |}|d ur|d d df |d d d d d | jf< || |fS r   )rE   r   r   r   r   rJ   squarerelur_   r`   r   r   )r   r   ra   r   r_   r   r`   r   r   r   rX   H  s   
(
(zRwkvFeedForward.forwardr   rs   rt   ru   rv   r   rX   r   r   r   r   r   r   6  s    r   c                       s&   e Zd Z fddZdddZ  ZS )	RwkvBlockc                    sv   t    || _|| _|dkrtj|j|jd| _tj|j|jd| _	tj|j|jd| _
t||| _t||| _d S )Nr   )eps)r   r   r   r   r   	LayerNormre   layer_norm_epsilonpre_lnln1ln2r   	attentionr   feed_forward)r   r   r   r   r   r   r   ]  s   
zRwkvBlock.__init__NFc                 C   s|   | j dkr
| |}| j| |||d\}}|| }| j| ||d\}}|| }||f}|r8||f7 }|S |d7 }|S )Nr   )ra   r   r   rs   )r   r   r   r   r   r   )r   r   ra   r   output_attentionsr   r   outputsr   r   r   rX   k  s   


zRwkvBlock.forward)NFFr   r   r   r   r   r   \  s    r   c                   @   s2   e Zd ZeZdZdgZddgZdZdZ	dd Z
dS )	RwkvPreTrainedModelr   r   r]   r^   Tc                    s"  t |tr|j}|jj}|jj|j ||d  d||  }tjfddt	D |j
j|j
jd}|ddddf } fddt	 D }tj||jj|jjd}tjdd t	 D |jj|jjdd	 }t : ||j_t|jtd
 | |j_t|||j
_t||d
  |j_t|d	| |j_W d   dS 1 sw   Y  dS t |tr|j}|jj}|jjd||  }tjfddt	D |j
j|j
jd}|ddddf }t  t|||j
_t|||j_W d   dS 1 sw   Y  dS dS )zInitialize the weights.r   g      ?c                       g | ]}|  qS r   r   r   ire   r   r   r     r   z5RwkvPreTrainedModel._init_weights.<locals>.<listcomp>r8   r9   Nc                    s,   g | ]}d d| d  dd     qS )   r   gffffff?g?r   )r   h)r   ratio_0_to_1r   r   r     s    c                 S   s   g | ]
}|d  d d  qS )r   r   r   r   r   r   r   r     s    g      ?g333333?c                    r   r   r   r   r   r   r   r     r   )
isinstancer   r   r   num_hidden_layersre   r   rJ   tensorry   r   r8   r9   r]   r^   no_graddata	ones_likemathlogpowr   r   r   )r   moduler   r   ratio_1_to_almost0time_weightdecay_speedzigzagr   )r   re   r   r   _init_weights  s`   

	"
$z!RwkvPreTrainedModel._init_weightsN)rt   ru   rv   r   config_classbase_model_prefix_no_split_modules_keep_in_fp32_modulessupports_gradient_checkpointing_is_statefulr   r   r   r   r   r   ~  s    r   z+
    Class for the RWKV model outputs.
    )custom_introc                   @   sn   e Zd ZU dZdZeej ed< dZ	ee
ej  ed< dZeeejdf  ed< dZeeejdf  ed< dS )
RwkvOutputa  
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    Nlast_hidden_statera   .hidden_states
attentions)rt   ru   rv   __doc__r   r   rJ   FloatTensor__annotations__ra   listr   r   r   r   r   r   r   r     s   
 r   zK
    Base class for causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeejdf  ed< dZeeejdf  ed< dS )	RwkvCausalLMOutputap  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    Nlosslogitsra   .r   r   )rt   ru   rv   r   r   r   rJ   r   r   r   ra   r   r   r   r   r   r   r   r   r     s   
 
r   c                       s   e Zd Z fddZdd Zdd Ze								ddeej	 d	eej	 d
eej
 deeej
  dee dee dee dee deeef fddZdd Zdd Z  ZS )	RwkvModelc                    sd   t    t j j| _t fddt j	D | _
t j| _d| _d| _|   d S )Nc                    s   g | ]}t  |d qS )r   )r   )r   idxr   r   r   r     s    z&RwkvModel.__init__.<locals>.<listcomp>F)r   r   r   	Embedding
vocab_sizere   
embeddings
ModuleListry   r   blocksr   ln_outlayers_are_rescaledgradient_checkpointing	post_initr   r   r   r   r   r     s    zRwkvModel.__init__c                 C      | j S rs   r   r   r   r   r   get_input_embeddings     zRwkvModel.get_input_embeddingsc                 C   
   || _ d S rs   r  r   new_embeddingsr   r   r   set_input_embeddings     
zRwkvModel.set_input_embeddingsN	input_idsattention_maskinputs_embedsra   r   r   output_hidden_statesreturn_dictreturnc	                    s  |dur|n| j j}|dur|n| j j}|dur|n| js!| j jnd}|dur)|n| j j}|dur6td | j| jkr@| 	  |durL durLt
d|du rX du rXt
d du ra| | |r|du r d| j j| j jf fddtd	D }|d
  d8  < | jr| jr|rtd d} }	|rdnd}
|rdnd}t| jD ]4\}}||	|||d\}	}}| jr| j jdkr|d | j j dkr|	d }	|r||	f }|r|
|f }
q| |	}	|r||	f }|stdd |	|||
fD S t|	|||
dS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        NFz<`attention_mask` was passed, but it is unused in this model.zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr   c                    s0   g | ]}t j|d kr jnt j jdqS )r   r   )rJ   rQ   r8   rR   r9   r   r  shaper   r   r   8  s    z%RwkvModel.forward.<locals>.<listcomp>   r   gꌠ9Y>)FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r   )ra   r   r   r   r:   c                 s   s    | ]	}|d ur|V  qd S rs   r   )r   xr   r   r   r   c  r   z$RwkvModel.forward.<locals>.<genexpr>)r   ra   r   r   )r   r   r  trainingr   use_return_dictr*   warning_oncer   _rescale_layersrF   r   rE   re   r   ry   r   	enumerater   rescale_everyr   r   r   )r   r
  r  r  ra   r   r   r  r  r   all_self_attentionsall_hidden_statesr   blockr   r   r  r   rX     sn   





zRwkvModel.forwardc                 C   sx  | j | j kr	d S | jjdkrt  t| jD ]\}}| jrA|jj	j
dt|| jj   |jjj
dt|| jj   qt|jj	j
drl|jj	j
jdt|| jj   |jjj
jdt|| jj   qt|jj	j
dr| |jj	| | |jj| q|jj	j
dt|| jj   |jjj
dt|| jj   qW d    n1 sw   Y  | j | _ d S )Nr   r:   SCBquant_state)r   r  r   r  rJ   r   r  r   r   rf   weightmul_intr   r`   hasattrr  div_ _bnb_4bit_dequantize_and_rescale)r   block_idr  r   r   r   r  l  s&   
 ""$ "zRwkvModel._rescale_layersc                 C   st   t  stdddl}|j|jj|jj}|dt	|| j
j   |jj|ddd|j}t|d| dS )	z
        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
        be quantized again.
        z/Please install bitsandbytes to use this method.r   Nr:   cpuF)requires_gradr  )r   ImportErrorbitsandbytes
functionaldequantize_4bitr  r   r  r#  r!  r   r  r   
Params4bitr[   r9   setattr)r   target_layerr%  bnbdequant_weightsquant_weightr   r   r   r$    s   z*RwkvModel._bnb_4bit_dequantize_and_rescale)NNNNNNNN)rt   ru   rv   r   r  r  r   r   rJ   
LongTensorr   r   boolr   r   r   rX   r  r$  r   r   r   r   r   r     sD    	

ir   z
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       s   e Zd ZdgZ fddZdd Zdd Zdd	d
Ze									dde	e
j de	e
j de	e
j de	ee
j  de	e
j de	e de	e de	e de	e deeef fddZ  ZS )RwkvForCausalLMzhead.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
r   r   r   r   r   r   re   r   headr   r   r   r   r   r     s   
zRwkvForCausalLM.__init__c                 C   r  rs   r5  r   r   r   r   get_output_embeddings  r  z%RwkvForCausalLM.get_output_embeddingsc                 C   r  rs   r6  r  r   r   r   set_output_embeddings  r	  z%RwkvForCausalLM.set_output_embeddingsNc                 K   sT   |d ur|d d df  d}|d ur|d u rd|i}nd|i}||d< ||d< |S )Nr   r  r
  ra   r   r>   )r   r
  ra   r  r   kwargsmodel_inputsr   r   r   prepare_inputs_for_generation  s   
z-RwkvForCausalLM.prepare_inputs_for_generationr
  r  r  ra   labelsr   r   r  r  r  c
              	   K   s   |	dur|	n| j j}	| j|||||||	d}|d }| |}d}|dur3| j||fd| j ji|
}|	sI|f|dd  }|durG|f| S |S t|||j|j|j	dS )aI  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        N)r  ra   r   r   r  r  r   r   r   )r   r   ra   r   r   )
r   r  r   r5  loss_functionr   r   ra   r   r   )r   r
  r  r  ra   r<  r   r   r  r  r9  rwkv_outputsr   r   r   rf   r   r   r   rX     s@   %	
zRwkvForCausalLM.forward)NNN)	NNNNNNNNN)rt   ru   rv   _tied_weights_keysr   r7  r8  r;  r   r   rJ   r2  r   r   r3  r   r   r   rX   r   r   r   r   r   r4    sJ    
	

r4  )r4  r   r   rr   )/r   r   dataclassesr   pathlibr   typingr   r   rJ   torch.utils.checkpointr   
generationr   modeling_layersr	   modeling_utilsr
   utilsr   r   r   r   r   r   configuration_rwkvr   
get_loggerrt   r*   r(   r2   autogradFunctionr3   r   r   Moduler   r   r   r   r   r   r   r4  __all__r   r   r   r   <module>   sT    
 
j
,F&"B .l