o
    ei}                     @   s  d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZ eeZdadd ZG dd dejjZ d+ddZ!d+ddZ"G dd dej#Z$G dd dej#Z%G dd deZ&eG dd deZ'eeddG d d! d!eZ(eed"dG d#d$ d$eZ)eG d%d& d&e'Z*ed'dG d(d) d)e'e
Z+g d*Z,dS ),zPyTorch RWKV model.    N)	dataclass)nn   )initialization)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringis_bitsandbytes_availableis_kernels_availableis_ninja_availableis_torch_cuda_availablelogging   )
RwkvConfigc                 C   s,   t  stdddlm} |da| t_d S )NzFkernels is not installed, please install it with `pip install kernels`r   )
get_kernelzkernels-community/rwkv)r   ImportErrorintegrations.hub_kernelsr   rwkv_cuda_kernelmax_seq_length)context_lengthr    r   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/rwkv/modeling_rwkv.pyload_wkv_cuda_kernel-   s
   
r   c                   @   s(   e Zd ZedddZedddZdS )	RwkvLinearAttentionNFc              	   C   s  |  \}}}	|tjkrtd| dtj d||	 t|	d dkr4td| d|	 dt|	d d	|j| _|jjd
ksP|jjd
ksP|jjd
ksP|jjd
krTtdt	
|   }|jt	jkrp| }| }| }| }| }| }t	j|t	jd}
|s|d ur|d u rt	j||	dt	j|jt	jd}|d d d d df  d8  < nt	jdd |D dd }|jt	jkrtj}ntj}||||||
| n|jt	jkrtjntj}||||||
 | |||||
 |d urdd t	j|dddD }|
| j|fS )NzCannot process a batch with z+ tokens at the same time, use a maximum of z with this model.    r   zThe product of batch size (z) and hidden size (z") needs to be a round multiple of .cudazUCalling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.memory_formatr   )dtypedevicer       籡*Gc                 S      g | ]}| d qS r#   )	unsqueeze.0sr   r   r   
<listcomp>g       z/RwkvLinearAttention.forward.<locals>.<listcomp>)dimc                 S   r%   r&   )squeezer(   r   r   r   r+   t   r,   )sizer   r   
ValueErrorminr!   input_dtyper"   typetorchexpfloat
contiguousfloat16
empty_likecontiguous_formatzerosfloat32catbfloat16forward_with_state_bf16forward_with_stateforward_bf16forwardsave_for_backwardchunkto)ctx
time_decay
time_firstkeyvaluestatereturn_state
batch_sizeseq_lenhidden_sizeoutputforward_funcr   r   r   rB   9   sd   
 zRwkvLinearAttention.forwardc                 C   s   | j }| j\}}}}}tj|tj|tjkrtjntjd}	tj|tjd}
tj|tjd}tj|tjd}|tjkr>| }|tjkrFt	j
nt	j}||||||| |	|
||
 |	||
|||||d d fS )N)r    r!   r   )r2   saved_tensorsr4   r9   r:   r>   r<   r8   r6   r   backward_bf16backwardr7   rE   )rF   g_outputg_stater2   rG   rH   rI   rJ   rP   g_time_decayg_time_firstg_keyg_valuebackward_funcr   r   r   rT   x   s@   
zRwkvLinearAttention.backwardNFN)__name__
__module____qualname__staticmethodrB   rT   r   r   r   r   r   8   s
    >r   Fc                 C   s  |  \}}}t|}|d u r=tj|d d df tjd}	tj|d d df tjd}
tj|d d df tjdd }n|\}	}
}t|  } t|D ]p}|d d |f  }|d d |f }t||| }t|| }t|| | }||	 ||  }||
 | }|| |j	|d d |f< t||  |}t||  | }t|| }||	 ||  }	||
 | }
|}qL|s|d ur|	|
|g}||fS )Nr   )r!   r$   )
r/   r4   
zeros_liker<   r5   ranger6   maximumrE   r!   )rG   rH   rI   rJ   rK   rL   _
seq_lengthrP   	num_state	den_state	max_statecurrent_indexcurrent_keycurrent_valuemax_for_outpute1e2	numeratordenominatormax_for_stater   r   r   rwkv_linear_attention_cpu   s4   
"

rs   c                 C   s`   t dd | |||fD }|ddk}td u s|s|r&t| |||||dS t| |||||S )Nc                 s   s    | ]	}|j jd kV  qdS )r   N)r"   r3   )r)   tr   r   r   	<genexpr>       z(rwkv_linear_attention.<locals>.<genexpr>r   rK   rL   )anyr/   r   rs   r   apply)rG   rH   rI   rJ   rK   rL   no_cuda	one_tokenr   r   r   rwkv_linear_attention   s
   r|   c                       s2   e Zd Zd
 fdd	ZdddZddd	Z  ZS )RwkvSelfAttentionr   c                    sD  t    || _td uotj|jk}t r0t r0|s0zt|j W n t	y/   t
d Y nw || _|j}|jd ur>|jn|}|| _tt|| _tt|| _ttdd|| _ttdd|| _ttdd|| _td| _tj||dd| _tj||dd| _tj||dd| _tj||dd| _d S )Nz9Could not load the custom CUDA kernel for RWKV attention.r   r   r   r   Fbias)super__init__configr   r   r   r   r   r   	Exceptionloggerinfolayer_idrO   attention_hidden_sizer   	Parameterr4   emptyrG   rH   time_mix_keytime_mix_valuetime_mix_receptance	ZeroPad2d
time_shiftLinearrI   rJ   
receptancerP   )selfr   r   kernel_loadedrO   r   	__class__r   r   r      s0   
zRwkvSelfAttention.__init__Nc                 C   s  | ddkr|d ur|d d d d d | jf }n| |}|d ur7|d d d d d | jf |d d df< || j |d| j   }|| j |d| j   }|| j |d| j   }| |}| |}t	| 
|}|d ur|d d df |d d d d d | jf< ||||fS Nr   r   r   )r/   r   r   r   r   r   rI   rJ   r4   sigmoidr   )r   hiddenrK   shiftedrI   rJ   r   r   r   r   extract_key_value   s   
(

(z#RwkvSelfAttention.extract_key_valueFc           	         s    j ||d\}}}}|d urt fdd|dd  D nd }t j j||||d\}}|d urb|d |d d d d d  jf< |d |d d d d d  jf< |d |d	 d d d d  jf<  || |fS )
NrK   c                 3   s(    | ]}|d d d d  j f V  qd S r]   r   r(   r   r   r   ru     s   & z,RwkvSelfAttention.forward.<locals>.<genexpr>r#   rw   r   r   r      )r   tupler|   rG   rH   r   rP   )	r   r   rK   	use_cacher   rI   rJ   layer_staterwkvr   r   r   rB     s   *
	   zRwkvSelfAttention.forwardr   r]   r\   )r^   r_   r`   r   r   rB   __classcell__r   r   r   r   r}      s    
r}   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	RwkvFeedForwardr   c                    s   t    || _|| _|j}|jd ur|jnd|j }td| _t	t
dd|| _t	t
dd|| _tj||dd| _tj||dd| _tj||dd| _d S )Nr   r~   r   Fr   )r   r   r   r   rO   intermediate_sizer   r   r   r   r4   r   r   r   r   rI   r   rJ   )r   r   r   rO   r   r   r   r   r      s   
zRwkvFeedForward.__init__Nc                 C   s
  | ddkr|d ur|d d d d d | jf }n| |}|d ur7|d d d d d | jf |d d df< || j |d| j   }|| j |d| j   }tt| |}| 	|}t
| |}|d ur|d d df |d d d d d | jf< || |fS r   )r/   r   r   r   r   r4   squarerelurI   rJ   r   r   )r   r   rK   r   rI   r   rJ   r   r   r   rB   1  s   
(
(zRwkvFeedForward.forwardr   r]   r^   r_   r`   r   rB   r   r   r   r   r   r     s    r   c                       s&   e Zd Z fddZdddZ  ZS )	RwkvBlockc                    sv   t    || _|| _|dkrtj|j|jd| _tj|j|jd| _	tj|j|jd| _
t||| _t||| _d S )Nr   )eps)r   r   r   r   r   	LayerNormrO   layer_norm_epsilonpre_lnln1ln2r}   	attentionr   feed_forward)r   r   r   r   r   r   r   F  s   
zRwkvBlock.__init__NFc                 C   s|   | j dkr
| |}| j| |||d\}}|| }| j| ||d\}}|| }||f}|r8||f7 }|S |d7 }|S )Nr   )rK   r   r   r]   )r   r   r   r   r   r   )r   r   rK   r   output_attentionsr   r   outputsr   r   r   rB   T  s   


zRwkvBlock.forward)NFFr   r   r   r   r   r   E  s    r   c                   @   sH   e Zd ZU eed< dZdgZddgZdZdZ	e
 dejfdd	Zd
S )RwkvPreTrainedModelr   r   r   rG   rH   Tmodulec              	      s  t |tr|j}|jj}|jj|j ||d  d||  }tjfddt	D |j
j|j
jd}|ddddf } fddt	 D }tj||jj|jjd}tjdd t	 D |jj|jjdd	 }t|j| t|jt|jtd
 |  t|j
t|| t|jt||d
   t|jt|d	|  dS t |tr|j}|jj}|jjd||  }tjfddt	D |j
j|j
jd}|ddddf }t|j
t|| t|jt|| dS t |tjrI|jj}d}	d}
|jdurt|j |d |d kr't|d |d  }	|d | jjkr;|d | jjkr;d	}
|	|
9 }	tj |j|	d dS t |tj!rl|jj}dtt"|d |d  }	tj |j|	d dS t |tj#rt$|j t|j dS dS )zInitialize the weights.r   g      ?c                       g | ]}|  qS r   r   r)   irO   r   r   r+   }      z5RwkvPreTrainedModel._init_weights.<locals>.<listcomp>r!   r"   Nc                    s,   g | ]}d d| d  dd     qS )   r   gffffff?g?r   )r)   h)r   ratio_0_to_1r   r   r+     s    c                 S   s   g | ]
}|d  d d  qS )r   r   r   r   r   r   r   r+     s    g      ?g333333?c                    r   r   r   r   r   r   r   r+     r   r   )gaing-C6?)%
isinstancer}   r   r   num_hidden_layersrO   r   r4   tensorrc   r   r!   r"   rG   rH   initcopy_	ones_likemathlogpowr   r   r   r   r   weightshaper   zeros_sqrt
vocab_sizeorthogonal_	Embeddingmaxr   ones_)r   r   r   r   ratio_1_to_almost0time_weightdecay_speedzigzagr   r   scaler   )r   rO   r   r   _init_weightsp  s~   
	$
$z!RwkvPreTrainedModel._init_weightsN)r^   r_   r`   r   __annotations__base_model_prefix_no_split_modules_keep_in_fp32_modulessupports_gradient_checkpointing_is_statefulr4   no_gradr   Moduler   r   r   r   r   r   g  s   
 r   z+
    Class for the RWKV model outputs.
    )custom_introc                   @   sn   e Zd ZU dZdZejdB ed< dZe	ej dB ed< dZ
eejdf dB ed< dZeejdf dB ed< dS )
RwkvOutputa  
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    Nlast_hidden_staterK   .hidden_states
attentions)r^   r_   r`   __doc__r   r4   FloatTensorr   rK   listr   r   r   r   r   r   r   r     s   
 r   zK
    Base class for causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ej dB ed< dZeejdf dB ed< dZeejdf dB ed< dS )	RwkvCausalLMOutputap  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    NlosslogitsrK   .r   r   )r^   r_   r`   r   r   r4   r   r   r   rK   r   r   r   r   r   r   r   r   r     s   
 
r   c                       s   e Zd Z fddZdd Zdd Ze								ddejdB d	ejdB d
ej	dB de
ej	 dB dedB dedB dedB dedB deeB fddZdd Zdd Z  ZS )	RwkvModelc                    sd   t    t j j| _t fddt j	D | _
t j| _d| _d| _|   d S )Nc                    s   g | ]}t  |d qS )r   )r   )r)   idxr   r   r   r+     s    z&RwkvModel.__init__.<locals>.<listcomp>F)r   r   r   r   r   rO   
embeddings
ModuleListrc   r   blocksr   ln_outlayers_are_rescaledgradient_checkpointing	post_initr   r   r   r   r   r     s    zRwkvModel.__init__c                 C      | j S r]   r   r   r   r   r   get_input_embeddings     zRwkvModel.get_input_embeddingsc                 C   
   || _ d S r]   r   r   new_embeddingsr   r   r   set_input_embeddings     
zRwkvModel.set_input_embeddingsN	input_idsattention_maskinputs_embedsrK   r   r   output_hidden_statesreturn_dictreturnc	                    s  |dur|n| j j}|dur|n| j j}|dur|n| js!| j jnd}|dur)|n| j j}|dur6td | j| jkr@| 	  |durL durLt
d|du rX du rXt
d du ra| | |r|du r d| j j| j jf fddtd	D }|d
  d8  < | jr| jr|rtd d} }
|rdnd}|rdnd}t| jD ]4\}}||
|||d\}
}}| jr| j jdkr|d | j j dkr|
d }
|r||
f }|r||f }q| |
}
|r||
f }|stdd |
|||fD S t|
|||dS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        NFz<`attention_mask` was passed, but it is unused in this model.zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr   c                    s0   g | ]}t j|d kr jnt j jdqS )r   r   )r4   r;   r!   r<   r"   r   r  r   r   r   r+   5  s    z%RwkvModel.forward.<locals>.<listcomp>   r   gꌠ9Y>)FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r   )rK   r   r   r   r#   c                 s   s    | ]	}|d ur|V  qd S r]   r   )r)   xr   r   r   ru   `  rv   z$RwkvModel.forward.<locals>.<genexpr>)r   rK   r   r   )r   r   r  trainingr   use_return_dictr   warning_oncer   _rescale_layersr0   r   r/   rO   r   rc   r   	enumerater   rescale_everyr   r   r   )r   r  r  r  rK   r   r   r  r  kwargsr   all_self_attentionsall_hidden_statesr   blockr   r   r  r   rB     sn    





zRwkvModel.forwardc                 C   sx  | j | j kr	d S | jjdkrt  t| jD ]\}}| jrA|jj	j
dt|| jj   |jjj
dt|| jj   qt|jj	j
drl|jj	j
jdt|| jj   |jjj
jdt|| jj   qt|jj	j
dr| |jj	| | |jj| q|jj	j
dt|| jj   |jjj
dt|| jj   qW d    n1 sw   Y  | j | _ d S )Nr   r#   SCBquant_state)r   r  r   r  r4   r   r  r   r   rP   r   mul_intr   rJ   hasattrr  div_ _bnb_4bit_dequantize_and_rescale)r   block_idr  r   r   r   r  i  s&   
 ""$ "zRwkvModel._rescale_layersc                 C   st   t  stdddl}|j|jj|jj}|dt	|| j
j   |jj|ddd|j}t|d| dS )	z
        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
        be quantized again.
        z/Please install bitsandbytes to use this method.r   Nr#   cpuF)requires_gradr   )r   r   bitsandbytes
functionaldequantize_4bitr   datar  r  r  r   r  r   
Params4bitrE   r"   setattr)r   target_layerr  bnbdequant_weightsquant_weightr   r   r   r    s   z*RwkvModel._bnb_4bit_dequantize_and_rescale)NNNNNNNN)r^   r_   r`   r   r   r   r
   r4   
LongTensorr   r   boolr   r   rB   r  r  r   r   r   r   r   r     sD    	jr   z
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       s   e Zd ZddiZ fddZdd Zdd Ze																			
ddej	d	B dej	d	B dej
d	B deej
 d	B dej	d	B ded	B ded	B ded	B ded	B deejB deeB fddZ  ZS )RwkvForCausalLMzhead.weightzrwkv.embeddings.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
r   r   r   r   r   r   rO   r   headr   r   r   r   r   r     s   
zRwkvForCausalLM.__init__c                 C   r   r]   r,  r   r   r   r   get_output_embeddings  r   z%RwkvForCausalLM.get_output_embeddingsc                 C   r   r]   r-  r   r   r   r   set_output_embeddings  r  z%RwkvForCausalLM.set_output_embeddingsNr   r  r  r  rK   labelsr   r   r  r  logits_to_keepr  c              	   K   s   |	dur|	n| j j}	| j|||||||	d}|d }t|
tr%t|
 dn|
}| |dd|ddf }d}|durI| jd||| j jd|}|	s_|f|dd  }|dur]|f| S |S t	|||j
|j|jdS )aJ  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        N)r  rK   r   r   r  r  r   )r   r0  r   r   )r   r   rK   r   r   r   )r   r  r   r   r  slicer,  loss_functionr   r   rK   r   r   )r   r  r  r  rK   r0  r   r   r  r  r1  r  rwkv_outputsr   slice_indicesr   r   rP   r   r   r   rB     s4   &
zRwkvForCausalLM.forward)
NNNNNNNNNr   )r^   r_   r`   _tied_weights_keysr   r.  r/  r
   r4   r)  r   r   r*  r  Tensorr   r   rB   r   r   r   r   r   r+    sN    	
r+  )r+  r   r   r\   )-r   r   dataclassesr   r4   r    r   r   
generationr   modeling_layersr   modeling_utilsr   utilsr	   r
   r   r   r   r   r   configuration_rwkvr   
get_loggerr^   r   r   r   autogradFunctionr   rs   r|   r   r}   r   r   r   r   r   r   r+  __all__r   r   r   r   <module>   sP   $	

j
,F&"U /Y