o
    ei!                    @   s"  d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' e$(e)Z*dej+de,de,fddZ-G dd dej.Z/G dd dej0Z1G dd deZ2G dd deZ3G dd dej0Z4G d d! d!ej0Z5e#G d"d# d#e!Z6G d$d% d%e6Z7G d&d' d'e6Z8e#G d(d) d)e6Z9e#d*d+G d,d- d-e6eZ:e#d.d+G d/d0 d0e6Z;e#G d1d2 d2e6Z<G d3d4 d4e6Z=G d5d6 d6e6eZ>g d7Z?dS )8zPyTorch MVP model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringloggingtorch_compilable_check   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |du r*td||dk| |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r   shifted_input_ids r'   b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_right.   s   (r)   c                       sP   e Zd ZdZdedef fddZ	ddejd	ed
ejdB f fddZ  Z	S )MvpLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S N   )offsetsuper__init__)selfr+   r,   	__class__r'   r(   r1   D   s   z&MvpLearnedPositionalEmbedding.__init__r   Nr   past_key_values_lengthposition_idsc                    s\   |du r |j dd \}}tj||| tj| jjd|d}n|d}t 	|| j
 S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr.   )dtypedevicer    r   )r"   torcharangelongweightr8   expand	unsqueezer0   forwardr/   )r2   r   r5   r6   bszseq_lenr3   r'   r(   r?   J   s   
z%MvpLearnedPositionalEmbedding.forward)r   N)
__name__
__module____qualname____doc__intr1   r9   Tensorr?   __classcell__r'   r'   r3   r(   r*   ?   s    r*   c                       s   e Zd ZdZ				ddedededB d	edB d
edB dedB f fddZ						ddej	dej	dB de
dB dej	dB dej	dB dedej	dB deej	ej	dB eej	 dB f fddZ  ZS )MvpAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _|| _|| _	t
j|||d| _t
j|||d| _t
j|||d| _t
j|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rO   )r0   r1   rK   rL   rM   head_dimr$   scalingrN   rP   r   Lineark_projv_projq_projout_proj)r2   rK   rL   rM   rN   rO   rP   r3   r'   r(   r1   ]   s$   
	

zMvpAttention.__init__hidden_stateskey_value_statespast_key_valuesattention_maskattn_promptoutput_attentionscache_positionreturnc                 C   s  |du}|  \}	}
}| || j }d}|dur1t|tr/|j| j}|r+|j}n|j	}n|}|r5|n|}|rN|durN|rN|j
| j j}|j
| j j}nJ| |}| |}||	d| j| jdd}||	d| j| jdd}|dur|s||nd}|||| jd|i\}}|rt|trd|j| j< |durtj|d |	ddd|gdd	}tj|d |	ddd|gdd	}|durt|	d|
|d  d|j}tj||gdd	}|	| j d| jf}||	|
| j| jdd}|j| }|j| }|j| }| d}t||dd}|  |	| j |
|fkr0td
|	| j |
|f d|   |dure|  |	d|
|fkrPtd|	d|
|f d|   ||	| j|
|| }||	| j |
|}tjj|dd	}|r||	| j|
|}||	| j |
|}nd}tjj || j | j!d}t||}|  |	| j |
| jfkrtd|	| j|
| jf d|   ||	| j|
| j}|dd}||	|
| j"}| #|}||fS )z#Input shape: Batch x Time x ChannelNFr    r   r.   r_   Tr   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size ptrainingz `attn_output` should be of size )$sizerW   rS   
isinstancer   
is_updatedgetrP   cross_attention_cacheself_attention_cachelayerskeysvaluesrU   rV   viewrL   rR   	transposeupdater9   catr=   zerostor8   reshapebmmr$   r   
functionalsoftmaxrM   re   rK   rX   )r2   rY   rZ   r[   r\   r]   r^   r_   is_cross_attentionr@   tgt_len_query_statesrh   curr_past_key_valuescurrent_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputr'   r'   r(   r?   z   s   


"""





zMvpAttention.forward)rJ   FTN)NNNNFN)rB   rC   rD   rE   rF   floatboolr1   r9   rG   r	   tupler?   rH   r'   r'   r3   r(   rI   Z   sT     	rI   c                       s^   e Zd Zdef fddZ	ddejdejdejded	B d
eejejd	B f f
ddZ	  Z
S )MvpEncoderLayerconfigc                    s   t    |j| _t| j|j|jd| _t	| j| _
|j| _t|j | _|j| _t| j|j| _t|j| j| _t	| j| _d S )N)rK   rL   rM   )r0   r1   d_modelrK   rI   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrM   r   activation_functionactivation_fnactivation_dropoutrT   encoder_ffn_dimfc1fc2final_layer_normr2   r   r3   r'   r(   r1      s   
zMvpEncoderLayer.__init__FrY   r\   self_attn_promptr^   Nr`   c                 C   s   |}| j ||||d\}}tjj|| j| jd}|| }| |}|}| | |}tjj|| j| jd}| 	|}tjj|| j| jd}|| }| 
|}|jtjkrut| sct| rut|jjd }tj|| |d}||fS )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rY   r\   r]   r^   rc   i  )minmax)r   r   rw   rM   re   r   r   r   r   r   r   r7   r9   float16isinfanyisnanfinfor   clamp)r2   rY   r\   r   r^   residualr   clamp_valuer'   r'   r(   r?     s2   



zMvpEncoderLayer.forwardF)rB   rC   rD   r   r1   r9   FloatTensorr   r   r?   rH   r'   r'   r3   r(   r      s    r   c                       s   e Zd Zddef fddZ									ddejdejdB d	ejdB d
ejdB dejdB dejdB dedB dedB dedB dejdB de	ej
e	ej
ej
f dB f fddZ  ZS )MvpDecoderLayerNr   c                    s   t    |j| _t| j|j|jd|d| _|j| _t	|j
 | _|j| _t| j| _t| j|j|jd|d| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)rK   rL   rM   rN   rP   )rM   rN   rP   )r0   r1   r   rK   rI   decoder_attention_headsr   r   rM   r   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normrT   decoder_ffn_dimr   r   r   )r2   r   rP   r3   r'   r(   r1   0  s0   
zMvpDecoderLayer.__init__FTrY   r\   encoder_hidden_statesencoder_attention_maskr   cross_attn_promptr[   r^   	use_cacher_   r`   c                 C   s  |}| j ||||||
d\}}tjj|| j| jd}|| }| |}d}|durL|}| j||||||d\}}tjj|| j| jd}|| }| |}|}| | 	|}tjj|| j
| jd}| |}tjj|| j| jd}|| }| |}|f}|r|||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rY   r[   r\   r]   r^   r_   rc   N)rY   rZ   r\   r]   r[   r^   )r   r   rw   rM   re   r   r   r   r   r   r   r   r   )r2   rY   r\   r   r   r   r   r[   r^   r   r_   r   self_attn_weightscross_attn_weightsoutputsr'   r'   r(   r?   L  sJ   





zMvpDecoderLayer.forwardN)	NNNNNNFTN)rB   rC   rD   r   r1   r9   rG   r	   r   r   r   r?   rH   r'   r'   r3   r(   r   /  sD    	
r   c                       sH   e Zd ZdZdedededef fddZdejd	ejfd
dZ	  Z
S )MvpClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                    s8   t    t||| _tj|d| _t||| _d S )Nrd   )r0   r1   r   rT   denseDropoutrM   rX   )r2   r   r   r   r   r3   r'   r(   r1     s   
zMvpClassificationHead.__init__rY   r`   c                 C   s6   |  |}| |}t|}|  |}| |}|S r   )rM   r   r9   tanhrX   )r2   rY   r'   r'   r(   r?     s   




zMvpClassificationHead.forward)rB   rC   rD   rE   rF   r   r1   r9   rG   r?   rH   r'   r'   r3   r(   r     s    r   c                       s:   e Zd ZdZ fddZdejdeej fddZ  Z	S )	MvpPromptz)Layer-wise prompt for encoder or decoder.c              	      s   t    |j| _|| _|| _|j| | _tj|j	d| _	t
|j|j| _tt|j|jt t|j|d |j | _d S )Nr   r.   )r0   r1   prompt_length
num_layersrL   r   rR   r   r   rM   	Embeddingprompt_embedding
SequentialrT   prompt_mid_dimGELUprompt_trans)r2   r   r   rL   r3   r'   r(   r1     s   

zMvpPrompt.__init__
prompt_idsr`   c                 C   sN   |  | |}|| j| jd | j| j}| |}|g d	d}|S )Nr.   )r   r.   r   r   )
r   r   ro   r   r   rL   rR   rM   permutesplit)r2   r   promptr'   r'   r(   r?     s
   
zMvpPrompt.forward)
rB   rC   rD   rE   r1   r9   rG   r   r?   rH   r'   r'   r3   r(   r     s    "r   c                       s:   e Zd ZU eed< dZdZ fddZedd Z	  Z
S )MvpPreTrainedModelr   modelTc                    s*   t  | t|trt|j d S d S r   )r0   _init_weightsrg   MvpForConditionalGenerationinitzeros_final_logits_bias)r2   moduler3   r'   r(   r     s   
z MvpPreTrainedModel._init_weightsc                 C   s>   | j j}tjg ddddd|gg| jd}|||d}|S )N)r      
      r.   r         r.   r8   )r\   r   )r   r   r9   tensorr8   ne)r2   	pad_tokenr   dummy_inputsr'   r'   r(   r     s   "zMvpPreTrainedModel.dummy_inputs)rB   rC   rD   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   propertyr   rH   r'   r'   r3   r(   r     s   
 r   c                       s   e Zd ZdZddedejdB dedB f fddZ						dd	e	j
dB d
e	jdB de	jdB dedB dedB dedB deeB fddZ  ZS )
MvpEncodera  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    NFr   embed_tokens
use_promptc                    s   t     j| _ j| _ j} j| _ j| _	 j
r!t|nd| _t j|| j| _t j|| _t fddt jD | _t|| _|| _|r] j| _t  j j| _d| _|    d S )N      ?c                    s   g | ]}t  qS r'   )r   ).0r{   r   r'   r(   
<listcomp>   s    z'MvpEncoder.__init__.<locals>.<listcomp>F)!r0   r1   rM   encoder_layerdrop	layerdropr   r   padding_idxmax_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler   r   
vocab_sizer   r*   embed_positions
ModuleListrangeencoder_layersrl   r   layernorm_embeddingr   r   r   r   r   gradient_checkpointing	post_init)r2   r   r   r   rK   r3   r   r(   r1     s0    zMvpEncoder.__init__r   r\   inputs_embedsr^   output_hidden_statesreturn_dictr`   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|dur*td|dur<|}|j}	|d|	d }n|durT| dd }	|dddddf }ntd|du rd| || j	 }| 
|}
||
 }| |}tjj|| j| jd}| jrt| j| j}| |}|durt| j ||d}|rdnd}|rdnd}t| jD ]=\}}|r||f }d}| jrtg }|| jk rd	}|rd
}n|||| jr|| nd|d}|d }|r||d f }q|r||f }|stdd |||fD S t|||dS )a8  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer    z5You have to specify either input_ids or inputs_embedsrc   )r   r   r\   r'   FT)NN)r   r^   r   r   c                 s       | ]	}|d ur|V  qd S r   r'   r   vr'   r'   r(   	<genexpr>  s    z%MvpEncoder.forward.<locals>.<genexpr>last_hidden_staterY   
attentions)r   r^   r   use_return_dictr$   r"   ro   rf   r   r   r   r   r   rw   rM   re   r   r9   r:   r   rt   r8   r   r   	enumeraterl   randr   r   r   )r2   r   r\   r   r^   r   r   kwargsinputinput_shape	embed_posrY   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputsr'   r'   r(   r?     st   (






zMvpEncoder.forwardNF)NNNNNN)rB   rC   rD   rE   r   r   r   r   r1   r9   
LongTensorrG   r   r   r   r?   rH   r'   r'   r3   r(   r     s0    &
#	r   c                       s   e Zd ZdZddededB f fddZ											ddejdB d	ej	dB d
ej
dB dejdB dedB dej
dB dedB dedB dedB dedB dej	dB deeB fddZ  ZS )
MvpDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    Fr   r   Nc                    s   t     j| _ j| _ j| _ j| _ j	rt
 jnd| _t j j| j| _t j j| _t fddt jD | _t j| _|| _|rg j| _t  j j| _t  j j| _d| _ | !  d S )Nr   c                    s   g | ]}t  |d qS ))rP   )r   )r   ir   r'   r(   r     s    z'MvpDecoder.__init__.<locals>.<listcomp>F)"r0   r1   rM   decoder_layerdropr   r   r   r   max_target_positionsr   r   r   r   r   r   r   r   r   r*   r   r   r   decoder_layersrl   r   r   r   r   r   r   r   r   r   r   )r2   r   r   r3   r   r(   r1     s8    zMvpDecoder.__init__r   r\   r   r   r[   r   r   r^   r   r   r_   r`   c                 K   s>  |dur|n| j j}|	dur|	n| j j}	|dur|n| j j}|
dur$|
n| j j}
|dur4|dur4td|durF|}|j}|d|d }n|dur^| dd }|dddddf }ntd|du rn| 	|| j
 }| jr}| jr}|r}td d}|r|du r|dus| j jrtt| j dt| j dnt| j d}|dur| nd}|du rtj|||jd	  |jd
}t| j ||||d}|dur|durt| j |||d}| ||}|| }| |}tjj|| j| jd}| jrt| j| j}|  |}| !|}|	rdnd}|rdnd}|r|durdnd}t"| j#D ]V\}}|	r2||f7 }| jrCt$g }|| j%k rCq&|||||| jrP|| nd| jrY|| nd||||d
}|d }|r{||d	 f7 }|dur{||d f7 }q&|	r||f7 }|
st&dd |||||fD S t'|||||dS )aU  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer    zEYou have to specify either decoder_input_ids or decoder_inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   r   )r   r   r\   r_   r[   )r   r   r\   r   rc   r'   )r   r   r   r[   r^   r   r_   r.   c                 s   r   r   r'   r   r'   r'   r(   r   d  s    z%MvpDecoder.forward.<locals>.<genexpr>)r  r[   rY   r  cross_attentions)(r   r^   r   r   r  r$   r"   ro   rf   r   r   r   re   loggerwarning_onceis_encoder_decoderr   r
   get_seq_lengthr9   r:   r8   r   r   r   r   r   rw   rM   r   r   rt   r   r   r  rl   r  r   r   r   )r2   r   r\   r   r   r[   r   r   r^   r   r   r_   r  r  r  r5   	positionsrY   r   r   r   all_hidden_statesall_self_attnsall_cross_attentionsr  decoder_layerr  r  r'   r'   r(   r?     s   A
	






zMvpDecoder.forwardr   )NNNNNNNNNNN)rB   rC   rD   rE   r   r   r1   r9   r  rG   r   r	   r   r   r?   rH   r'   r'   r3   r(   r    sN    	$	
r  c                       s   e Zd ZdgZdddZdef fddZdd Zd	d
 Zdd Z	e
													ddejdB dejdB dejdB dejdB deej dB dedB dejdB dejdB dedB dedB dedB dedB dejdB deeB fddZ  ZS )MvpModelr   zshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightr   c                    s\   t  | |j|j}}|j| _t||j|| _t	||j| _
t||j| _|   d S r   )r0   r1   r   r   r   r   r   r   sharedr   encoderr  decoderr   )r2   r   r   r   r3   r'   r(   r1   z  s   zMvpModel.__init__c                 C   s   | j S r   )r#  r2   r'   r'   r(   get_input_embeddings  s   zMvpModel.get_input_embeddingsc                 C   s   || _ | j | j_| j | j_d S r   )r#  r$  r   r%  r2   valuer'   r'   r(   set_input_embeddings  s   
zMvpModel.set_input_embeddingsc                 C   sF   | j sJ d| d | jjd | jjd | jjd d S )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r   requires_grad_r$  r   r%  r   r&  r'   r'   r(   set_lightweight_tuning  s
   
zMvpModel.set_lightweight_tuningNr   r\   decoder_input_idsdecoder_attention_maskencoder_outputsr[   r   decoder_inputs_embedsr   r^   r   r   r_   r`   c                 K   sD  |du r|du r|du rt dt|| jj| jj}|
dur |
n| jj}
|dur*|n| jj}|	dur4|	n| jj}	|dur>|n| jj}|du rR| j	||||
||d}n$|rvt
|tsvt|d t|dkrg|d ndt|dkrr|d ndd}| j|||d ||||	|
|||d}|s|| S t|j|j|j|j|j|j|j|jd	S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r\   r   r^   r   r   r   r   r.   r   )r   r\   r   r   r[   r   r   r^   r   r   r_   )r  r[   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentions)r$   r)   r   r   r   r^   r   r   r  r$  rg   r   lenr%  r   r  r[   rY   r  r  )r2   r   r\   r-  r.  r/  r[   r   r0  r   r^   r   r   r_   r  decoder_outputsr'   r'   r(   r?     sj   +	zMvpModel.forwardNNNNNNNNNNNNN)rB   rC   rD   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r1   r'  r*  r,  r   r9   r  rG   listr   r	   r   r   r   r?   rH   r'   r'   r3   r(   r"  r  sh    	
r"  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc                !       s6  e Zd ZddiZdef fddZ	d$ded	edB d
edej	f fddZ
deddfddZdd Ze														d%dejdB dejdB dejdB dejdB deej dB dedB dejdB dejdB dejdB dedB dedB dedB dedB dejdB deeB fd d!Zdejfd"d#Z  ZS )&r   lm_head.weightzmodel.shared.weightr   c                    sX   t  | t|| _| dtd| jjjf t	j
|j| jjjdd| _|   d S )Nr   r   FrQ   )r0   r1   r"  r   register_bufferr9   rs   r#  r+   r   rT   r   lm_headr   r   r3   r'   r(   r1     s
   
z$MvpForConditionalGeneration.__init__NTnew_num_tokenspad_to_multiple_ofmean_resizingr`   c                    s   t  |||}| | |S r   )r0   resize_token_embeddings_resize_final_logits_bias)r2   r?  r@  rA  new_embeddingsr3   r'   r(   rB    s   
z3MvpForConditionalGeneration.resize_token_embeddingsc                 C   sj   | j jd }||kr| j d d d |f }ntjd|| f| j jd}tj| j |gdd}| d| d S )Nr    r   r   ra   r   )r   r"   r9   rs   r8   rr   r=  )r2   r?  old_num_tokensnew_bias
extra_biasr'   r'   r(   rC    s   z5MvpForConditionalGeneration._resize_final_logits_biasc                 C      | j   | jd d S r  r   r,  r>  r+  r&  r'   r'   r(   r,  '     
z2MvpForConditionalGeneration.set_lightweight_tuningr   r\   r-  r.  r/  r[   r   r0  labelsr   r^   r   r   r_   c                 K   s  |dur|n| j j}|	dur)|
rtd d}
|du r)|du r)t|	| j j| j j}| j|||||||||
||||d}| |d | j	 }d}|	dur\t
 }||d| j j|	d}|sr|f|dd  }|durp|f| S |S t|||j|j|j|j|j|j|jd	S )	a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example of summarization:

        Fine-tuning a model
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
        ...     return_tensors="pt",
        ... )
        >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     generated_ids = model.generate(**inputs)

        >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)r\   r-  r/  r.  r[   r   r0  r   r^   r   r   r_   r   r    r   	losslogitsr[   r1  r2  r  r3  r   r4  )r   r  r  warningr)   r   r   r   r>  r   r   ro   r   r   r[   r1  r2  r  r3  r   r4  )r2   r   r\   r-  r.  r/  r[   r   r0  rK  r   r^   r   r   r_   r  r   	lm_logitsmasked_lm_lossloss_fctoutputr'   r'   r(   r?   +  sV   J
z#MvpForConditionalGeneration.forwardc                 C   s   t || jj| jjS r   )r)   r   r   r   )r2   rK  r'   r'   r(   %prepare_decoder_input_ids_from_labels  s   zAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels)NT)NNNNNNNNNNNNNN)rB   rC   rD   r9  r   r1   rF   r   r   r   rB  rC  r,  r   r9   r  rG   r:  r   r	   r   r   r?   rT  rH   r'   r'   r3   r(   r     s~    
		
zr   z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                       s   e Zd Zdef fddZdd Ze												ddejdB dej	dB d	ejdB d
ejdB de
ej dB dejdB dejdB dejdB dedB dedB dedB dedB deeB fddZ  ZS )MvpForSequenceClassificationr   c                    sB   t  j|fi | t|| _t|j|j|j|j| _| 	  d S r   )
r0   r1   r"  r   r   r   
num_labelsclassifier_dropoutclassification_headr   )r2   r   r  r3   r'   r(   r1     s   
z%MvpForSequenceClassification.__init__c                 C   rH  r  )r   r,  rX  r+  r&  r'   r'   r(   r,    rJ  z3MvpForSequenceClassification.set_lightweight_tuningNr   r\   r-  r.  r/  r   r0  rK  r   r^   r   r   r`   c                 K   s"  |dur|n| j j}|durd}	|du r!|dur!td| jj | j||||||||	|
||d}|d }|| j j|j	}t
t|d dkd ||ddf |dd|ddddddf }| |}d}|dur| j jdu r| j jdkrd	| j _n| j jdkr|jtjks|jtjkrd
| j _nd| j _| j jd	krt }| j jdkr|| | }n,|||}n&| j jd
krt }||d| j j|d}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|j|j|j|j |j!|j"d	S )a
  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        Fine-tuning a model on `num_labels` classes
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForSequenceClassification

        >>> num_labels = 2  # for example, this is a binary classification task
        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

        >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
        >>> labels = torch.tensor(1)  # the real label for inputs

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax()
        ```
        NFz8Passing input embeddings is currently not supported for 
r\   r-  r.  r/  r   r0  r   r^   r   r   r   r   z7All examples must have the same number of <eos> tokens.r    
regressionsingle_label_classificationmulti_label_classificationrL  )#r   r  NotImplementedErrorr4   rB   r   eqeos_token_idrt   r8   r   r9   unique_consecutivesumnumelro   rf   rX  problem_typerV  r7   r;   rF   r   squeezer   r   r   r[   r1  r2  r  r3  r   r4  )r2   r   r\   r-  r.  r/  r   r0  rK  r   r^   r   r   r  r   rY   eos_masksentence_representationrN  rM  rR  rS  r'   r'   r(   r?     s~   E$

$

z$MvpForSequenceClassification.forward)NNNNNNNNNNNN)rB   rC   rD   r   r1   r,  r   r9   r  rG   r:  r   r   r   r   r?   rH   r'   r'   r3   r(   rU    sV    	
rU  c                       s   e Zd Z fddZdd Ze													ddejdB dejdB dejdB d	ejdB d
e	ej
 dB dejdB dejdB dej
dB dej
dB dedB dedB dedB dedB deeB fddZ  ZS )MvpForQuestionAnsweringc                    sB   t  | d|_|j| _t|| _t|j|j| _| 	  d S r-   )
r0   r1   rV  r"  r   r   rT   hidden_size
qa_outputsr   r   r3   r'   r(   r1   S  s   
z MvpForQuestionAnswering.__init__c                 C   rH  r  )r   r,  ri  r+  r&  r'   r'   r(   r,  _  rJ  z.MvpForQuestionAnswering.set_lightweight_tuningNr   r\   r-  r.  r/  start_positionsend_positionsr   r0  r   r^   r   r   r`   c                 K   st  |dur|n| j j}|dur|durd}
| j|||||||	|
|||d}|d }| |}|jddd\}}|d }|d }d}|dur|durt| dkr[|d}t| dkrh|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d	 }|s||f|dd  }|dur|f| S |S t||||j|j|j|j|j|j|jd

S )a
  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.

        Example:

        Fine-tuning a model for extrative question answering, and our model also supports generative question answering
        using `BartForConditionalGeneration`
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
        ...     return_tensors="pt",
        ... )
        >>> target_start_index = torch.tensor([18])
        >>> target_end_index = torch.tensor([19])

        >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        >>> predict_answer = tokenizer.decode(predict_answer_tokens)
        ```
        NFrY  r   r   r    ra   )ignore_indexr.   )
rM  start_logits
end_logitsr[   r1  r2  r  r3  r   r4  )r   r  r   ri  r   rd  
contiguousr5  rf   r   r   r   r[   r1  r2  r  r3  r   r4  )r2   r   r\   r-  r.  r/  rj  rk  r   r0  r   r^   r   r   r  r   sequence_outputrN  rm  rn  
total_lossignored_indexrR  
start_lossend_lossrS  r'   r'   r(   r?   c  sj   K







zMvpForQuestionAnswering.forwardr7  )rB   rC   rD   r1   r,  r   r9   rG   r  r:  r   r   r   r   r?   rH   r'   r'   r3   r(   rg  Q  s\    	
rg  c                       s(   e Zd ZdZ fddZdd Z  ZS )MvpDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                    s"   t  | t|| _|   d S r   )r0   r1   r  r%  r   r   r3   r'   r(   r1     s   
zMvpDecoderWrapper.__init__c                 O   s   | j |i |S r   )r%  )r2   argsr  r'   r'   r(   r?     s   zMvpDecoderWrapper.forward)rB   rC   rD   rE   r1   r?   rH   r'   r'   r3   r(   ru    s    ru  c                       s   e Zd ZddiZ fddZdd Zdd Zd	d
 Ze													dde	j
dB de	jdB de	jdB de	jdB dedB de	jdB de	j
dB dedB dedB dedB dedB de	jdB dee	jB deeB fddZ  ZS )MvpForCausalLMr<  z!model.decoder.embed_tokens.weightc                    sD   d|_ d|_t | t|| _tj|j|j	dd| _
|   d S )NTFrQ   )rN   r  r0   r1   ru  r   r   rT   rh  r   r>  r   r   r3   r'   r(   r1      s   
zMvpForCausalLM.__init__c                 C   s
   | j jjS r   r   r%  r   r&  r'   r'   r(   r'    s   
z#MvpForCausalLM.get_input_embeddingsc                 C   s   || j j_d S r   rx  r(  r'   r'   r(   r*    s   z#MvpForCausalLM.set_input_embeddingsc                 C   rH  r  rI  r&  r'   r'   r(   r,    rJ  z%MvpForCausalLM.set_lightweight_tuningNr   r   r\   r   r   r[   r   rK  r   r^   r   r   r_   logits_to_keepr`   c                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}| jj||||||||	|
|d
}|d }t|tr=t| dn|}| 	|dd|ddf }d}|durdt
 }||d| j j|d}|sz|f|dd  }|durx|f| S |S t|||j|j|j|jdS )ap  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MvpForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 8, 50267]
        ```N)
r   r\   r   r   r[   r   r   r^   r   r   r   r    r   )rM  rN  r[   rY   r  r  )r   r^   r   r  r   r%  rg   rF   slicer>  r   ro   r   r   r[   rY   r  r  )r2   r   r\   r   r   r[   r   rK  r   r^   r   r   r_   ry  r  r   rY   slice_indicesrN  rM  rR  rS  r'   r'   r(   r?     sD   (zMvpForCausalLM.forward)NNNNNNNNNNNNr   )rB   rC   rD   r9  r1   r'  r*  r,  r   r9   r  rG   r   r	   r   rF   r   r   r?   rH   r'   r'   r3   r(   rw    sb    	
rw  )rw  r   rg  rU  r"  r   )@rE   r   r9   r   torch.nnr   r   r    r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   configuration_mvpr   
get_loggerrB   r  rG   rF   r)   r   r*   ModulerI   r   r   r   r   r   r   r  r"  r   rU  rg  ru  rw  __all__r'   r'   r'   r(   <module>   sb   $	
 ?n $ l  " " l