o
    wiE                    @   s  d Z ddlZddlZddlmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ e"%e&Z'dej(de)de)fddZ*G dd dej+Z,G dd dej-Z.G dd deZ/G dd deZ0G dd dej-Z1G dd  d ej-Z2e!G d!d" d"eZ3G d#d$ d$e3Z4G d%d& d&e3Z5e!G d'd( d(e3Z6e!d)d*G d+d, d,e3eZ7e!d-d*G d.d/ d/e3Z8e!G d0d1 d1e3Z9G d2d3 d3e3Z:G d4d5 d5e3eZ;g d6Z<dS )7zPyTorch MVP model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |du r*td||dk| |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r   shifted_input_ids r$   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_right3   s   (r&   c                       sJ   e Zd ZdZdedef fddZddejd	ed
ejf fddZ  Z	S )MvpLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S N   )offsetsuper__init__)selfr(   r)   	__class__r$   r%   r.   I   s   z&MvpLearnedPositionalEmbedding.__init__r   Nr   past_key_values_lengthposition_idsc                    s\   |du r |j dd \}}tj||| tj| jjd|d}n|d}t 	|| j
 S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr+   )dtypedevicer   r   )r   torcharangelongweightr5   expand	unsqueezer-   forwardr,   )r/   r   r2   r3   bszseq_lenr0   r$   r%   r<   O   s   
z%MvpLearnedPositionalEmbedding.forwardr   N)
__name__
__module____qualname____doc__intr.   r6   Tensorr<   __classcell__r$   r$   r0   r%   r'   D   s    (r'   c                       s   e Zd ZdZ			ddedededed	ef
 fd
dZdej	dedefddZ
						ddej	deej	 deeej	  deej	 deej	 deej	 dedeej	eej	 eeej	  f fddZ  ZS )MvpAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FT	embed_dim	num_headsdropout
is_decoderbiasc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _|| _t	j
|||d| _t	j
|||d| _t	j
|||d| _t	j
|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rM   )r-   r.   rI   rJ   rK   head_dimr!   scalingrL   r   Lineark_projv_projq_projout_proj)r/   rI   rJ   rK   rL   rM   r0   r$   r%   r.   `   s"   


zMvpAttention.__init__tensorr>   r=   c                 C   s    | ||| j| jdd S )Nr   r+   )viewrJ   rO   	transpose
contiguous)r/   rV   r>   r=   r$   r$   r%   _shape{   s    zMvpAttention._shapeNhidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskattn_promptoutput_attentionsreturnc                 C   s  |du}|  \}	}
}| || j }|r"|dur"|d }|d }nZ|r9| | |d|	}| | |d|	}nC|durh| | |d|	}| | |d|	}tj|d |gdd}tj|d |gdd}n| | |d|	}| | |d|	}| jr||f}|durtj|d 	|	ddd|gdd}tj|d 	|	ddd|gdd}|durt
|	d|
|d  d|j}tj||gdd}|	| j d| jf}| ||
|	j| }|j| }|j| }| d}t||dd}|  |	| j |
|fkrtd|	| j |
|f d|   |durG|  |	d|
|fkr2td	|	d|
|f d|   ||	| j|
|| }||	| j |
|}tjj|dd}|dur|  | jfkrktd
| jf d|   |dddd||	| j|
| }||	| j |
|}|r||	| j|
|}||	| j |
|}nd}tjj|| j| jd}t||}|  |	| j |
| jfkrtd|	| j|
| jf d|   ||	| j|
| j}|dd}||	|
| j}| |}|||fS )z#Input shape: Batch x Time x ChannelNr   r   r   r+   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )sizerT   rP   rZ   rR   rS   r6   catrL   r:   zerostor5   rJ   rO   rW   bmmrX   r!   r   
functionalsoftmaxrK   rg   reshaperI   rU   )r/   r[   r\   r]   r^   r_   r`   ra   is_cross_attentionr=   tgt_len_query_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputr$   r$   r%   r<   ~   s   
"""




"

zMvpAttention.forward)rH   FT)NNNNNF)r@   rA   rB   rC   rD   floatboolr.   r6   rE   rZ   r   tupler<   rF   r$   r$   r0   r%   rG   ]   sP    	rG   c                       sd   e Zd Zdef fddZ	ddejdejdejdejd	ee d
e	ejeej f fddZ
  ZS )MvpEncoderLayerconfigc                    s   t    |j| _t| j|j|jd| _t	| j| _
|j| _t|j | _|j| _t| j|j| _t|j| j| _t	| j| _d S )N)rI   rJ   rK   )r-   r.   d_modelrI   rG   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrK   r	   activation_functionactivation_fnactivation_dropoutrQ   encoder_ffn_dimfc1fc2final_layer_normr/   r   r0   r$   r%   r.      s   
zMvpEncoderLayer.__init__Fr[   r^   r_   self_attn_promptra   rb   c                 C   s  |}| j |||||d\}}}tjj|| j| jd}|| }| |}|}| | |}tjj|| j| jd}| 	|}tjj|| j| jd}|| }| 
|}|jtjkrwt| set| rwt|jjd }	tj||	 |	d}|f}
|r|
|f7 }
|
S )a@  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r[   r^   r_   r`   ra   re   i  )minmax)r   r   rm   rK   rg   r   r   r   r   r   r   r4   r6   float16isinfanyisnanfinfor   clamp)r/   r[   r^   r_   r   ra   residualry   rr   clamp_valueoutputsr$   r$   r%   r<   	  s:   



zMvpEncoderLayer.forward)F)r@   rA   rB   r   r.   r6   FloatTensorr   r~   r   r<   rF   r$   r$   r0   r%   r      s     r   c                       s   e Zd Zdef fddZ										ddejdeej d	eej d
eej deej deej deej deej deeej  dee	 dee	 deej
eeej
ej
f  f fddZ  ZS )MvpDecoderLayerr   c                    s   t    |j| _t| j|j|jdd| _|j| _t	|j
 | _|j| _t| j| _t| j|j|jdd| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)rI   rJ   rK   rL   )rK   rL   )r-   r.   r   rI   rG   decoder_attention_headsr   r   rK   r	   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normrQ   decoder_ffn_dimr   r   r   r   r0   r$   r%   r.   A  s,   
zMvpDecoderLayer.__init__NFTr[   r^   encoder_hidden_statesencoder_attention_maskr_   cross_attn_layer_head_maskr   cross_attn_promptr]   ra   	use_cacherb   c              	   C   s^  |}|	dur|	dd nd}| j ||||||
d\}}}tjj|| j| jd}|| }| |}d}d}|durm|}|	durD|	dd nd}| j|||||||
d\}}}tjj|| j| jd}|| }| |}|| }|}| | 	|}tjj|| j
| jd}| |}tjj|| j| jd}|| }| |}|f}|
r|||f7 }|r||f7 }|S )aC  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        Nr+   )r[   r]   r^   r_   r`   ra   re   )r[   r\   r^   r_   r`   r]   ra   )r   r   rm   rK   rg   r   r   r   r   r   r   r   r   )r/   r[   r^   r   r   r_   r   r   r   r]   ra   r   r   self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valuer   r$   r$   r%   r<   [  sX   $
	



zMvpDecoderLayer.forward)
NNNNNNNNFT)r@   rA   rB   r   r.   r6   rE   r   r   r~   r   r<   rF   r$   r$   r0   r%   r   @  sJ    	
r   c                       sH   e Zd ZdZdedededef fddZdejd	ejfd
dZ	  Z
S )MvpClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                    s8   t    t||| _tj|d| _t||| _d S )Nrf   )r-   r.   r   rQ   denseDropoutrK   rU   )r/   r   r   r   r   r0   r$   r%   r.     s   
zMvpClassificationHead.__init__r[   rb   c                 C   s6   |  |}| |}t|}|  |}| |}|S N)rK   r   r6   tanhrU   )r/   r[   r$   r$   r%   r<     s   




zMvpClassificationHead.forward)r@   rA   rB   rC   rD   r}   r.   r6   rE   r<   rF   r$   r$   r0   r%   r     s    r   c                       s:   e Zd ZdZ fddZdejdeej fddZ  Z	S )	MvpPromptz)Layer-wise prompt for encoder or decoder.c              	      s   t    |j| _|| _|| _|j| | _tj|j	d| _	t
|j|j| _tt|j|jt t|j|d |j | _d S )Nr   r+   )r-   r.   prompt_length
num_layersrJ   r   rO   r   r   rK   	Embeddingprompt_embedding
SequentialrQ   prompt_mid_dimGELUprompt_trans)r/   r   r   rJ   r0   r$   r%   r.     s   

zMvpPrompt.__init__
prompt_idsrb   c                 C   sN   |  | |}|| j| jd | j| j}| |}|g d	d}|S )Nr+   )r   r+   r   r   )
r   r   rW   r   r   rJ   rO   rK   permutesplit)r/   r   promptr$   r$   r%   r<     s
   
zMvpPrompt.forward)
r@   rA   rB   rC   r.   r6   rE   r   r<   rF   r$   r$   r0   r%   r     s    "r   c                   @   s,   e Zd ZeZdZdZdd Zedd Z	dS )MvpPreTrainedModelmodelTc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rA|jjjd|d |jd urC|jj|j 	  d S d S d S )NrH   )meanstd)r   init_std
isinstancer   rQ   r9   datanormal_rM   zero_r   padding_idx)r/   moduler   r$   r$   r%   _init_weights  s   

z MvpPreTrainedModel._init_weightsc                 C   s>   | j j}tjg ddddd|gg| jd}|||d}|S )N)r      
      r+   r         r+   r5   )r^   r   )r   r   r6   rV   r5   ne)r/   	pad_tokenr   dummy_inputsr$   r$   r%   r      s   "zMvpPreTrainedModel.dummy_inputsN)
r@   rA   rB   r   config_classbase_model_prefixsupports_gradient_checkpointingr   propertyr   r$   r$   r$   r%   r     s    r   c                       s   e Zd ZdZ	ddedeej dee f fddZ	d	d
 Z
dd Z							ddeej deej deej deej dee dee dee deeef fddZ  ZS )
MvpEncodera  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    NFr   embed_tokens
use_promptc                    s   t     j| _ j| _ j} j| _ j| _	 j
r!t|nd| _|d ur,|| _n
t j|| j| _t j|| _t fddt jD | _t|| _|| _|re j| _t  j j| _d| _|    d S )N      ?c                       g | ]}t  qS r$   )r   .0rr   r   r$   r%   
<listcomp>,      z'MvpEncoder.__init__.<locals>.<listcomp>F)!r-   r.   rK   encoder_layerdrop	layerdropr   r   r   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler   r   r   
vocab_sizer'   embed_positions
ModuleListrangeencoder_layerslayersr   layernorm_embeddingr   r   r   r   r   gradient_checkpointing	post_init)r/   r   r   r   rI   r0   r   r%   r.     s4    zMvpEncoder.__init__c                 C      | j S r   r   r/   r$   r$   r%   get_input_embeddings<     zMvpEncoder.get_input_embeddingsc                 C   
   || _ d S r   r   r/   valuer$   r$   r%   set_input_embeddings?     
zMvpEncoder.set_input_embeddingsr   r^   	head_maskinputs_embedsra   output_hidden_statesreturn_dictrb   c                 C   sb  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|dur*td|dur<|}|j}	|d|	d }n|durT| dd }	|dddddf }ntd|du rd| || j	 }| 
|}
||
 }| |}tjj|| j| jd}| jrt| j| j}| |}|durt||j}|rdnd}|rdnd}|dur| d t| jkrtdt| j d	| d  d
t| jD ]G\}}|r||f }d}| jrtg }|| jk rd}|rd}n||||dur|| nd| jr|| nd|d}|d }|r||d f }q|r||f }|s*tdd |||fD S t|||dS )a~  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsre   r$   r   z&The head_mask should be specified for  layers, but it is for .FT)NN)r_   r   ra   r   c                 s       | ]	}|d ur|V  qd S r   r$   r   vr$   r$   r%   	<genexpr>  s    z%MvpEncoder.forward.<locals>.<genexpr>last_hidden_stater[   
attentions) r   ra   r  use_return_dictr!   r   rW   rh   r   r   r   r   r   rm   rK   rg   r   r6   r7   r   rk   r5   r   r   r4   lenr   	enumeraterandr   r   r   )r/   r   r^   r  r  ra   r  r  inputinput_shape	embed_posr[   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputsr$   r$   r%   r<   B  s|   .







zMvpEncoder.forwardNF)NNNNNNN)r@   rA   rB   rC   r   r   r   r   r~   r.   r   r  r6   
LongTensorrE   r   r   r   r   r<   rF   r$   r$   r0   r%   r     sH    &
	r   c                       s   e Zd ZdZ	ddedeej dee f fddZ	d	d
 Z
dd Z												ddeej deej deej deej deej deej deeej  deej dee dee dee dee deeef fddZ  ZS )
MvpDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    NFr   r   r   c                    s   t     j| _ j| _ j| _ j| _ j	rt
 jnd| _|d ur*|| _nt j j| j| _t j j| _t fddt jD | _t j| _|| _|ro j| _t  j j| _t  j j| _d| _ | !  d S )Nr   c                    r   r$   )r   r   r   r$   r%   r     r   z'MvpDecoder.__init__.<locals>.<listcomp>F)"r-   r.   rK   decoder_layerdropr   r   r   r   max_target_positionsr   r   r   r   r   r   r   r   r   r'   r   r   r   decoder_layersr   r   r   r   r   r   r   r   r   r   r   )r/   r   r   r   r0   r   r%   r.     s<    zMvpDecoder.__init__c                 C   r   r   r   r   r$   r$   r%   r     r   zMvpDecoder.get_input_embeddingsc                 C   r   r   r   r   r$   r$   r%   r    r  zMvpDecoder.set_input_embeddingsr   r^   r   r   r  cross_attn_head_maskpast_key_valuesr  r   ra   r  r  rb   c           !      C   s  |
dur|
n| j j}
|dur|n| j j}|	dur|	n| j j}	|dur$|n| j j}|dur4|dur4td|durF|}|j}|d|d }n|dur^| dd }|dddddf }ntd|duro|d d jd nd}|du r}| 	|| j
 }t||||}|dur|durt||j|d d}| ||}|| }| |}tjj|| j| jd}| jrt| j| j}| |}| |}| jr| jr|	rtd	 d
}	|rdnd}|
rdnd}|
r|durdnd}|	rdnd}t||gddgD ]*\}}|dur%| d t | j!kr%td| dt | j! d| d  dqt"| j!D ]\}}|r7||f7 }| jrHt#g }|| j$k rHq+|durQ|| nd}||||||dura|| nd|durk|| nd| jrt|| nd| jr}|| nd||
|	d}|d }|	r|||
rdnd f7 }|
r||d f7 }|dur||d f7 }q+|r||f7 }|	r|nd} |st%dd || |||fD S t&|| |||dS )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   r+   )rq   re   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr$   r  r$  zThe `z` should be specified for r  r  )r   r_   r   r   r   r]   ra   r   r   r   c                 s   r	  r   r$   r
  r$   r$   r%   r    s    z%MvpDecoder.forward.<locals>.<genexpr>)r  r%  r[   r  cross_attentions)'r   ra   r  r   r  r!   r   rW   rh   r   r   r   r   r4   r   r   r   rm   rK   rg   r   r6   r7   r   rk   r5   r   r   r   loggerwarning_oncezipr  r   r  r  r   r   r   )!r/   r   r^   r   r   r  r$  r%  r  r   ra   r  r  r  r  r2   	positionsr[   r   r   r   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cache	attn_mask	mask_namer  decoder_layerr  r]   r  
next_cacher$   r$   r%   r<     s   P








zMvpDecoder.forwardr  )NNNNNNNNNNNN)r@   rA   rB   rC   r   r   r   r   r~   r.   r   r  r6   r  rE   r   listr   r   r   r<   rF   r$   r$   r0   r%   r     sf    
(	

r   c                $       s(  e Zd ZdgZddgZdef fddZdd Zd	d
 Zdd Z	dd Z
dd Ze															d$deej deej deej deej deej deej deej deeej  deeej  deej deej dee dee dee d ee d!eeef f d"d#Z  ZS )%MvpModelfinal_logits_biasencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    sd   t  | |j|j}}|j| _t||j|| _t	|| j|j| _
t|| j|j| _|   d S r   )r-   r.   r   r   r   r   r   r   sharedr   encoderr   decoderr   )r/   r   r   r   r0   r$   r%   r.     s   zMvpModel.__init__c                 C   r   r   )r8  r   r$   r$   r%   r     r   zMvpModel.get_input_embeddingsc                 C   s   || _ | j | j_| j | j_d S r   )r8  r9  r   r:  r   r$   r$   r%   r    s   
zMvpModel.set_input_embeddingsc                 C   r   r   )r9  r   r$   r$   r%   get_encoder  r   zMvpModel.get_encoderc                 C   r   r   r:  r   r$   r$   r%   get_decoder  r   zMvpModel.get_decoderc                 C   sF   | j sJ d| d | jjd | jjd | jjd d S )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r   requires_grad_r9  r   r:  r   r   r$   r$   r%   set_lightweight_tuning  s
   
zMvpModel.set_lightweight_tuningNr   r^   decoder_input_idsdecoder_attention_maskr  decoder_head_maskr$  encoder_outputsr%  r  decoder_inputs_embedsr   ra   r  r  rb   c                 C   sH  |du r|du r|du rt dt|| jj| jj}|dur |n| jj}|dur*|n| jj}|dur4|n| jj}|dur>|n| jj}|du rS| j	||||
|||d}n$|rwt
|tswt|d t|dkrh|d ndt|dkrs|d ndd}| j|||d ||||	|||||d}|s|| S t|j|j|j|j|j|j|j|jd	S )
a*  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r^   r  r  ra   r  r  r   r   r+   r  r   r^   r   r   r  r$  r%  r  r   ra   r  r  )r  r%  decoder_hidden_statesdecoder_attentionsr&  encoder_last_hidden_stater   encoder_attentions)r!   r&   r   r   r   ra   r  r   r  r9  r   r   r  r:  r   r  r%  r[   r  r&  )r/   r   r^   r@  rA  r  rB  r$  rC  r%  r  rD  r   ra   r  r  decoder_outputsr$   r$   r%   r<     sn   2
zMvpModel.forwardNNNNNNNNNNNNNNN)r@   rA   rB   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r.   r   r  r;  r=  r?  r   r   r6   r  rE   r3  r   r~   r   r   r   r<   rF   r$   r$   r0   r%   r4    st    	

r4  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc                &       s  e Zd Zg dZdef fddZdd Zdd Z	
d/dede	e de
dejf fddZdedd	fddZdd Zdd Zdd Ze																																d0de	ej de	ej de	ej de	ej de	ej de	ej de	ej d e	eej  d!e	eej  d"e	ej d#e	ej d$e	ej d%e	e
 d&e	e
 d'e	e
 d(e	e
 deeef f"d)d*Zd$ejfd+d,Zed-d. Z  ZS )1MvpForConditionalGeneration)r6  r7  lm_head.weightr   c                    sX   t  | t|| _| dtd| jjjf t	j
|j| jjjdd| _|   d S )Nr5  r   FrN   )r-   r.   r4  r   register_bufferr6   rj   r8  r(   r   rQ   r   lm_headr   r   r0   r$   r%   r.   s  s
   
z$MvpForConditionalGeneration.__init__c                 C   
   | j  S r   )r   r;  r   r$   r$   r%   r;  |  r  z'MvpForConditionalGeneration.get_encoderc                 C   rS  r   )r   r=  r   r$   r$   r%   r=    r  z'MvpForConditionalGeneration.get_decoderNTnew_num_tokenspad_to_multiple_ofmean_resizingrb   c                    s   t  |||}| | |S r   )r-   resize_token_embeddings_resize_final_logits_bias)r/   rT  rU  rV  new_embeddingsr0   r$   r%   rW    s   
z3MvpForConditionalGeneration.resize_token_embeddingsc                 C   sj   | j jd }||kr| j d d d |f }ntjd|| f| j jd}tj| j |gdd}| d| d S )Nr   r   r   rc   r5  )r5  r   r6   rj   r5   ri   rQ  )r/   rT  old_num_tokensnew_bias
extra_biasr$   r$   r%   rX    s   z5MvpForConditionalGeneration._resize_final_logits_biasc                 C   r   r   rR  r   r$   r$   r%   get_output_embeddings  r   z1MvpForConditionalGeneration.get_output_embeddingsc                 C   r   r   r]  r/   rY  r$   r$   r%   set_output_embeddings  r  z1MvpForConditionalGeneration.set_output_embeddingsc                 C      | j   | jd d S r  r   r?  rR  r>  r   r$   r$   r%   r?       
z2MvpForConditionalGeneration.set_lightweight_tuningr   r^   r@  rA  r  rB  r$  rC  r%  r  rD  labelsr   ra   r  r  c                 C   s  |dur|n| j j}|dur)|rtd d}|du r)|du r)t|| j j| j j}| j|||||||||	|
|||||d}| |d | j	 }d}|dur^t
 }||d| j j|d}|st|f|dd  }|durr|f| S |S t|||j|j|j|j|j|j|jd	S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example of summarization:

        Fine-tuning a model
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
        ...     return_tensors="pt",
        ... )
        >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     generated_ids = model.generate(**inputs)

        >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)r^   r@  rC  rA  r  rB  r$  r%  r  rD  r   ra   r  r  r   r   r   	losslogitsr%  rF  rG  r&  rH  r   rI  )r   r  r'  warningr&   r   r   r   rR  r5  r   rW   r   r   r%  rF  rG  r&  rH  r   rI  )r/   r   r^   r@  rA  r  rB  r$  rC  r%  r  rD  rd  r   ra   r  r  r   	lm_logitsmasked_lm_lossloss_fctoutputr$   r$   r%   r<     sZ   Q
z#MvpForConditionalGeneration.forwardc                 C   s   t || jj| jjS r   )r&   r   r   r   )r/   rd  r$   r$   r%   %prepare_decoder_input_ids_from_labels   s   zAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    sB   d}| D ]}|t  fdd|d d D |dd   f7 }q|S )Nr$   c                 3   $    | ]}| d  |jV  qdS r?   index_selectrk   r5   r   
past_statebeam_idxr$   r%   r  )     " z=MvpForConditionalGeneration._reorder_cache.<locals>.<genexpr>r+   r   r%  rt  reordered_past
layer_pastr$   rs  r%   _reorder_cache#  s   
z*MvpForConditionalGeneration._reorder_cache)NTNNNNNNNNNNNNNNNN) r@   rA   rB   rM  r   r.   r;  r=  rD   r   r~   r   r   rW  rX  r^  r`  r?  r   r6   r  rE   r3  r   r   r   r   r<   rm  staticmethodrz  rF   r$   r$   r0   r%   rO  k  s    			

 rO  z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                $       s   e Zd ZddgZdef fddZdd Ze															dd	ee	j
 d
ee	j dee	j
 dee	j
 dee	j dee	j dee	j deee	j  dee	j dee	j dee	j
 dee dee dee dee deeef f ddZ  ZS )MvpForSequenceClassificationr6  r7  r   c                    sB   t  j|fi | t|| _t|j|j|j|j| _| 	  d S r   )
r-   r.   r4  r   r   r   
num_labelsclassifier_dropoutclassification_headr   )r/   r   kwargsr0   r$   r%   r.   8  s   
z%MvpForSequenceClassification.__init__c                 C   ra  r  )r   r?  r  r>  r   r$   r$   r%   r?  E  rc  z3MvpForSequenceClassification.set_lightweight_tuningNr   r^   r@  rA  r  rB  r$  rC  r  rD  rd  r   ra   r  r  rb   c                 C   s.  |dur|n| j j}|durd}|du r!|	dur!td| jj | j|||||||||	|
||||d}|d }|| j j|j	}t
t|ddkrStd||ddf |dd|ddddddf }| |}d}|dur| j jdu r| j jdkrd	| j _n| j jdkr|jtjks|jtjkrd
| j _nd| j _| j jd	krt }| j jdkr|| | }n,|||}n&| j jd
krt }||d| j j|d}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|j|j|j|j |j!|j"d	S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        Fine-tuning a model on `num_labels` classes
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForSequenceClassification

        >>> num_labels = 2  # for example, this is a binary classification task
        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

        >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
        >>> labels = torch.tensor(1)  # the real label for inputs

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax()
        ```
        NFz8Passing input embeddings is currently not supported for r^   r@  rA  r  rB  r$  rC  r  rD  r   ra   r  r  r   r   z7All examples must have the same number of <eos> tokens.r   
regressionsingle_label_classificationmulti_label_classificationre  )#r   r  NotImplementedErrorr1   r@   r   eqeos_token_idrk   r5   r  r6   unique_consecutivesumr!   rW   rh   r  problem_typer~  r4   r8   rD   r   squeezer   r   r   r%  rF  rG  r&  rH  r   rI  )r/   r   r^   r@  rA  r  rB  r$  rC  r  rD  rd  r   ra   r  r  r   r[   eos_masksentence_representationrg  rf  rk  rl  r$   r$   r%   r<   I  s   M$

$

z$MvpForSequenceClassification.forwardrK  )r@   rA   rB   rM  r   r.   r?  r   r   r6   r  rE   r3  r   r~   r   r   r   r<   rF   r$   r$   r0   r%   r}  /  sj    	

r}  c                &       s  e Zd ZddgZ fddZdd Ze																ddeej	 d	eej	 d
eej
 deej
 deej	 deej	 deej	 deeej  deej
 deej
 deej deej dee dee dee dee deeef f"ddZ  ZS )MvpForQuestionAnsweringr6  r7  c                    sB   t  | d|_|j| _t|| _t|j|j| _| 	  d S r*   )
r-   r.   r~  r4  r   r   rQ   hidden_size
qa_outputsr   r   r0   r$   r%   r.     s   
z MvpForQuestionAnswering.__init__c                 C   ra  r  )r   r?  r  r>  r   r$   r$   r%   r?    rc  z.MvpForQuestionAnswering.set_lightweight_tuningNr   r^   r@  rA  r  rB  r$  rC  start_positionsend_positionsr  rD  r   ra   r  r  rb   c                 C   sz  |dur|n| j j}|	dur|
durd}| j||||||||||||||d}|d }| |}|jddd\}}|d }|d }d}|	dur|
durt|	 dkr^|	d}	t|
 dkrk|
d}
|d}|		d|}	|
	d|}
t
|d}|||	}|||
}|| d	 }|s||f|dd  }|dur|f| S |S t||||j|j|j|j|j|j|jd

S )a`  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        Fine-tuning a model for extrative question answering, and our model also supports generative question answering
        using `BartForConditionalGeneration`
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
        ...     return_tensors="pt",
        ... )
        >>> target_start_index = torch.tensor([18])
        >>> target_end_index = torch.tensor([19])

        >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        >>> predict_answer = tokenizer.decode(predict_answer_tokens)
        ```
        NFr  r   r   r   rc   )ignore_indexr+   )
rf  start_logits
end_logitsr%  rF  rG  r&  rH  r   rI  )r   r  r   r  r   r  rY   r  rh   r   r   r   r%  rF  rG  r&  rH  r   rI  )r/   r   r^   r@  rA  r  rB  r$  rC  r  r  r  rD  r   ra   r  r  r   sequence_outputrg  r  r  
total_lossignored_indexrk  
start_lossend_lossrl  r$   r$   r%   r<     sp   S







zMvpForQuestionAnswering.forwardr{  )r@   rA   rB   rM  r.   r?  r   r   r6   rE   r  r3  r   r~   r   r   r   r<   rF   r$   r$   r0   r%   r    sp    	

r  c                       s(   e Zd ZdZ fddZdd Z  ZS )MvpDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                    s   t  | t|| _d S r   )r-   r.   r   r:  r   r0   r$   r%   r.     s   zMvpDecoderWrapper.__init__c                 O   s   | j |i |S r   r<  )r/   argsr  r$   r$   r%   r<     s   zMvpDecoderWrapper.forward)r@   rA   rB   rC   r.   r<   rF   r$   r$   r0   r%   r    s    r  c                        s  e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Ze													d%deej deej deej deej deej deej deeej  deej deej dee dee dee dee d eeef fd!d"Zed#d$ Z  ZS )&MvpForCausalLMrP  c                    sN   t |}d|_d|_t | t|| _tj	|j
|jdd| _|   d S )NTFrN   )copydeepcopyrL   is_encoder_decoderr-   r.   r  r   r   rQ   r  r   rR  r   r   r0   r$   r%   r.     s   

zMvpForCausalLM.__init__c                 C   s
   | j jjS r   r   r:  r   r   r$   r$   r%   r     r  z#MvpForCausalLM.get_input_embeddingsc                 C   s   || j j_d S r   r  r   r$   r$   r%   r    s   z#MvpForCausalLM.set_input_embeddingsc                 C   r   r   r]  r   r$   r$   r%   r^    r   z$MvpForCausalLM.get_output_embeddingsc                 C   r   r   r]  r_  r$   r$   r%   r`    r  z$MvpForCausalLM.set_output_embeddingsc                 C   s   || j _d S r   r   r:  )r/   r:  r$   r$   r%   set_decoder  s   zMvpForCausalLM.set_decoderc                 C   s   | j jS r   r  r   r$   r$   r%   r=    s   zMvpForCausalLM.get_decoderc                 C   ra  r  rb  r   r$   r$   r%   r?    rc  z%MvpForCausalLM.set_lightweight_tuningNr   r^   r   r   r  r$  r%  r  rd  r   ra   r  r  rb   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jj|||||||||
|||d}| |d }d}|	durNt }||d| j j	|	d}|sd|f|dd  }|durb|f| S |S t
|||j|j|j|jdS )a  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MvpForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 8, 50267]
        ```NrE  r   r   r   )rf  rg  r%  r[   r  r&  )r   ra   r  r  r   r:  rR  r   rW   r   r   r%  r[   r  r&  )r/   r   r^   r   r   r  r$  r%  r  rd  r   ra   r  r  r   rg  rf  rk  rl  r$   r$   r%   r<     sD   ,zMvpForCausalLM.forwardc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr$   c                 3   rn  r?   ro  rq  rs  r$   r%   r    ru  z0MvpForCausalLM._reorder_cache.<locals>.<genexpr>rv  rw  r$   rs  r%   rz    s   zMvpForCausalLM._reorder_cache)NNNNNNNNNNNNN)r@   rA   rB   rM  r.   r   r  r^  r`  r  r=  r?  r   r   r6   r  rE   r   r3  r~   r   r   r   r<   r|  rz  rF   r$   r$   r0   r%   r    sn    	

Ur  )r  rO  r  r}  r4  r   )=rC   r  r   typingr   r   r6   torch.utils.checkpointr   torch.nnr   r   r   activationsr	   
generationr
   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mvpr   
get_loggerr@   r'  rE   rD   r&   r   r'   ModulerG   r   r   r   r   r   r   r   r4  rO  r}  r  r  r  __all__r$   r$   r$   r%   <module>   sh   $	
 H~ ;  
  @ - * 