o
    iA                    @   s.  d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( e$)e*Z+dej,de-de-fddZ.G dd dej/Z0G dd dej1Z2G dd deZ3G dd deZ4G dd  d ej1Z5G d!d" d"ej1Z6e#G d#d$ d$e!Z7G d%d& d&e7Z8G d'd( d(e7Z9e#G d)d* d*e7Z:e#d+d,G d-d. d.e7eZ;e#d/d,G d0d1 d1e7Z<e#G d2d3 d3e7Z=G d4d5 d5e7Z>G d6d7 d7e7eZ?g d8Z@dS )9zPyTorch MVP model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |du r*td||dk| |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r    shifted_input_ids r(   a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_right3   s   (r*   c                       sP   e Zd ZdZdedef fddZ	ddejd	ed
eej f fddZ	  Z
S )MvpLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S N   )offsetsuper__init__)selfr,   r-   	__class__r(   r)   r2   I   s   z&MvpLearnedPositionalEmbedding.__init__r   Nr   past_key_values_lengthposition_idsc                    s\   |du r |j dd \}}tj||| tj| jjd|d}n|d}t 	|| j
 S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr/   )dtypedevicer!   r   )r#   torcharangelongweightr9   expand	unsqueezer1   forwardr0   )r3   r   r6   r7   bszseq_lenr4   r(   r)   r@   O   s   
z%MvpLearnedPositionalEmbedding.forward)r   N)__name__
__module____qualname____doc__intr2   r:   Tensorr   r@   __classcell__r(   r(   r4   r)   r+   D   s    r+   c                       s   e Zd ZdZ				ddededee d	ee d
ee dee f fddZe	dddd							dde
jdee
j dee dee
j dee
j dee
j dedee
j dee
jee
j eee
j  f fddZ  ZS )MvpAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _|| _|| _	t
j|||d| _t
j|||d| _t
j|||d| _t
j|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rP   )r1   r2   rL   rM   rN   head_dimr%   scalingrO   rQ   r   Lineark_projv_projq_projout_proj)r3   rL   rM   rN   rO   rP   rQ   r4   r(   r)   r2   b   s$   
	

zMvpAttention.__init__past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesattention_masklayer_head_maskattn_promptoutput_attentionscache_positionreturnc	                 C   s  |du}	|  \}
}}| || j }d}|dur1t|tr/|j| j}|	r+|j}n|j	}n|}|	r5|n|}|	rN|durN|rN|j
| j j}|j
| j j}nJ| |}| |}||
d| j| jdd}||
d| j| jdd}|dur|	s||nd}|||| jd|i\}}|	rt|trd|j| j< |durtj|d |
ddd|gdd	}tj|d |
ddd|gdd	}|durt|
d||d  d|j}tj||gdd	}|
| j d| jf}||
|| j| jdd}|j| }|j| }|j| }| d}t||dd}|  |
| j ||fkr0td
|
| j ||f d|   |dure|  |
d||fkrPtd|
d||f d|   ||
| j||| }||
| j ||}tjj|dd	}|dur|  | jfkrtd| jf d|   |dddd||
| j|| }||
| j ||}|r||
| j||}||
| j ||}nd}tjj || j | j!d}t||}|  |
| j || jfkrtd|
| j|| jf d|   ||
| j|| j}|dd}||
|| j"}| #|}||fS )z#Input shape: Batch x Time x ChannelNFr!   r   r/   rf   Tr   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )$sizerX   rT   
isinstancer   
is_updatedgetrQ   cross_attention_cacheself_attention_cachelayerskeysvaluesrV   rW   viewrM   rS   	transposeupdater:   catr>   zerostor9   reshapebmmr%   r   
functionalsoftmaxrN   rl   rL   rY   )r3   r`   ra   r[   rb   rc   rd   re   rf   is_cross_attentionrA   tgt_len_query_statesro   curr_past_key_valuecurrent_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputr(   r(   r)   r@      s   


"""





"
zMvpAttention.forward)rK   FTN)NNNNNFN)rC   rD   rE   rF   rG   r   floatboolr2   r   r:   rH   r
   tupler@   rI   r(   r(   r4   r)   rJ   _   s\    	
rJ   c                       sd   e Zd Zdef fddZ	ddejdejdejdejd	ee d
e	ejeej f fddZ
  ZS )MvpEncoderLayerconfigc                    s   t    |j| _t| j|j|jd| _t	| j| _
|j| _t|j | _|j| _t| j|j| _t|j| j| _t	| j| _d S )N)rL   rM   rN   )r1   r2   d_modelrL   rJ   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrN   r	   activation_functionactivation_fnactivation_dropoutrU   encoder_ffn_dimfc1fc2final_layer_normr3   r   r4   r(   r)   r2     s   
zMvpEncoderLayer.__init__Fr`   rb   rc   self_attn_promptre   rg   c           	      C   s   |}| j |||||d\}}tjj|| j| jd}|| }| |}|}| | |}tjj|| j| jd}| 	|}tjj|| j| jd}|| }| 
|}|jtjkrvt| sdt| rvt|jjd }tj|| |d}||fS )a@  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r`   rb   rc   rd   re   rj   i  )minmax)r   r   r~   rN   rl   r   r   r   r   r   r   r8   r:   float16isinfanyisnanfinfor   clamp)	r3   r`   rb   rc   r   re   residualr   clamp_valuer(   r(   r)   r@     s4   



zMvpEncoderLayer.forward)F)rC   rD   rE   r   r2   r:   FloatTensorr   r   r   r@   rI   r(   r(   r4   r)   r      s     r   c                !       s   e Zd Zddef fddZedddd											
	ddejdeej deej deej deej deej deej deej dee	 dee
 dee
 deej deejeeejejf  f fddZ  ZS )MvpDecoderLayerNr   c                    s   t    |j| _t| j|j|jd|d| _|j| _t	|j
 | _|j| _t| j| _t| j|j|jd|d| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)rL   rM   rN   rO   rQ   )rN   rO   rQ   )r1   r2   r   rL   rJ   decoder_attention_headsr   r   rN   r	   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normrU   decoder_ffn_dimr   r   r   )r3   r   rQ   r4   r(   r)   r2   D  s0   
zMvpDecoderLayer.__init__rZ   r[   r\   r]   FTr`   rb   encoder_hidden_statesencoder_attention_maskrc   cross_attn_layer_head_maskr   cross_attn_promptre   	use_cacherf   rg   c              	   C   s  |}| j ||	||||
|d\}}tjj|| j| jd}|| }| |}d}|durN|}| j||||||	|
d\}}tjj|| j| jd}|| }| |}|}| | 	|}tjj|| j
| jd}| |}tjj|| j| jd}|| }| |}|f}|
r|||f7 }|S )a1  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r`   r[   rb   rc   rd   re   rf   rj   N)r`   ra   rb   rc   rd   r[   re   )r   r   r~   rN   rl   r   r   r   r   r   r   r   r   )r3   r`   rb   r   r   rc   r   r   r   r[   re   r   rf   r   self_attn_weightscross_attn_weightsoutputsr(   r(   r)   r@   `  sN   &
	

	


zMvpDecoderLayer.forwardN)NNNNNNNNFTN)rC   rD   rE   r   r2   r   r:   rH   r   r
   r   r   r   r@   rI   r(   r(   r4   r)   r   C  sR    	
r   c                       sH   e Zd ZdZdedededef fddZdejd	ejfd
dZ	  Z
S )MvpClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                    s8   t    t||| _tj|d| _t||| _d S )Nrk   )r1   r2   r   rU   denseDropoutrN   rY   )r3   r   r   r   r   r4   r(   r)   r2     s   
zMvpClassificationHead.__init__r`   rg   c                 C   s6   |  |}| |}t|}|  |}| |}|S r   )rN   r   r:   tanhrY   )r3   r`   r(   r(   r)   r@     s   




zMvpClassificationHead.forward)rC   rD   rE   rF   rG   r   r2   r:   rH   r@   rI   r(   r(   r4   r)   r     s    r   c                       s:   e Zd ZdZ fddZdejdeej fddZ  Z	S )	MvpPromptz)Layer-wise prompt for encoder or decoder.c              	      s   t    |j| _|| _|| _|j| | _tj|j	d| _	t
|j|j| _tt|j|jt t|j|d |j | _d S )Nr   r/   )r1   r2   prompt_length
num_layersrM   r   rS   r   r   rN   	Embeddingprompt_embedding
SequentialrU   prompt_mid_dimGELUprompt_trans)r3   r   r   rM   r4   r(   r)   r2     s   

zMvpPrompt.__init__
prompt_idsrg   c                 C   sN   |  | |}|| j| jd | j| j}| |}|g d	d}|S )Nr/   )r   r/   r   r   )
r   r   rv   r   r   rM   rS   rN   permutesplit)r3   r   promptr(   r(   r)   r@     s
   
zMvpPrompt.forward)
rC   rD   rE   rF   r2   r:   rH   r   r@   rI   r(   r(   r4   r)   r     s    "r   c                   @   s2   e Zd ZU eed< dZdZdd Zedd Z	dS )	MvpPreTrainedModelr   modelTc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rA|jjjd|d |jd urC|jj|j 	  d S d S d S )NrK   )meanstd)r   init_stdrn   r   rU   r=   datanormal_rP   zero_r   padding_idx)r3   moduler   r(   r(   r)   _init_weights  s   

z MvpPreTrainedModel._init_weightsc                 C   s>   | j j}tjg ddddd|gg| jd}|||d}|S )N)r      
      r/   r         r/   r9   )rb   r   )r   r   r:   tensorr9   ne)r3   	pad_tokenr   dummy_inputsr(   r(   r)   r     s   "zMvpPreTrainedModel.dummy_inputsN)
rC   rD   rE   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   propertyr   r(   r(   r(   r)   r     s   
 r   c                       s   e Zd ZdZ	ddedeej dee f fddZ								dd	ee
j d
ee
j dee
j dee
j dee dee dee deeef fddZ  ZS )
MvpEncodera  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    NFr   embed_tokens
use_promptc                    s   t     j| _ j| _ j} j| _ j| _	 j
r!t|nd| _|d ur,|| _n
t j|| j| _t j|| _t fddt jD | _t|| _|| _|re j| _t  j j| _d| _|    d S )N      ?c                    s   g | ]}t  qS r(   )r   ).0r   r   r(   r)   
<listcomp>(  s    z'MvpEncoder.__init__.<locals>.<listcomp>F)!r1   r2   rN   encoder_layerdrop	layerdropr   r   r   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler   r   r   
vocab_sizer+   embed_positions
ModuleListrangeencoder_layersrs   r   layernorm_embeddingr   r   r   r   r   gradient_checkpointing	post_init)r3   r   r   r   rL   r4   r   r)   r2     s4    zMvpEncoder.__init__r   rb   	head_maskinputs_embedsre   output_hidden_statesreturn_dictrg   c                 C   sb  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|dur*td|dur<|}|j}	|d|	d }n|durT| dd }	|dddddf }ntd|du rd| || j	 }| 
|}
||
 }| |}tjj|| j| jd}| jrt| j| j}| |}|durt||j}|rdnd}|rdnd}|dur| d t| jkrtdt| j d	| d  d
t| jD ]G\}}|r||f }d}| jrtg }|| jk rd}|rd}n||||dur|| nd| jr|| nd|d}|d }|r||d f }q|r||f }|s*tdd |||fD S t|||dS )a~  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer!   z5You have to specify either input_ids or inputs_embedsrj   r(   r   z&The head_mask should be specified for  layers, but it is for .FT)NN)rc   r   re   r   c                 s       | ]	}|d ur|V  qd S r   r(   r   vr(   r(   r)   	<genexpr>  s    z%MvpEncoder.forward.<locals>.<genexpr>last_hidden_stater`   
attentions) r   re   r  use_return_dictr%   r#   rv   rm   r   r   r   r   r   r~   rN   rl   r   r:   r;   r   r{   r9   r   r   r8   lenrs   	enumeraterandr   r   r   )r3   r   rb   r  r  re   r  r  inputinput_shape	embed_posr`   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputsr(   r(   r)   r@   8  s|   .







zMvpEncoder.forwardNF)NNNNNNN)rC   rD   rE   rF   r   r   r   r   r   r2   r:   
LongTensorrH   r   r   r   r   r@   rI   r(   r(   r4   r)   r     sD    (
	r   c                       s   e Zd ZdZ	ddedeej dee f fddZ														dd	ee
j d
ee
j dee
j dee
j dee
j dee
j dee dee
j dee dee dee dee dee
j deeef fddZ  ZS )
MvpDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    NFr   r   r   c                    s   t     j| _ j| _ j| _ j| _ j	rt
 jnd| _|d ur*|| _nt j j| j| _t j j| _t fddt jD | _t j| _|| _|ro j| _t  j j| _t  j j| _d| _ | !  d S )Nr   c                    s   g | ]}t  |d qS ))rQ   )r   )r   ir   r(   r)   r     s    z'MvpDecoder.__init__.<locals>.<listcomp>F)"r1   r2   rN   decoder_layerdropr   r   r   r   max_target_positionsr   r   r   r   r   r   r   r   r   r+   r   r   r   decoder_layersrs   r   r   r   r   r   r   r   r   r   r  )r3   r   r   r   r4   r   r)   r2     s<    zMvpDecoder.__init__r   rb   r   r   r  cross_attn_head_maskr[   r  r   re   r  r  rf   rg   c                 C   s  |
dur|
n| j j}
|dur|n| j j}|	dur|	n| j j}	|dur$|n| j j}|dur4|dur4td|durF|}|j}|d|d }n|dur^| dd }|dddddf }ntd|du rn| 	|| j
 }| jr}| jr}|	r}td d}	|	r|du r|durtt| j dt| j dnt| j d}|	rt|trtd t|}|dur| nd	}t||||}|dur|durt||j|d d
}| ||}|| }| |}tjj|| j| jd}| jrt| j !| j"}| #|}| $|}|rdnd}|
rdnd}|
r|durdnd}t%||gddgD ]+\}}|durL| d	 t&| j'krLtd| dt&| j' d| d	  dq"t(| j'D ]j\}}|r_||f7 }| jrpt)g }|| j*k rpqS||||||dur~|| nd|dur|| nd| jr|| nd| jr|| nd||
|	|d}|d	 }|
r||d f7 }|dur||d f7 }qS|r||f7 }|stdd |||||fD S t+|||||dS )aE  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer!   zEYou have to specify either decoder_input_ids or decoder_inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r   rj   r(   r  r$  zThe `z` should be specified for r  r  )	r   rc   r   r   r   r[   re   r   rf   r   r/   c                 s   r  r   r(   r	  r(   r(   r)   r    s    z%MvpDecoder.forward.<locals>.<genexpr>)r  r[   r`   r  cross_attentions),r   re   r  r   r  r%   r#   rv   rm   r   r   r   rl   loggerwarning_oncer   r   rn   r   from_legacy_cacheget_seq_lengthr   r   r8   r   r   r   r~   rN   r   r:   r;   r   r{   r9   r   r   zipr  rs   r  r  r   r   )r3   r   rb   r   r   r  r$  r[   r  r   re   r  r  rf   r  r  r6   	positionsr`   r   r   r   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer  decoder_layerr  r  r(   r(   r)   r@     s   O










zMvpDecoder.forwardr  )NNNNNNNNNNNNN)rC   rD   rE   rF   r   r   r   r   r   r2   r:   r  rH   r   r
   r   r   r   r@   rI   r(   r(   r4   r)   r    sh    
*	

r  c                &       s&  e Zd ZdgZddgZdef fddZdd Zd	d
 Zdd Z	dd Z
e																d#deej deej deej deej deej deej deej deeej  dee deej deej dee dee dee dee deej d eeef f"d!d"Z  ZS )$MvpModelfinal_logits_biasencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    sd   t  | |j|j}}|j| _t||j|| _t	|| j|j| _
t|| j|j| _|   d S r   )r1   r2   r   r   r   r   r   r   sharedr   encoderr  decoderr  )r3   r   r   r   r4   r(   r)   r2     s   zMvpModel.__init__c                 C      | j S r   )r6  r3   r(   r(   r)   get_input_embeddings     zMvpModel.get_input_embeddingsc                 C   s   || _ | j | j_| j | j_d S r   )r6  r7  r   r8  r3   valuer(   r(   r)   set_input_embeddings  s   
zMvpModel.set_input_embeddingsc                 C   r9  r   )r7  r:  r(   r(   r)   get_encoder  r<  zMvpModel.get_encoderc                 C   sF   | j sJ d| d | jjd | jjd | jjd d S )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r   requires_grad_r7  r   r8  r   r:  r(   r(   r)   set_lightweight_tuning  s
   
zMvpModel.set_lightweight_tuningNr   rb   decoder_input_idsdecoder_attention_maskr  decoder_head_maskr$  encoder_outputsr[   r  decoder_inputs_embedsr   re   r  r  rf   rg   c                 C   sJ  |du r|du r|du rt dt|| jj| jj}|dur |n| jj}|dur*|n| jj}|dur4|n| jj}|dur>|n| jj}|du rS| j	||||
|||d}n$|rwt
|tswt|d t|dkrh|d ndt|dkrs|d ndd}| j|||d ||||	||||||d}|s|| S t|j|j|j|j|j|j|j|jd	S )
a*  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   rb   r  r  re   r  r  r   r   r/   r  )r   rb   r   r   r  r$  r[   r  r   re   r  r  rf   )r  r[   decoder_hidden_statesdecoder_attentionsr%  encoder_last_hidden_stater   encoder_attentions)r%   r*   r   r   r    re   r  r   r  r7  rn   r   r  r8  r   r  r[   r`   r  r%  )r3   r   rb   rC  rD  r  rE  r$  rF  r[   r  rG  r   re   r  r  rf   decoder_outputsr(   r(   r)   r@     sp   3
zMvpModel.forwardNNNNNNNNNNNNNNNN)rC   rD   rE   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r2   r;  r?  r@  rB  r   r   r:   r  rH   listr   r
   r   r   r   r   r@   rI   r(   r(   r4   r)   r2    sx    	

r2  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc                (       sn  e Zd Zg dZdef fddZdd Zdd Z	
d*dede	e de
dejf fddZdedd	fddZdd Ze																																		d+de	ej de	ej de	ej de	ej de	ej de	ej de	ej de	eej  de	e de	ej de	ej d e	ej d!e	e
 d"e	e
 d#e	e
 d$e	e
 d%e	ej deeef f$d&d'Zd ejfd(d)Z  ZS ),MvpForConditionalGeneration)r4  r5  lm_head.weightr   c                    sX   t  | t|| _| dtd| jjjf t	j
|j| jjjdd| _|   d S )Nr3  r   FrR   )r1   r2   r2  r   register_bufferr:   rz   r6  r,   r   rU   r   lm_headr  r   r4   r(   r)   r2   f  s
   
z$MvpForConditionalGeneration.__init__c                 C   
   | j  S r   )r   r@  r:  r(   r(   r)   r@  o     
z'MvpForConditionalGeneration.get_encoderc                 C   rV  r   )r   get_decoderr:  r(   r(   r)   rX  r  rW  z'MvpForConditionalGeneration.get_decoderNTnew_num_tokenspad_to_multiple_ofmean_resizingrg   c                    s   t  |||}| | |S r   )r1   resize_token_embeddings_resize_final_logits_bias)r3   rY  rZ  r[  new_embeddingsr4   r(   r)   r\  u  s   
z3MvpForConditionalGeneration.resize_token_embeddingsc                 C   sj   | j jd }||kr| j d d d |f }ntjd|| f| j jd}tj| j |gdd}| d| d S )Nr!   r   r   rh   r3  )r3  r#   r:   rz   r9   ry   rT  )r3   rY  old_num_tokensnew_bias
extra_biasr(   r(   r)   r]  |  s   z5MvpForConditionalGeneration._resize_final_logits_biasc                 C      | j   | jd d S r  r   rB  rU  rA  r:  r(   r(   r)   rB       
z2MvpForConditionalGeneration.set_lightweight_tuningr   rb   rC  rD  r  rE  r$  rF  r[   r  rG  labelsr   re   r  r  rf   c                 C   s  |dur|n| j j}|dur)|rtd d}|du r)|du r)t|| j j| j j}| j|f||||||||	|
||||||d}| |d | j	 }d}|durat
 }||d| j j|d}|sw|f|dd  }|duru|f| S |S t|||j|j|j|j|j|j|jd	S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example of summarization:

        Fine-tuning a model
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
        ...     return_tensors="pt",
        ... )
        >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     generated_ids = model.generate(**inputs)

        >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)rb   rC  rF  rD  r  rE  r$  r[   r  rG  r   re   r  r  rf   r   r!   r   	losslogitsr[   rH  rI  r%  rJ  r   rK  )r   r  r&  warningr*   r   r    r   rU  r3  r   rv   r   r   r[   rH  rI  r%  rJ  r   rK  )r3   r   rb   rC  rD  r  rE  r$  rF  r[   r  rG  re  r   re   r  r  rf   r   	lm_logitsmasked_lm_lossloss_fctoutputr(   r(   r)   r@     s^   R
z#MvpForConditionalGeneration.forwardc                 C   s   t || jj| jjS r   )r*   r   r   r    )r3   re  r(   r(   r)   %prepare_decoder_input_ids_from_labels  s   zAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels)NT)NNNNNNNNNNNNNNNNN)rC   rD   rE   rO  r   r2   r@  rX  rG   r   r   r   r   r\  r]  rB  r   r:   r  rH   rP  r   r
   r   r   r   r@   rn  rI   r(   r(   r4   r)   rR  ^  s    			

 rR  z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                $       s   e Zd ZddgZdef fddZdd Ze															dd	ee	j
 d
ee	j dee	j
 dee	j
 dee	j dee	j dee	j deee	j  dee	j dee	j dee	j
 dee dee dee dee deeef f ddZ  ZS )MvpForSequenceClassificationr4  r5  r   c                    sB   t  j|fi | t|| _t|j|j|j|j| _| 	  d S r   )
r1   r2   r2  r   r   r   
num_labelsclassifier_dropoutclassification_headr  )r3   r   kwargsr4   r(   r)   r2     s   
z%MvpForSequenceClassification.__init__c                 C   rb  r  )r   rB  rr  rA  r:  r(   r(   r)   rB  )  rd  z3MvpForSequenceClassification.set_lightweight_tuningNr   rb   rC  rD  r  rE  r$  rF  r  rG  re  r   re   r  r  rg   c                 C   s.  |dur|n| j j}|durd}|du r!|	dur!td| jj | j|||||||||	|
||||d}|d }|| j j|j	}t
t|ddkrStd||ddf |dd|ddddddf }| |}d}|dur| j jdu r| j jdkrd	| j _n| j jdkr|jtjks|jtjkrd
| j _nd| j _| j jd	krt }| j jdkr|| | }n,|||}n&| j jd
krt }||d| j j|d}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|j|j|j|j |j!|j"d	S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        Fine-tuning a model on `num_labels` classes
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForSequenceClassification

        >>> num_labels = 2  # for example, this is a binary classification task
        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

        >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
        >>> labels = torch.tensor(1)  # the real label for inputs

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax()
        ```
        NFz8Passing input embeddings is currently not supported for rb   rC  rD  r  rE  r$  rF  r  rG  r   re   r  r  r   r   z7All examples must have the same number of <eos> tokens.r!   
regressionsingle_label_classificationmulti_label_classificationrf  )#r   r  NotImplementedErrorr5   rC   r   eqeos_token_idr{   r9   r  r:   unique_consecutivesumr%   rv   rm   rr  problem_typerp  r8   r<   rG   r   squeezer   r   r   r[   rH  rI  r%  rJ  r   rK  )r3   r   rb   rC  rD  r  rE  r$  rF  r  rG  re  r   re   r  r  r   r`   eos_masksentence_representationrh  rg  rl  rm  r(   r(   r)   r@   -  s   M$

$

z$MvpForSequenceClassification.forward)NNNNNNNNNNNNNNN)rC   rD   rE   rO  r   r2   rB  r   r   r:   r  rH   rP  r   r   r   r   r   r@   rI   r(   r(   r4   r)   ro    sj    	

ro  c                &       s  e Zd ZddgZ fddZdd Ze																ddeej	 d	eej	 d
eej
 deej
 deej	 deej	 deej	 deeej  deej
 deej
 deej deej dee dee dee dee deeef f"ddZ  ZS )MvpForQuestionAnsweringr4  r5  c                    sB   t  | d|_|j| _t|| _t|j|j| _| 	  d S r.   )
r1   r2   rp  r2  r   r   rU   hidden_size
qa_outputsr  r   r4   r(   r)   r2     s   
z MvpForQuestionAnswering.__init__c                 C   rb  r  )r   rB  r  rA  r:  r(   r(   r)   rB    rd  z.MvpForQuestionAnswering.set_lightweight_tuningNr   rb   rC  rD  r  rE  r$  rF  start_positionsend_positionsr  rG  r   re   r  r  rg   c                 C   sz  |dur|n| j j}|	dur|
durd}| j||||||||||||||d}|d }| |}|jddd\}}|d }|d }d}|	dur|
durt|	 dkr^|	d}	t|
 dkrk|
d}
|d}|		d|}	|
	d|}
t
|d}|||	}|||
}|| d	 }|s||f|dd  }|dur|f| S |S t||||j|j|j|j|j|j|jd

S )a`  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        Fine-tuning a model for extrative question answering, and our model also supports generative question answering
        using `BartForConditionalGeneration`
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
        ...     return_tensors="pt",
        ... )
        >>> target_start_index = torch.tensor([18])
        >>> target_end_index = torch.tensor([19])

        >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        >>> predict_answer = tokenizer.decode(predict_answer_tokens)
        ```
        NFrt  r   r   r!   rh   )ignore_indexr/   )
rg  start_logits
end_logitsr[   rH  rI  r%  rJ  r   rK  )r   r  r   r  r   r~  
contiguousr  rm   r   r   r   r[   rH  rI  r%  rJ  r   rK  )r3   r   rb   rC  rD  r  rE  r$  rF  r  r  r  rG  r   re   r  r  r   sequence_outputrh  r  r  
total_lossignored_indexrl  
start_lossend_lossrm  r(   r(   r)   r@     sp   S







zMvpForQuestionAnswering.forwardrM  )rC   rD   rE   rO  r2   rB  r   r   r:   rH   r  rP  r   r   r   r   r   r@   rI   r(   r(   r4   r)   r    sp    	

r  c                       s(   e Zd ZdZ fddZdd Z  ZS )MvpDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                    s   t  | t|| _d S r   )r1   r2   r  r8  r   r4   r(   r)   r2   u  s   zMvpDecoderWrapper.__init__c                 O   s   | j |i |S r   )r8  )r3   argsrs  r(   r(   r)   r@   y  s   zMvpDecoderWrapper.forward)rC   rD   rE   rF   r2   r@   rI   r(   r(   r4   r)   r  o  s    r  c                "       s  e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	e
														d deej deej deej deej deej deej dee deej deej dee dee dee dee deej deeef fddZ  ZS )!MvpForCausalLMrS  c                    sD   d|_ d|_t | t|| _tj|j|j	dd| _
|   d S )NTFrR   )rO   is_encoder_decoderr1   r2   r  r   r   rU   r  r   rU  r  r   r4   r(   r)   r2     s   
zMvpForCausalLM.__init__c                 C   s
   | j jjS r   r   r8  r   r:  r(   r(   r)   r;    rW  z#MvpForCausalLM.get_input_embeddingsc                 C   s   || j j_d S r   r  r=  r(   r(   r)   r?    s   z#MvpForCausalLM.set_input_embeddingsc                 C   s   || j _d S r   r   r8  )r3   r8  r(   r(   r)   set_decoder  s   zMvpForCausalLM.set_decoderc                 C   s   | j jS r   r  r:  r(   r(   r)   rX    s   zMvpForCausalLM.get_decoderc                 C   rb  r  rc  r:  r(   r(   r)   rB    rd  z%MvpForCausalLM.set_lightweight_tuningNr   rb   r   r   r  r$  r[   r  re  r   re   r  r  rf   rg   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jj|||||||||
|||d}| |d }d}|	durNt }||d| j j	|	d}|sd|f|dd  }|durb|f| S |S t
|||j|j|j|jdS )a  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MvpForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 8, 50267]
        ```N)r   rb   r   r   r  r$  r[   r  r   re   r  r  r   r!   r   )rg  rh  r[   r`   r  r%  )r   re   r  r  r   r8  rU  r   rv   r   r   r[   r`   r  r%  )r3   r   rb   r   r   r  r$  r[   r  re  r   re   r  r  rf   r   rh  rg  rl  rm  r(   r(   r)   r@     sD   -zMvpForCausalLM.forward)NNNNNNNNNNNNNN)rC   rD   rE   rO  r2   r;  r?  r  rX  rB  r   r   r:   r  rH   r   r
   r   r   r   r   r@   rI   r(   r(   r4   r)   r  }  sl    	

r  )r  rR  r  ro  r2  r   )ArF   r   typingr   r   r:   r   torch.nnr   r   r   activationsr	   cache_utilsr
   r   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_mvpr   
get_loggerrC   r&  rH   rG   r*   r   r+   ModulerJ   r   r   r   r   r   r   r  r2  rR  ro  r  r  r  __all__r(   r(   r(   r)   <module>   sf   $	
 "Cw 5    1 - *v