o
    eiU                    @   sr  d Z ddlZddlZddlmZ ddlZddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ eeZ dDddZ!dd Z"dDddZ#dd Z$eeddG dd deZ%eeddG dd deZ&eed dG d!d" d"eZ'eed dG d#d$ d$eZ(eG d%d& d&eZ)G d'd( d(ej*Z+G d)d* d*ej,Z-G d+d, d,ej,Z.G d-d. d.ej,Z/G d/d0 d0eZ0G d1d2 d2eZ1ed3dG d4d5 d5e)Z2ed6dG d7d8 d8e)Z3eG d9d: d:e)Z4ed;dG d<d= d=e)eZ5ed>dG d?d@ d@e)eZ6G dAdB dBe)Z7g dCZ8dS )EzRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).    N)	dataclass)Tensornn)	LayerNorm   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutput)PreTrainedModel)ModelOutputauto_docstringlogging   )ProphetNetConfigFc                 C   s,   |rt jj|  |dS t jj| |tjdS )Ndimr   dtype)r   
functionalsoftmaxfloattorchfloat32)hidden_stater   
onnx_trace r   p/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   %   s   r   c                 C   s   t j|| | f||dt |j }|  }t|D ]}|| jddd || | d  qd|dddddf< t j	||gddS )	z@
    This function computes the bias for the predict stream
    )devicer   r   F)wrapr   N   r   )
r   onesfinfomindetachclonerangefill_diagonal_triu_cat)sequence_lengthngramr!   r   
left_blockright_block
stream_idxr   r   r    ngram_attention_bias,   s    r2   c           	      C   s   | }d}|r | d } |t |t | |   }t |}n	t |t |}| d }t ||}|t | | t||  | |   }t 	|t 
|| d   }|t || | }|S )zo
    This function computes individual parts of the relative position buckets. For more detail, see paper.
    r   r#   r   )r   lt
zeros_likeintabsmaxlogr   mathr&   	ones_likewhere)	num_bucketsmax_distancerelative_positionsis_bidirectionalinv_relative_positionsrel_positions_bucket	max_exactis_smallval_if_larger   r   r    compute_relative_buckets=   s(   rE   c                 C   s   | dd|dd}|| d }tj|d |fdd d}|d|dd}|| d }t| ||dd}t| ||dd}||fS )zm
    This function computes both main and predict relative position buckets. For more detail, see paper.
    r   r   F)r?   )	unsqueezerepeatsizer   r,   rE   )r<   r=   position_idsmain_stream_relative_positions$predicting_stream_relative_positionsmain_relative_position_buckets!predict_relative_position_bucketsr   r   r    #compute_all_stream_relative_bucketsX   s   rO   zF
    Base class for sequence-to-sequence language models outputs.
    )custom_introc                   @   s  e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
edB ed< dZeej dB ed< dZeej dB ed< dZeej dB ed	< dZeej dB ed
< dZeej dB ed< dZejdB ed< dZeej dB ed< dZeej dB ed< dS )ProphetNetSeq2SeqLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    Nlosslogitslogits_ngrampast_key_valuesdecoder_hidden_statesdecoder_ngram_hidden_statesdecoder_attentionsdecoder_ngram_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentions)__name__
__module____qualname____doc__rR   r   FloatTensor__annotations__rS   rT   rU   r   rV   tuplerW   rX   rY   rZ   r[   r\   r]   r   r   r   r    rQ   o   s   
 rQ   z
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.
    c                   @   s   e Zd ZU dZejed< dZejdB ed< dZe	dB ed< dZ
eej dB ed< dZeej dB ed< dZeej dB ed< dZeej dB ed	< dZeej dB ed
< dZejdB ed< dZeej dB ed< dZeej dB ed< dS )ProphetNetSeq2SeqModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    last_hidden_stateNlast_hidden_state_ngramrU   rV   rW   rX   rY   rZ   r[   r\   r]   )r^   r_   r`   ra   r   rb   rc   rg   rU   r   rV   rd   rW   rX   rY   rZ   r[   r\   r]   r   r   r   r    re      s   
 
re   zs
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                   @   s   e Zd ZU dZejed< dZejdB ed< dZe	dB ed< dZ
eej dB ed< dZeej dB ed< dZeej dB ed< dZeej dB ed	< dZeej dB ed
< dS )ProphetNetDecoderModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    rf   Nrg   rU   hidden_stateshidden_states_ngram
attentionsngram_attentionsrZ   )r^   r_   r`   ra   r   rb   rc   rg   rU   r   ri   rd   rj   rk   rl   rZ   r   r   r   r    rh      s   
 
rh   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
edB ed< dZeej dB ed< dZeej dB ed< dZeej dB ed	< dZeej dB ed
< dZeej dB ed< dS )ProphetNetDecoderLMOutputa	  
    ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    NrR   rS   rT   rU   ri   rj   rk   rl   rZ   )r^   r_   r`   ra   rR   r   rb   rc   rS   rT   rU   r   ri   rd   rj   rk   rl   rZ   r   r   r   r    rm      s   
 "rm   c                   @   s&   e Zd ZU eed< dZdZdd ZdS )ProphetNetPreTrainedModelconfig
prophetnetTc                 C   s   | j j}| j j}|d usJ d||j}|dd df  |ddd f< ||d< |d us2J d||dk| t|dk	 sGJ d	|S )
Nzself.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.rF   r   ).r   z1self.model.config.pad_token_id has to be defined.r   z8Verify that `shifted_input_ids` has only positive values)
ro   decoder_start_token_idpad_token_id	new_zerosshaper(   masked_fill_r   allitem)self	input_idsrr   rs   shifted_input_idsr   r   r    _shift_right:  s   
 z&ProphetNetPreTrainedModel._shift_rightN)r^   r_   r`   r   rc   base_model_prefixsupports_gradient_checkpointingr|   r   r   r   r    rn   4  s
   
 rn   c                       sD   e Zd ZdZdeddf fddZd fdd	Z fd	d
Z  ZS )ProphetNetPositionalEmbeddingsa  
    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
    the forward function.
    ro   returnNc                    s"   |j | _t |j |j|j d S N)max_position_embeddings
max_lengthsuper__init__hidden_sizers   ry   ro   	__class__r   r    r   X  s   z'ProphetNetPositionalEmbeddings.__init__c                    s   |d u s| j d u sJ d|d u r]|d ur6| dkr6| }|d | }tjdtj|dt| j |  }n'|d u rCtj|tj|d}tj|dd||  | j  }|d| j	d }t
 ||fS )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r   )r   r   r   r!   r   )padding_idxget_seq_lengthr   r$   longr5   cumsumtype_asclampr   r   forward)ry   inputs_shaper!   attention_maskrU   rJ   prev_num_input_idsnum_input_idsr   r   r    r   \  s"   z&ProphetNetPositionalEmbeddings.forwardc                    s   t  |S r   )r   r   )ry   rJ   r   r   r    _forwardx  s   z'ProphetNetPositionalEmbeddings._forward)NNN)	r^   r_   r`   ra   r   r   r   r   __classcell__r   r   r   r    r   Q  s
    r   c                       s   e Zd ZdZddedededB f fddZ					dd	edB d
edB dedB de	dB de
jdB deeedB f fddZ  ZS )ProphetNetAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNro   num_attn_heads	layer_idxc                    s   t    |j}|j| _|j| _|| _|| | _|| _| j| |ks&J dt	||| _
t	||| _t	||| _t	||| _d S )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r   r   r   attention_dropoutdropoutr   head_dimr   r   Linearkey_proj
value_proj
query_projout_proj)ry   ro   r   r   r   r   r   r    r     s   

zProphetNetAttention.__init__Fkey_value_statesr   rU   output_attentionscache_positionr   c                 C   s  |  \}}}	|d u}
t|  |||	gks%J d|||	f d|   | || jd  }d}|d urMt|trK|j| j}|
rG|j	}n|j
}n|}|
rQ|n|}|
rj|d urj|rj|j| j j}|j| j j}nJ| |}| |}||d| j| jdd}||d| j| jdd}|d ur|
s|nd }|||| jd|i\}}|
rt|trd	|j| j< |||| j| jdd}| d}td
||dd}|| j||f}|  |krtd| d|   |d ur| dkrd }|| jd|f}|d ur|  |krtd| d|   |d ur|| }|r%|}nd }tjj|dd}tjj|| j| jd}td
||}|| j|| jf}|  |kr\td| d|   |dd|||	}| |}tjj|| j| jd}||fS )Nz Size of hidden states should be z	, but is       ?FrF   r   r#   r   Tzbsij,bsjk->bsikr   z#Attention weights should have size r   z Attention mask should have size r   ptrainingz `attn_output` should have shape , but is of shape ) rI   listr   r   
isinstancer
   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   viewr   	transposeupdater   einsum
ValueErrorr   r   r   r   r   r   r   reshaper   )ry   ri   r   r   rU   r   r   
batch_sizetgt_lenr   is_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statessrc_lenattn_weightsexpected_shapeattn_weights_reshaped
attn_probsattn_outputr   r   r    r     s~   	






zProphetNetAttention.forwardr   )NNNFN)r^   r_   r`   ra   r   r5   r   r   r   boolr   rd   r   r   r   r   r   r    r   |  s*     r   c                       s2   e Zd ZdZdedef fddZdd Z  ZS )ProphetNetFeedForwardzm
    This is the residual two feed-forward layer block based on the original Transformer implementation.
    ro   ffn_dimc                    sJ   t    t|j | _t|j|| _t||j| _	|j
| _
|j| _d S r   )r   r   r   activation_functionactivation_fnr   r   r   intermediateoutputactivation_dropoutr   )ry   ro   r   r   r   r    r     s   
zProphetNetFeedForward.__init__c                 C   sN   |  |}| |}tjj|| j| jd}| |}tjj|| j| jd}|S )Nr   )r   r   r   r   r   r   r   r   )ry   ri   r   r   r    r     s   


zProphetNetFeedForward.forward)	r^   r_   r`   ra   r   r5   r   r   r   r   r   r   r    r     s    r   c                       sf   e Zd Zddef fddZdd Zdd Z							dd	edB fd
dZdd Z	dd Z
  ZS )ProphetNetNgramSelfAttentionNro   c                    s   t    |j| _|j| _|j| _|j| _|j| _|j| _|j| j | _	|j
| _
|| _| j	| j |jks8J dt|j|j| _t|j|j| _t|j|j| _t|j|j| _t|j| j| j | _d| _d S )Nz6config.hidden_size must be divisible by num_attn_headsF)r   r   r   r<   relative_max_distancenum_decoder_attention_headsr   r   r   r   r.   r   r   r   r   r   r   r   relative_pos_embeddingsr   ry   ro   r   r   r   r    r     s&   

z%ProphetNetNgramSelfAttention.__init__c                 C   s    | ||| j| jdd S Nr   r#   )r   r   r   r   
contiguous)ry   tensorseq_lenr   r   r   r    _shape+  s    z#ProphetNetNgramSelfAttention._shapec                 C   s
   d| _ d S )NT)r   ry   r   r   r    prepare_for_onnx_export_.     
z5ProphetNetNgramSelfAttention.prepare_for_onnx_export_rU   c	           )         s  |  \}	}
}t|  |	|
|gks J d|	|
|f d|j | |}| |}| |}|| jd  }| ||
|	}| |d|	}| |d|	}|	| jd| jf}|j	| }|j	| }|j	| }|j
d| j dd}|j
d| j dd}|j
d| j dd}|j
d| j dd}|d |dd  }}|d |dd  }}|d |dd   }|d |dd  }|d urt|tr|j}n|}| | jd	|i\ |
d| j  }td
| dd}| ||||}|| }|d ur|| }t|d| jd|}tjj|| j| jd}td
|}|dd	|	d||}| |}t|d|	| j| j|| j} t fdd|D d}!tj|dd}"t fdd|D d}#td| |!f}$| !|"|$||}%|$|% }$|d ur|"ddddd}|#|$j$}|$| }$t|$d| jd|$}&tjj|&| j| jd}&td|&|#ddf}'|'dd}'|'	|	| j||}'| |'}'t ||'gd|	d|}(||	| j|d}tjj|(| j| jd}(|(||&fS )Nz#`hidden_states` should be of shape r   r   rF   r   r   r#   r   r   zbntc,bncs->bntsr   )r   r   r   c                    s   g | ]
}t  |gd qS r#   )r   r,   ).0key)main_key_statesr   r    
<listcomp>  s    z8ProphetNetNgramSelfAttention.forward.<locals>.<listcomp>c                    s"   g | ]}t  |gd d qS r   )r   r,   rG   )r   v_p)main_value_statesr   r    r     s   " zbnhtc,bnhsc->bnhts   zbnhts,bnhsc->bnhtc)%rI   r   ru   r   r   r   r   r   r   r   chunkr.   r   r
   r   r   r   r   r   r    get_main_relative_pos_embeddingsr   r   r   r   r   r   r   r   r   stackr   r,   #get_predict_relative_pos_embeddingspermutetor   ))ry   ri   rU   r   extended_predict_attention_maskrM   rN   rJ   r   r   ngram_sequence_lengthr   r   r   r   
proj_shapehidden_states_listquery_states_listkey_states_listvalue_states_listmain_hidden_stateshidden_states_predict_listmain_query_statespredict_query_states_listpredict_key_states_listpredict_value_states_listr   r-   main_attn_weightsmain_relative_pos_embeddingsmain_attn_probsmain_attn_outputpredict_query_statespredict_key_statespredict_hidden_statespredict_value_statespredict_attn_weightspredict_relative_pos_embeddingspredict_attn_probspredict_attn_outputr   r   )r   r   r    r   1  s   











z$ProphetNetNgramSelfAttention.forwardc                 C   sH  |j \}}}}|||||}|d u rK|j d d \}}	td|j d d dd||	d|j}
|
|d||	d }
t| j	| j
|
d}| |}||j d d | j	| jf }|dddd}||j d d d }|d| jd}|d|j d }| }|d|d}tj|d|d}||||d}|S )	Nr#   r   rF   r   Fr   )rF   r   index)ru   r   r   arangerG   rH   r   r!   rE   r<   r   r   r   r   r   r   rI   gather)ry   ri   r   rJ   rM   r   r   r   r   r-   r>   rel_pos_embeddingsr   r   r   r    r     s:   


z=ProphetNetNgramSelfAttention.get_main_relative_pos_embeddingsc                 C   sH  |j dd \}}|d u rJ|j d }|d d |d ks J dtd|dd||d|j}||d||d }t| j| j	|d}|
dd}| |}	|	|j d d | j| jf }	|	ddddd}	|	d| j}	|d}|| jd| jd}|d|d }tj|	d|d	}
|
|| j| j|d}
|
S )
Nr   r#   rF   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr   r   r  )ru   r   r  rG   rH   r   r!   rE   r<   r   r   r   r   r   r   r   r.   rI   r   r  )ry   ri   r   rJ   rN   r   r-   key_sequence_lengthr>   r  r  r   r   r    r     sN   





z@ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddingsr   )NNNNNNN)r^   r_   r`   r   r   r   r   r   r   r   r   r   r   r   r   r    r     s"    
 $-r   c                       s8   e Zd ZdZdef fddZ	d	defddZ  ZS )
ProphetNetEncoderLayerz&
    Encoder block for Prophetnet
    ro   c                    sB   t    t||j| _t|j| _t||j	| _
t|j| _d S r   )r   r   r   num_encoder_attention_heads	self_attnr   r   self_attn_layer_normr   encoder_ffn_dimfeed_forwardfeed_forward_layer_normr   r   r   r    r   B  s
   
zProphetNetEncoderLayer.__init__Fr   c                 C   sR   | j |||d\}}| || }| |}| || }|f}|r'||f7 }|S )N)ri   r   r   )r  r  r  r  )ry   ri   r   r   attention_outputr   feed_forward_outputoutputsr   r   r    r   L  s   


zProphetNetEncoderLayer.forwardF)	r^   r_   r`   ra   r   r   r   r   r   r   r   r   r    r
  =  s    r
  c                       sd   e Zd ZdZddef fddZ											ddedB d	edB d
ejdB fddZ	  Z
S )ProphetNetDecoderLayerz&
    Decoder block for Prophetnet
    Nro   c                    sf   t    t||d| _t|j| _|jr$t||j	|d| _
t|j| _t||j| _t|j| _d S )Nr   )r   r   r   r  r   r   r  add_cross_attentionr   r   
cross_attncross_attn_layer_normr   decoder_ffn_dimr  r  r   r   r   r    r   k  s   
zProphetNetDecoderLayer.__init__TF	use_cacher   r   c              	   C   s   | j ||	|||||d\}}}| || }d }|d ur/| j||||	|d\}}| || }| |}| || }|f}|rG||||f7 }|S )N)ri   rU   r   r   rM   rN   rJ   )ri   r   r   rU   r   )r  r  r  r  r  r  )ry   ri   r   r\   encoder_attn_maskr   rM   rN   rJ   rU   r  r   r   ngram_attention_outputself_attn_weightsself_attn_weights_ngramcross_attn_weightsr  r  r  r   r   r    r   z  s4   	

zProphetNetDecoderLayer.forwardr   )NNNNNNNNTFN)r^   r_   r`   ra   r   r   r   r   r   r   r   r   r   r   r    r  f  s*    r  z=
    The standalone encoder part of the ProphetNetModel.
    c                       s   e Zd Zdef fddZdd Zdd Ze						dd	ej	dB d
ej	dB dej	dB de
dB de
dB de
dB deeB fddZ  ZS )ProphetNetEncoderro   c                    sl   t    tj j j jd| _t | _	t
 j| _t fddt jD | _d| _|   d S )Nr   c                    s   g | ]}t  qS r   )r
  )r   _ro   r   r    r     s    z.ProphetNetEncoder.__init__.<locals>.<listcomp>F)r   r   r   	Embedding
vocab_sizer   rs   word_embeddingsr   position_embeddingsr   embeddings_layer_norm
ModuleListr)   num_encoder_layersr   gradient_checkpointing	post_initr   r   r$  r    r     s   
 zProphetNetEncoder.__init__c                 C      | j S r   r'  r   r   r   r    get_input_embeddings     z&ProphetNetEncoder.get_input_embeddingsc                 C   
   || _ d S r   r/  ry   valuer   r   r    set_input_embeddings  r   z&ProphetNetEncoder.set_input_embeddingsNrz   r   inputs_embedsr   output_hidden_statesreturn_dictr   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r*|du r*td|dur6|dur6td|durC|du rC| |}|durkd|ddddddf d| j jdd t	| j
j }||j
}nd}| |jdd |j\}	}
||	 }| |}tjj|| j j| jd}|rdnd}|rdnd}t| jD ]\}}|r||f }||||d	}|d
 }|r||d f }q|r||f }|stdd |||fD S t|||dS )a	  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetEncoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.      ?r   r#   r   r   )r   r   r   c                 s       | ]	}|d ur|V  qd S r   r   r   vr   r   r    	<genexpr>      z,ProphetNetEncoder.forward.<locals>.<genexpr>)rf   ri   rk   )ro   r   r7  use_return_dictr   r'  rH   r  r   r%   r   r&   r   r(  ru   r!   r)  r   r   r   r   	enumerater   rd   r   )ry   rz   r   r6  r   r7  r8  kwargsextended_attention_maskr(  rJ   ri   r\   all_attentionsidxencoder_layerlayer_outputsr   r   r    r     sR   
*


zProphetNetEncoder.forward)NNNNNN)r^   r_   r`   r   r   r0  r5  r   r   r   r   rd   r   r   r   r   r   r   r    r!    s4    	r!  z=
    The standalone decoder part of the ProphetNetModel.
    c                       s   e Zd Zdef fddZdd Zdd Ze											dd	ej	dB d
ej	dB dej	dB dej	dB de
dB dej	dB dedB dedB dedB dedB dej	dB deeB fddZdd Zdd Zdd Z  ZS )ProphetNetDecoderro   c                    s   t     j| _ j| _ j| _ j| _ j| _tj	 j
 j jd| _t | _t	| j jd | _t fddt jD | _t j| _d| _|   d S )Nr"  c                    s   g | ]}t  |d qS )r  )r  )r   ir$  r   r    r   ,  s    z.ProphetNetDecoder.__init__.<locals>.<listcomp>F)r   r   r.   r<   r   r   r   max_target_positionsr   r%  r&  r   rs   r'  r   r(  ngram_embeddingsr*  r)   num_decoder_layersr   r   r)  r,  r-  r   r   r$  r    r     s   
zProphetNetDecoder.__init__c                 C   r.  r   r/  r   r   r   r    r0  4  r1  z&ProphetNetDecoder.get_input_embeddingsc                 C   r2  r   r/  r3  r   r   r    r5  7  r   z&ProphetNetDecoder.set_input_embeddingsNrz   r   r\   encoder_attention_maskrU   r6  r  r   r7  r8  r   r   c           "         s`  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	|
dur$|
n| j j}
|du r4|du r4td|dur@|dur@td|durM|du rM| |}|jdd \ }| jre| j	re|ret
d d}|r|du r|duss| j jrtt| j dt| j dnt| j d}|dur| nd}| j |f|j|d	\}}|dkrd
\}}n| |\}}| j|d || }| jj|dkr|ddksJ d fddt| jD }d}d}nfddt| jD }| ||}| ||}|dur!d|ddddddf d| j jdd t| jj  }|!|j}nd}t"|g| d}| j#r5| #|}t$j%j&|| j&| j	d}|	rEdnd}|	rS| j jdkrSdnd}|rZdnd}|radnd}|rm| j j'rmdnd}t(| j)D ]Z\}}|	r||ddd|f f7 }| j jdkr||dd|df f7 }|||||||||||||d}|d }|r||d f7 }||d f7 }| j j'r||d f7 }qt|	r||ddd|f f7 }| j jdkr||dd|df f7 }|ddd|f } | j jdkr|dd|df nd}!|
s$t*dd | |!||||||fD S t+| |!||||||dS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetDecoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r#   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr$  r   )r!   rU   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1c                    s&   g | ]}|d      d d qS r   )rH   r   r.   r   rJ  predicting_stream_pos_embedr   r    r     s    z-ProphetNetDecoder.forward.<locals>.<listcomp>c                    s   g | ]
} |d    qS rM  r   rN  )rJ  rP  r   r    r     s    r9  r   r   )	r  r   rM   rN   rJ   rU   r  r   r   r   c                 s   r:  r   r   r;  r   r   r    r=    s    
z,ProphetNetDecoder.forward.<locals>.<genexpr>)rf   rg   rU   ri   rj   rk   rl   rZ   ),ro   r  r   r7  r?  r   r'  ru   r,  r   loggerwarning_onceis_encoder_decoderr
   r	   r   r(  r!   !compute_buffered_relative_bucketsr   rJ  weightrI   r)   r.   prepare_attention_maskprepare_predict_attention_maskrH   r   r   r%   r   r&   r   r,   r)  r   r   r   r  r@  r   rd   rh   )"ry   rz   r   r\   rL  rU   r6  r  r   r7  r8  r   rA  r-   past_key_values_lengthmain_stream_pos_embedrJ   rM   rN   ri   ngram_hidden_statesrB  r   extended_encoder_attention_maskall_main_stream_hidden_statesall_ngram_stream_hidden_statesall_main_stream_attnsall_ngram_stream_attnsall_cross_attnsrD  decoder_layerrF  rf   rg   r   rO  r    r   :  s   




*

&zProphetNetDecoder.forwardc              	   C   s   |j \}}td| j|jdd}t| j| j	|\}}|d d d |d |f |dd}t
|d d d |d |f |d d d || j| j| f gd|dd}||fS r   )ru   r   r  rI  r   r!   rH   rO   r<   r   r,   )ry   rJ   r   r-   main_relative_bucketspredict_relative_bucketsr   r   r    rT    s"   

$

z3ProphetNetDecoder.compute_buffered_relative_bucketsc                 C   s   |j d d \}}tj||ft|jj|j|jd}t|d}|d |d |f d d d d d d f || j	j
f|j  }|d ur]d|d d d d d d f  t| jj }|| }n|}||jS )Nr#   r   r   r9  )ru   r   fullr%   r   r&   r!   triuexpandro   r   r   )ry   ri   r   r   
seq_lengthcausal_maskextended_causal_maskrB  r   r   r    rV    s    (*
z(ProphetNetDecoder.prepare_attention_maskc           	      C   s   |j d d \}}t| j| j|j|j}tj|d d d |d |f |d d d || j| j| f gdd}|d d d d d d d d f || j	j
f|j  }|d urd|d d d d d d d f  t| jj }||| j	j
| j||f}tj|t|gdd}|| }n|}||jS )Nr#   rF   r   r9  )ru   r2   rI  r.   r!   r   r   r,   rf  ro   r   r%   r&   r4   r   )	ry   ri   r   r   rg  predict_causal_maskextended_predict_causal_maskrB  r   r   r   r    rW  &  s4   	,
z0ProphetNetDecoder.prepare_predict_attention_maskNNNNNNNNNNN)r^   r_   r`   r   r   r0  r5  r   r   r   r   r   rd   rh   r   rT  rV  rW  r   r   r   r   r    rG    sZ    	
 >rG  c                       s   e Zd ZdddZdef fddZdd Zdd	 Ze	
	
	
	
	
	
	
	
	
	
	
	
	
dde	j
d
B de	j
d
B de	j
d
B de	jd
B ded
B ded
B de	j
d
B de	j
d
B ded
B ded
B ded
B ded
B de	j
d
B deeB fddZ  ZS )ProphetNetModelword_embeddings.weight)zencoder.word_embeddings.weightdecoder.word_embeddings.weightro   c                    sd   t  | tj|j|j|jd| _t	|}d|_
t|| _t	|}d|_t|| _|   d S )Nr"  FT)r   r   r   r%  r&  r   rs   r'  copydeepcopyr  r!  encoder
is_decoderrG  decoderr-  )ry   ro   encoder_configdecoder_configr   r   r    r   Q  s   



zProphetNetModel.__init__c                 C   r.  r   r/  r   r   r   r    r0  `  r1  z$ProphetNetModel.get_input_embeddingsc                 C   s   || _ | j | j_ | j | j_ d S r   )r'  rr  rt  r3  r   r   r    r5  c  s   
z$ProphetNetModel.set_input_embeddingsNrz   r   decoder_input_idsdecoder_attention_maskencoder_outputsrU   r6  decoder_inputs_embedsr  r   r7  r8  r   r   c                 K   s   |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du r7| j||||
||d}| j|||d ||||
||	||d}|sO|| S t|j|j	|j
|j|j|j|j|j|j|j|jdS )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
        ```N)rz   r   r6  r   r7  r8  r   )rz   r   r\   rL  rU   r6  r   r7  r  r8  r   )rf   rg   rU   rV   rW   rX   rY   rZ   r[   r\   r]   )ro   r  r   r7  r?  rr  rt  re   rf   rg   rU   ri   rj   rk   rl   rZ   )ry   rz   r   rw  rx  ry  rU   r6  rz  r  r   r7  r8  r   rA  decoder_outputsr   r   r    r   h  sT   3
zProphetNetModel.forward)NNNNNNNNNNNNN)r^   r_   r`   _tied_weights_keysr   r   r0  r5  r   r   r   
BoolTensorrd   r   r   re   r   r   r   r   r   r    rm  J  sd    	
rm  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c                !       s  e Zd ZddiZdef fddZdd Ze														d!d	ej	dB d
ej	dB dej	dB dej
dB dej	dB dedB dej	dB dej	dB dej	dB dedB dedB dedB dedB dej	dB deeB fddZd"ddZdej	fddZd# fdd 	Z  ZS )$"ProphetNetForConditionalGenerationlm_head.weight!prophetnet.word_embeddings.weightro   c                    sH   t  | t|| _|j| _|j| _tj|j	|j
dd| _|   d S )NFbias)r   r   rm  rp   rs   r   disable_ngram_lossr   r   r   r&  lm_headr-  r   r   r   r    r     s   
z+ProphetNetForConditionalGeneration.__init__c                 C   s   | j jS r   )rp   r'  r   r   r   r    r0    s   z7ProphetNetForConditionalGeneration.get_input_embeddingsNrz   r   rw  rx  ry  rU   r6  rz  labelsr  r   r7  r8  r   r   c                 K   sr  |dur|n| j j}|	dur|du r|du r| |	}| j|||||||||
||||d}|dur4|jn|jdd \}}|d || j j|d}| |}|dddf }| j jdkrf|ddddf nd}| sp|	 }d}|	dur|| 
||	}|stdd ||fD }|dur|f| |dd  S ||dd  S t||||j|j|j|j|j|j|j|j|jd	S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
        ```N)rz   r   rw  rx  ry  rU   r6  rz  r  r   r7  r8  r   r#   r   rF   r   c                 s   r:  r   r   r;  r   r   r    r=  D  r>  z=ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>)rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   )ro   r?  r|   rp   ru   r   r.   r  is_contiguousr   _compute_lossrd   rQ   rU   rV   rW   rX   rY   rZ   r[   r\   r]   )ry   rz   r   rw  rx  ry  rU   r6  rz  r  r  r   r7  r8  r   rA  r  r   r-   predicting_streamspredict_logitsrS   rT   rR   
all_logitsr   r   r    r     s\   8

$.z*ProphetNetForConditionalGeneration.forwardrq   c                 C     | | jj|d|d|}t| jjD ]}|dkr#| jr# n|||d d d d f< q|dd }t	j
j|d|ddtjd}t	j
j||ddd}| jjdkr|jddd	 }||d}	||	 }| }| jj|d }
d
| jj | |
|  }|S Nr   r   rF   r   mean)	reductiong        T)r   keepdimr9  rt   ro   r.   rI   fill_r)   r  r   r   r   r   log_softmaxr   r   r   nll_lossepssumner  ry   rS   r  ignore_indexexpend_targetsrH  lprobsrR   smooth_lossnon_masked_tokenseps_ir   r   r    r  V  (   $z0ProphetNetForConditionalGeneration._compute_lossc                 C   s
   |  |S r   )r|   )ry   r  r   r   r    %prepare_decoder_input_ids_from_labelsr  r   zHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    s   |d u r| j jS t j|dS )N)modality)rp   rr  r   get_encoder)ry   r  r   r   r    r  u  s   z.ProphetNetForConditionalGeneration.get_encoder)NNNNNNNNNNNNNNrq   r   )r^   r_   r`   r|  r   r   r0  r   r   r   r}  r   r   rd   rQ   r   r  r  r  r   r   r   r   r    r~    sl    	

qr~  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                       s   e Zd ZdddZdef fddZdd Zdd	 Ze	
	
	
	
	
	
	
	
	
	
	
dde	j
d
B de	j
d
B de	j
d
B de	j
d
B ded
B de	j
d
B de	j
d
B ded
B ded
B ded
B ded
B deeB fddZd ddZ	
	
	
	d! fdd	Z  ZS )"ProphetNetForCausalLMr  )r  z)prophetnet.decoder.word_embeddings.weightro   c                    s^   t |}d|_d|_t | t|| _|j| _	|j
| _
tj|j|jdd| _|   d S )NTFr  )rp  rq  rs  rS  r   r   ProphetNetDecoderWrapperrp   rs   r   r  r   r   r   r&  r  r-  r   r   r   r    r     s   

zProphetNetForCausalLM.__init__c                 C   s
   | j jjS r   rp   rt  r'  r   r   r   r    r0    r   z*ProphetNetForCausalLM.get_input_embeddingsc                 C   s   || j j_d S r   r  r3  r   r   r    r5    s   z*ProphetNetForCausalLM.set_input_embeddingsNrz   r   r\   rL  rU   r6  r  r  r   r7  r8  r   c                 K   s0  |dur|n| j j}| jj||||||||	|
|d
}|dur!|jn|jdd \}}|d || j j|d}| |}|dddf }| j jdkrS|ddddf nd}d}|dura| ||}|st	dd ||fD }|dur}|f| |dd  S ||dd  S t
||||j|j|j|j|j|jd		S )
a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForCausalLM
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits

        >>> # Model can also be used with EncoderDecoder framework
        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
        >>> import torch

        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
        >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
        ... )

        >>> ARTICLE = (
        ...     "the us state department said wednesday it had received no "
        ...     "formal word from bolivia that it was expelling the us ambassador there "
        ...     "but said the charges made against him are `` baseless ."
        ... )
        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
        >>> labels = tokenizer_dec(
        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
        ... ).input_ids
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

        >>> loss = outputs.loss
        ```N)
rz   r   r\   rL  rU   r6  r  r   r7  r8  r#   r   rF   r   c                 s   r:  r   r   r;  r   r   r    r=    r>  z0ProphetNetForCausalLM.forward.<locals>.<genexpr>)	rR   rS   rT   rU   ri   rj   rk   rl   rZ   )ro   r?  rp   rt  ru   r   r.   r  r  rd   rm   rU   ri   rj   rk   rl   rZ   )ry   rz   r   r\   rL  rU   r6  r  r  r   r7  r8  rA  r  r   r-   r  r  rS   rT   rR   r  r   r   r    r     sF   ; 
$.zProphetNetForCausalLM.forwardrq   c                 C   r  r  r  r  r   r   r    r    r  z#ProphetNetForCausalLM._compute_lossFc                    s.   t  j|f||||d|}|dd  |S )N)rU   r   r  is_first_iterationr   )r   prepare_inputs_for_generationpop)ry   rz   rU   r   r  r  rA  model_inputsr   r   r    r     s   	z3ProphetNetForCausalLM.prepare_inputs_for_generationrl  r  )NNNF)r^   r_   r`   r|  r   r   r0  r5  r   r   r   r   r   rd   rm   r   r  r  r   r   r   r   r    r  |  sd    	

fr  c                       s6   e Zd ZdZddiZdef fddZdd Z  ZS )	r  z
    This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
    classes.
    ro  rn  ro   c                    s:   t  | tj|j|j|jd| _t|| _	| 
  d S )Nr"  )r   r   r   r%  r&  r   rs   r'  rG  rt  r-  r   r   r   r    r   C  s   
z!ProphetNetDecoderWrapper.__init__c                 O   s   | j |i |S r   )rt  )ry   argsrA  r   r   r    r   L  s   z ProphetNetDecoderWrapper.forward)	r^   r_   r`   ra   r|  r   r   r   r   r   r   r   r    r  9  s    	r  )rG  r!  r  r~  rm  rn   r  )9ra   rp  r9   dataclassesr   r   r   r   torch.nnr   activationsr   cache_utilsr   r	   r
   
generationr   modeling_layersr   modeling_outputsr   modeling_utilsr   utilsr   r   r   configuration_prophetnetr   
get_loggerr^   rQ  r   r2   rE   rO   rQ   re   rh   rm   rn   r%  r   Moduler   r   r   r
  r  r!  rG  rm  r~  r  r  __all__r   r   r   r    <module>   s   


-+&.+y  3)Gf  /  , 9