o
    wilz                    @   s~  d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	Z	ddl
Z	ddl	mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ee Z!dDddZ"dd Z#dDddZ$dd Z%eeddG dd deZ&eeddG dd deZ'eed dG d!d" d"eZ(eed dG d#d$ d$eZ)eG d%d& d&eZ*G d'd( d(ej+Z,G d)d* d*ej-Z.G d+d, d,ej-Z/G d-d. d.ej-Z0G d/d0 d0eZ1G d1d2 d2eZ2ed3dG d4d5 d5e*Z3ed6dG d7d8 d8e*Z4eG d9d: d:e*Z5ed;dG d<d= d=e*eZ6ed>dG d?d@ d@e*eZ7G dAdB dBe*Z8g dCZ9dS )EzRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).    N)	dataclass)OptionalUnion)Tensornn)	LayerNorm   )ACT2FN)GenerationMixin)GradientCheckpointingLayer)BaseModelOutput)PreTrainedModel)ModelOutputauto_docstringlogging   )ProphetNetConfigFc                 C   s,   |rt jj|  |dS t jj| |tjdS )Ndimr   dtype)r   
functionalsoftmaxfloattorchfloat32)hidden_stater   
onnx_trace r   o/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   (   s   r   c                 C   s   t j|| | f||dt |j }|  }t|D ]}|| jddd || | d  qd|dddddf< t j	||gddS )	z@
    This function computes the bias for the predict stream
    )devicer   r   F)wrapr   N   r   )
r   onesfinfomindetachclonerangefill_diagonal_triu_cat)sequence_lengthngramr    r   
left_blockright_block
stream_idxr   r   r   ngram_attention_bias/   s    r1   c           	      C   s   | }d}|r | d } |t |t | |   }t |}n	t |t |}| d }t ||}|t | | t||  | |   }t 	|t 
|| d   }|t || | }|S )zo
    This function computes individual parts of the relative position buckets. For more detail, see paper.
    r   r"   r   )r   lt
zeros_likeintabsmaxlogr   mathr%   	ones_likewhere)	num_bucketsmax_distancerelative_positionsis_bidirectionalinv_relative_positionsrel_positions_bucket	max_exactis_smallval_if_larger   r   r   compute_relative_buckets@   s(   rD   c                 C   s   | dd|dd}|| d }tj|d |fdd d}|d|dd}|| d }t| ||dd}t| ||dd}||fS )zm
    This function computes both main and predict relative position buckets. For more detail, see paper.
    r   r   F)r>   )	unsqueezerepeatsizer   r+   rD   )r;   r<   position_idsmain_stream_relative_positions$predicting_stream_relative_positionsmain_relative_position_buckets!predict_relative_position_bucketsr   r   r   #compute_all_stream_relative_buckets[   s   rN   zF
    Base class for sequence-to-sequence language models outputs.
    )custom_introc                   @   s  e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dZeeej  ed
< dZeeej  ed< dZeej ed< dZeeej  ed< dZeeej  ed< edd ZdS )ProphetNetSeq2SeqLMOutputa
	  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    Nlosslogitslogits_ngrampast_key_valuesdecoder_hidden_statesdecoder_ngram_hidden_statesdecoder_attentionsdecoder_ngram_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentionsc                 C      t dt | jS Nzi`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions` instead.warningswarnFutureWarningrY   selfr   r   r   decoder_cross_attentions   
   z2ProphetNetSeq2SeqLMOutput.decoder_cross_attentions)__name__
__module____qualname____doc__rQ   r   r   FloatTensor__annotations__rR   rS   rT   tuplerU   rV   rW   rX   rY   rZ   r[   r\   propertyre   r   r   r   r   rP   r   s    
 rP   z
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.
    c                   @   s   e Zd ZU dZejed< dZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed	< dZee
ej  ed
< dZeej ed< dZee
ej  ed< dZee
ej  ed< edd ZdS )ProphetNetSeq2SeqModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    last_hidden_stateNlast_hidden_state_ngramrT   rU   rV   rW   rX   rY   rZ   r[   r\   c                 C   r]   r^   r_   rc   r   r   r   re      rf   z5ProphetNetSeq2SeqModelOutput.decoder_cross_attentions)rg   rh   ri   rj   r   rk   rl   rq   r   rT   rm   rU   rV   rW   rX   rY   rZ   r[   r\   rn   re   r   r   r   r   ro      s   
 
ro   zs
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                   @   s   e Zd ZU dZejed< dZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed	< dZee
ej  ed
< dS )ProphetNetDecoderModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    rp   Nrq   rT   hidden_stateshidden_states_ngram
attentionsngram_attentionsrY   )rg   rh   ri   rj   r   rk   rl   rq   r   rT   rm   rs   rt   ru   rv   rY   r   r   r   r   rr      s   
 
rr   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dZeeej  ed
< dZeeej  ed< dS )ProphetNetDecoderLMOutputa	  
    ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    NrQ   rR   rS   rT   rs   rt   ru   rv   rY   )rg   rh   ri   rj   rQ   r   r   rk   rl   rR   rS   rT   rm   rs   rt   ru   rv   rY   r   r   r   r   rw     s   
 #rw   c                   @   s(   e Zd ZeZdZdZdd Zdd ZdS )ProphetNetPreTrainedModel
prophetnetTc                 C   s   t |tjr |jjjd| jjd |jd ur|jj	  d S d S t |tj
rA|jjjd| jjd |jd urC|jj|j 	  d S d S d S )N        )meanstd)
isinstancer   Linearweightdatanormal_configinit_stdbiaszero_	Embeddingpadding_idx)rd   moduler   r   r   _init_weightsS  s   

z'ProphetNetPreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d usJ d||j}|dd df  |ddd f< ||d< |d us2J d||dk| t|dk	 sGJ d	|S )
Nzself.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.rE   r   ).r   z1self.model.config.pad_token_id has to be defined.r   z8Verify that `shifted_input_ids` has only positive values)
r   decoder_start_token_idpad_token_id	new_zerosshaper'   masked_fill_r   allitem)rd   	input_idsr   r   shifted_input_idsr   r   r   _shift_right]  s   
 z&ProphetNetPreTrainedModel._shift_rightN)	rg   rh   ri   r   config_classbase_model_prefixsupports_gradient_checkpointingr   r   r   r   r   r   rx   M  s    
rx   c                       sD   e Zd ZdZdeddf fddZd fdd	Z fd	d
Z  ZS )ProphetNetPositionalEmbeddingsa  
    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
    the forward function.
    r   returnNc                    s"   |j | _t |j |j|j d S N)max_position_embeddings
max_lengthsuper__init__hidden_sizer   rd   r   	__class__r   r   r   {  s   z'ProphetNetPositionalEmbeddings.__init__c                    s   |d u s| j d u sJ d|d u r\|d ur5|d d jd }|d | }tjdtj|dt| j |  }n'|d u rBtj|tj|d}tj|dd||  | j  }|d| j	d }t
 ||fS )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r"   r   )r   r   r   r    r   )r   r   r   r#   longr4   cumsumtype_asclampr   r   forward)rd   inputs_shaper    attention_maskrT   rI   prev_num_input_idsnum_input_idsr   r   r   r     s"   z&ProphetNetPositionalEmbeddings.forwardc                    s   t  |S r   )r   r   )rd   rI   r   r   r   _forward     z'ProphetNetPositionalEmbeddings._forward)NNN)	rg   rh   ri   rj   r   r   r   r   __classcell__r   r   r   r   r   t  s
    r   c                       s   e Zd ZdZdedef fddZdejdedefd	d
Z						dde
e de
e de
e de
ee  dedeee
e f fddZ  ZS )ProphetNetAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   num_attn_headsc                    s   t    |j}|j| _|j| _|| _|| | _| j| |ks#J dt||| _	t||| _
t||| _t||| _d S )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r   r   r   attention_dropoutdropoutr   head_dimr   r~   key_proj
value_proj
query_projout_proj)rd   r   r   r   r   r   r   r     s   

zProphetNetAttention.__init__tensorseq_lenbszc                 C       | ||| j| jdd S Nr   r"   viewr   r   	transpose
contiguous)rd   r   r   r   r   r   r   _shape      zProphetNetAttention._shapeNFkey_value_statesr   layer_head_maskpast_key_valueoutput_attentionsr   c                 C   s  |  \}}}	|d u}
t|  |||	gks%J d|||	f d|   | || jd  }|
r>|d ur>|d }|d }n+|
rU| | |d|}| | |d|}n| | |d|}| | |d|}|
ro||f}|| jd| jf}| |||j| }|j| }|j| }| d}t	
d||dd	}|| j||f}|  |krtd
| d|   |d ur| dkrd }|| jd|f}|d ur|  |krtd| d|   |d ur|| }|r|}nd }tjj|dd}|d ur+|  | jfksJ d| jf d|   |dddd||| j|| }|dddd| }tjj|| j| jd}t	
d||}|| j|| jf}|  |krXtd| d|   |dd|||	}| |}tjj|| j| jd}|||fS )Nz Size of hidden states should be 	, but is       ?r   r   rE   r"   zbsij,bsjk->bsikr   z#Attention weights should have size z Attention mask should have size r   /Head mask for a single layer should be of size ptrainingz `attn_output` should have shape , but is of shape )rH   listr   r   r   r   r   r   r   r   einsumr   
ValueErrorr   r   r   r   r   r   r   reshaper   )rd   rs   r   r   r   r   r   
batch_sizetgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsexpected_shapeattn_weights_reshaped
attn_probsattn_outputr   r   r   r     s~   	








zProphetNetAttention.forward)NNNNF)rg   rh   ri   rj   r   r4   r   r   r   r   r   rm   boolr   r   r   r   r   r   r     s4    
r   c                       s2   e Zd ZdZdedef fddZdd Z  ZS )ProphetNetFeedForwardzm
    This is the residual two feed-forward layer block based on the original Transformer implementation.
    r   ffn_dimc                    sJ   t    t|j | _t|j|| _t||j| _	|j
| _
|j| _d S r   )r   r   r	   activation_functionactivation_fnr   r~   r   intermediateoutputactivation_dropoutr   )rd   r   r   r   r   r   r   %  s   
zProphetNetFeedForward.__init__c                 C   sN   |  |}| |}tjj|| j| jd}| |}tjj|| j| jd}|S )Nr   )r   r   r   r   r   r   r   r   )rd   rs   r   r   r   r   -  s   


zProphetNetFeedForward.forward)	rg   rh   ri   rj   r   r4   r   r   r   r   r   r   r   r      s    r   c                       sh   e Zd Zdef fddZdd Zdd Z							dd	eee	  fd
dZ
dd Zdd Z  ZS )ProphetNetNgramSelfAttentionr   c                    s   t    |j| _|j| _|j| _|j| _|j| _|j| _|j| j | _	|j
| _
| j	| j |jks5J dt|j|j| _t|j|j| _t|j|j| _t|j|j| _t|j| j| j | _d| _d S )Nz6config.hidden_size must be divisible by num_attn_headsF)r   r   r   r;   relative_max_distancenum_decoder_attention_headsr   r   r   r   r-   r   r~   r   r   r   r   relative_pos_embeddingsr   r   r   r   r   r   8  s$   

z%ProphetNetNgramSelfAttention.__init__c                 C   r   r   r   )rd   r   r   r   r   r   r   r   U  r   z#ProphetNetNgramSelfAttention._shapec                 C   s
   d| _ d S )NT)r   rc   r   r   r   prepare_for_onnx_export_X     
z5ProphetNetNgramSelfAttention.prepare_for_onnx_export_Nr   c	           *         st  |  \}	}
}t|  |	|
|gks J d|	|
|f d|j | |}| |}| |}|| jd  }| ||
|	}| |d|	}| |d|	}|	| jd| jf}|j	| }|j	| }|j	| }|j
d| j dd}|j
d| j dd}|j
d| j dd}|j
d| j dd}|d |dd  }}|d |dd  }}|d |dd   }|d |dd  }|d ur|d }tj| fdd |d }tj|fdd f}|
d| j  }td	| dd
}| ||||}|| }|d ur|| }t|d| jd|}|d ur6|  | jfks%J d| jf d|   |	dddd|	|	| jd| }tjj|| j| jd}td	|} | dd|	d||} | | } t|d	|	| j| j|| j}!t fdd|D d}"tj|dd}#tfdd|D d}$td|!|"f}%| |#|%||}&|%|& }%|d ur|dddd
d}||%j}|%| }%t|%d| jd|%}'|d ur|  | jfksJ d| jf d|   |	ddddd|' }'tjj|'| j| jd}'td|'|$ddf}(|(dd
}(|(|	| j||}(| |(}(t| |(gd	|	d|})|	|	| j|d}tjj|)| j| jd})|)||'|fS )Nz#`hidden_states` should be of shape r   r   rE   r   r   r"   r   zbntc,bncs->bntsr   )r   r   r   r   r   c                    s   g | ]
}t  |gd qS r"   )r   r+   ).0key)main_key_statesr   r   
<listcomp>  s    z8ProphetNetNgramSelfAttention.forward.<locals>.<listcomp>c                    s"   g | ]}t  |gd d qS r   )r   r+   rF   )r   v_p)main_value_statesr   r   r     s   " zbnhtc,bnhsc->bnhts   zbnhts,bnhsc->bnhtc) rH   r   r   r   r   r   r   r   r   r   chunkr-   r   r+   r   r    get_main_relative_pos_embeddingsr   r   r   r   r   r   r   r   r   r   stack#get_predict_relative_pos_embeddingspermutetor   )*rd   rs   r   r   r   extended_predict_attention_maskrL   rM   rI   r   ngram_sequence_lengthr   r   r   r   r   hidden_states_listquery_states_listkey_states_listvalue_states_listmain_hidden_stateshidden_states_predict_listmain_query_statespredict_query_states_listpredict_key_states_listpredict_value_states_listprev_main_key_statesprev_main_value_statesr,   main_attn_weightsmain_relative_pos_embeddingsmain_attn_probsmain_attn_outputpredict_query_statespredict_key_statespredict_hidden_statespredict_value_statespredict_attn_weightspredict_relative_pos_embeddingspredict_attn_probspredict_attn_outputr   r   )r   r   r   r   [  s   












z$ProphetNetNgramSelfAttention.forwardc                 C   sH  |j \}}}}|||||}|d u rK|j d d \}}	td|j d d dd||	d|j}
|
|d||	d }
t| j	| j
|
d}| |}||j d d | j	| jf }|dddd}||j d d d }|d| jd}|d|j d }| }|d|d}tj|d|d}||||d}|S )	Nr"   r   rE   r   Fr   )rE   r   index)r   r   r   arangerF   rG   r   r    rD   r;   r   r   r   r   r   r   rH   gather)rd   rs   r   rI   rL   r   r   r   r   r,   r=   rel_pos_embeddingsr  r   r   r   r     s:   


z=ProphetNetNgramSelfAttention.get_main_relative_pos_embeddingsc                 C   sH  |j dd \}}|d u rJ|j d }|d d |d ks J dtd|dd||d|j}||d||d }t| j| j	|d}|
dd}| |}	|	|j d d | j| jf }	|	ddddd}	|	d| j}	|d}|| jd| jd}|d|d }tj|	d|d	}
|
|| j| j|d}
|
S )
Nr   r"   rE   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr   r   r  )r   r   r  rF   rG   r   r    rD   r;   r   r   r   r   r   r   r   r-   rH   r   r  )rd   rs   r   rI   rM   r   r,   key_sequence_lengthr=   r  r  r   r   r   r   :  sN   





z@ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddingsNNNNNNN)rg   rh   ri   r   r   r   r   r   rm   r   r   r   r   r   r   r   r   r   r   7  s"    

 3-r   c                       s8   e Zd ZdZdef fddZ	d	defddZ  ZS )
ProphetNetEncoderLayerz&
    Encoder block for Prophetnet
    r   c                    sB   t    t||j| _t|j| _t||j	| _
t|j| _d S r   )r   r   r   num_encoder_attention_heads	self_attnr   r   self_attn_layer_normr   encoder_ffn_dimfeed_forwardfeed_forward_layer_normr   r   r   r   r   {  s
   
zProphetNetEncoderLayer.__init__Fr   c           
      C   sV   | j ||||d\}}}| || }| |}| || }|f}	|r)|	|f7 }	|	S )N)rs   r   r   r   )r  r   r"  r#  )
rd   rs   r   r   r   attention_outputr   _feed_forward_outputoutputsr   r   r   r     s   

zProphetNetEncoderLayer.forwardF	rg   rh   ri   rj   r   r   r   r   r   r   r   r   r   r  v  s    r  c                       sR   e Zd ZdZdef fddZ												dded	efd
dZ  ZS )ProphetNetDecoderLayerz&
    Decoder block for Prophetnet
    r   c                    s^   t    t|| _t|j| _|jr t||j	| _
t|j| _t||j| _t|j| _d S r   )r   r   r   r  r   r   r   add_cross_attentionr   r   
cross_attncross_attn_layer_normr   decoder_ffn_dimr"  r#  r   r   r   r   r     s   

zProphetNetDecoderLayer.__init__NTF	use_cacher   c              
   C   s   |d ur
|d d nd }| j |||||||	|
d\}}}}| || }|d ur.|dd  nd }d }|d urO| j||||||d\}}}| || }|| }| |}| || }|f}|rg||||f7 }|rn||f7 }|S )Nr"   )rs   r   r   r   r   rL   rM   rI   )rs   r   r   r   r   r   )r  r   r,  r-  r"  r#  )rd   rs   r   r[   encoder_attn_maskr   cross_attn_layer_head_maskr   rL   rM   rI   r   r/  r   self_attn_past_key_valuengram_attention_outputself_attn_weightsself_attn_weights_ngrampresent_key_valuecross_attn_past_key_valuecross_attn_weightsr$  cross_attn_present_key_valuer&  r'  r   r   r   r     sB   


zProphetNetDecoderLayer.forward)NNNNNNNNNNTFr)  r   r   r   r   r*    s(    r*  z=
    The standalone encoder part of the ProphetNetModel.
    c                       s   e Zd Zddedejf fddZdd Zdd	 Ze								dd
e
ej de
ej de
ej de
ej de
e de
e de
e deeef fddZ  ZS )ProphetNetEncoderNr   word_embeddingsc                    sx   t    |dur|n
tj j j jd| _t | _	t
 j| _t fddt jD | _d| _|   dS )7  
        word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
            The word embedding parameters. This can be used to initialize [`ProphetNetEncoder`] with pre-defined word
            embeddings instead of randomly initialized word embeddings.
        Nr   c                       g | ]}t  qS r   )r  r   r%  r   r   r   r         z.ProphetNetEncoder.__init__.<locals>.<listcomp>F)r   r   r   r   
vocab_sizer   r   r<  r   position_embeddingsr   embeddings_layer_norm
ModuleListr(   num_encoder_layerslayersgradient_checkpointing	post_initrd   r   r<  r   rA  r   r     s   
 zProphetNetEncoder.__init__c                 C      | j S r   r<  rc   r   r   r   get_input_embeddings     z&ProphetNetEncoder.get_input_embeddingsc                 C   
   || _ d S r   rM  rd   valuer   r   r   set_input_embeddings  r   z&ProphetNetEncoder.set_input_embeddingsr   r   	head_maskinputs_embedsr   output_hidden_statesreturn_dictr   c                 C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r*|du r*td|dur6|dur6td|durC|du rC| |}|durkd|ddddddf d| j jdd t	| j
j }||j
}nd}| |jdd |j\}	}
||	 }| |}tjj|| j j| jd}|rdnd}|rdnd}|dur| d	 t| jksJ d
t| j d| d	  dt| jD ](\}}|r||f }||||dur|| nd|d}|d	 }|r||d f }q|r||f }|stdd |||fD S t|||dS )a	  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetEncoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.      ?r   r"   r   r   r   z&The head_mask should be specified for  layers, but it is for .)r   r   r   c                 s       | ]	}|d ur|V  qd S r   r   r   vr   r   r   	<genexpr>i      z,ProphetNetEncoder.forward.<locals>.<genexpr>)rp   rs   ru   )r   r   rV  use_return_dictr   r<  rG   r  r   r$   r   r%   r   rD  r   r    rE  r   r   r   r   rH   lenrH  	enumeraterm   r   )rd   r   r   rT  rU  r   rV  rW  extended_attention_maskrD  rI   rs   r[   all_attentionsidxencoder_layerlayer_outputsr   r   r   r     s\   
*


zProphetNetEncoder.forwardr   r  )rg   rh   ri   r   r   r   r   rN  rS  r   r   r   r   r   r   rm   r   r   r   r   r   r   r   r;    s:    
	r;  z=
    The standalone decoder part of the ProphetNetModel.
    c                       s  e Zd Zddedeej f fddZdd Zdd	 Z	e
												d d
eej deej deej deej deej deej deeeej   deej dee dee dee dee deeef fddZdd Zdd Zdd Z  ZS )!ProphetNetDecoderNr   r<  c                    s   t     j| _ j| _ j| _ j| _ j| _|dur |n
tj	 j
 j jd| _t | _t	| j jd| _t fddt jD | _t j| _d| _|   dS )r=  Nr>  c                    r?  r   )r*  r@  rA  r   r   r     rB  z.ProphetNetDecoder.__init__.<locals>.<listcomp>F)r   r   r-   r;   r   r   r   max_target_positionsr   r   rC  r   r   r<  r   rD  ngram_embeddingsrF  r(   num_decoder_layersrH  r   rE  rI  rJ  rK  r   rA  r   r   u  s    
 zProphetNetDecoder.__init__c                 C   rL  r   rM  rc   r   r   r   rN    rO  z&ProphetNetDecoder.get_input_embeddingsc                 C   rP  r   rM  rQ  r   r   r   rS    r   z&ProphetNetDecoder.set_input_embeddingsr   r   r[   encoder_attention_maskrT  cross_attn_head_maskrT   rU  r/  r   rV  rW  r   c           %         s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du r4|du r4td|dur@|dur@td|durM|du rM| |}|jdd \ }| j |f|j	|d\}}|durld\}}n| 
|\}}| j|d || }| jj|dur|ddksJ d fd	d
t| jD }d}d}nfdd
t| jD }| ||}| ||}|durd|ddddddf d| j jdd t| jj }||j}nd}t|g| d}| jr| |}tjj|| j| jd}|r	dnd}|r| j jdkrdnd}|
rdnd}|
r%dnd}|
r1| j j r1dnd}| j!rE| jrE|	rEt"#d d}	|	rJdnd}t$||gddgD ]+\}}|dur| d t%| j&ksJ d| dt%| j& d| d  dqUt'| j&D ]\}} |r||ddd|f f7 }| j jdkr||dd|df f7 }|dur|| nd}!| |||||dur|| nd|dur|| nd|||||!|	|
d}"|"d }|	r||"|
rdnd f7 }|
r||"d f7 }||"d f7 }| j j r||"d f7 }q|r2||ddd|f f7 }| j jdkr2||dd|df f7 }|ddd|f }#| j jdkrM|dd|df nd}$|sct(dd |#|$||||||fD S t)|#|$||||||dS )aY  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetDecoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r"   )r    rT   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1c                    s&   g | ]}|d      d d qS r   )rG   r   r-   r   rj  predicting_stream_pos_embedr   r   r     s    z-ProphetNetDecoder.forward.<locals>.<listcomp>c                    s   g | ]
} |d    qS rn  r   ro  )rj  rq  r   r   r     s    rX  r   r   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FrT  rm  zThe `z` should be specified for rY  rZ  )
r1  r   r2  r   rL   rM   rI   r   r/  r   r   r   c                 s   r[  r   r   r\  r   r   r   r^  J  s    
z,ProphetNetDecoder.forward.<locals>.<genexpr>)rp   rq   rT   rs   rt   ru   rv   rY   )*r   r/  r   rV  r`  r   r<  r   rD  r    !compute_buffered_relative_bucketsr   rj  r   rH   r(   r-   prepare_attention_maskprepare_predict_attention_maskrG   r   r   r$   r   r%   r   r+   rE  r   r   r   r   r+  rI  loggerwarning_oncezipra  rH  rb  rm   rr   )%rd   r   r   r[   rl  rT  rm  rT   rU  r/  r   rV  rW  r,   main_stream_pos_embedrI   rL   rM   rs   ngram_hidden_statesrc  r   extended_encoder_attention_maskall_main_stream_hidden_statesall_ngram_stream_hidden_statesall_main_stream_attnsall_ngram_stream_attnsall_cross_attnspresent_key_values	attn_mask	mask_namere  decoder_layerr   rg  rp   rq   r   rp  r   r     s   $


*



&zProphetNetDecoder.forwardc              	   C   s   |j \}}td| j|jdd}t| j| j	|\}}|d d d |d |f |dd}t
|d d d |d |f |d d d || j| j| f gd|dd}||fS r   )r   r   r  ri  r   r    rG   rN   r;   r   r+   )rd   rI   r   r,   main_relative_bucketspredict_relative_bucketsr   r   r   rr  c  s"   

$

z3ProphetNetDecoder.compute_buffered_relative_bucketsc                 C   s   |j d d \}}tj||ft|jj|j|jd}t|d}|d |d |f d d d d d d f || j	j
f|j  }|d ur]d|d d d d d d f  t| jj }|| }n|}||jS )Nr"   r   r   rX  )r   r   fullr$   r   r%   r    triuexpandr   r   r   )rd   rs   r   r   
seq_lengthcausal_maskextended_causal_maskrc  r   r   r   rs  y  s    (*
z(ProphetNetDecoder.prepare_attention_maskc           	      C   s   |j d d \}}t| j| j|j|j}tj|d d d |d |f |d d d || j| j| f gdd}|d d d d d d d d f || j	j
f|j  }|d urd|d d d d d d d f  t| jj }||| j	j
| j||f}tj|t|gdd}|| }n|}||jS )Nr"   rE   r   rX  )r   r1   ri  r-   r    r   r   r+   r  r   r   r$   r%   r3   r   )	rd   rs   r   r   r  predict_causal_maskextended_predict_causal_maskrc  r   r   r   r   rt    s4   	,
z0ProphetNetDecoder.prepare_predict_attention_maskr   )NNNNNNNNNNNN)rg   rh   ri   r   r   r   r   r   rN  rS  r   r   r   rm   r   r   rr   r   rr  rs  rt  r   r   r   r   r   rh  o  s`    	

 Krh  c                $       s   e Zd ZddgZdef fddZdd Zdd	 Zd
d Zdd Z	dd Z
e															d#deej deej deej deej deej deej deej dee deeeej   deej deej dee dee dee dee d eeef f d!d"Z  ZS )$ProphetNetModelencoder.word_embeddings.weightdecoder.word_embeddings.weightr   c                    sx   t  | tj|j|j|jd| _t	|}d|_
d|_t|| j| _t	|}d|_d|_
t|| j| _|   d S )Nr>  FT)r   r   r   r   rC  r   r   r<  copydeepcopyis_encoder_decoderr/  r;  encoder
is_decoderrh  decoderrJ  )rd   r   encoder_configdecoder_configr   r   r   r     s   

zProphetNetModel.__init__c                 C   rL  r   rM  rc   r   r   r   rN    rO  z$ProphetNetModel.get_input_embeddingsc                 C   s   || _ | j | j_ | j | j_ d S r   )r<  r  r  rQ  r   r   r   rS    s   
z$ProphetNetModel.set_input_embeddingsc                 C   s4   | j jr| | jj| j | | jj| j d S d S r   )r   tie_word_embeddings_tie_or_clone_weightsr  r<  r  rc   r   r   r   _tie_weights  s   zProphetNetModel._tie_weightsc                 C   rL  r   )r  rc   r   r   r   get_encoder  rO  zProphetNetModel.get_encoderc                 C   rL  r   r  rc   r   r   r   get_decoder  rO  zProphetNetModel.get_decoderNr   r   decoder_input_idsdecoder_attention_maskrT  decoder_head_maskrm  encoder_outputsrT   rU  decoder_inputs_embedsr/  r   rV  rW  r   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|du r8| j||||
|||d}| j|||d ||||	|||||d}|sQ|| S t|j|j	|j
|j|j|j|j|j|j|j|jdS )a7  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
        ```N)r   r   rT  rU  r   rV  rW  r   )r   r   r[   rl  rT  rm  rT   rU  r   rV  r/  rW  )rp   rq   rT   rU   rV   rW   rX   rY   rZ   r[   r\   )r   r/  r   rV  r`  r  r  ro   rp   rq   rT   rs   rt   ru   rv   rY   )rd   r   r   r  r  rT  r  rm  r  rT   rU  r  r/  r   rV  rW  decoder_outputsr   r   r   r     sX   9zProphetNetModel.forward)NNNNNNNNNNNNNNN)rg   rh   ri   _tied_weights_keysr   r   rN  rS  r  r  r  r   r   r   r   
BoolTensorrm   r   r   ro   r   r   r   r   r   r   r    sr    	

r  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c                &       s\  e Zd Zg dZdef fddZdd Zdd Zd	d
 Zdd Z	e
																d,deej deej deej deej deej deej deej deej deeeej   deej deej deej dee dee dee dee deeef f"dd Zd-d"d#Zdejfd$d%Zed&d' Zd(d) Zd*d+ Z  ZS )."ProphetNetForConditionalGeneration)r  r  lm_head.weightr   c                    sH   t  | t|| _|j| _|j| _tj|j	|j
dd| _|   d S )NFr   )r   r   r  ry   r   r   disable_ngram_lossr   r~   r   rC  lm_headrJ  r   r   r   r   r   Q  s   
z+ProphetNetForConditionalGeneration.__init__c                 C   rL  r   r  rc   r   r   r   get_output_embeddings\  rO  z8ProphetNetForConditionalGeneration.get_output_embeddingsc                 C   rP  r   r  rd   new_embeddingsr   r   r   set_output_embeddings_  r   z8ProphetNetForConditionalGeneration.set_output_embeddingsc                 C   s"   | j jr| | jj| j d S d S r   )r   r  r  ry   r<  r  rc   r   r   r   r  b  s   z/ProphetNetForConditionalGeneration._tie_weightsc                 C      | j jS r   )ry   r<  rc   r   r   r   rN  f     z7ProphetNetForConditionalGeneration.get_input_embeddingsNr   r   r  r  rT  r  rm  r  rT   rU  r  labelsr/  r   rV  rW  r   c                 C   sv  |dur|n| j j}|dur|du r|du r| |}| j|||||||||	|
|||||d}|dur6|jn|jdd \}}|d || j j|d}| |}|dddf }| j jdkrh|ddddf nd}| sr|	 }d}|dur~| 
||}|stdd ||fD }|dur|f| |dd  S ||dd  S t||||j|j|j|j|j|j|j|j|jd	S )
a	  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
        ```N)r   r   r  r  rT  r  rm  r  rT   rU  r  r/  r   rV  rW  r"   r   rE   r   c                 s   r[  r   r   r\  r   r   r   r^    r_  z=ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>)rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   )r   r`  r   ry   r   r   r-   r  is_contiguousr   _compute_lossrm   rP   rT   rU   rV   rW   rX   rY   rZ   r[   r\   )rd   r   r   r  r  rT  r  rm  r  rT   rU  r  r  r/  r   rV  rW  r'  r   r,   predicting_streamspredict_logitsrR   rS   rQ   
all_logitsr   r   r   r   i  s`   >

$.z*ProphetNetForConditionalGeneration.forwardr   c                 C     | | jj|d|d|}t| jjD ]}|dkr#| jr# n|||d d d d f< q|dd }t	j
j|d|ddtjd}t	j
j||ddd}| jjdkr|jddd	 }||d}	||	 }| }| jj|d }
d
| jj | |
|  }|S Nr   r   rE   r   r{   )	reductionrz   T)r   keepdimrX  r   r   r-   rH   fill_r(   r  r   r   r   r   log_softmaxr   r   r   nll_lossepssumner{   rd   rR   r  ignore_indexexpend_targetsilprobsrQ   smooth_lossnon_masked_tokenseps_ir   r   r   r    (   $z0ProphetNetForConditionalGeneration._compute_lossc                 C   s
   |  |S r   )r   )rd   r  r   r   r   %prepare_decoder_input_ids_from_labels  r   zHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    sB   d}| D ]}|t  fdd|d d D |dd   f7 }q|S )Nr   c                 3   $    | ]}| d  |jV  qdS r   Nindex_selectr   r    r   
past_statebeam_idxr   r   r^  	     " zDProphetNetForConditionalGeneration._reorder_cache.<locals>.<genexpr>r"   rm   rT   r  reordered_past
layer_pastr   r  r   _reorder_cache  s   
z1ProphetNetForConditionalGeneration._reorder_cachec                 C   r  r   )ry   r  rc   r   r   r   r    r  z.ProphetNetForConditionalGeneration.get_encoderc                 C   r  r   ry   r  rc   r   r   r   r    r  z.ProphetNetForConditionalGeneration.get_decoder)NNNNNNNNNNNNNNNNr   )rg   rh   ri   r  r   r   r  r  r  rN  r   r   r   r   r  rm   r   r   rP   r   r  r  staticmethodr  r  r  r   r   r   r   r   r  I  s    	


y

r  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                        sB  e Zd Zg dZdef fddZdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Ze													d+deej deej deej deej deej deej deeeej   deej deej dee dee dee d ee d!eeef fd"d#Zd,d%d&Z				d-d'd(Zed)d* Z  ZS ).ProphetNetForCausalLM)z!prophetnet.word_embeddings.weightz)prophetnet.decoder.word_embeddings.weightr  r   c                    s^   t |}d|_d|_t | t|| _|j| _	|j
| _
tj|j|jdd| _|   d S )NTFr  )r  r  r  r  r   r   ProphetNetDecoderWrapperry   r   r   r  r   r~   r   rC  r  rJ  r   r   r   r   r   !  s   

zProphetNetForCausalLM.__init__c                 C   s
   | j jjS r   ry   r  r<  rc   r   r   r   rN  1  r   z*ProphetNetForCausalLM.get_input_embeddingsc                 C   s   || j j_d S r   r  rQ  r   r   r   rS  4  s   z*ProphetNetForCausalLM.set_input_embeddingsc                 C   rL  r   r  rc   r   r   r   r  7  rO  z+ProphetNetForCausalLM.get_output_embeddingsc                 C   rP  r   r  r  r   r   r   r  :  r   z+ProphetNetForCausalLM.set_output_embeddingsc                 C   s$   | j jr| | jjj| j d S d S r   )r   r  r  ry   r  r<  r  rc   r   r   r   r  =  s   z"ProphetNetForCausalLM._tie_weightsc                 C   s   || j _d S r   r  )rd   r  r   r   r   set_decoderA  r   z!ProphetNetForCausalLM.set_decoderc                 C   r  r   r  rc   r   r   r   r  D  r  z!ProphetNetForCausalLM.get_decoderNr   r   r[   rl  rT  rm  rT   rU  r  r/  r   rV  rW  r   c                 C   s4  |dur|n| j j}| jj|||||||||
|||d}|dur#|jn|jdd \}}|d || j j|d}| |}|dddf }| j jdkrU|ddddf nd}d}|	durc| ||	}|st	dd ||fD }|dur|f| |dd  S ||dd  S t
||||j|j|j|j|j|jd		S )
a	  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForCausalLM
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits

        >>> # Model can also be used with EncoderDecoder framework
        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
        >>> import torch

        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
        >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
        ... )

        >>> ARTICLE = (
        ...     "the us state department said wednesday it had received no "
        ...     "formal word from bolivia that it was expelling the us ambassador there "
        ...     "but said the charges made against him are `` baseless ."
        ... )
        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
        >>> labels = tokenizer_dec(
        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
        ... ).input_ids
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

        >>> loss = outputs.loss
        ```N)r   r   r[   rl  rT  rm  rT   rU  r/  r   rV  rW  r"   r   rE   r   c                 s   r[  r   r   r\  r   r   r   r^    r_  z0ProphetNetForCausalLM.forward.<locals>.<genexpr>)	rQ   rR   rS   rT   rs   rt   ru   rv   rY   )r   r`  ry   r  r   r   r-   r  r  rm   rw   rT   rs   rt   ru   rv   rY   )rd   r   r   r[   rl  rT  rm  rT   rU  r  r/  r   rV  rW  r'  r   r,   r  r  rR   rS   rQ   r  r   r   r   r   G  sJ   A 
$.zProphetNetForCausalLM.forwardr   c                 C   r  r  r  r  r   r   r   r    r  z#ProphetNetForCausalLM._compute_lossc                 K   s<   |d u r
| |j}|r|d d dd f }|||||dS )NrE   )r   r   rT  rT   r/  )new_onesr   )rd   r   rT   r   rT  r/  kwargsr   r   r   prepare_inputs_for_generation  s   z3ProphetNetForCausalLM.prepare_inputs_for_generationc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr   c                 3   r  r  r  r  r  r   r   r^    r  z7ProphetNetForCausalLM._reorder_cache.<locals>.<genexpr>r  r  r   r  r   r    s   z$ProphetNetForCausalLM._reorder_cache)NNNNNNNNNNNNNr  )NNNN)rg   rh   ri   r  r   r   rN  rS  r  r  r  r  r  r   r   r   r   rm   r   r   rw   r   r  r  r  r  r   r   r   r   r   r    sz    	


n
r  c                       s6   e Zd ZdZdef fddZdd Zdd Z  ZS )	r  z
    This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
    classes.
    r   c                    s@   t  | tj|j|j|jd| _t|| jd| _	| 
  d S )Nr>  rM  )r   r   r   r   rC  r   r   r<  rh  r  rJ  r   r   r   r   r     s   z!ProphetNetDecoderWrapper.__init__c                 C   s   |  | j| j  d S r   )r  r<  r  rN  rc   r   r   r   r    s   z%ProphetNetDecoderWrapper._tie_weightsc                 O   s   | j |i |S r   r  )rd   argsr  r   r   r   r   	  s   z ProphetNetDecoderWrapper.forward)	rg   rh   ri   rj   r   r   r  r   r   r   r   r   r   r    s
    	r  )rh  r;  r  r  r  rx   r(  ):rj   r  r8   r`   dataclassesr   typingr   r   r   torch.utils.checkpointr   r   torch.nnr   activationsr	   
generationr
   modeling_layersr   modeling_outputsr   modeling_utilsr   utilsr   r   r   configuration_prophetnetr   
get_loggerrg   ru  r   r1   rD   rN   rP   ro   rr   rw   rx   r   r   Moduler   r   r   r  r*  r;  rh  r  r  r  r  __all__r   r   r   r   <module>   s   


75'/&+   A+Tu  C  H ^