o
    	۷i{                    @   s  d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	Z	ddl	m
Z
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z! ddl"m#Z# e$e%Z&dFddZ'dd Z(dFddZ)dd Z*eeddG dd deZ+eeddG d d! d!eZ,eed"dG d#d$ d$eZ-eed"dG d%d& d&eZ.eG d'd( d(eZ/G d)d* d*ej0Z1G d+d, d,ej2Z3G d-d. d.ej2Z4G d/d0 d0ej2Z5G d1d2 d2eZ6G d3d4 d4eZ7ed5dG d6d7 d7e/Z8ed8dG d9d: d:e/Z9eG d;d< d<e/Z:ed=dG d>d? d?e/eZ;ed@dG dAdB dBe/eZ<G dCdD dDe/Z=g dEZ>dS )GzRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).    N)	dataclass)OptionalUnion)Tensornn)	LayerNorm   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutput)PreTrainedModel)ModelOutputauto_docstringlogging)deprecate_kwarg   )ProphetNetConfigFc                 C   s,   |rt jj|  |dS t jj| |tjdS )Ndimr   dtype)r   
functionalsoftmaxfloattorchfloat32)hidden_stater   
onnx_trace r"   h/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   )   s   r   c                 C   s   t j|| | f||dt |j }|  }t|D ]}|| jddd || | d  qd|dddddf< t j	||gddS )	z@
    This function computes the bias for the predict stream
    )devicer   r   F)wrapr   N   r   )
r   onesfinfomindetachclonerangefill_diagonal_triu_cat)sequence_lengthngramr$   r   
left_blockright_block
stream_idxr"   r"   r#   ngram_attention_bias0   s    r5   c           	      C   s   | }d}|r | d } |t |t | |   }t |}n	t |t |}| d }t ||}|t | | t||  | |   }t 	|t 
|| d   }|t || | }|S )zo
    This function computes individual parts of the relative position buckets. For more detail, see paper.
    r   r&   r   )r   lt
zeros_likeintabsmaxlogr   mathr)   	ones_likewhere)	num_bucketsmax_distancerelative_positionsis_bidirectionalinv_relative_positionsrel_positions_bucket	max_exactis_smallval_if_larger"   r"   r#   compute_relative_bucketsA   s(   rH   c                 C   s   | dd|dd}|| d }tj|d |fdd d}|d|dd}|| d }t| ||dd}t| ||dd}||fS )zm
    This function computes both main and predict relative position buckets. For more detail, see paper.
    r   r   F)rB   )	unsqueezerepeatsizer   r/   rH   )r?   r@   position_idsmain_stream_relative_positions$predicting_stream_relative_positionsmain_relative_position_buckets!predict_relative_position_bucketsr"   r"   r#   #compute_all_stream_relative_buckets\   s   rR   zF
    Base class for sequence-to-sequence language models outputs.
    )custom_introc                   @   s  e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZee ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dZeeej  ed
< dZeeej  ed< dZeej ed< dZeeej  ed< dZeeej  ed< edd ZdS )ProphetNetSeq2SeqLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    Nlosslogitslogits_ngrampast_key_valuesdecoder_hidden_statesdecoder_ngram_hidden_statesdecoder_attentionsdecoder_ngram_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentionsc                 C      t dt | jS Nzi`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions` instead.warningswarnFutureWarningr]   selfr"   r"   r#   decoder_cross_attentions   
   z2ProphetNetSeq2SeqLMOutput.decoder_cross_attentions)__name__
__module____qualname____doc__rU   r   r   FloatTensor__annotations__rV   rW   rX   r
   rY   tuplerZ   r[   r\   r]   r^   r_   r`   propertyri   r"   r"   r"   r#   rT   s   s    
 rT   z
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.
    c                   @   s   e Zd ZU dZejed< dZeej ed< dZ	ee
 ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dZeeej  ed
< dZeej ed< dZeeej  ed< dZeeej  ed< edd ZdS )ProphetNetSeq2SeqModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    last_hidden_stateNlast_hidden_state_ngramrX   rY   rZ   r[   r\   r]   r^   r_   r`   c                 C   ra   rb   rc   rg   r"   r"   r#   ri      rj   z5ProphetNetSeq2SeqModelOutput.decoder_cross_attentions)rk   rl   rm   rn   r   ro   rp   ru   r   rX   r
   rY   rq   rZ   r[   r\   r]   r^   r_   r`   rr   ri   r"   r"   r"   r#   rs      s   
 
rs   zs
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                   @   s   e Zd ZU dZejed< dZeej ed< dZ	ee
 ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dZeeej  ed
< dS )ProphetNetDecoderModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    rt   Nru   rX   hidden_stateshidden_states_ngram
attentionsngram_attentionsr]   )rk   rl   rm   rn   r   ro   rp   ru   r   rX   r
   rw   rq   rx   ry   rz   r]   r"   r"   r"   r#   rv      s   
 
rv   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZee ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dZeeej  ed
< dZeeej  ed< dS )ProphetNetDecoderLMOutputa	  
    ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    NrU   rV   rW   rX   rw   rx   ry   rz   r]   )rk   rl   rm   rn   rU   r   r   ro   rp   rV   rW   rX   r
   rw   rq   rx   ry   rz   r]   r"   r"   r"   r#   r{     s   
 "r{   c                   @   s.   e Zd ZU eed< dZdZdd Zdd ZdS )	ProphetNetPreTrainedModelconfig
prophetnetTc                 C   s   t |tjr |jjjd| jjd |jd ur|jj	  d S d S t |tj
rA|jjjd| jjd |jd urC|jj|j 	  d S d S d S )N        )meanstd)
isinstancer   Linearweightdatanormal_r}   init_stdbiaszero_	Embeddingpadding_idx)rh   moduler"   r"   r#   _init_weightsP  s   

z'ProphetNetPreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d usJ d||j}|dd df  |ddd f< ||d< |d us2J d||dk| t|dk	 sGJ d	|S )
Nzself.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.rI   r   ).r   z1self.model.config.pad_token_id has to be defined.r   z8Verify that `shifted_input_ids` has only positive values)
r}   decoder_start_token_idpad_token_id	new_zerosshaper+   masked_fill_r   allitem)rh   	input_idsr   r   shifted_input_idsr"   r"   r#   _shift_rightZ  s   
 z&ProphetNetPreTrainedModel._shift_rightN)	rk   rl   rm   r   rp   base_model_prefixsupports_gradient_checkpointingr   r   r"   r"   r"   r#   r|   J  s   
 
r|   c                       sD   e Zd ZdZdeddf fddZd fdd	Z fd	d
Z  ZS )ProphetNetPositionalEmbeddingsa  
    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
    the forward function.
    r}   returnNc                    s"   |j | _t |j |j|j d S N)max_position_embeddings
max_lengthsuper__init__hidden_sizer   rh   r}   	__class__r"   r#   r   x  s   z'ProphetNetPositionalEmbeddings.__init__c                    s   |d u s| j d u sJ d|d u r]|d ur6| dkr6| }|d | }tjdtj|dt| j |  }n'|d u rCtj|tj|d}tj|dd||  | j  }|d| j	d }t
 ||fS )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r   )r   r   r   r$   r   )r   get_seq_lengthr   r'   longr8   cumsumtype_asclampr   r   forward)rh   inputs_shaper$   attention_maskrX   rM   prev_num_input_idsnum_input_idsr   r"   r#   r   |  s"   z&ProphetNetPositionalEmbeddings.forwardc                    s   t  |S r   )r   r   )rh   rM   r   r"   r#   _forward     z'ProphetNetPositionalEmbeddings._forward)NNN)	rk   rl   rm   rn   r   r   r   r   __classcell__r"   r"   r   r#   r   q  s
    r   c                       s   e Zd ZdZddededee f fddZedd	d
d						ddee	 dee	 dee	 d	ee
 dee deej	 dee	ee	 f fddZ  ZS )ProphetNetAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr}   num_attn_heads	layer_idxc                    s   t    |j}|j| _|j| _|| _|| | _|| _| j| |ks&J dt	||| _
t	||| _t	||| _t	||| _d S )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r   r   r   attention_dropoutdropoutr   head_dimr   r   r   key_proj
value_proj
query_projout_proj)rh   r}   r   r   r   r   r"   r#   r     s   

zProphetNetAttention.__init__past_key_valuerX   4.58new_nameversionFkey_value_statesr   layer_head_maskoutput_attentionscache_positionr   c                 C   sd  |  \}}	}
|d u}t|  ||	|
gks%J d||	|
f d|   | || jd  }d}|d urMt|trK|j| j}|rG|j	}n|j
}n|}|rQ|n|}|rj|d urj|rj|j| j j}|j| j j}nJ| |}| |}||d| j| jdd}||d| j| jdd}|d ur|s|nd }|||| jd|i\}}|rt|trd	|j| j< |||	| j| jdd}| d}td
||dd}|| j|	|f}|  |krtd| d|   |d ur| dkrd }|| jd|f}|d ur|  |krtd| d|   |d ur|| }|r%|}nd }tjj|dd}|d urf|  | jfksKJ d| jf d|   |dddd||| j|	| }|dddd| }tjj|| j| jd}td
||}|| j|	| jf}|  |krtd| d|   |dd||	|
}| |}tjj|| j| jd}||fS )Nz Size of hidden states should be 	, but is       ?FrI   r   r&   r   Tzbsij,bsjk->bsikr   z#Attention weights should have size r   z Attention mask should have size r   /Head mask for a single layer should be of size ptrainingz `attn_output` should have shape , but is of shape ) rL   listr   r   r   r   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   viewr   	transposeupdater   einsum
ValueErrorr   r   r   r   r   r   r   reshaper   )rh   rw   r   r   r   rX   r   r   
batch_sizetgt_lenr   is_cross_attentionquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statessrc_lenattn_weightsexpected_shapeattn_weights_reshaped
attn_probsattn_outputr"   r"   r#   r     s   








zProphetNetAttention.forwardr   )NNNNFN)rk   rl   rm   rn   r   r8   r   r   r   r   r
   boolr   rq   r   r   r"   r"   r   r#   r     s2     	r   c                       s2   e Zd ZdZdedef fddZdd Z  ZS )ProphetNetFeedForwardzm
    This is the residual two feed-forward layer block based on the original Transformer implementation.
    r}   ffn_dimc                    sJ   t    t|j | _t|j|| _t||j| _	|j
| _
|j| _d S r   )r   r   r	   activation_functionactivation_fnr   r   r   intermediateoutputactivation_dropoutr   )rh   r}   r   r   r"   r#   r   (  s   
zProphetNetFeedForward.__init__c                 C   sN   |  |}| |}tjj|| j| jd}| |}tjj|| j| jd}|S )Nr   )r   r   r   r   r   r   r   r   )rh   rw   r"   r"   r#   r   0  s   


zProphetNetFeedForward.forward)	rk   rl   rm   rn   r   r8   r   r   r   r"   r"   r   r#   r   #  s    r   c                       sv   e Zd Zddef fddZdd Zdd Zed	d
dd								dd
ee	 fddZ
dd Zdd Z  ZS )ProphetNetNgramSelfAttentionNr}   c                    s   t    |j| _|j| _|j| _|j| _|j| _|j| _|j| j | _	|j
| _
|| _| j	| j |jks8J dt|j|j| _t|j|j| _t|j|j| _t|j|j| _t|j| j| j | _d| _d S )Nz6config.hidden_size must be divisible by num_attn_headsF)r   r   r   r?   relative_max_distancenum_decoder_attention_headsr   r   r   r   r1   r   r   r   r   r   r   r   relative_pos_embeddingsr!   rh   r}   r   r   r"   r#   r   ;  s&   

z%ProphetNetNgramSelfAttention.__init__c                 C   s    | ||| j| jdd S Nr   r&   )r   r   r   r   
contiguous)rh   tensorseq_lenr   r"   r"   r#   _shapeY  s    z#ProphetNetNgramSelfAttention._shapec                 C   s
   d| _ d S )NT)r!   rg   r"   r"   r#   prepare_for_onnx_export_\     
z5ProphetNetNgramSelfAttention.prepare_for_onnx_export_r   rX   r   r   c
           *         sf  |  \}
}}t|  |
||gks J d|
||f d|j | |}| |}| |}|| jd  }| |||
}| |d|
}| |d|
}|
| jd| jf}|j	| }|j	| }|j	| }|j
d| j dd}|j
d| j dd}|j
d| j dd}|j
d| j dd}|d |dd  }}|d |dd  }}|d |dd   }|d |dd  }|d urt|tr|j}n|}| | jd	|	i\ |d| j  }td
| dd}| ||||}|| }|d ur|| }t|d| jd|}|d ur0|  | jfksJ d| jf d|   |dddd||
| jd| }tjj|| j| jd}td
|} | dd	|
d||} | | } t|d|
| j| j|| j}!t fdd|D d}"tj|dd}#t fdd|D d}$td|!|"f}%| !|#|%||}&|%|& }%|d ur|"ddddd}|#|%j$}|%| }%t|%d| jd|%}'|d ur|  | jfksJ d| jf d|   |ddddd|' }'tjj|'| j| jd}'td|'|$ddf}(|(dd}(|(	|
| j||}(| |(}(t | |(gd|
d|})||
| j|d}tjj|)| j| jd})|)||'fS )Nz#`hidden_states` should be of shape r   r   rI   r   r   r&   r   r   zbntc,bncs->bntsr   )r   r!   r   r   r   c                    s   g | ]
}t  |gd qS r&   )r   r/   ).0key)main_key_statesr"   r#   
<listcomp>  s    z8ProphetNetNgramSelfAttention.forward.<locals>.<listcomp>c                    s"   g | ]}t  |gd d qS r   )r   r/   rJ   )r   v_p)main_value_statesr"   r#   r    s   " zbnhtc,bnhsc->bnhts   zbnhts,bnhsc->bnhtc)%rL   r   r   r   r   r   r   r   r   r   chunkr1   r   r   r   r   r   r   r   r    get_main_relative_pos_embeddingsr   r!   r   r   r   r   r   r   r   r   stackr/   #get_predict_relative_pos_embeddingspermutetor   )*rh   rw   rX   r   r   extended_predict_attention_maskrP   rQ   rM   r   r   ngram_sequence_lengthr   r   r   r   
proj_shapehidden_states_listquery_states_listkey_states_listvalue_states_listmain_hidden_stateshidden_states_predict_listmain_query_statespredict_query_states_listpredict_key_states_listpredict_value_states_listr   r0   main_attn_weightsmain_relative_pos_embeddingsmain_attn_probsmain_attn_outputpredict_query_statespredict_key_statespredict_hidden_statespredict_value_statespredict_attn_weightspredict_relative_pos_embeddingspredict_attn_probspredict_attn_outputr   r"   )r  r  r#   r   _  s   














z$ProphetNetNgramSelfAttention.forwardc                 C   sH  |j \}}}}|||||}|d u rK|j d d \}}	td|j d d dd||	d|j}
|
|d||	d }
t| j	| j
|
d}| |}||j d d | j	| jf }|dddd}||j d d d }|d| jd}|d|j d }| }|d|d}tj|d|d}||||d}|S )	Nr&   r   rI   r   Fr   )rI   r   index)r   r   r   arangerJ   rK   r  r$   rH   r?   r   r   r   r
  r   r   rL   gather)rh   rw   r   rM   rP   r   r   r   r   r0   rA   rel_pos_embeddingsr  r"   r"   r#   r    s:   


z=ProphetNetNgramSelfAttention.get_main_relative_pos_embeddingsc                 C   sH  |j dd \}}|d u rJ|j d }|d d |d ks J dtd|dd||d|j}||d||d }t| j| j	|d}|
dd}| |}	|	|j d d | j| jf }	|	ddddd}	|	d| j}	|d}|| jd| jd}|d|d }tj|	d|d	}
|
|| j| j|d}
|
S )
Nr   r&   rI   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr  r   r%  )r   r   r'  rJ   rK   r  r$   rH   r?   r   r   r   r   r   r
  r   r1   rL   r   r(  )rh   rw   r   rM   rQ   r   r0   key_sequence_lengthrA   r)  r"  r"   r"   r#   r	  A  sN   





z@ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddingsr   )NNNNNNNN)rk   rl   rm   r   r   r   r   r   r   r
   r   r  r	  r   r"   r"   r   r#   r   :  s&     5-r   c                       s8   e Zd ZdZdef fddZ	d	defddZ  ZS )
ProphetNetEncoderLayerz&
    Encoder block for Prophetnet
    r}   c                    sB   t    t||j| _t|j| _t||j	| _
t|j| _d S r   )r   r   r   num_encoder_attention_heads	self_attnr   r   self_attn_layer_normr   encoder_ffn_dimfeed_forwardfeed_forward_layer_normr   r   r"   r#   r     s
   
zProphetNetEncoderLayer.__init__Fr   c           	      C   sT   | j ||||d\}}| || }| |}| || }|f}|r(||f7 }|S )N)rw   r   r   r   )r-  r.  r0  r1  )	rh   rw   r   r   r   attention_outputr   feed_forward_outputoutputsr"   r"   r#   r     s   


zProphetNetEncoderLayer.forwardF)	rk   rl   rm   rn   r   r   r   r   r   r"   r"   r   r#   r+  }  s    r+  c                	       sv   e Zd ZdZddef fddZedddd												
		ddee dee dee	j
 fddZ  ZS )ProphetNetDecoderLayerz&
    Decoder block for Prophetnet
    Nr}   c                    sf   t    t||d| _t|j| _|jr$t||j	|d| _
t|j| _t||j| _t|j| _d S )Nr   )r   r   r   r-  r   r   r.  add_cross_attentionr   r   
cross_attncross_attn_layer_normr   decoder_ffn_dimr0  r1  r   r   r"   r#   r     s   
zProphetNetDecoderLayer.__init__r   rX   r   r   TF	use_cacher   r   c              
   C   s   | j |||||||	|
d\}}}| || }d }|d ur1| j||||||d\}}| || }| |}| || }|f}|rI||||f7 }|S )N)rw   rX   r   r   r  rP   rQ   rM   )rw   r   r   r   rX   r   )r-  r.  r9  r:  r0  r1  )rh   rw   r   r_   encoder_attn_maskr   cross_attn_layer_head_maskr  rP   rQ   rM   rX   r<  r   r   ngram_attention_outputself_attn_weightsself_attn_weights_ngramcross_attn_weightsr2  r3  r4  r"   r"   r#   r     s8   


zProphetNetDecoderLayer.forwardr   )NNNNNNNNNNTFN)rk   rl   rm   rn   r   r   r   r   r   r   r   r   r   r"   r"   r   r#   r6    s0    r6  z=
    The standalone encoder part of the ProphetNetModel.
    c                       s   e Zd Zddedeej f fddZdd Zdd	 Z	e
							dd
eej deej deej deej dee dee dee deeef fddZ  ZS )ProphetNetEncoderNr}   word_embeddingsc                    sx   t    |dur|n
tj j j jd| _t | _	t
 j| _t fddt jD | _d| _|   dS )7  
        word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
            The word embedding parameters. This can be used to initialize [`ProphetNetEncoder`] with pre-defined word
            embeddings instead of randomly initialized word embeddings.
        Nr   c                    s   g | ]}t  qS r"   )r+  )r   _r}   r"   r#   r  
  s    z.ProphetNetEncoder.__init__.<locals>.<listcomp>F)r   r   r   r   
vocab_sizer   r   rD  r   position_embeddingsr   embeddings_layer_norm
ModuleListr,   num_encoder_layersr   gradient_checkpointing	post_initrh   r}   rD  r   rH  r#   r     s   
 zProphetNetEncoder.__init__c                 C      | j S r   rD  rg   r"   r"   r#   get_input_embeddings     z&ProphetNetEncoder.get_input_embeddingsc                 C   
   || _ d S r   rR  rh   valuer"   r"   r#   set_input_embeddings  r   z&ProphetNetEncoder.set_input_embeddingsr   r   	head_maskinputs_embedsr   output_hidden_statesreturn_dictr   c                 C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r*|du r*td|dur6|dur6td|durC|du rC| |}|durkd|ddddddf d| j jdd t	| j
j }||j
}nd}| |jdd |j\}	}
||	 }| |}tjj|| j j| jd}|rdnd}|rdnd}|dur| d	 t| jksJ d
t| j d| d	  dt| jD ](\}}|r||f }||||dur|| nd|d}|d	 }|r||d f }q|r||f }|stdd |||fD S t|||dS )a	  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetEncoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.      ?r   r&   r   r"   r   z&The head_mask should be specified for  layers, but it is for .)r   r   r   c                 s       | ]	}|d ur|V  qd S r   r"   r   vr"   r"   r#   	<genexpr>h      z,ProphetNetEncoder.forward.<locals>.<genexpr>)rt   rw   ry   )r}   r   r[  use_return_dictr   rD  rK   r,  r   r(   r   r)   r  rJ  r   r$   rK  r   r   r   r   rL   lenr   	enumeraterq   r   )rh   r   r   rY  rZ  r   r[  r\  extended_attention_maskrJ  rM   rw   r_   all_attentionsidxencoder_layerlayer_outputsr"   r"   r#   r     s\   
*


zProphetNetEncoder.forwardr   )NNNNNNN)rk   rl   rm   r   r   r   r   r   rS  rX  r   r   r   r   r   rq   r   r   r   r"   r"   r   r#   rC    s:    
	rC  z=
    The standalone decoder part of the ProphetNetModel.
    c                        s  e Zd Zd dedeej f fddZdd Zdd	 Z	e
													d!d
eej deej deej deej deej deej dee deej dee dee dee dee deej deeef fddZdd Zdd Zdd Z  ZS )"ProphetNetDecoderNr}   rD  c                    s   t     j| _ j| _ j| _ j| _ j| _|dur |n
tj	 j
 j jd| _t | _t	| j jd| _t fddt jD | _t j| _d| _|   dS )rE  NrF  c                    s   g | ]}t  |d qS )r7  )r6  )r   irH  r"   r#   r    s    z.ProphetNetDecoder.__init__.<locals>.<listcomp>F)r   r   r1   r?   r   r   r   max_target_positionsr   r   rI  r   r   rD  r   rJ  ngram_embeddingsrL  r,   num_decoder_layersr   r   rK  rN  rO  rP  r   rH  r#   r   t  s$   
zProphetNetDecoder.__init__c                 C   rQ  r   rR  rg   r"   r"   r#   rS    rT  z&ProphetNetDecoder.get_input_embeddingsc                 C   rU  r   rR  rV  r"   r"   r#   rX    r   z&ProphetNetDecoder.set_input_embeddingsr   r   r_   encoder_attention_maskrY  cross_attn_head_maskrX   rZ  r<  r   r[  r\  r   r   c           %         s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du r4|du r4td|dur@|dur@td|durM|du rM| |}|jdd \ }| jre| j	re|	ret
d d}	|	r|du r|dur|tt| j dt| j dnt| j d}|	rt|trt
d t|}|dur| nd	}| j |f|j|d
\}}|d	krd\}}n| |\}}| j|d || }| jj|d	kr|ddksJ d fddt| jD }d}d}nfddt| jD }| ||}| ||}|dur.d|ddddddf d| j jdd t | j!j" }|#|j!}nd}t$|g| d}| j%rB| %|}t&j'j(|| j(| j	d}|rRdnd}|r`| j jd	kr`dnd}|
rgdnd}|
rndnd}|
rz| j j)rzdnd}t*||gddgD ]+\}}|dur| d	 t+| j,ksJ d| dt+| j, d| d	  dqt-| j,D ]n\} }!|r||ddd|f f7 }| j jd	kr||dd|df f7 }|!|||||dur||  nd|dur||  nd||||||	|
|d}"|"d	 }|
r#||"d f7 }||"d f7 }| j j)r#||"d f7 }q|rI||ddd|f f7 }| j jd	krI||dd|df f7 }|ddd|f }#| j jd	krd|dd|df nd}$|sztdd |#|$||||||fD S t.|#|$||||||dS )aY  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetDecoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r&   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FrH  zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r$   rX   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1c                    s&   g | ]}|d      d d qS r   )rK   r   r1   r   rp  predicting_stream_pos_embedr"   r#   r    s    z-ProphetNetDecoder.forward.<locals>.<listcomp>c                    s   g | ]
} |d    qS rt  r"   ru  )rp  rw  r"   r#   r    s    r]  r   r"   rY  rs  zThe `z` should be specified for r^  r_  )r=  r   r>  r  rP   rQ   rM   rX   r<  r   r   r   c                 s   r`  r   r"   ra  r"   r"   r#   rc  U  s    
z,ProphetNetDecoder.forward.<locals>.<genexpr>)rt   ru   rX   rw   rx   ry   rz   r]   )/r}   r<  r   r[  re  r   rD  r   rN  r   loggerwarning_oncer   r   r   rq   from_legacy_cacher   rJ  r$   !compute_buffered_relative_bucketsr   rp  r   rL   r,   r1   prepare_attention_maskprepare_predict_attention_maskrK   r   r   r(   r   r)   r  r/   rK  r   r   r   r8  ziprf  r   rg  rv   )%rh   r   r   r_   rr  rY  rs  rX   rZ  r<  r   r[  r\  r   r0   past_key_values_lengthmain_stream_pos_embedrM   rP   rQ   rw   ngram_hidden_statesrh  r  extended_encoder_attention_maskall_main_stream_hidden_statesall_ngram_stream_hidden_statesall_main_stream_attnsall_ngram_stream_attnsall_cross_attns	attn_mask	mask_namerj  decoder_layerrl  rt   ru   r"   rv  r#   r     s  %





*



&zProphetNetDecoder.forwardc              	   C   s   |j \}}td| j|jdd}t| j| j	|\}}|d d d |d |f |dd}t
|d d d |d |f |d d d || j| j| f gd|dd}||fS r   )r   r   r'  ro  r  r$   rK   rR   r?   r   r/   )rh   rM   r   r0   main_relative_bucketspredict_relative_bucketsr"   r"   r#   r{  n  s"   

$

z3ProphetNetDecoder.compute_buffered_relative_bucketsc                 C   s   |j d d \}}tj||ft|jj|j|jd}t|d}|d |d |f d d d d d d f || j	j
f|j  }|d ur]d|d d d d d d f  t| jj }|| }n|}||jS )Nr&   r   r   r]  )r   r   fullr(   r   r)   r$   triuexpandr}   r   r  )rh   rw   r   r   
seq_lengthcausal_maskextended_causal_maskrh  r"   r"   r#   r|    s    (*
z(ProphetNetDecoder.prepare_attention_maskc           	      C   s   |j d d \}}t| j| j|j|j}tj|d d d |d |f |d d d || j| j| f gdd}|d d d d d d d d f || j	j
f|j  }|d urd|d d d d d d d f  t| jj }||| j	j
| j||f}tj|t|gdd}|| }n|}||jS )Nr&   rI   r   r]  )r   r5   ro  r1   r$   r   r   r/   r  r}   r   r(   r)   r7   r  )	rh   rw   r   r   r  predict_causal_maskextended_predict_causal_maskrh  r  r"   r"   r#   r}    s4   	,
z0ProphetNetDecoder.prepare_predict_attention_maskr   NNNNNNNNNNNNN)rk   rl   rm   r   r   r   r   r   rS  rX  r   r   r   r
   r   r   rq   rv   r   r{  r|  r}  r   r"   r"   r   r#   rm  n  sf    	

 Urm  c                &       s  e Zd ZddgZdef fddZdd Zdd	 Zd
d Zdd Z	e
																d"deej deej deej deej deej deej deej dee dee deej deej dee dee dee dee deej deeef f"d d!Z  ZS )#ProphetNetModelencoder.word_embeddings.weightdecoder.word_embeddings.weightr}   c                    sx   t  | tj|j|j|jd| _t	|}d|_
d|_t|| j| _t	|}d|_d|_t|| j| _|   d S )NrF  FT)r   r   r   r   rI  r   r   rD  copydeepcopyr<  tie_encoder_decoderrC  encoder
is_decoderrm  decoderrO  )rh   r}   encoder_configdecoder_configr   r"   r#   r     s   

zProphetNetModel.__init__c                 C   rQ  r   rR  rg   r"   r"   r#   rS    rT  z$ProphetNetModel.get_input_embeddingsc                 C   s   || _ | j | j_ | j | j_ d S r   )rD  r  r  rV  r"   r"   r#   rX    s   
z$ProphetNetModel.set_input_embeddingsc                 C   s4   | j jr| | jj| j | | jj| j d S d S r   )r}   tie_word_embeddings_tie_or_clone_weightsr  rD  r  rg   r"   r"   r#   _tie_weights  s   zProphetNetModel._tie_weightsc                 C   rQ  r   )r  rg   r"   r"   r#   get_encoder  rT  zProphetNetModel.get_encoderNr   r   decoder_input_idsdecoder_attention_maskrY  decoder_head_maskrs  encoder_outputsrX   rZ  decoder_inputs_embedsr<  r   r[  r\  r   r   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|du r8| j||||
|||d}| j|||d ||||	||||||d}|sR|| S t|j|j	|j
|j|j|j|j|j|j|j|jdS )a7  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
        ```N)r   r   rY  rZ  r   r[  r\  r   )r   r   r_   rr  rY  rs  rX   rZ  r   r[  r<  r\  r   )rt   ru   rX   rY   rZ   r[   r\   r]   r^   r_   r`   )r}   r<  r   r[  re  r  r  rs   rt   ru   rX   rw   rx   ry   rz   r]   )rh   r   r   r  r  rY  r  rs  r  rX   rZ  r  r<  r   r[  r\  r   decoder_outputsr"   r"   r#   r     sZ   :zProphetNetModel.forward)NNNNNNNNNNNNNNNN)rk   rl   rm   _tied_weights_keysr   r   rS  rX  r  r  r   r   r   r   
BoolTensorrq   r
   r   r   rs   r   r   r"   r"   r   r#   r    sv    	

r  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c                (       sB  e Zd Zg dZdef fddZdd Zdd Ze																																		d'd
e	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e de	e
j de	e
j de	e
j de	e de	e de	e de	e de	e
j deeef f$ddZd(dd Zde
jfd!d"Zd#d$ Zd%d& Z  ZS ))"ProphetNetForConditionalGeneration)r  r  lm_head.weightr}   c                    sH   t  | t|| _|j| _|j| _tj|j	|j
dd| _|   d S )NFr   )r   r   r  r~   r   r   disable_ngram_lossr   r   r   rI  lm_headrO  r   r   r"   r#   r   [  s   
z+ProphetNetForConditionalGeneration.__init__c                 C   s"   | j jr| | jj| j d S d S r   )r}   r  r  r~   rD  r  rg   r"   r"   r#   r  f  s   z/ProphetNetForConditionalGeneration._tie_weightsc                 C      | j jS r   )r~   rD  rg   r"   r"   r#   rS  j     z7ProphetNetForConditionalGeneration.get_input_embeddingsNr   r   r  r  rY  r  rs  r  rX   rZ  r  labelsr<  r   r[  r\  r   r   c                 C   s  |dur|n| j j}|dur|du r|du r| |}| jdi d|d|d|d|d|d|d|d	|d
|	d|
d|d|d|d|d|d|}|durX|jn|jdd \}}|d || j j|d}| |}|dddf }| j jdkr|ddddf nd}| s|	 }d}|dur| 
||}|stdd ||fD }|dur|f| |dd  S ||dd  S t||||j|j|j|j|j|j|j|j|jdS )a	  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
        ```Nr   r   r  r  rY  r  rs  r  rX   rZ  r  r<  r   r[  r\  r   r&   r   rI   r   c                 s   r`  r   r"   ra  r"   r"   r#   rc    rd  z=ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>)rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   r"   )r}   re  r   r~   r   r   r1   r  is_contiguousr   _compute_lossrq   rT   rX   rY   rZ   r[   r\   r]   r^   r_   r`   )rh   r   r   r  r  rY  r  rs  r  rX   rZ  r  r  r<  r   r[  r\  r   r4  r   r0   predicting_streamspredict_logitsrV   rW   rU   
all_logitsr"   r"   r#   r   m  s   ?

	

$.z*ProphetNetForConditionalGeneration.forwardr   c                 C     | | jj|d|d|}t| jjD ]}|dkr#| jr# n|||d d d d f< q|dd }t	j
j|d|ddtjd}t	j
j||ddd}| jjdkr|jddd	 }||d}	||	 }| }| jj|d }
d
| jj | |
|  }|S Nr   r   rI   r   r   )	reductionr   T)r   keepdimr]  r   r}   r1   rL   fill_r,   r  r   r   r   r   log_softmaxr   r   r   nll_lossepssumner   rh   rV   r  ignore_indexexpend_targetsrn  lprobsrU   smooth_lossnon_masked_tokenseps_ir"   r"   r#   r    (   $z0ProphetNetForConditionalGeneration._compute_lossc                 C   s
   |  |S r   )r   )rh   r  r"   r"   r#   %prepare_decoder_input_ids_from_labels  r   zHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                 C   r  r   )r~   r  rg   r"   r"   r#   r    r  z.ProphetNetForConditionalGeneration.get_encoderc                 C   r  r   r~   r  rg   r"   r"   r#   get_decoder  r  z.ProphetNetForConditionalGeneration.get_decoder)NNNNNNNNNNNNNNNNNr   )rk   rl   rm   r  r   r   r  rS  r   r   r   r   r  r
   r   r   rq   rT   r   r  r  r  r  r   r"   r"   r   r#   r  S  s    	


{r  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                        s  e Zd Zg dZdef fddZdd Zdd Zd	d
 Zdd Z	dd Z
e													d%deej deej deej deej deej deej dee deej deej dee dee dee dee deeef fddZd&d!d"Z				d'd#d$Z  ZS )(ProphetNetForCausalLM)z!prophetnet.word_embeddings.weightz)prophetnet.decoder.word_embeddings.weightr  r}   c                    s^   t |}d|_d|_t | t|| _|j| _	|j
| _
tj|j|jdd| _|   d S )NTFr  )r  r  r  is_encoder_decoderr   r   ProphetNetDecoderWrapperr~   r   r   r  r   r   r   rI  r  rO  r   r   r"   r#   r     s   

zProphetNetForCausalLM.__init__c                 C   s
   | j jjS r   r~   r  rD  rg   r"   r"   r#   rS  +  r   z*ProphetNetForCausalLM.get_input_embeddingsc                 C   s   || j j_d S r   r  rV  r"   r"   r#   rX  .  s   z*ProphetNetForCausalLM.set_input_embeddingsc                 C   s$   | j jr| | jjj| j d S d S r   )r}   r  r  r~   r  rD  r  rg   r"   r"   r#   r  1  s   z"ProphetNetForCausalLM._tie_weightsc                 C   s   || j _d S r   r  )rh   r  r"   r"   r#   set_decoder5  r   z!ProphetNetForCausalLM.set_decoderc                 C   r  r   r  rg   r"   r"   r#   r  8  r  z!ProphetNetForCausalLM.get_decoderNr   r   r_   rr  rY  rs  rX   rZ  r  r<  r   r[  r\  r   c                 C   s4  |dur|n| j j}| jj|||||||||
|||d}|dur#|jn|jdd \}}|d || j j|d}| |}|dddf }| j jdkrU|ddddf nd}d}|	durc| ||	}|st	dd ||fD }|dur|f| |dd  S ||dd  S t
||||j|j|j|j|j|jd		S )
a	  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForCausalLM
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits

        >>> # Model can also be used with EncoderDecoder framework
        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
        >>> import torch

        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
        >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
        ... )

        >>> ARTICLE = (
        ...     "the us state department said wednesday it had received no "
        ...     "formal word from bolivia that it was expelling the us ambassador there "
        ...     "but said the charges made against him are `` baseless ."
        ... )
        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
        >>> labels = tokenizer_dec(
        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
        ... ).input_ids
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

        >>> loss = outputs.loss
        ```N)r   r   r_   rr  rY  rs  rX   rZ  r<  r   r[  r\  r&   r   rI   r   c                 s   r`  r   r"   ra  r"   r"   r#   rc    rd  z0ProphetNetForCausalLM.forward.<locals>.<genexpr>)	rU   rV   rW   rX   rw   rx   ry   rz   r]   )r}   re  r~   r  r   r   r1   r  r  rq   r{   rX   rw   rx   ry   rz   r]   )rh   r   r   r_   rr  rY  rs  rX   rZ  r  r<  r   r[  r\  r4  r   r0   r  r  rV   rW   rU   r  r"   r"   r#   r   ;  sJ   A 
$.zProphetNetForCausalLM.forwardr   c                 C   r  r  r  r  r"   r"   r#   r    r  z#ProphetNetForCausalLM._compute_lossc           
      K   s~   |d u r
| |j}|d ur| dkr|d d dd f }|||||d}|dd  | D ]\}}	||vr<|	||< q0|S )Nr   rI   )r   r   rY  rX   r<  r   )new_onesr   r   popitems)
rh   r   rX   r   rY  r<  kwargsmodel_inputsr   rW  r"   r"   r#   prepare_inputs_for_generation  s    	z3ProphetNetForCausalLM.prepare_inputs_for_generationr  r  )NNNN)rk   rl   rm   r  r   r   rS  rX  r  r  r  r   r   r   r   r
   r   r   rq   r{   r   r  r  r   r"   r"   r   r#   r    sr    	


nr  c                       s6   e Zd ZdZdef fddZdd Zdd Z  ZS )	r  z
    This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
    classes.
    r}   c                    s@   t  | tj|j|j|jd| _t|| jd| _	| 
  d S )NrF  rR  )r   r   r   r   rI  r   r   rD  rm  r  rO  r   r   r"   r#   r     s   z!ProphetNetDecoderWrapper.__init__c                 C   s   |  | j| j  d S r   )r  rD  r  rS  rg   r"   r"   r#   r    s   z%ProphetNetDecoderWrapper._tie_weightsc                 O   s   | j |i |S r   )r  )rh   argsr  r"   r"   r#   r     s   z ProphetNetDecoderWrapper.forward)	rk   rl   rm   rn   r   r   r  r   r   r"   r"   r   r#   r    s
    	r  )rm  rC  r  r  r  r|   r5  )?rn   r  r<   rd   dataclassesr   typingr   r   r   r   r   torch.nnr   activationsr	   cache_utilsr
   r   r   
generationr   modeling_layersr   modeling_outputsr   modeling_utilsr   utilsr   r   r   utils.deprecationr   configuration_prophetnetr   
get_loggerrk   rx  r   r5   rH   rR   rT   rs   rv   r{   r|   r   r   Moduler   r   r   r+  r6  rC  rm  r  r  r  r  __all__r"   r"   r"   r#   <module>   s   


64&.&+   E+Lu  O  8 X