o
    eiZ                    @   sh  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZm	Z	 ddl
mZ dd	lmZmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ee Z!eeddG dd deZ"eeG dd deZ#eddeG dd deZ$eG dd de$Z%eddG dd de$Z&eddG d d! d!e$eZ'g d"Z(dS )#zRAG model implementation.    )Callable)	dataclassN)nn   )CacheEncoderDecoderCache)PreTrainedConfig)GenerationConfigGenerationMixinGenerationModeLogitsProcessorListStoppingCriteriaList)GENERATION_MODES_MAPPING)ModelOutput)PreTrainedModel)auto_docstringlogging   )	RagConfig)RagRetrieverzI
    Base class for retriever augmented marginalized models outputs.
    )custom_introc                   @   sz  e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
edB ed< dZejdB ed< dZejdB ed< dZejdB ed	< dZejdB ed
< dZejdB ed< dZeejdf dB ed< dZeejdf dB ed< dZejdB ed< dZeejdf dB ed< dZeejdf dB ed< dZeejdf dB ed< dZeejdf dB ed< dZeejdf dB ed< dS )RetrievAugLMMarginOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
        each vocabulary token.
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
        (see `past_key_values` input) to speed up sequential decoding.
    retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
        Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
        the `doc_scores`.
    retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
        The indexes of the embedded documents retrieved by the retriever.
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.
    question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
        model.
    question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
    question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
    generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
    generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
    generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
    Nlosslogits
doc_scorespast_key_valuesretrieved_doc_embedsretrieved_doc_idscontext_input_idscontext_attention_mask"question_encoder_last_hidden_state.question_enc_hidden_statesquestion_enc_attentionsgenerator_enc_last_hidden_stategenerator_enc_hidden_statesgenerator_enc_attentionsgenerator_dec_hidden_statesgenerator_dec_attentionsgenerator_cross_attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r   r   r   
LongTensorr   r   r    r!   tupler"   r#   r$   r%   r&   r'   r(    r2   r2   b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/rag/modeling_rag.pyr   $   s&   
 Fr   c                   @   sh  e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed	< dZejdB ed
< dZeejdf dB ed< dZeejdf dB ed< dZejdB ed< dZeejdf dB ed< dZeejdf dB ed< dZeejdf dB ed< dZeejdf dB ed< dZeejdf dB ed< dS )RetrievAugLMOutputa"  
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
        each vocabulary token.
    doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
        Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
        `question_encoder_last_hidden_state`.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains precomputed hidden-states (key and values in the attention blocks) of the decoder that can be used
        (see `past_key_values` input) to speed up sequential decoding.
    retrieved_doc_embeds (`torch.FloatTensor` of shape `(batch_size, config.n_docs, hidden_size)`, *optional*, returned when *output_retrieved=True*):
        Embedded documents retrieved by the retriever. Is used with `question_encoder_last_hidden_state` to compute
        the `doc_scores`.
    retrieved_doc_ids (`torch.LongTensor` of shape `(batch_size, config.n_docs)`, *optional*, returned when *output_retrieved=True*):
        The indexes of the embedded documents retrieved by the retriever.
    context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Input ids post-processed from the retrieved documents and the question encoder input_ids by the retriever.
    context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
        Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
        retriever.
    question_encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden states at the output of the last layer of the question encoder pooled output of the
        model.
    question_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the question encoder at the output of each layer plus the initial embedding outputs.
    question_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the question encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_enc_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the generator encoder of the model.
    generator_enc_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator encoder at the output of each layer plus the initial embedding outputs.
    generator_enc_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator encoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_dec_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden states of the generator decoder at the output of each layer plus the initial embedding outputs.
    generator_dec_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights of the generator decoder, after the attention softmax, used to compute the weighted
        average in the self-attention heads.
    generator_cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Cross-attentions weights of the generator decoder, after the attention softmax, used to compute the
        weighted average in the cross-attention heads.
    Nr   r   r   r   r   r   r   r    .r!   r"   r#   r$   r%   r&   r'   r(   )r)   r*   r+   r,   r   r-   r.   r/   r   r   r   r   r   r0   r   r   r    r!   r1   r"   r#   r$   r%   r&   r'   r(   r2   r2   r2   r3   r4      s$   
 Dr4   a  
    RAG models were released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP
    Tasks](https://huggingface.co/papers/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandra Piktus et al.

    RAG is a retriever augmented model and encapsulate three components: a question encoder, a dataset retriever and a
    generator, the encoder and generator are trainable while the retriever is just an indexed dataset.
    c                
   @   sT   e Zd ZU eed< dZdZdZe			dde	dB de	dB de
dB defd	d
ZdS )RagPreTrainedModelconfigragTN.question_encoder_pretrained_model_name_or_path'generator_pretrained_model_name_or_path	retrieverreturnc                 K   s~  dd |  D }dd |  D }|D ]}|d| = q|D ]}|d| = q|dd}|du rd|dus8J dd	d
lm}	 d|vr[d	dlm}
 |
j|fi |ddi\}}||d< |	j|fi |}|dd}|du r|dusvJ dd	dlm} d|vrd	dlm}
 |
j|fi |ddi\}}||d< |j|fi |}|d}|du rt	j
|j|jfi |}| ||||dS )a  
        Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
        model checkpoints.

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
        the model, you need to first set it back in training mode with `model.train()`.

        Params:
            question_encoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
                Information necessary to initiate the question encoder. Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.

            generator_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
                Information necessary to initiate the generator. Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.

            model_args (remaining positional arguments, *optional*):
                All remaining positional arguments will be passed to the underlying model's `__init__` method.
            retriever ([`RagRetriever`], *optional*):
                The retriever to use.
            kwwargs (remaining dictionary of keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`).

                - To update the question_encoder configuration, use the prefix *question_encoder_* for each
                  configuration parameter.
                - To update the generator configuration, use the prefix *generator_* for each configuration parameter.
                - To update the parent model configuration, do not use a prefix for each configuration parameter.

                Behaves differently depending on whether a `config` is provided or automatically loaded.

        Example:

        ```python
        >>> from transformers import RagModel

        >>> # initialize a RAG from two pretrained models.
        >>> model = RagModel.from_pretrained_question_encoder_generator(
        ...     "facebook/dpr-question_encoder-single-nq-base", "google-t5/t5-small"
        ... )
        >>> # saving model after fine-tuning
        >>> model.save_pretrained("./rag")
        >>> # load fine-tuned model
        >>> model = RagModel.from_pretrained("./rag")
        ```c                 S   ,   i | ]\}}| d r|td d |qS )question_encoder_N
startswithlen.0argumentvaluer2   r2   r3   
<dictcomp>)      zQRagPreTrainedModel.from_pretrained_question_encoder_generator.<locals>.<dictcomp>c                 S   r<   )
generator_Nr>   rA   r2   r2   r3   rE   /  rF   r=   rG   modelNznIf `model` is not defined as an argument, a `question_encoder_pretrained_model_name_or_path` has to be defined   	AutoModelr6   )
AutoConfigreturn_unused_kwargsTzqIf `generator_model` is not defined as an argument, a `generator_pretrained_model_name_or_path` has to be definedAutoModelForSeq2SeqLM)question_encoder	generatorr6   r:   )itemspopauto.modeling_autorK   auto.configuration_autorL   from_pretrainedrO   getr   'from_question_encoder_generator_configsr6   )clsr8   r9   r:   kwargskwargs_question_encoderkwargs_generatorkeyrP   rK   rL   question_encoder_configrQ   rO   generator_configr6   r2   r2   r3   *from_pretrained_question_encoder_generator   sx   <


z=RagPreTrainedModel.from_pretrained_question_encoder_generator)NNN)r)   r*   r+   r   r/   base_model_prefix_supports_flash_attn_supports_sdpaclassmethodstrr   r   r`   r2   r2   r2   r3   r5      s$   
 r5   c                !       s
  e Zd Z				ddedB dedB dedB dedB f fddZe														ddej	dB d	ej
dB d
eeej  dB dej	dB dejdB dedB dejdB dej	dB dej	dB dedB dedB dedB dedB dedB deej
 eB fddZ  ZS )RagModelNr6   rP   rQ   r:   c                    s
  |dus|dur|dusJ d|du r!t j|j|jfi |}nt|| js2J d| d| j t | |du rHddlm} |	|j
}|du rXddlm} |	|j}|| _| jdurst|tspJ dt| j d	|| _|| _
|| _d| _d
| _|   dS )  
        question_encoder (`PreTrainedModel`, *optional*):
            The model responsible for encoding the question into hidden states for retrieval.
        generator (`PreTrainedModel`, *optional*):
            The model responsible for generating text based on retrieved documents.
        retriever (`RagRetriever`, *optional*):
            The component responsible for retrieving documents from a knowledge base given the encoded question.
        NzQEither a configuration or an question_encoder and a generator has to be provided.zconfig: z has to be of type rI   rJ   rN   z`self.retriever` is of type z&, but should be of type `RagRetriever`F)r   rX   r6   
isinstanceconfig_classsuper__init__rT   rK   from_configrP   rO   rQ   r:   r   typectx_encodercontext_encoder_training	post_init)selfr6   rP   rQ   r:   rZ   rK   rO   	__class__r2   r3   rk   u  s8   "
zRagModel.__init__	input_idsattention_maskencoder_outputsdecoder_input_idsdecoder_attention_maskr   r   r   r   	use_cacheoutput_attentionsoutput_hidden_statesoutput_retrievedn_docsr;   c                 K   sx  |dur|n| j j}|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|dur.|n| j j}| jduoF|du pB|	du pB|du oF|du }|du r|r| j||dd}|d }| j|| j	dt
jd t| jj dd|dd	}| jr|d
 |d |d |d |d |d f\}}	}}}}|	|}|		|}	|	|}|	|}| j||ddj}|d||jd }t
|d|ddd}nM|d
 |d |d |d f\}}	}}|	|}|	|}|		|}	t
|d|ddd}n|dusJ d|	dusJ d|dusJ d|dusJ d|jd | dks7J d| d|jd  d|durC|j|dd}|durO|j|dd}| j||	|||||
|dd	}|skd}d}d}d}d}n|j}|j}|rw|sd}d}	d}d}td*i d|jd|d|jd
|d|	d|d |d!|d"|d#|d$|jd%|jd&|j d'|j!d(|j"d)|j#S )+ay  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
            which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
            obtain the indices.

            [What are input IDs?](../glossary#input-ids)
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
            Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
            *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
            sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
            generator's encoder.

            Used by the ([`RagModel`]) model during decoding.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
            you're using with your RAG instance.
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
            has to be provided to the forward pass. `doc_scores` can be computed via
            `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
            the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
            provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
        output_retrieved (`bool`, *optional*):
            Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
            `context_attention_mask`. See returned tensors for more detail.
        n_docs (`int`, *optional*):
            The number of documents to retrieve.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RagRetriever, RagModel
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-base")
        >>> retriever = RagRetriever.from_pretrained(
        ...     "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
        ... )
        >>> # initialize with RagRetriever to do everything in one forward call
        >>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)

        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
        >>> outputs = model(input_ids=inputs["input_ids"])
        ```NT)ru   return_dictr   cpudevicedtypeprefixptr   r}   return_tensorsr   r   r   tokenized_doc_idstokenized_doc_attention_maskdoc_idsr   rI   zMake sure that `context_input_ids` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `context_attention_mask` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `doc_scores` are passed, if no `retriever` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.z^Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function.M The first dimension of `context_input_ids` should be a multiple of `n_docs`=	, but is .dim)	rt   ru   rv   rw   rx   r   ry   rz   r~   Nr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r2   )$r6   r}   ry   rz   r{   r|   r:   rP   detachtor-   float32numpygetattrrQ   ro   rn   pooler_outputviewshapebmm	unsqueeze	transposesqueezerepeat_interleavehidden_states
attentionsr4   r   r   encoder_last_hidden_stateencoder_hidden_statesencoder_attentionsdecoder_hidden_statesdecoder_attentionscross_attentions)rq   rt   ru   rv   rw   rx   r   r   r   r   ry   rz   r{   r|   r}   rZ   has_to_retrievequestion_enc_outputsr    retriever_outputsr   retrieved_doc_input_idsretrieved_doc_attention_maskr   gen_outputsr!   r"   r2   r2   r3   forward  s&  J

	









	
zRagModel.forwardNNNN)NNNNNNNNNNNNNN)r)   r*   r+   r   r   r   rk   r   r-   r0   Tensorr1   r.   
BoolTensorr   boolintr4   r   __classcell__r2   r2   rr   r3   rf   s  sx    4	
rf   zu
    A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
    c                &       s  e Zd Z				d2dedB dedB dedB dedB f fddZdefdd	Zd
efddZe																		d3de
jdB de
jdB deee
j  dB de
jdB de
jdB dedB de
jdB de
jdB de
jdB dedB dedB dedB dedB dedB dedB de
jdB dedB def$dd Zed!d" Zed#d$ Zed%d& Ze
 									d4de
jdB de
jdB de
jdB de
jdB de
jdB d'edB d(edB d)edB dedB de
jfd*d+Z	d5d.d/Zed0d1 Z  ZS )6RagSequenceForGenerationNr6   rP   rQ   r:   c                    j   |dus|dur|dusJ d|du r t j|j|jfi |}t | t||||d| _|   dS rg   NzHEither a configuration or an encoder and a generator has to be provided.)r6   rP   rQ   r:   r   rX   r6   rj   rk   rf   r7   rp   rq   r6   rP   rQ   r:   rZ   rr   r2   r3   rk     s   z!RagSequenceForGeneration.__init__c                 C      || j _d S r   r7   r:   rq   r:   r2   r2   r3   set_retriever     z&RagSequenceForGeneration.set_retrieverrn   c                 C      d| j _|| j _d S NTr7   ro   rn   rq   rn   r2   r2   r3    set_context_encoder_for_training     z9RagSequenceForGeneration.set_context_encoder_for_trainingrt   ru   rv   rw   rx   r   r   r   r   ry   rz   r{   r|   exclude_bos_scorereduce_losslabelsr}   r;   c                 K   s6  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|du r(|}d}
| j||||||||	||
||||d}d}|durS| j|j|j||| j j||d}t	di d|d|jd|jd|j
d	|jd
|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jS )a3  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
            which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
            obtain the indices.

            [What are input IDs?](../glossary#input-ids)
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
            Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
            *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
            sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
            generator's encoder.

            Used by the ([`RagModel`]) model during decoding.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
            you're using with your RAG instance.
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
            the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
            provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
            has to be provided to the forward pass. `doc_scores` can be computed via
            `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
        output_retrieved (`bool`, *optional*):
            Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
            `context_attention_mask`. See returned tensors for more detail.
        exclude_bos_score (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the score of the BOS token is disregarded when computing
            the loss.
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
            operation.
        n_docs (`int`, *optional*):
            The number of documents to retrieve.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RagRetriever, RagSequenceForGeneration
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-sequence-nq")
        >>> retriever = RagRetriever.from_pretrained(
        ...     "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
        ... )
        >>> # initialize with RagRetriever to do everything in one forward call
        >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
        >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
        >>> input_ids = inputs["input_ids"]
        >>> labels = targets["input_ids"]
        >>> outputs = model(input_ids=input_ids, labels=labels)

        >>> # or use retriever separately
        >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
        >>> # 1. Encode
        >>> question_hidden_states = model.question_encoder(input_ids)[0]
        >>> # 2. Retrieve
        >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
        >>> doc_scores = torch.bmm(
        ...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
        ... ).squeeze(1)
        >>> # 3. Forward to generator
        >>> outputs = model(
        ...     context_input_ids=docs_dict["context_input_ids"],
        ...     context_attention_mask=docs_dict["context_attention_mask"],
        ...     doc_scores=doc_scores,
        ...     decoder_input_ids=labels,
        ... )
        ```NFrt   ru   rv   rw   rx   r   r   r   r   ry   rz   r{   r|   r}   )r   epsilonr   r}   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r2   )r6   r}   r   r   r7   get_nllr   r   label_smoothingr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   )rq   rt   ru   rv   rw   rx   r   r   r   r   ry   rz   r{   r|   r   r   r   r}   rZ   outputsr   r2   r2   r3   r     s   g
	
z RagSequenceForGeneration.forwardc                 C      | j jS r   r   rq   r2   r2   r3   r:   _     z"RagSequenceForGeneration.retrieverc                 C   r   r   r7   rQ   r   r2   r2   r3   rQ   c  r   z"RagSequenceForGeneration.generatorc                 C   r   r   r7   rP   r   r2   r2   r3   rP   g  r   z)RagSequenceForGeneration.question_encoderdo_deduplicationnum_return_sequences	num_beamsc
                 K   sZ  |	dur|	n| j j}	|dur|n| j j}|dur|n| j j}|dur$|n| j j}|dus4|dus4J d| jdurg|du rg| j||dd }| j|| jdt	j
d t| jj dd|	dd	d
 }||}g }||
d< ||
d< d|
d< |dur~|jd n|jd |	 }t|D ]}|||	 |d |	  }| jj|fi |
}|rt	tdd |D  }|jd }|dur|||d  |d}| ||dd}nC|dusJ d|dusJ d||d}|||	 |d |	  }||d}|||d ddf }||d}| ||||dd}|d  |d }|||  q| j|| j jjdS )a  
        Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
        for more information on how to set other generate input parameters.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The sequence used as a prompt for the generation. If `input_ids` is not passed, then
                `context_input_ids` has to be provided.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Input IDs post-processed from the retrieved documents and the question encoder input_ids by the
                retriever.
            context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
                retriever.

                If the model is not initialized with a `retriever` or `input_ids` is not given, `context_input_ids` and
                `context_attention_mask` have to be provided to the forward pass. They are returned by
                [`~RagRetriever.__call__`].
            doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
                Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
                `question_encoder_last_hidden_state`.

                If the model is not initialized with a `retriever` or `input_ids` is not given, `doc_scores` has to be
                provided to the forward pass. `doc_scores` are returned by [`~RagRetriever.__call__`].
            do_deduplication (`bool`, *optional*):
                Whether or not to deduplicate the generations from different context documents for a given input. Has
                to be set to `False` if used while training with distributed backend.
            num_return_sequences(`int`, *optional*, defaults to 1):
                The number of independently computed returned sequences for each element in the batch. Note that this
                is not the value we pass to the `generator`'s `[`~generation.GenerationMixin.generate`]` function,
                where we set `num_return_sequences` to `num_beams`.
            num_beams (`int`, *optional*, defaults to 1):
                Number of beams for beam search. 1 means no beam search.
            n_docs (`int`, *optional*, defaults to `config.n_docs`)
                Number of documents to retrieve and/or number of documents for which to generate an answer.
            kwargs (`dict[str, Any]`, *optional*):
                Additional kwargs will be passed to [`~generation.GenerationMixin.generate`].

        Return:
            `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
            sequences. The second dimension (sequence length) is either equal to `max_length` or shorter if all batches
            finished early due to the `eos_token_id`.
        Nz= At least one of input_ids or context_input_ids must be givenru   r   r   r   r   r   r   r   r   r   ru   r   c                 S   s   i | ]	}t | |qS r2   )re   tolist)rB   kr2   r2   r3   rE     s    z5RagSequenceForGeneration.generate.<locals>.<dictcomp>T)r   r   zMake sure that `context_attention_mask` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.zMake sure that `doc_scores` are passed, if no `input_ids` is set. Alternatively, you can set a retriever using the `set_retriever(...)` function.)r   r   r   r   r   r   )pad_token_id)r6   r}   r   r   r   r:   rP   r   r   r-   r   r   r   rQ   r   rangegeneratestacklistvaluesrepeattopkappend_cat_and_padr   )rq   rt   ru   r   r   r   r   r   r   r}   model_kwargsnum_doc_return_sequencesquestion_hidden_stateshypos
batch_sizeindexgenerator_input_idsoutput_sequencesnum_candidatesnew_input_idsr   individual_input_idsindividual_attention_maskindividual_doc_scorestop_cand_indsr2   r2   r3   r   k  s~   A
	 

z!RagSequenceForGeneration.generateF        c                    sB  t d d dd f jd d jjjgd|d ur#|n jj} jj	p/ jjj	}|d uo@d d df 
| }	 fdd}
tjj|dd|jd | |d|d}tjj|dddd}|d d d d d dd d f }|d d d d ddd d f }|d d d d dd d d f }t j||| |gdd}ddd|dd | ksJ |jdd}|jdd	d
}|
||\}}|r|	r|d d d d dd f dn|d}|d}|d}|d}| }| }|r| }| }||d }d| | ||  }|S )Nr   r   c                    D     jjj}| r| |d ||d | d|dfS Nr   r   eqr6   rQ   r   anymasked_fill_r   ll
smooth_objpad_maskrq   targetr2   r3   
_mask_pads  
   z4RagSequenceForGeneration.get_nll.<locals>._mask_padsr   r   rI   r   r   Tr   keepdim      ?)r-   catnewr   fill_r6   rQ   r   r}   bos_token_idr   allr   
functionallog_softmaxr   sizer   r   r   gathersum	logsumexp)rq   
seq_logitsr   r   r   r   r   r}   r  use_bosr   seq_logprobsdoc_logprobsfirst_token_scoressecond_token_scores	remainderrag_logprobsr   r   nll_losssmooth_losseps_ir   r2   r   r3   r     s@   2"   2


z RagSequenceForGeneration.get_nllc                 C   sv   | d  tdd | D tdd | D |}d}| D ]}|||||jd  d |jd f< ||jd 7 }q|S )Nr   c                 s       | ]}|j d  V  qdS )r   Nr   rB   tr2   r2   r3   	<genexpr>?      z8RagSequenceForGeneration._cat_and_pad.<locals>.<genexpr>c                 s   r  )r   Nr  r  r2   r2   r3   r  ?  r  r   )r  r
  maxr  r   )tensorsr   outputindr  r2   r2   r3   r   =  s   2$z%RagSequenceForGeneration._cat_and_padr   NNNNNNNNNNNNNNNNN)	NNNNNNNNN)Fr   FN)r)   r*   r+   r   r   r   rk   r   r   r   r-   r0   r   r1   r   r   r.   r   r   r   r   propertyr:   rQ   rP   no_gradr   r   staticmethodr   r   r2   r2   rr   r3   r     s    	
 !


	
 
;r   zo
    A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
    c                &       s^  e Zd Z				dAdedB dedB dedB dedB f fddZdefdd	Zd
efddZ						dBddZ	e
dd Ze
dd Ze
dd Zedd ZdCddZe																	dDdejdB dejdB deeej  dB dejdB dejdB dedB dejdB d ejdB d!ejdB d"edB d#edB d$edB d%edB d&edB d'edB d(ejdB d)edB d*ef$d+d,Ze dddddddde e f
dejdB dejdB dejdB d ejdB d!ejdB d)edB d-e dB d.e!eejge"e f dB d/edB d0edB d*ejfd1d2Z#d3d4 Z$d5d6 Z%d7d8 Z&d9d: Z'dCd;d<Z(dEd?d@Z)  Z*S )FRagTokenForGenerationNr6   rP   rQ   r:   c                    r   r   r   r   rr   r2   r3   rk   M  s   zRagTokenForGeneration.__init__c                 C   r   r   r   r   r2   r2   r3   r   m  r   z#RagTokenForGeneration.set_retrieverrn   c                 C   r   r   r   r   r2   r2   r3   r   p  r   z6RagTokenForGeneration.set_context_encoder_for_trainingc           	   
   K   s4   |d ur|d d dd f }d ||||||d|d	S )Nr   T)	rt   rv   r   r   rw   r   ry   do_marginalizer}   r2   )	rq   rw   r   ru   ry   rv   r   r}   rZ   r2   r2   r3   prepare_inputs_for_generationt  s   z3RagTokenForGeneration.prepare_inputs_for_generationc                 C   r   r   r   r   r2   r2   r3   r:     r   zRagTokenForGeneration.retrieverc                 C   r   r   r   r   r2   r2   r3   rQ     r   zRagTokenForGeneration.generatorc                 C   r   r   r   r   r2   r2   r3   rP     r   z&RagTokenForGeneration.question_encoderc           	         s   dd  d}t t| D ]S}t| tr@ fdd| jj| j| jj| j| jj| j| jj| jfD \}}}}||||f}n fdd| j| j| j| jfD \}}||f}||f7 }qt	| |S )zeReorders cache for generation. BART-inspired but we need to take care of the extra dimension for docsc                 S   s^   | j d |j d  }| jd|g| j dd  R  } | d|} | jdg| j dd  R  }|S )Nr   r   r   rI   )r   r   index_select)r   	new_orderr}   resultr2   r2   r3   _reorder_stacked  s
   z>RagTokenForGeneration._reorder_cache.<locals>._reorder_stackedr2   c                 3   "    | ]} | |jV  qd S r   r   r   rB   xr+  beam_idxr2   r3   r    
    
z7RagTokenForGeneration._reorder_cache.<locals>.<genexpr>c                 3   r,  r   r-  r.  r0  r2   r3   r    r2  )
r   r@   rh   r   self_attention_cachelayerskeysr   cross_attention_cacherm   )	r   r1  reordered_pastidxself_attention_kself_attention_vcross_attention_kcross_attention_v	new_tupler2   r0  r3   _reorder_cache  s$   
	
z$RagTokenForGeneration._reorder_cachec                 C   sp   |d ur|n| j j}tjj|dd|jd | |d|d}tj|dd}||	d	d }tj
|ddS )Nr   r   r   r   )r6   r}   r   r  r  r   r   r  r-   r   r  )rq   r  r   r}   r  r  log_prob_sumr2   r2   r3   marginalize  s   z!RagTokenForGeneration.marginalizert   ru   rv   rw   rx   r   r   r   r   ry   rz   r{   r|   r&  r   r   r}   r;   c                 K   sX  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|du r(|}d}
| j||||||||	||
||||d}d}|j}|dur[|dusLJ | j|j|j||| j j|d}|re| 	||j|}t
di d|d|d|jd|jd	|jd
|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. [`RagConfig`], used to initialize the model, specifies
            which generator to use, it also specifies a compatible generator tokenizer. Use that tokenizer class to
            obtain the indices.

            [What are input IDs?](../glossary#input-ids)
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*)
            Tuple consists of (`generator_enc_last_hidden_state`, *optional*: `generator_enc_hidden_states`,
            *optional*: `generator_enc_attentions`). `generator_enc_last_hidden_state` of shape `(batch_size, n_docs *
            sequence_length, hidden_size)` is a sequence of hidden-states at the output of the last layer of the
            generator's encoder.

            Used by the ([`RagModel`]) model during decoding.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
            you're using with your RAG instance.
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
            Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model was not initialized with a `retriever` ``context_input_ids` has to be provided to
            the forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
        context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`,*optional*, returned when *output_retrieved=True*):
            Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
            retriever. If the model has is not initialized with a `retriever` `context_attention_mask` has to be
            provided to the forward pass. `context_attention_mask` are returned by [`~RagRetriever.__call__`].
        doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
            Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
            `question_encoder_last_hidden_state`. If the model has is not initialized with a `retriever` `doc_scores`
            has to be provided to the forward pass. `doc_scores` can be computed via
            `question_encoder_last_hidden_state` and `retrieved_doc_embeds`, see examples for more information.
        output_retrieved (`bool`, *optional*):
            Whether or not to return the `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
            `context_attention_mask`. See returned tensors for more detail.
        do_marginalize (`bool`, *optional*):
            If `True`, the logits are marginalized over all documents by making use of
            `torch.nn.functional.log_softmax`.
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
            operation.
        n_docs (`int`, *optional*):
            The number of documents to retrieve.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RagRetriever, RagTokenForGeneration
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/rag-token-nq")
        >>> retriever = RagRetriever.from_pretrained(
        ...     "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
        ... )
        >>> # initialize with RagRetriever to do everything in one forward call
        >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
        >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
        >>> input_ids = inputs["input_ids"]
        >>> labels = targets["input_ids"]
        >>> outputs = model(input_ids=input_ids, labels=labels)

        >>> # or use retriever separately
        >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
        >>> # 1. Encode
        >>> question_hidden_states = model.question_encoder(input_ids)[0]
        >>> # 2. Retrieve
        >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
        >>> doc_scores = torch.bmm(
        ...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
        ... ).squeeze(1)
        >>> # 3. Forward to generator
        >>> outputs = model(
        ...     context_input_ids=docs_dict["context_input_ids"],
        ...     context_attention_mask=docs_dict["context_attention_mask"],
        ...     doc_scores=doc_scores,
        ...     decoder_input_ids=labels,
        ... )

        >>> # or directly generate
        >>> generated = model.generate(
        ...     context_input_ids=docs_dict["context_input_ids"],
        ...     context_attention_mask=docs_dict["context_attention_mask"],
        ...     doc_scores=doc_scores,
        ... )
        >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
        ```NFr   )r   r   r}   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r2   )r6   r}   r&  r   r7   r   r   r   r   r@  r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   )rq   rt   ru   rv   rw   rx   r   r   r   r   ry   rz   r{   r|   r&  r   r   r}   rZ   r   r   r   r2   r2   r3   r     s   o		
zRagTokenForGeneration.forwardgeneration_configprefix_allowed_tokens_fnlogits_processorstopping_criteriac                    s  |  d|ddd}| j|fi |\}}| }|tjtjtjtjfvr,td| dt	t
| t| }| |  | ||| |dddu}| || durWn| jj| jdur|du r| j||dd }| j|| jdtjd	 t	| jjd
ddd}|d |d |d }}}||}||}||}t|d|ddd}|jd  dksJ d d|jd  d|jd   | j j! }|||dd}tj" |j# df|j$tj%t&| ' j(d}|jd }|d }d& fdd	}|||j#d}|||j#d|d< |j)|j#dd}||d< ||d< ||d< |d < | j*|||||	|j(d!}| j+||
d"}| j,||d|jd |j-d d# | .||||d$< || |f|||d%||S )'a  
        Implements RAG token decoding.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The sequence used as a prompt for the generation. If `input_ids` is not passed, then
                `context_input_ids` has to be provided.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            context_input_ids (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Input IDs post-processed from the retrieved documents and the question encoder `input_ids` by the
                retriever.

                If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
                forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
            context_attention_mask (`torch.LongTensor` of shape `(batch_size * config.n_docs, config.max_combined_length)`, *optional*, returned when *output_retrieved=True*):
                Attention mask post-processed from the retrieved documents and the question encoder `input_ids` by the
                retriever.

                If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
                forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
            doc_scores (`torch.FloatTensor` of shape `(batch_size, config.n_docs)`):
                Score between each retrieved document embeddings (see `retrieved_doc_embeds`) and
                `question_encoder_last_hidden_state`.

                If the model has is not initialized with a `retriever`, `context_input_ids` has to be provided to the
                forward pass. `context_input_ids` are returned by [`~RagRetriever.__call__`].
            n_docs (`int`, *optional*, defaults to `config.n_docs`)
                Number of documents to retrieve and/or number of documents for which to generate an answer.
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which has the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments `inputs_ids` and the batch ID
                `batch_id`. It has to return a list with the allowed tokens for the next generation step conditioned on
                the previously generated tokens `inputs_ids` and the batch ID `batch_id`. This argument is useful for
                constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                Retrieval](https://huggingface.co/papers/2010.00904).
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and a
                model's config. If a logit processor is passed that is already created with the arguments or a model's
                config an error is thrown.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complement the default stopping criteria built from arguments and a
                model's config. If a stopping criteria is passed that is already created with the arguments or a
                model's config an error is thrown.
            kwargs (`dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model.

        Return:
            `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
            sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches
            finished early due to the `eos_token_id`.
        NFz!RAG model is not compatible with z5 generation. Please check your generation parameters.ru   r   r   r   r   r   r   r   r   r   r   r   rI   r   r   r   T)rt   ru   r~   )r   r   r   last_hidden_statec                    sl   | d d d d f   df| jdd   } |  |f| jdd   } |   |  f| jdd   S )Nr   r   )reshaper   expand)tensorr   r   r}   r2   r3   extend_enc_output  s   ,"z9RagTokenForGeneration.generate.<locals>.extend_enc_output)r   r   r   rv   r}   )rA  input_ids_seq_lengthencoder_input_idsrB  rC  r   )rA  rD  )generation_moder   max_cache_lengthprefill_outputs)rC  rD  rA  r   )/_extract_generation_mode_kwargs_prepare_generation_configget_generation_moder   SAMPLEGREEDY_SEARCHBEAM_SEARCHBEAM_SAMPLE
ValueErrorr   rm   r   _validate_model_kwargscopy_validate_generation_moderW   _prepare_special_tokensr6   r}   r:   rP   r   r   r-   r   r   rQ   r   r   r   r   r   r7   get_encoderfullr   decoder_start_token_idlongnext
parametersr   r   _get_logits_processor_get_stopping_criteria_prepare_cache_for_generation
max_length_prefill)rq   rt   ru   r   r   r   r}   rA  rB  rC  rD  rZ   generation_mode_kwargsr   rM  decoding_methodkwargs_has_attention_maskr   outr   encoderrv   rK  rE  rJ  prepared_logits_processorprepared_stopping_criteriar2   rI  r3   r   v  s   Q





	
		zRagTokenForGeneration.generatec                 C   s   |  ||}|S r   )r>  )rq   r   r1  r2   r2   r3   _temporary_reorder_cacheE  s   z.RagTokenForGeneration._temporary_reorder_cachec                 C      | j j S r   )r7   rQ   get_input_embeddingsr   r2   r2   r3   rp  L  r   z*RagTokenForGeneration.get_input_embeddingsc                 C   ro  r   )r7   rQ   get_output_embeddingsr   r2   r2   r3   rq  O  r   z+RagTokenForGeneration.get_output_embeddingsc                 C   s   | j j|S r   )r7   rQ   set_output_embeddings)rq   new_embeddingsr2   r2   r3   rr  R  s   z+RagTokenForGeneration.set_output_embeddingsc                 C   sX   |du r| j j}||j}|ddddf  |ddddf< ||dddf< |S )zCShift input ids one token to the right, and pad with start_token_idNr   r   r   )r6   r^  	new_zerosr   clone)rq   rt   start_token_idshifted_input_idsr2   r2   r3   shift_tokens_rightU  s   (z(RagTokenForGeneration.shift_tokens_rightFr   c                    s  |d ur|n j j}td d dd f jd d j jjgd fdd} 	|||}
d | ksDJ |jdd}	|jddd}
||	|
\}	}
|	d}	|
d}
|	 }|
 }|rs| }| }||d }d	| | ||  }|S )
Nr   r   c                    r   r   r   r   r   r2   r3   r   e  r   z1RagTokenForGeneration.get_nll.<locals>._mask_padsr   r   Tr   r   )r6   r}   r-   r  r  r   r  rQ   r   r@  r   r   r	  r
  r  )rq   r  r   r   r   r   r}   r   r  r   r   r  r  r  r   r2   r   r3   r   ^  s*   2


zRagTokenForGeneration.get_nllr   )NNNNNNr   r!  )Fr   N)+r)   r*   r+   r   r   r   rk   r   r   r'  r"  r:   rQ   rP   r$  r>  r@  r   r-   r0   r.   r1   r   r   r   r   r   r   r   r#  r   r   r	   r   r   r   rn  rp  rq  rr  rx  r   r   r2   r2   rr   r3   r%  G  s    





 	
 -	
 O
	r%  )rf   r5   r   r%  ))r,   collections.abcr   dataclassesr   r-   r   cache_utilsr   r   configuration_utilsr   
generationr	   r
   r   r   r   generation.utilsr   modeling_outputsr   modeling_utilsr   utilsr   r   configuration_ragr   retrieval_ragr   
get_loggerr)   loggerr   r4   r5   rf   r   r%  __all__r2   r2   r2   r3   <module>   sb   
ZW	       3    ;