o
    ̳i0?                     @   s  d dl mZmZmZmZ d dlZd dlmZ dejdejdejfddZ	d	ddd
dejde
dee deej dejf
ddZ	d*dd	dddedejdejdeej deej de
dee deejejf fddZdejdejdejdejfddZ	d*dejdee dejfddZdejfd d!Ze d d	ddddd"ded#ejd$ed%ede
dee deee  d&eej d'ee deejejf fd(d)ZdS )+    )CallableListOptionalTupleN)TransformerDecoderprobsqreturnc                 C   s   t j| | dddjt jdS )z(Samples from a multinomial distribution.T)dimkeepdim)dtype)torchargmaxtoint)r   r    r   T/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/generation/_generation.pymultinomial_sample_one   s   r   g      ?temperaturetop_kr   logitsr   r   c                C   s   | t |d } |dur.t| t|| d\}}|ddd}t| |k td | } tj	j
j| dd}|du rCt|d}t||S )a:  Generic sample from a probability distribution. Includes support for Top-K sampling
    and Temperature.

    Args:
        logits (torch.Tensor): logits from which to sample
        temperature (float): value to scale the predicted logits by, default 1.0.
        top_k (Optional[int]): If specified, we prune the sampling to only token ids within the top_k probabilities
        q (Optional[torch.Tensor]): randomly sampled tensor for softmax sampling trick. If None,
            we use the default softmax sampling trick. Default None.

    Example:
        >>> from torchtune.generation import sample
        >>> logits = torch.empty(3, 3).uniform_(0, 1)
        >>> sample(logits)
        tensor([[1],
                [2],
                [0]], dtype=torch.int32)

    Returns:
        torch.Tensor: sampled token id
    gh㈵>Nr
   Infr      )maxr   topkminsizeselect	unsqueezewherefloatnn
functionalsoftmax
empty_likeexponential_r   )r   r   r   r   v_pivotr   r   r   r   sample   s   
r,   )maskr   r   model	input_posxr-   c                C   s8   | |||ddddf }t | |||d|dfS )a  
    Generates the next tokens given a prompt, and also returns the corresponding logits.

    Args:
        model (TransformerDecoder): model used for generation
        input_pos (torch.Tensor): tensor with the positional encodings associated with the given prompt,
            with shape [bsz x seq_length].
        x (torch.Tensor): tensor with the token IDs associated with the given prompt,
            with shape [bsz x seq_length].
        q (Optional[torch.Tensor]): randomly sampled tensor for softmax sampling trick.
            See https://github.com/pytorch-labs/gpt-fast/blob/32971d3129541c5bfb4f715abc33d1c5f408d204/generate.py#L40
        mask (Optional[torch.Tensor]): attention mask with shape [bsz x seq_length x seq_length],
            default None.
        temperature (float): value to scale the predicted logits by, default 1.0.
        top_k (Optional[int]): Top-k value to use for sampling, default None.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: tuple of two tensors:
            - tokens (torch.Tensor): tensor with the generated tokens,
                with shape [bsz x 1].
            - logits (torch.Tensor): tensor with the logits associated with the generated tokens,
                with shape [bsz x 1 x vocab_size].

    )r/   r-   Nr
   r   r   )r,   cloner!   )r.   r/   r0   r   r-   r   r   r   r   r   r   generate_next_tokenB   s   $r2   tokensstop_tokensstop_token_reachedc                 C   s   t | | }||O }|S )z2Updates which sequences have reached a stop token.)r   isinflatten)r3   r4   r5   stop_token_reached_currr   r   r   update_stop_tokens_trackerm   s   r9   padding_masktarget_seq_lenc                 C   s   | j \}}|du r|n|}||k rtdtjtj||| jtddd|dd}|dd|	| dddddf 
d|d |jddd	td
g |S )a  
    Converts a padding mask of shape ``[bsz, seq_len]`` to a ``[bsz, seq_len, seq_len]`` causal attention mask suitable for
    consumption by :func:`~torch.nn.functional.scaled_dot_product_attention`. If ``target_seq_len``
    is provided, this will return a mask of shape ``[bsz, seq_len, target_seq_len]``. This is useful
    when generating masks for static KV caches where the maximum length the caches have been setup with
    are longer than the current sequence.

    Args:
        padding_mask (torch.Tensor): Boolean tensor where False indicates the corresponding token in the sequence
            is a padding token and should be masked out in attention, with shape [bsz x seq_length]
        target_seq_len (Optional[int]): target sequence length to create attention mask with. Default None.

    Returns:
        torch.Tensor: Boolean causal mask with shape
            - [bsz, seq_length, seq_length] or
            - [bsz, seq_length, target_seq_len] if ``target_seq_len`` was specified.

    Raises:
        AssertionError: if ``target_seq_len < seq_len``, the sequence length of the padding mask.

    Example:
        >>> padding_mask = torch.tensor([[False, True, True, True]])
        >>> get_causal_mask_from_padding_mask(padding_mask, target_seq_len=5)
        tensor([[[ True, False, False, False, False],
                  [False,  True, False, False, False],
                  [False,  True,  True, False, False],
                  [False,  True,  True,  True, False]]])
        ])
    NzNtarget_seq_len cannot be shorter than the sequence length of the padding mask.devicer   r   )diagonalr      r
   )dim1dim2T)shapeAssertionErrorr   trilonesr=   boolrepeatnarrowmul_expandr>   copy_Tensor)r:   r;   bszseq_lenr-   r   r   r   !get_causal_mask_from_padding_masky   s   
 
0rO   c                 C   s   |  dd |  tjS )a  
    Calculates position ids given a padding mask which right-shifts position ids to start
    from the first valid token.

    Args:
        padding_mask (torch.Tensor): Boolean tensor where False indicates the corresponding token in the sequence
            is a padding token and should be masked out in attention. Shape [bsz, seq_len]

    Returns:
        torch.Tensor: position ids which are appropriately shifted according to any padding values.

    Example:
        >>> padding_mask = torch.tensor([False, False, False, True, True, True, True, True])
        >>> get_position_ids_from_padding_mask(padding_mask)
        torch.Tensor([0, 0, 0, 0, 1, 2, 3, 4])
    r
   r   )cumsumr   r   r   )r:   r   r   r   "get_position_ids_from_padding_mask   s   rQ   )pad_idr   r   r4   rngcustom_generate_next_tokenpromptmax_generated_tokensrR   rS   rT   c             
   C   sv  |j dkr|ddn|}|du rt}| \}	}
|
| }| }|  }|s)|n| j}||k}| sKtj	j
j|d|fdd}t||d}t|}nttj||tj|jdd}tjd||jd	d}|ru|ddd|
f }n|ddd|
d|
f }d}|durtj|	| jjf|jd	jd|d
}t| |ddd|
f  |||||d\}}tj||gdd}|
}tj|	tj|jd}|rtj||j|jdnd}tj|	|
d ftj|jd}|durt|||}|  r||fS t |d D ]}|durtj||!|	d gdd}|r,|dd|f " }|dd|dddf " }n!| }|ddd|d f }|ddd|d d|d f }d}|duretj|	| jjf|jd	jd|d
}|| || ||||d\}}tj||gdd}tj||gdd}|d7 }|durt|||}| r nq|dur||9 }||dd|j#d  ddf 9 }||fS )a1	  
    Generates tokens from a model conditioned on a prompt, and also returns logits for the generations.

    Args:
        model (TransformerDecoder): model used for generation
        prompt (torch.Tensor): tensor with the token IDs associated with the given prompt,
            with shape either [seq_length] or [bsz x seq_length].
        max_generated_tokens (int): number of tokens to be generated
        pad_id (int): token ID to use for padding, default 0.
        temperature (float): value to scale the predicted logits by, default 1.0.
        top_k (Optional[int]): If specified, we prune the sampling to only token ids within the top_k probabilities,
            default None.
        stop_tokens (Optional[List[int]]): If specified, generation is stopped when any of these tokens are generated,
            default None.
        rng (Optional[torch.Generator]): random number generator, default None.
        custom_generate_next_token (Optional[Callable]): If specified, we'll use the
            ``custom_generate_next_token function``. This is generally only useful if
            you want to specify a ``torch.compile`` version of the generate next token for
            performance reasons. If None, we use the default :func:`generate_next_token`.
            Default is None.

    Note:
        This function has only been tested with decoder-only models.

    Examples:
        >>> model = torchtune.models.llama3.llama3_8b()
        >>> tokenizer = torchtune.models.llama3.llama3_tokenizer()
        >>> prompt = tokenizer.encode("Hi my name is")
        >>> rng.manual_seed(42)
        >>> output, logits = generate(model, torch.tensor(prompt), max_generated_tokens=100, pad_id=0)
        >>> print(tokenizer.decode(output[0].tolist()))
        Hi my name is Jeremy and I'm a friendly language model assistant!

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: tuple of two tensors:
            - tokens (torch.Tensor): tensor with the generated tokens,
                with shape ``[bsz x seq_len + num_generated_tokens]`` where ``num_generated_tokens``
                may be less than ``max_generated_tokens`` if ``stop_tokens`` are provided.
            - logits (torch.Tensor): tensor with the logits associated with the generated tokens,
                with shape ``[bsz x num_generated_tokens x vocab_size]``.
    r   r
   Nr   T)value)r;   )r   r=   )r=   )	generator)r/   r-   r0   r   r   r   r   r<   )r/   r0   r-   r   r   r   )$ndimviewr2   r   r1   caches_are_enableddecoder_max_cache_seq_lenallr   r$   r%   padrO   rQ   rD   rE   rF   r=   r!   arangeemptytok_embeddingsnum_embeddingsr(   squeezecatzerostensorr   int32r9   itemrangereshape
contiguousrB   )r.   rU   rV   rR   r   r   r4   rS   rT   rM   prompt_lengthtotal_response_lengthgenerated_tokensincremental_decodingmax_seq_lenpadding_masksmasksr/   
curr_masksr   r3   generated_logitscurr_posr5   stop_token_maskr*   curr_input_posr   r   r   r   generate   s   6




"

	


"rx   )N)typingr   r   r   r   r   torchtune.modules.transformerr   rL   r   r#   r   r,   r2   r9   rO   rQ   no_grad	Generatorrx   r   r   r   r   <module>   s   
4	
+

1

	
