o
    ̳i*                     @   sd   d dl mZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZ d dlmZ G dd de	ZdS )	    )DictListOptionalN)
functional)Dataset)CROSS_ENTROPY_IGNORE_IDX	PACK_TYPE)tqdmc                   @   s   e Zd ZdZdddddededed	ee d
eddfddZd!ddZ	defddZ
dedefddZdeddfddZdedefddZdededefddZdefddZdedeeejf fdd ZdS )"PackedDataseta  
    Performs greedy sample packing on a provided dataset. This is done as a single
    preprocessing step before training begins. Shuffling is done outside of this
    class on packed samples with a ``Sampler`` as part of the dataloader. Currently,
    this only supports in-memory map-style datasets.

    The class loads, tokenizes, and packs examples on initialization - no tokenization is done during training.

    The general flow on initialization is: load tokenized sample -> add to buffer ->
    when buffer is long enough, add to ``self.packs``.

    During training, returns self.packs[idx] as input, label, attention mask, and
    position ids. The attention mask is a lower triangular block mask to prevent
    samples from cross-attending within a pack. The position ids indicate the position
    of each token relative to its sample within a pack. These are all padded to max
    sequence length, so a batch-wise collator is not needed.

    A packed sample is made up of individual smaller sequence length samples jammed together
    within ``max_seq_len``. For example, if max_seq_len is 6 and there are varied
    length samples::

        tokens = [
            [S1, S1, S1, S2, S2, pad],
            [S3, S3, S4, S4, pad, pad],
            ...,
        ]

    To prevent cross-contamination, the following mask would be returned for the
    first pack in the example::

        mask = [
            [1, 0, 0, 0, 0, 0],
            [1, 1, 0, 0, 0, 0],
            [1, 1, 1, 0, 0, 0],
            [0, 0, 0, 1, 0, 0],
            [0, 0, 0, 1, 1, 0],
            [0, 0, 0, 0, 0, 1],
        ]

    The position ids would be::

        input_pos = [
            [0, 1, 2, 0, 1, 2],
            [0, 1, 0, 1, 2, 3],
            ...,
        ]

    The identity matrix is used in the mask for pad tokens instead of a causal mask.
    For position ids for pad tokens, we simply continue to increment from the previous
    sample normally.

    Args:
        ds (Dataset): dataset to sample pack. This should return a dictionary with field
            "tokens" and "labels" containing the tokenized and label samples.
        max_seq_len (int): Maximum number of tokens to pack
        padding_idx (int): padding index for the tokenizer. Default is 0.
        max_packs (Optional[int]): Maximum number of packs. Default is None, which will create as many
            packs as possible.
        split_across_pack (bool): if the last sample in a pack does not fit in ``max_seq_len``,
            split the sample into the next pack, or move it entirely to the beginning of the next pack.
            For pre-training, typically this is set to True for general text completion. For
            fine-tuning, typically this is set to False to avoid truncating sentences in instruct
            tuning. Default is False.
    r   NF)padding_idx	max_packssplit_across_packdsmax_seq_lenr   r   r   returnc                C   s6   || _ || _|| _|| _|| _g | _d| _|   d S )Nr   )r   r   r   r   r   packsprevious_sample_boundary_pack)selfr   r   r   r   r    r   N/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/datasets/_packed.py__init__S   s   	zPackedDataset.__init__c                    s  g g g g d}t j rt j rt j nd}|dkr&tt jddd} jD ]}|d |d }}t|}| jkrL j	sLt
d| d	 j d
|d  |7  < |d  |7  < |d   fddt|D 7  < |d  |g7  < t|d  jkr  s |}t|d  jkr  r|dkr|  t|d  _  r nq)t|d dkrǈ jdu st j jk rɈ | dS dS dS )zIterate through the dataset. Use a buffer to hold samples until max_seq_len,
        then append the buffer to self.packs as a single "packed" sample. Continue
        until max_packs or end of dataset.tokenslabels	input_posseq_lensr   zPacking datasetT)totaldescdynamic_ncolsr   r   zDataset sample is too long (z > zA). Please set `split_across_pack=True` or increase `max_seq_len`.r   c                    s   g | ]}| j  qS r   )r   ).0xr   r   r   
<listcomp>   s    z'PackedDataset._pack.<locals>.<listcomp>r   N)torchdistributedis_availableis_initializedget_rankr	   lenr   r   r   
ValueErrorrange_should_stop_packing_split_and_add_packupdater   r   r   	_add_pack)r   current_packrankpbarsampler   r   seq_lenr   r"   r   r   f   sT   

"
zPackedDataset._packc                 C   s"   | j durt| j| j krdS dS )z<If max packs is set, stop packing when we reach that number.NTF)r   r)   r   r"   r   r   r   r,      s   z"PackedDataset._should_stop_packingr0   c                 C   s   | j r| j}| jt|d dd  }|dkr|gng }n| j}g }|d d| |d d| |d d| |d dd | d}| | | j rUt|d |d n|d d }|d |d |d |d |d |d |gdS )	zSplits the current pack at the boundary, processes it, adds it to ``self.packs`` and
        returns the start of the next pack.r   Nr   r   r   r   r   )r   r   sumr   r/   r)   )r   r0   boundaryleftover_seq_lenseq_len_paddingpacknext_seq_lenr   r   r   r-      s*   

z!PackedDataset._split_and_add_packr:   c                 C   s*   |  |}| j|| jd}| j| dS )z2Processes, pads and adds a pack to ``self.packs``.)r   N)_convert_to_tensors	_pad_packr   r   appendr   r:   r   r   r   r/      s   
zPackedDataset._add_packc                 C   sN   t j|d t jdt j|d t jdt j|d t jdt j|d t jddS )z[Converts a pack into tensors. Pack comes in as a dict of lists and is converted to tensors.r   )dtyper   r   r   r   )r$   tensorlongr?   r   r   r   r<      s
   z!PackedDataset._convert_to_tensorsc           
      C   s   | j t|d  }tj|d d|f|d}tj|d d| j t|d  ftd}|dkr9t|d t|ggn|d }t|d d d |d d | j  t|d  d }t	|d| j d }t|d |g}	|||	|d	S )
z$Pads a pack to ``self.max_seq_len``.r   r   )valuer   r   r   r5      r   )
r   r)   Fpadr   r$   catrA   arangeclamp)
r   r:   r   num_padding_tokenspadded_tokenspadded_labelspadded_seq_lens	num_rangeclamped_num_rangepadded_input_posr   r   r   r=      s4   		 zPackedDataset._pad_packc                 C   s
   t | jS N)r)   r   r"   r   r   r   __len__     
zPackedDataset.__len__idxc                 C   s
   | j | S rQ   )r   )r   rT   r   r   r   __getitem__  rS   zPackedDataset.__getitem__)r   N)__name__
__module____qualname____doc__r   intr   boolr   r   r,   r   r-   r/   r<   r=   rR   r   strr$   TensorrU   r   r   r   r   r
      s4    F

@(	* r
   )typingr   r   r   r$   torch.nnr   rE   torch.utils.datar   torchtune.data._commonr   r   r	   r
   r   r   r   r   <module>   s   