o
    7wiC#                     @   sp   d Z ddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ edddgZG d	d
 d
ZG dd dZdS )z,Batch collation

Authors
  * Aku Rouhe 2020
    N)default_convert)
pin_memory)batch_pad_rightmod_default_collaterecursive_to
PaddedDatadatalengthsc                   @   sb   e Zd ZdZddei ddfddZdd Zdd	 Zd
d Zdd Z	dd Z
dd Zedd ZdS )PaddedBatcha+  Collate_fn when examples are dicts and have variable-length sequences.

    Different elements in the examples get matched by key.
    All numpy tensors get converted to Torch (PyTorch default_convert)
    Then, by default, all torch.Tensor valued elements get padded and support
    collective pin_memory() and to() calls.
    Regular Python data types are just collected in a list.

    Arguments
    ---------
    examples : list
        List of example dicts, as produced by Dataloader.
    padded_keys : list, None
        (Optional) List of keys to pad on. If None, pad all torch.Tensors
    device_prep_keys : list, None
        (Optional) Only these keys participate in collective memory pinning and moving with
        to().
        If None, defaults to all items with torch.Tensor values.
    padding_func : callable, optional
        Called with a list of tensors to be padded together. Needs to return
        two tensors: the padded data, and another tensor for the data lengths.
    padding_kwargs : dict
        (Optional) Extra kwargs to pass to padding_func. E.G. mode, value
    apply_default_convert : bool
        Whether to apply PyTorch default_convert (numpy to torch recursively,
        etc.) on all data. Default:True, usually does the right thing.
    nonpadded_stack : bool
        Whether to apply PyTorch-default_collate-like stacking on values that
        didn't get padded. This stacks if it can, but doesn't error out if it
        cannot. Default:True, usually does the right thing.

    Example
    -------
    >>> batch = PaddedBatch([
    ...     {"id": "ex1", "foo": torch.Tensor([1.])},
    ...     {"id": "ex2", "foo": torch.Tensor([2., 1.])}])
    >>> # Attribute or key-based access:
    >>> batch.id
    ['ex1', 'ex2']
    >>> batch["id"]
    ['ex1', 'ex2']
    >>> # torch.Tensors get padded
    >>> type(batch.foo)
    <class 'speechbrain.dataio.batch.PaddedData'>
    >>> batch.foo.data
    tensor([[1., 0.],
            [2., 1.]])
    >>> batch.foo.lengths
    tensor([0.5000, 1.0000])
    >>> # Batch supports collective operations:
    >>> _ = batch.to(dtype=torch.half)
    >>> batch.foo.data
    tensor([[1., 0.],
            [2., 1.]], dtype=torch.float16)
    >>> batch.foo.lengths
    tensor([0.5000, 1.0000], dtype=torch.float16)
    >>> # Numpy tensors get converted to torch and padded as well:
    >>> import numpy as np
    >>> batch = PaddedBatch([
    ...     {"wav": np.asarray([1,2,3,4])},
    ...     {"wav": np.asarray([1,2,3])}])
    >>> batch.wav  # +ELLIPSIS
    PaddedData(data=tensor([[1, 2,...
    >>> # Basic stacking collation deals with non padded data:
    >>> batch = PaddedBatch([
    ...     {"spk_id": torch.tensor([1]), "wav": torch.tensor([.1,.0,.3])},
    ...     {"spk_id": torch.tensor([2]), "wav": torch.tensor([.2,.3,-.1])}],
    ...     padded_keys=["wav"])
    >>> batch.spk_id
    tensor([[1],
            [2]])
    >>> # And some data is left alone:
    >>> batch = PaddedBatch([
    ...     {"text": ["Hello"]},
    ...     {"text": ["How", "are", "you?"]}])
    >>> batch.text
    [['Hello'], ['How', 'are', 'you?']]

    NTc           
         s   t || _t|d  | _g | _g | _| jD ]b  fdd|D }|r(t|}|d ur0 |v s<|d u rSt|d t	j
rS| j  t||fi | }	t|  |	 n|rYt|}t|  | |d urg |v ss|d u ryt|d t	j
ry| j  qd S )Nr   c                    s   g | ]}|  qS  r   ).0examplekeyr   U/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/speechbrain/dataio/batch.py
<listcomp>x   s    z(PaddedBatch.__init__.<locals>.<listcomp>)len_PaddedBatch__lengthlistkeys_PaddedBatch__keys_PaddedBatch__padded_keys_PaddedBatch__device_prep_keysr   
isinstancetorchTensorappendr   setattrr   )
selfexamplespadded_keysdevice_prep_keyspadding_funcpadding_kwargsapply_default_convertnonpadded_stackvaluespaddedr   r   r   __init__i   s*   


zPaddedBatch.__init__c                 C      | j S Nr   r   r   r   r   __len__   s   zPaddedBatch.__len__c                 C   s"   || j v r
t| |S td| )NzBatch doesn't have key: )r   getattrKeyError)r   r   r   r   r   __getitem__   s   

zPaddedBatch.__getitem__c                    s   t  fdd jD S )a~  Iterates over the different elements of the batch.

        Returns
        -------
        Iterator over the batch.

        Example
        -------
        >>> batch = PaddedBatch([
        ...     {"id": "ex1", "val": torch.Tensor([1.])},
        ...     {"id": "ex2", "val": torch.Tensor([2., 1.])}])
        >>> ids, vals = batch
        >>> ids
        ['ex1', 'ex2']
        c                 3   s    | ]}t  |V  qd S r*   )r.   )r   r   r,   r   r   	<genexpr>   s    z'PaddedBatch.__iter__.<locals>.<genexpr>)iterr   r,   r   r,   r   __iter__   s   zPaddedBatch.__iter__c                 C   s.   | j D ]}t| |}t|}t| || q| S )z3In-place, moves relevant elements to pinned memory.)r   r.   recursive_pin_memoryr   )r   r   valuepinnedr   r   r   r      s
   

zPaddedBatch.pin_memoryc                 O   s<   | j D ]}t| |}t|g|R i |}t| || q| S )zwIn-place move/cast relevant elements.

        Passes all arguments to torch.Tensor.to, see its documentation.
        )r   r.   r   r   )r   argskwargsr   r5   movedr   r   r   to   s
   

zPaddedBatch.toc                 C   s   | j | }t| |S )zGets the position.)r   r.   )r   posr   r   r   r   at_position   s   

zPaddedBatch.at_positionc                 C   r)   )zReturns the bach sizer+   r,   r   r   r   	batchsize   s   zPaddedBatch.batchsize)__name__
__module____qualname____doc__r   r(   r-   r0   r3   r   r:   r<   propertyr=   r   r   r   r   r
      s"    S
%r
   c                   @   sP   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd ZdS )BatchsizeGuessera;  Try to figure out the batchsize, but never error out

    If this cannot figure out anything else, will fallback to guessing 1

    Example
    -------
    >>> guesser = BatchsizeGuesser()
    >>> # Works with simple tensors:
    >>> guesser(torch.randn((2,3)))
    2
    >>> # Works with sequences of tensors:
    >>> guesser((torch.randn((2,3)), torch.randint(high=5, size=(2,))))
    2
    >>> # Works with PaddedBatch:
    >>> guesser(PaddedBatch([{"wav": [1.,2.,3.]}, {"wav": [4.,5.,6.]}]))
    2
    >>> guesser("Even weird non-batches have a fallback")
    1

    c                 C   s
   d | _ d S r*   )methodr,   r   r   r   r(      s   
zBatchsizeGuesser.__init__c                 C   s"   z|  |W S    | | Y S r*   )rD   find_suitable_methodr   batchr   r   r   __call__   s   zBatchsizeGuesser.__call__c                 C   s   z|  |}| j | _|W S    Y z| |}| j| _|W S    Y z| |}| j| _|W S    Y z| |}| j| _|W S    Y | |}| || _|S )z/Try the different methods and note which worked)
attr_basedrD   torch_tensor_bslen_of_firstlen_of_iter_firstfallback)r   rG   bsr   r   r   rE      s6   




z%BatchsizeGuesser.find_suitable_methodc                 C   s   |j S )zImplementation of attr_based.)r=   rF   r   r   r   rI     s   zBatchsizeGuesser.attr_basedc                 C   s
   |j d S )z"Implementation of torch_tensor_bs.r   )shaperF   r   r   r   rJ   	  s   
z BatchsizeGuesser.torch_tensor_bsc                 C   s   t |d S )zImplementation of len_of_first.r   )r   rF   r   r   r   rK     s   zBatchsizeGuesser.len_of_firstc                 C   s   t tt|S )z$Implementation of len_of_iter_first.)r   nextr2   rF   r   r   r   rL     s   z"BatchsizeGuesser.len_of_iter_firstc                 C   s   dS )zImplementation of fallback.   r   rF   r   r   r   rM     s   zBatchsizeGuesser.fallbackN)r>   r?   r@   rA   r(   rH   rE   rI   rJ   rK   rL   rM   r   r   r   r   rC      s    rC   )rA   collectionsr   torch.utils.data._utils.collater   "torch.utils.data._utils.pin_memoryr   r4   speechbrain.utils.data_utilsr   r   r   
namedtupler   r
   rC   r   r   r   r   <module>   s     0