o
    2wia                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZmZ G dd dZ 			d9dede!de
e de
ej" deej#ej#f f
ddZ$				d:dede!de
e de%de
e! deeej#ej#f eej#ej#ef f fddZ&e&Z'			d;dede!de
e de%fddZ(		d<dede!dede)e*f de!deej#eej#ej#f f f
dd Z+dedej#fd!d"Z,e j-dfd#eeej#ej.f  d$ee)e*f d%e%dej#fd&d'Z/	 	d=d#eeej#ej.f  d$ee)e*f d%e%dej#fd(d)Z0	 				d>dee de
e d*e%de
e! d+e
e deee	ej# ef ee	ej# ee	f f fd,d-Z1		d?dee de
e d*e%dee	ej# e	ej# ef fd.d/Z2	d@dee de
e de	ej# fd0d1Z3	dAd2ed*e%de
e! de
ej# fd3d4Z4d2edej#fd5d6Z5	dBd2ed*e%de
eej#e
ej# f  fd7d8Z6dS )C    N)Executor)partial)repeat)IterableListOptionalTupleUnion)CrossEntropyLoss)CutSet	Recording)suppress_audio_loading_errors)suppress_video_loading_errors)CutMixedCut)DEFAULT_PADDING_VALUESecondscompute_num_samplesc                   @   s   e Zd ZdZ						ddeded	ed
edededefddZdedee	j
e	j
f fddZde	jde	jdee fddZdS )TokenCollatera  Collate list of tokens

    Map sentences to integers. Sentences are padded to equal length.
    Beginning and end-of-sequence symbols can be added.
    Call .inverse(tokens_batch, tokens_lens) to reconstruct batch as string sentences.

    Example:
        >>> token_collater = TokenCollater(cuts)
        >>> tokens_batch, tokens_lens = token_collater(cuts.subset(first=32))
        >>> original_sentences = token_collater.inverse(tokens_batch, tokens_lens)

    Returns:
        tokens_batch: IntTensor of shape (B, L)
            B: batch dimension, number of input sentences
            L: length of the longest sentence
        tokens_lens: IntTensor of shape (B,)
            Length of each sentence after adding <eos> and <bos>
            but before padding.
    T<pad><bos><eos><unk>cutsadd_eosadd_bos
pad_symbol
bos_symbol
eos_symbol
unk_symbolc           
      C   s   || _ || _|| _|| _|| _|| _dd |D }||g|r!|gng  |r(|gng  t| }	dd t|	D | _dd |	D | _	d S )Nc                 S   s"   h | ]}|j d  jD ]}|q
qS r   )supervisionstext).0cutchar r&   U/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/dataset/collation.py	<setcomp>9   s   " z)TokenCollater.__init__.<locals>.<setcomp>c                 S   s   i | ]\}}||qS r&   r&   )r#   idxtokenr&   r&   r'   
<dictcomp>A       z*TokenCollater.__init__.<locals>.<dictcomp>c                 S   s   g | ]}|qS r&   r&   r#   r*   r&   r&   r'   
<listcomp>B   s    z*TokenCollater.__init__.<locals>.<listcomp>)
r   r   r   r   r   r   sorted	enumerate	token2idx	idx2token)
selfr   r   r   r   r   r   r   tokenstokens_uniquer&   r&   r'   __init__'   s"   
zTokenCollater.__init__returnc                    sv   dd |D }t t|t d  fdd|D }ttjfdd|D tjd}tfdd|D }||fS )Nc                 S   s"   g | ]}d  dd |jD qS ) c                 s       | ]}|j V  qd S N)r"   )r#   supervisionr&   r&   r'   	<genexpr>F       z4TokenCollater.__call__.<locals>.<listcomp>.<genexpr>)joinr!   r#   r$   r&   r&   r'   r.   E   s    z*TokenCollater.__call__.<locals>.<listcomp>keyc                    sL   g | ]"}j rjgng t| jrjgng  jg t|   qS r&   )r   r   listr   r   r   lenr#   seqmax_lenr3   r&   r'   r.   K   s    c                    s   g | ]} fd d|D qS )c                       g | ]} j | qS r&   )r1   r-   r3   r&   r'   r.   U   r,   z5TokenCollater.__call__.<locals>.<listcomp>.<listcomp>r&   rD   rI   r&   r'   r.   U   s    dtypec                    s(   g | ]}t |t j t j qS r&   )rC   intr   r   rD   rI   r&   r'   r.   [   s    )rC   maxtorch
from_numpynparrayint64	IntTensor)r3   r   token_sequencesseqstokens_batchtokens_lensr&   rF   r'   __call__D   s&   
zTokenCollater.__call__rV   rW   c                    s,    j rdnd fddt||D }|S )N   r   c              
      s:   g | ]\}}d   fdd||t j  D qS ) c                    rH   r&   )r2   )r#   r)   rI   r&   r'   r.   i   s    z4TokenCollater.inverse.<locals>.<listcomp>.<listcomp>)r>   rL   r   )r#   tokens_listendr3   startr&   r'   r.   g   s    
z)TokenCollater.inverse.<locals>.<listcomp>)r   zip)r3   rV   rW   	sentencesr&   r]   r'   inversec   s
   	zTokenCollater.inverseN)TTr   r   r   r   )__name__
__module____qualname____doc__r   boolstrr6   r   rN   TensorrX   
LongTensorrS   r   ra   r&   r&   r&   r'   r      s>    
r   rightr   pad_directionexecutorfeatures_dtyper7   c           
      C   s   t dd | D sJ tjdd | D tjd}| jt| |d} tt| }tj	t
| |j|j|d}|du rMt| D ]
\}}t|||< q>||fS t|t| D ]\}}	|	||< qU||fS )a1  
    Load features for all the cuts and return them as a batch in a torch tensor.
    The output shape is ``(batch, time, features)``.
    The cuts will be padded with silence if necessary.

    :param cuts: a :class:`CutSet` used to load the features.
    :param pad_direction: where to apply the padding (``right``, ``left``, or ``both``).
    :param executor: an instance of ThreadPoolExecutor or ProcessPoolExecutor; when provided,
        we will use it to read the features concurrently.
    :return: a tuple of tensors ``(features, features_lens)``.
    c                 s   r9   r:   has_featuresr?   r&   r&   r'   r<      r=   z#collate_features.<locals>.<genexpr>c                 S   s   g | ]}|j qS r&   )
num_framesr?   r&   r&   r'   r.      s    z$collate_features.<locals>.<listcomp>rJ   )rp   	directionN)allrN   tensorrL   padrM   itemnextiteremptyrC   rp   num_featuresr0   _read_featuresmap)
r   rk   rl   rm   features_lens	first_cutfeaturesr)   r$   example_featuresr&   r&   r'   collate_featuress   s   
r   Ffault_tolerantrecording_fieldc           
      C   s"  | D ]"}|du r|j sJ d|j q||s$J d| d|j qg }| D ]}|du r3|j}nt|jt||jd}|| q)| j	t
dd | D |dd	} t| ||||d
\}} }t|d jdkrot|dd}ntdd |D dddd}tj|tjd}	|r||	| fS ||	fS )a  
    Load audio samples for all the cuts and return them as a batch in a torch tensor.
    The output shape is ``(batch, time)``.
    The cuts will be padded with silence if necessary.

    :param cuts: a :class:`CutSet` used to load the audio samples.
    :param pad_direction: where to apply the padding (``right``, ``left``, or ``both``).
    :param executor: an instance of ThreadPoolExecutor or ProcessPoolExecutor; when provided,
        we will use it to read audio concurrently.
    :param fault_tolerant: when ``True``, the cuts for which audio loading failed
        will be skipped. Setting this parameter will cause the function to return a 3-tuple,
        where the third element is a CutSet for which the audio data were sucessfully read.
    :param recording_field: when specified, we will try to load recordings from a custom field with this name
        (i.e., ``cut.load_<recording_field>()`` instead of default ``cut.load_audio()``).
    :return: a tuple of tensors ``(audio, audio_lens)``, or ``(audio, audio_lens, cuts)``.
    NzMissing recording in cut zMissing custom recording field z in cut )sampling_ratec                 s   r9   r:   durationr?   r&   r&   r'   r<      r=   z collate_audio.<locals>.<genexpr>Tr   rq   preserve_id)suppress_errorsr   filter_aux_iterr   rY   g        )padding_valuec                 S   s   g | ]}| d dqS )r   rY   )	transposer#   ar&   r&   r'   r.      s    z!collate_audio.<locals>.<listcomp>   rJ   )has_recordingid
has_customnum_samplesr   r   getattrr   appendrt   rM   read_audio_from_cutsrC   shapecollate_vectorscollate_matricesr   rN   rs   int32)
r   rk   rl   r   r   r$   sample_countsr   audios
audio_lensr&   r&   r'   collate_audio   sN   
r   c           	         s   t dd | D sJ i  | D ]}|j|jjf |j< q| jtdd | D |dd} t| ||d\}}} t	|}t	|}tj
 fdd| D tjd	}tj
 fd
d| D tjd	}|rf||||| fS ||||fS )aw  
    Load video and audio for all cuts and return them as a batch in torch tensors.
    The output video shape is ``(batch, time, channel, height, width)``.
    The output audio shape is ``(batch, channel, time)``.
    The cuts will be padded with silence if necessary.

    .. note:: We expect each video to contain audio and the same number of audio channels.
        We may support padding missing channels at a later time.

    :param cuts: a :class:`CutSet` used to load the audio samples.
    :param pad_direction: where to apply the padding (``right``, ``left``, or ``both``).
    :param executor: an instance of ThreadPoolExecutor or ProcessPoolExecutor; when provided,
        we will use it to read video concurrently.
    :param fault_tolerant: when ``True``, the cuts for which video/audio loading failed
        will be skipped. Setting this parameter will cause the function to return a 5-tuple,
        where the fifth element is a CutSet for which the audio data were sucessfully read.
    :return: a tuple of tensors ``(video, video_lens, audio, audio_lens)``,
        or ``(video, video_lens, audio, audio_lens, cuts)``.
    c                 s   r9   r:   )	has_videor?   r&   r&   r'   r<      r=   z collate_video.<locals>.<genexpr>c                 s   r9   r:   r   r#   cr&   r&   r'   r<     r=   Tr   r   c                       g | ]	} |j  d  qS r    r   r?   id2lensr&   r'   r.         z!collate_video.<locals>.<listcomp>rJ   c                    r   )rY   r   r?   r   r&   r'   r.     r   )rr   r   videorp   r   rt   rM   read_video_from_cutsrN   stackrs   r   )	r   rk   rl   r   r$   videosr   r   
video_lensr&   r   r'   collate_video   s&   

r   field	pad_valuec                    s  ddl m}m} t| d t|r0tfdd| D s$J dtfdd| D S t|r|du rGt	d	 d
t
 d t
}jfdd| D }tjfdd|D tjd}t|tjd}|j }	t|g|jR }
|j t fddtjtjtjtjfD rtj |tj|
 d }t|D ]Q\}}|j }|dkrtd|n&|dkrt|	| |	n|dkr|	| d }t||	| ntd| d|ftfddtt|jD  }|||< q||fS tfdd| D S )a  
    Load custom arrays for all the cuts and return them as a batch in a torch tensor.
    The output shapes are:

        - ``(batch, d0, d1, d2, ...)`` for :class:`lhotse.array.Array` of shape ``(d0, d1, d2, ...)``.
            Note: all arrays have to be of the same shape, as we expect these represent fixed-size
            embeddings.

        - ``(batch, d0, pad_dt, d1, ...)`` for :class:`lhotse.array.TemporalArray` of shape
            ``(d0, dt, d1, ...)`` where ``dt`` indicates temporal dimension (variable-sized),
            and ``pad_dt`` indicates temporal dimension after padding (equal-sized for all cuts).
            We expect these represent temporal data, such as alignments, posteriors, features, etc.

        - ``(batch, )`` for anything else, such as int or float: we will simply stack them into
            a list and tensorize it.

    .. note:: This function disregards the ``frame_shift`` attribute of
        :class:`lhotse.array.TemporalArray` when padding; it simply pads all the arrays
        to the longest one found in the mini-batch. Because of that, the function
        will work correctly even if the user supplied inconsistent meta-data.

    .. note:: Temporal arrays of integer type that are smaller than torch.int64,
        will be automatically promoted to torch.int64.

    :param cuts: a :class:`CutSet` used to load the features.
    :param field: name of the custom field to be retrieved.
    :param pad_value: value to be used for padding the temporal arrays.
        Ignored for non-temporal array and non-array attributes.
    :param pad_direction: where to apply the padding (``right``, ``left``, or ``both``).
    :return: a collated data tensor, or a tuple of tensors ``(collated_data, sequence_lens)``.
    r   )ArrayTemporalArrayc                 3   s"    | ]}t | jjkV  qd S r:   )r   r   r   )r   first_manifestr&   r'   r<   A  s     z'collate_custom_field.<locals>.<genexpr>zCannot collate manifests of type Array with different shapes, because we don't know which dimension must be padded. Use TemporalArray manifests and try again.c                       g | ]
}t | qS r&   rN   rO   load_customr   r   r&   r'   r.   F      z(collate_custom_field.<locals>.<listcomp>Nz6Argument 'pad_value' not passed -- we will pad field 'z' with .c                    r   r&   r   r   r   r&   r'   r.   Y  r   c                    s   g | ]}|j   qS r&   r   r   )temporal_dimr&   r'   r.   [  r,   rJ   r@   c                 3   s    | ]}| kV  qd S r:   r&   )r#   drJ   r&   r'   r<   a  s    rj   leftbothr   z$Unexpected pad_direction argument: ''c                 3   s(    | ]}| kr
nt d d d V  qd S r:   )slice)r#   i)r   temporal_slicer&   r'   r<   s  s
    
c                    s   g | ]}t | qS r&   )r   r   r   r&   r'   r.   }  r,   )lhotse.arrayr   r   r   
isinstancerr   rN   r   warningswarnr   r   rs   r   rM   numelr   rC   rK   anyuint8int8int16rR   onesr0   r   
ValueErrortuplerange)r   r   r   rk   r   r   arrsarr_lenslargest_arrmaxlencollated_shapetensorsaidxr   alenhalfindicesr&   )rK   r   r   r   r   r'   collate_custom_field  sZ   %

	
&



r   c                 C   s   t dd | D sJ t dd | D sJ | | } tt| }tt| t|j|j|j	}t
| D ]\}}t|jdd||< q4|S )a5  
    Load features for all the cuts and return them as a batch in a torch tensor.
    The cuts have to be of type ``MixedCut`` and their tracks will be interpreted as individual channels.
    The output shape is ``(batch, channel, time, features)``.
    The cuts will be padded with silence if necessary.
    c                 s   r9   r:   rn   r?   r&   r&   r'   r<     r=   z1collate_multi_channel_features.<locals>.<genexpr>c                 s   s    | ]}t |tV  qd S r:   )r   r   r?   r&   r&   r'   r<     s    F)mixed)rr   rt   rv   rw   rN   rx   rC   tracksrp   ry   r0   rO   load_features)r   r}   r~   r)   r$   r&   r&   r'   collate_multi_channel_features  s   
r   r   r   matching_shapesc                    s   dd | D } t dd | D sJ dt| dd d |r-t  fd	d| D s-J d
 t|  jd | }t| D ]\}}|||d|jd f< q>|S )a  
    Convert an iterable of 1-D tensors (of possibly various lengths)
    into a single stacked tensor.

    :param tensors: an iterable of 1-D tensors.
    :param padding_value: the padding value inserted to make all tensors have the same length.
    :param matching_shapes: when ``True``, will fail when input tensors have different shapes.
    :return: a tensor with shape ``(B, L)`` where ``B`` is the number of input tensors and
        ``L`` is the number of items in the longest tensor.
    c                 S   &   g | ]}t |tjr|nt|qS r&   r   rN   rh   rO   r#   tr&   r&   r'   r.         z#collate_vectors.<locals>.<listcomp>c                 s       | ]
}t |jd kV  qdS )rY   NrC   r   r   r&   r&   r'   r<         z"collate_vectors.<locals>.<genexpr>z Expected only 1-D input tensors.c                 S   
   | j d S Nr   r   r   r&   r&   r'   <lambda>     
 z!collate_vectors.<locals>.<lambda>r@   c                 3       | ]	}|j  j kV  qd S r:   r   r   longestr&   r'   r<         
IAll tensors must have the same shape when matching_shapes is set to True.r   Nrr   rM   new_onesrC   r   r0   r   r   r   resultr   r   r&   r   r'   r     s   
r   c                    s   dd | D } t dd | D sJ dt| dd d |r-t  fd	d| D s-J d
 jt| g jR  | }t| D ]\}}|||d|jd f< q?|S )a,  
    Convert an iterable of 2-D tensors (of possibly various first dimension, but consistent second dimension)
    into a single stacked tensor.

    :param tensors: an iterable of 2-D tensors.
    :param padding_value: the padding value inserted to make all tensors have the same length.
    :param matching_shapes: when ``True``, will fail when input tensors have different shapes.
    :return: a tensor with shape ``(B, L, F)`` where ``B`` is the number of input tensors,
        ``L`` is the largest found shape[0], and ``F`` is equal to shape[1].
    c                 S   r   r&   r   r   r&   r&   r'   r.     r   z$collate_matrices.<locals>.<listcomp>c                 s   r   )r   Nr   r   r&   r&   r'   r<     r   z#collate_matrices.<locals>.<genexpr>z Expected only 2-D input tensors.c                 S   r   r   r   r   r&   r&   r'   r     r   z"collate_matrices.<locals>.<lambda>r@   c                 3   r   r:   r   r   r   r&   r'   r<     r   r   Nr   r   r   r&   r   r'   r     s   
r   r   r   c              	   C   s   d}|du rt dg}d}|du rtn|j}g }g }g }	tt| |tt||d| |D ]\}
\}}}|du r8q,|| || |	| q,|t|f}|rV||	f }|S )a  
    Loads audio data from an iterable of cuts.

    :param cuts: a CutSet or iterable of cuts.
    :param executor: optional Executor (e.g., ThreadPoolExecutor or ProcessPoolExecutor)
        to perform the audio reads in parallel.
    :param suppress_errors: when set to ``True``, will enable fault-tolerant data reads;
        we will skip the cuts and audio data for the instances that failed (and emit a warning).
        When ``False`` (default), the errors will not be suppressed.
    :param recording_field: when specified, we will try to load recordings from a custom field with this name
        (i.e., ``cut.load_<recording_field>()`` instead of default ``cut.load_audio()``).
    :param filter_aux_iter: when specified, we will iterate over this iterator and discard the elements
        for which a corresponding cut failed to load audio, if ``suppress_errors`` is set to ``True``.
        This iterator is expected to be of the same length as ``cuts``.
    :return: a tuple of two items: a list of audio tensors (with different shapes),
        and a list of cuts for which we read the data successfully.
        If ``filter_aux_iter`` is specified, it returns a 3-tuple where the third element is
        the filtered auxiliary iterator.
    TNF)r   r   )	r   r{   r0   r_   r   _read_audior   r   	from_cuts)r   rl   r   r   r   aux_requestedmap_fnr   ok_cutsaux_iter_outr)   r$   maybe_audioaux_itemansr&   r&   r'   r     s>   



r   c                 C   s   |du rt n|j }g }g }g }tt| |tt|d| D ]\}\}}	|	du r(q|	\}
}||
 || || q||t|fS )a  
    Loads audio data from an iterable of cuts.

    :param cuts: a CutSet or iterable of cuts.
    :param executor: optional Executor (e.g., ThreadPoolExecutor or ProcessPoolExecutor)
        to perform the audio reads in parallel.
    :param suppress_errors: when set to ``True``, will enable fault-tolerant data reads;
        we will skip the cuts and audio data for the instances that failed (and emit a warning).
        When ``False`` (default), the errors will not be suppressed.
    :return: a tuple of two items: a list of audio tensors (with different shapes),
        and a list of cuts for which we read the data successfully.
    Nr   )r{   r0   r_   r   _read_videor   r   r   )r   rl   r   r   r   r   r   r)   r$   	maybe_ansr   audior&   r&   r'   r     s.   

r   c                 C   s    |d u rt n|j }t|t| S r:   )r{   rB   rz   )r   rl   r   r&   r&   r'   read_features_from_cuts>  s   r   r$   c                 C   s   t |d= |du r|  }nt| |}t|ts%J d| dt| | |}|jd dkr6|d}t	
|W  d   S 1 sEw   Y  dS )z{
    Loads audio data from cut, or returns None if there was an error
    and ``suppress_errors`` was set to ``True``.
    enabledNzExpected 'getattr(cut, z)' to yield Recording, got r   rY   )r   
load_audior   r   r   typer   r   squeezerN   rO   )r$   r   r   r   attrr&   r&   r'   r   E  s   



$r   c                 C   s   t |  S r:   )rN   rO   r   )r$   r&   r&   r'   rz   Z  s   rz   c                 C   s6   t |d |  W  d   S 1 sw   Y  dS )z
    Loads video + audio data from cut, or returns None if there was an error
    and ``suppress_errors`` was set to ``True``.
    r  N)r   
load_video)r$   r   r&   r&   r'   r   ^  s   $r   )rj   NN)rj   NFN)rj   NF)Nrj   )r   F)NFNN)NFr:   )FN)F)7r   concurrent.futuresr   	functoolsr   	itertoolsr   typingr   r   r   r   r	   numpyrP   rN   torch.nnr
   lhotser   r   lhotse.audior   lhotse.audio.utilsr   
lhotse.cutr   r   lhotse.utilsr   r   r   r   rg   rK   rh   r   rf   r   collate_multi_channel_audior   rL   floatr   r   ignore_indexndarrayr   r   r   r   r   r   rz   r   r&   r&   r&   r'   <module>   s0   c
# 
I
9
j

 

(
>
,

