o
    SiHn                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZ G dd dZ			d?dede de
e de
ej! deej"ej"f f
ddZ#				d@dede de
e de$de
e  deeej"ej"f eej"ej"ef f fddZ%e%Z&					dAdede$de de
e de$de
e  fddZ'		dBdede d ede(e)f de deej"eej"ej"f f f
d!d"Z*dedej"fd#d$Z+e j,ddfd%eeej"ej-f  d&ee(e)f de d'e$dej"f
d(d)Z.	 	dCd%eeej"ej-f  d&ee(e)f d'e$dej"fd*d+Z/	 				dDdee de
e d,e$de
e  d-e
e deee	ej" ef ee	ej" ee	f f fd.d/Z0				dEdee de$de
e d,e$de
e  dee	ej" e	ej" ef fd0d1Z1	dFdee de
e de	ej" fd2d3Z2	dGd4ed,e$de
e  de
ej" fd5d6Z3d4edej"fd7d8Z4			dHd4ede$d,e$de
e  de
eej"e
ej" f  f
d9d:Z5	;dIded<e dej"fd=d>Z6dS )J    N)Executor)partial)repeat)IterableListOptionalTupleUnion)CrossEntropyLoss)CutSet	Recording)suppress_audio_loading_errors)suppress_video_loading_errors)CutMixedCut)DEFAULT_PADDING_VALUEcompute_num_samplesc                   @   s   e Zd ZdZ						ddeded	ed
edededefddZdedee	j
e	j
f fddZde	jde	jdee fddZdS )TokenCollatera  Collate list of tokens

    Map sentences to integers. Sentences are padded to equal length.
    Beginning and end-of-sequence symbols can be added.
    Call .inverse(tokens_batch, tokens_lens) to reconstruct batch as string sentences.

    Example:
        >>> token_collater = TokenCollater(cuts)
        >>> tokens_batch, tokens_lens = token_collater(cuts.subset(first=32))
        >>> original_sentences = token_collater.inverse(tokens_batch, tokens_lens)

    Returns:
        tokens_batch: IntTensor of shape (B, L)
            B: batch dimension, number of input sentences
            L: length of the longest sentence
        tokens_lens: IntTensor of shape (B,)
            Length of each sentence after adding <eos> and <bos>
            but before padding.
    T<pad><bos><eos><unk>cutsadd_eosadd_bos
pad_symbol
bos_symbol
eos_symbol
unk_symbolc           
      C   s   || _ || _|| _|| _|| _|| _dd |D }||g|r!|gng  |r(|gng  t| }	dd t|	D | _dd |	D | _	d S )Nc                 S   s"   h | ]}|j d  jD ]}|q
qS r   )supervisionstext).0cutchar r%   L/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/collation.py	<setcomp>9   s   " z)TokenCollater.__init__.<locals>.<setcomp>c                 S   s   i | ]\}}||qS r%   r%   )r"   idxtokenr%   r%   r&   
<dictcomp>A       z*TokenCollater.__init__.<locals>.<dictcomp>c                 S   s   g | ]}|qS r%   r%   r"   r)   r%   r%   r&   
<listcomp>B   s    z*TokenCollater.__init__.<locals>.<listcomp>)
r   r   r   r   r   r   sorted	enumerate	token2idx	idx2token)
selfr   r   r   r   r   r   r   tokenstokens_uniquer%   r%   r&   __init__'   s"   
zTokenCollater.__init__returnc                    sv   dd |D }t t|t d  fdd|D }ttjfdd|D tjd}tfdd|D }||fS )Nc                 S   s"   g | ]}d  dd |jD qS ) c                 s       | ]}|j V  qd S N)r!   )r"   supervisionr%   r%   r&   	<genexpr>F       z4TokenCollater.__call__.<locals>.<listcomp>.<genexpr>)joinr    r"   r#   r%   r%   r&   r-   E   s    z*TokenCollater.__call__.<locals>.<listcomp>keyc                    sL   g | ]"}j rjgng t| jrjgng  jg t|   qS r%   )r   r   listr   r   r   lenr"   seqmax_lenr2   r%   r&   r-   K   s    c                    s   g | ]} fd d|D qS )c                       g | ]} j | qS r%   )r0   r,   r2   r%   r&   r-   U   r+   z5TokenCollater.__call__.<locals>.<listcomp>.<listcomp>r%   rC   rH   r%   r&   r-   U   s    dtypec                    s(   g | ]}t |t j t j qS r%   )rB   intr   r   rC   rH   r%   r&   r-   [   s    )rB   maxtorch
from_numpynparrayint64	IntTensor)r2   r   token_sequencesseqstokens_batchtokens_lensr%   rE   r&   __call__D   s&   
zTokenCollater.__call__rU   rV   c                    s,    j rdnd fddt||D }|S )N   r   c              
      s:   g | ]\}}d   fdd||t j  D qS ) c                    rG   r%   )r1   )r"   r(   rH   r%   r&   r-   i   s    z4TokenCollater.inverse.<locals>.<listcomp>.<listcomp>)r=   rK   r   )r"   tokens_listendr2   startr%   r&   r-   g   s    
z)TokenCollater.inverse.<locals>.<listcomp>)r   zip)r2   rU   rV   	sentencesr%   r\   r&   inversec   s
   	zTokenCollater.inverseN)TTr   r   r   r   )__name__
__module____qualname____doc__r   boolstrr5   r   rM   TensorrW   
LongTensorrR   r   r`   r%   r%   r%   r&   r      s>    
r   rightr   pad_directionexecutorfeatures_dtyper6   c           
      C   s   t dd | D sJ tjdd | D tjd}| jt| |d} tt| }tj	t
| |j|j|d}|du rMt| D ]
\}}t|||< q>||fS t|t| D ]\}}	|	||< qU||fS )a1  
    Load features for all the cuts and return them as a batch in a torch tensor.
    The output shape is ``(batch, time, features)``.
    The cuts will be padded with silence if necessary.

    :param cuts: a :class:`CutSet` used to load the features.
    :param pad_direction: where to apply the padding (``right``, ``left``, or ``both``).
    :param executor: an instance of ThreadPoolExecutor or ProcessPoolExecutor; when provided,
        we will use it to read the features concurrently.
    :return: a tuple of tensors ``(features, features_lens)``.
    c                 s   r8   r9   has_featuresr>   r%   r%   r&   r;      r<   z#collate_features.<locals>.<genexpr>c                 S   s   g | ]}|j qS r%   )
num_framesr>   r%   r%   r&   r-      s    z$collate_features.<locals>.<listcomp>rI   )ro   	directionN)allrM   tensorrK   padrL   itemnextiteremptyrB   ro   num_featuresr/   _read_featuresmap)
r   rj   rk   rl   features_lens	first_cutfeaturesr(   r#   example_featuresr%   r%   r&   collate_featuress   s   
r   Ffault_tolerantrecording_fieldc           
      C   s"  | D ]"}|du r|j sJ d|j q||s$J d| d|j qg }| D ]}|du r3|j}nt|jt||jd}|| q)| j	t
dd | D |dd	} t| ||||d
\}} }t|d jdkrot|dd}ntdd |D dddd}tj|tjd}	|r||	| fS ||	fS )a  
    Load audio samples for all the cuts and return them as a batch in a torch tensor.
    The output shape is ``(batch, time)``.
    The cuts will be padded with silence if necessary.

    :param cuts: a :class:`CutSet` used to load the audio samples.
    :param pad_direction: where to apply the padding (``right``, ``left``, or ``both``).
    :param executor: an instance of ThreadPoolExecutor or ProcessPoolExecutor; when provided,
        we will use it to read audio concurrently.
    :param fault_tolerant: when ``True``, the cuts for which audio loading failed
        will be skipped. Setting this parameter will cause the function to return a 3-tuple,
        where the third element is a CutSet for which the audio data were sucessfully read.
    :param recording_field: when specified, we will try to load recordings from a custom field with this name
        (i.e., ``cut.load_<recording_field>()`` instead of default ``cut.load_audio()``).
    :return: a tuple of tensors ``(audio, audio_lens)``, or ``(audio, audio_lens, cuts)``.
    NzMissing recording in cut Missing custom recording field  in cut )sampling_ratec                 s   r8   r9   durationr>   r%   r%   r&   r;      r<   z collate_audio.<locals>.<genexpr>Tr   rp   preserve_id)suppress_errorsr   filter_aux_iterr   rX   g        )padding_valuec                 S   s   g | ]}| d dqS )r   rX   )	transposer"   ar%   r%   r&   r-      s    z!collate_audio.<locals>.<listcomp>   rI   )has_recordingid
has_customnum_samplesr   r   getattrr   appendrs   rL   read_audio_from_cutsrB   shapecollate_vectorscollate_matricesr   rM   rr   int32)
r   rj   rk   r   r   r#   sample_countsr   audios
audio_lensr%   r%   r&   collate_audio   sN   
r   T
with_audioc                    sp  | D ]3}|du r|j sJ d|j q||s$J d| d|j t||j s5J d| d|j qi  | D ]%}|du rG|j}|j}nt||j}t|jt||j}||j	f |j< q:| j
tdd | D |d	d
} t| |||d\}	}
} t|	}	tj fdd| D tjd}|rt|
}
tj fdd| D tjd}nd\}
}|r|	||
|| fS |	||
|fS )aw  
    Load video and audio for all cuts and return them as a batch in torch tensors.
    The output video shape is ``(batch, time, channel, height, width)``.
    The output audio shape is ``(batch, channel, time)``.
    The cuts will be padded with silence if necessary.

    .. note:: We expect each video to contain audio and the same number of audio channels.
        We may support padding missing channels at a later time.

    :param cuts: a :class:`CutSet` used to load the audio samples.
    :param with_audio: should the audio data be loaded.
    :param pad_direction: where to apply the padding (``right``, ``left``, or ``both``).
    :param executor: an instance of ThreadPoolExecutor or ProcessPoolExecutor; when provided,
        we will use it to read video concurrently.
    :param fault_tolerant: when ``True``, the cuts for which video/audio loading failed
        will be skipped. Setting this parameter will cause the function to return a 5-tuple,
        where the fifth element is a CutSet for which the audio data were sucessfully read.
    :param recording_field: when specified, we will try to load recordings from a custom field with this name
        (i.e., ``cut.load_<recording_field>()`` instead of default ``cut.load_video()``).
    :return: a tuple of tensors ``(video, video_lens, audio, audio_lens)``,
        or ``(video, video_lens, audio, audio_lens, cuts)``.
    Nz&Missing video in the recording of cut r   r   z(Missing video in custom recording field z of cut c                 s   r8   r9   r   r"   cr%   r%   r&   r;     r<   z collate_video.<locals>.<genexpr>Tr   )r   rk   r   c                       g | ]	} |j  d  qS )rX   r   r>   id2lensr%   r&   r-   "      z!collate_video.<locals>.<listcomp>rI   c                    r   r   r   r>   r   r%   r&   r-   &  r   )NN)	has_videor   r   r   videor   r   r   r   ro   rs   rL   read_video_from_cutsrM   stackrr   r   )r   r   rj   rk   r   r   r#   r   r   videosr   
video_lensr   r%   r   r&   collate_video   sX   

r   field	pad_valuec                    s:  ddl m}m} ddlm} t| d t|r6tfdd| D s*J dt	fdd| D S t|r|d	u rMt
d
 dt d t}jfdd| D }tjfdd|D tjd}t|tjd}	|	j }
t|g|	jR }|	j t fddtjtjtjtjfD rtj |tj| d }t|D ]Q\}}|j }|dkrtd|n&|dkrt|
| |
n|dkr|
| d }t||
| ntd| d|ftfddtt|jD  }|||< q||fS t|rt | S tt!rt"| |dS tfdd| D S )a  
    Load custom arrays for all the cuts and return them as a batch in a torch tensor.
    The output shapes are:

        - ``(batch, d0, d1, d2, ...)`` for :class:`lhotse.array.Array` of shape ``(d0, d1, d2, ...)``.
            Note: all arrays have to be of the same shape, as we expect these represent fixed-size
            embeddings.

        - ``(batch, d0, pad_dt, d1, ...)`` for :class:`lhotse.array.TemporalArray` of shape
            ``(d0, dt, d1, ...)`` where ``dt`` indicates temporal dimension (variable-sized),
            and ``pad_dt`` indicates temporal dimension after padding (equal-sized for all cuts).
            We expect these represent temporal data, such as alignments, posteriors, features, etc.

        - ``(batch, )`` for anything else, such as int or float: we will simply stack them into
            a list and tensorize it.

    .. note:: This function disregards the ``frame_shift`` attribute of
        :class:`lhotse.array.TemporalArray` when padding; it simply pads all the arrays
        to the longest one found in the mini-batch. Because of that, the function
        will work correctly even if the user supplied inconsistent meta-data.

    .. note:: Temporal arrays of integer type that are smaller than torch.int64,
        will be automatically promoted to torch.int64.

    :param cuts: a :class:`CutSet` used to load the features.
    :param field: name of the custom field to be retrieved.
    :param pad_value: value to be used for padding the temporal arrays.
        Ignored for non-temporal array and non-array attributes.
    :param pad_direction: where to apply the padding (``right``, ``left``, or ``both``).
    :return: a collated data tensor, or a tuple of tensors ``(collated_data, sequence_lens)``.
    r   )ArrayTemporalArray)Imagec                 3   s"    | ]}t | jjkV  qd S r9   )r   r   r   )r   first_manifestr%   r&   r;   ]  s     z'collate_custom_field.<locals>.<genexpr>zCannot collate manifests of type Array with different shapes, because we don't know which dimension must be padded. Use TemporalArray manifests and try again.c                       g | ]
}t | qS r%   rM   rN   load_customr   r   r%   r&   r-   b      z(collate_custom_field.<locals>.<listcomp>Nz6Argument 'pad_value' not passed -- we will pad field 'z' with .c                    r   r%   r   r   r   r%   r&   r-   u  r   c                    s   g | ]}|j   qS r%   r   r   )temporal_dimr%   r&   r-   w  r+   rI   r?   c                 3   s    | ]}| kV  qd S r9   r%   )r"   drI   r%   r&   r;   }  s    ri   leftbothr   z$Unexpected pad_direction argument: ''c                 3   s(    | ]}| kr
nt d d d V  qd S r9   )slice)r"   i)r   temporal_slicer%   r&   r;     s
    
)r   rj   c                    s   g | ]}t | qS r%   )r   r   r   r%   r&   r-     r+   )#lhotse.arrayr   r   lhotse.imager   r   
isinstancerq   rM   r   warningswarnr   r   rr   r   rL   numelr   rB   rJ   anyuint8int8int16rQ   onesr/   r   
ValueErrortuplerangecollate_imagesr   r   )r   r   r   rj   r   r   r   arrsarr_lenslargest_arrmaxlencollated_shapetensorsaidxr   alenhalfindicesr%   )rJ   r   r   r   r   r&   collate_custom_field1  sd   %

	
&




r   c                 C   s   t dd | D sJ t dd | D sJ | | } tt| }tt| t|j|j|j	}t
| D ]\}}t|jdd||< q4|S )a5  
    Load features for all the cuts and return them as a batch in a torch tensor.
    The cuts have to be of type ``MixedCut`` and their tracks will be interpreted as individual channels.
    The output shape is ``(batch, channel, time, features)``.
    The cuts will be padded with silence if necessary.
    c                 s   r8   r9   rm   r>   r%   r%   r&   r;     r<   z1collate_multi_channel_features.<locals>.<genexpr>c                 s   s    | ]}t |tV  qd S r9   )r   r   r>   r%   r%   r&   r;     s    F)mixed)rq   rs   ru   rv   rM   rw   rB   tracksro   rx   r/   rN   load_features)r   r|   r}   r(   r#   r%   r%   r&   collate_multi_channel_features  s   
r   r   r   matching_shapesc                    s   dd | D } t dd | D sJ d|dvrtd| t| dd	 d
 |r8t  fdd| D s8J d t|  jd | }t| D ] \}}|dkr]|||d|jd f< qI||||jd  df< qI|S )a@  
    Convert an iterable of 1-D tensors (of possibly various lengths)
    into a single stacked tensor.

    :param tensors: an iterable of 1-D tensors.
    :param padding_value: the padding value inserted to make all tensors have the same length.
    :param pad_direction: where to apply the padding (``right`` or ``left``).
    :param matching_shapes: when ``True``, will fail when input tensors have different shapes.
    :return: a tensor with shape ``(B, L)`` where ``B`` is the number of input tensors and
        ``L`` is the number of items in the longest tensor.
    c                 S   &   g | ]}t |tjr|nt|qS r%   r   rM   rg   rN   r"   tr%   r%   r&   r-         z#collate_vectors.<locals>.<listcomp>c                 s       | ]
}t |jd kV  qdS )rX   NrB   r   r   r%   r%   r&   r;         z"collate_vectors.<locals>.<genexpr>z Expected only 1-D input tensors.)r   ri   z-pad_direction must be 'left' or 'right', got c                 S   
   | j d S Nr   r   r   r%   r%   r&   <lambda>     
 z!collate_vectors.<locals>.<lambda>r?   c                 3       | ]	}|j  j kV  qd S r9   r   r   longestr%   r&   r;         
IAll tensors must have the same shape when matching_shapes is set to True.r   ri   N)rq   r   rL   new_onesrB   r   r/   )r   r   rj   r   resultr   r   r%   r   r&   r     s*   
r   c                    s   dd | D } t dd | D sJ dt| dd d |r-t  fd	d| D s-J d
 jt| g jR  | }t| D ]\}}|||d|jd f< q?|S )a,  
    Convert an iterable of 2-D tensors (of possibly various first dimension, but consistent second dimension)
    into a single stacked tensor.

    :param tensors: an iterable of 2-D tensors.
    :param padding_value: the padding value inserted to make all tensors have the same length.
    :param matching_shapes: when ``True``, will fail when input tensors have different shapes.
    :return: a tensor with shape ``(B, L, F)`` where ``B`` is the number of input tensors,
        ``L`` is the largest found shape[0], and ``F`` is equal to shape[1].
    c                 S   r   r%   r   r   r%   r%   r&   r-     r   z$collate_matrices.<locals>.<listcomp>c                 s   r   )r   Nr   r   r%   r%   r&   r;     r   z#collate_matrices.<locals>.<genexpr>z Expected only 2-D input tensors.c                 S   r   r   r   r   r%   r%   r&   r     r   z"collate_matrices.<locals>.<lambda>r?   c                 3   r   r9   r   r   r   r%   r&   r;     r   r   Nr   )rq   rL   r   rB   r   r/   )r   r   r   r   r   r   r%   r   r&   r     s   
r   r   r   c              	   C   s   d}|du rt dg}d}|du rtn|j}g }g }g }	tt| |tt||d| |D ]\}
\}}}|du r8q,|| || |	| q,|t|f}|rV||	f }|S )a  
    Loads audio data from an iterable of cuts.

    :param cuts: a CutSet or iterable of cuts.
    :param executor: optional Executor (e.g., ThreadPoolExecutor or ProcessPoolExecutor)
        to perform the audio reads in parallel.
    :param suppress_errors: when set to ``True``, will enable fault-tolerant data reads;
        we will skip the cuts and audio data for the instances that failed (and emit a warning).
        When ``False`` (default), the errors will not be suppressed.
    :param recording_field: when specified, we will try to load recordings from a custom field with this name
        (i.e., ``cut.load_<recording_field>()`` instead of default ``cut.load_audio()``).
    :param filter_aux_iter: when specified, we will iterate over this iterator and discard the elements
        for which a corresponding cut failed to load audio, if ``suppress_errors`` is set to ``True``.
        This iterator is expected to be of the same length as ``cuts``.
    :return: a tuple of two items: a list of audio tensors (with different shapes),
        and a list of cuts for which we read the data successfully.
        If ``filter_aux_iter`` is specified, it returns a 3-tuple where the third element is
        the filtered auxiliary iterator.
    TNF)r   r   )	r   rz   r/   r^   r   _read_audior   r   	from_cuts)r   rk   r   r   r   aux_requestedmap_fnr   ok_cutsaux_iter_outr(   r#   maybe_audioaux_itemansr%   r%   r&   r      s>   



r   c              
   C   s   |du rt n|j }g }g }g }tt| |tt|||d| D ]\}	\}
}|du r*q|\}}|| || ||
 q||t|fS )a  
    Loads audio data from an iterable of cuts.

    :param cuts: a CutSet or iterable of cuts.
    :param with_audio: should the audio data be loaded.
    :param executor: optional Executor (e.g., ThreadPoolExecutor or ProcessPoolExecutor)
        to perform the audio reads in parallel.
    :param suppress_errors: when set to ``True``, will enable fault-tolerant data reads;
        we will skip the cuts and audio data for the instances that failed (and emit a warning).
        When ``False`` (default), the errors will not be suppressed.
    :param recording_field: when specified, we will try to load recordings from a custom field with this name
        (i.e., ``cut.load_<recording_field>()`` instead of default ``cut.load_video()``).
    :return: a tuple of two items: a list of audio tensors (with different shapes),
        and a list of cuts for which we read the data successfully.
    N)r   r   r   )rz   r/   r^   r   _read_videor   r   r   )r   r   rk   r   r   r   r   r   r   r(   r#   	maybe_ansr   audior%   r%   r&   r   <  s2   

r   c                 C   s    |d u rt n|j }t|t| S r9   )rz   rA   ry   )r   rk   r   r%   r%   r&   read_features_from_cutsn  s   r  r#   c                 C   s   t |d= |du r|  }nt| |}t|ts%J d| dt| | |}|jd dkr6|d}t	
|W  d   S 1 sEw   Y  dS )z{
    Loads audio data from cut, or returns None if there was an error
    and ``suppress_errors`` was set to ``True``.
    enabledNExpected 'getattr(cut, )' to yield Recording, got r   rX   )r   
load_audior   r   r   typer   r   squeezerM   rN   )r#   r   r   r  attrr%   r%   r&   r   u  s   



$r   c                 C   s   t |  S r9   )rM   rN   r   )r#   r%   r%   r&   ry     s   ry   c                 C   s   t |d6 |du r| j|dW  d   S t| |}t|ts-J d| dt| | j||dW  d   S 1 s>w   Y  dS )z
    Loads video + audio data from cut, or returns None if there was an error
    and ``suppress_errors`` was set to ``True``.
    r  N)r   r  r  )r   
load_videor   r   r   r  r   )r#   r   r   r   r
  r%   r%   r&   r     s   


$r   imageimage_fieldc                    s     fdd| D }t |}|S )a8  
    Load images for all cuts and return them as a batch in a torch tensor.
    The output image shape is ``(batch, height, width, channel)``.

    :param cuts: a :class:`CutSet` used to load the images.
    :param image_field: the field in the cut to load the images from.
    :return: tensor of collated imagesc                    r   r%   )rM   	as_tensorr   r>   r  r%   r&   r-     r   z"collate_images.<locals>.<listcomp>)rM   r   )r   r  imagesr%   r  r&   r     s   
r   )ri   NN)ri   NFN)Tri   NFN)Nri   )r   F)NFNN)TNFNr9   )FN)TFN)r  )7r   concurrent.futuresr   	functoolsr   	itertoolsr   typingr   r   r   r   r	   numpyrO   rM   torch.nnr
   lhotser   r   lhotse.audior   lhotse.audio.utilsr   
lhotse.cutr   r   lhotse.utilsr   r   r   rf   rJ   rg   r   re   r   collate_multi_channel_audior   rK   floatr   r   ignore_indexndarrayr   r   r   r   r  r   ry   r   r   r%   r%   r%   r&   <module>   sj   c
# 
I
T
o

)

(
>
3


