o
    2wi                  "   @   s  d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlZd dlZd dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ e dd Zejedddejdejddddejde dejdddeg dddejdddeg dddejdd dg d!d"ejd#d$e d%d&d'ejd(dd)d*ejd+dd,d*ejd-d e d.dejd/d0d1e d2dejd3d4dd5de!de!d6e!d7e!d8e	e! d9e d:e"d;e"d<e d=e d>e"fd?d@Z#ejedddejdAejdddBdejddCejddddDdEejddFedGdHgdGdId'ejd/d0d1e dJdejd3d4dd5dAe!dKe
e! dLe!d=e d>e f
dMdNZ$dedKedOedLe!fdPdQZ%dS )R    N)ProcessPoolExecutoras_completed)partial)Path)ListOptional)CutSetFbankFeatureExtractor)cli)MemoryRawWriter)ArrayTarWriter)Pathlikec                   C   s   dS )z-Lhotse Shar format for optimized I/O commandsN r   r   r   R/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/bin/modes/shar.pyshar   s   r   T)show_default)context_settingscutsF)existsdir_okay)typeoutdirz-az--audionone)r   wavflacmp3opusoriginalzFormat in which to export audio. Original will save in the same format as the original audio (disabled by default, enabling will make a copy of the data))defaultr   helpz-fz
--features)r   lilcomnumpyz_Format in which to export features (disabled by default, enabling will make a copy of the data)z-cz--customzCustom fields to export. Use syntax NAME:FORMAT, e.g.: -c target_recording:flac -c embedding:numpy. Use format options for audio and features depending on the custom fields type, or 'jsonl' for metadata.)multipler   r    z-sz--shard-sizei  z%The number of cuts in a single shard.)r   r   r    z--shuffle/--no-shufflez8Should we shuffle the cuts before splitting into shards.)r   r    z--fault-tolerant/--fast-failzDShould we skip over cuts that failed to load data or raise an error.z--seedzRandom seed.z-jz
--num-jobs   zNumber of parallel workers. We recommend to keep this number low on machines with slow disks as the speed of I/O will likely be the bottleneck.z-vz	--verbose)countaudiofeaturescustom
shard_sizeshufflefault_tolerantseednum_jobsverbosec                 C   s   t | } |r| jt|d} i }|dkr||d< |dkr"||d< |r4|D ]}|d\}}|||< q&t|jddd | j||||	||
d d	S )
aV  
    Export CutSet from CUTS into Lhotse Shar format in OUTDIR.

    This script partitions the input manifest into smaller pieces called shards
    with SHARD_SIZE cuts per shard. The input is optionally shuffled.
    In addition to sharding, the user can choose to export AUDIO or FEATURES
    into sequentially readable tar files with a selected compression type.
    This typically yields very high speedups vs random read formats such as HDF5,
    especially on slower disks or clusters, at the expense of a data copy.

    The result is readable in Python using: CutSet.from_shar(OUTDIR)
    )rngr   	recordingr'   :T)parentsexist_ok)
output_dirfieldsr)   r-   r+   r.   N)	r   	from_filer*   randomRandomsplitr   mkdirto_shar)r   r   r&   r'   r(   r)   r*   r+   r,   r-   r.   r5   itemkeyfmtr   r   r   export   s*   
L

r?   shar_dir)r   	file_okayz--feature-configzTOptional manifest specifying feature extractor configuration (use Fbank by default).)r   r    z--compressionr!   r"   z>Which compression to use (lilcom is lossy, numpy is lossless).zNumber of parallel workers.feature_configcompressionc                 C   s   dd t | dD }dd }|r'tdt| d ttjdt|d	}g }t|B}|D ])}	|	d
 d }
|
j	dd }|

d| d}||jtt|	|||d q0|t|D ]}|  q`W d   dS 1 srw   Y  dS )az  
    Compute features for Lhotse Shar cuts stored in SHAR_DIR.

    The features are computed sequentially on CPU within shards,
    and parallelized across shards up to NUM_JOBS concurrent workers.

    FEATURE_CONFIG defines the feature extractor type and settings.
    You can generate default feature extractor settings with:
    lhotse feat write-default-config --help
    c              
   S   s2   g | ]}|g| d d|jd dggdqS ) r0   r   .tar)r   r0   )	with_namejoinsuffixes).0pr   r   r   
<listcomp>   s    z$compute_features.<locals>.<listcomp>zcuts.*.jsonl*c                 S   s   | S )Nr   )xr   r   r   <lambda>   s    z"compute_features.<locals>.<lambda>zComputing features for z shards.zShard progress)desctotalr   r   .r$   z	features.rE   )r   rB   output_pathrC   N)r   globclickecholenr   tqdmr   namer9   rF   appendsubmitcompute_features_one_shardr   	from_sharr   result)r@   rB   rC   r-   r.   shardsprogbarfuturesexshard	cuts_path	shard_idxrQ   fr   r   r   compute_features}   s4   $
	
"re   rQ   c                 C   s   |d ur	t |nt }t }t|d |d }| D ]}|||}|j|j| |j	d qW d    d S 1 s9w   Y  d S )N)r)   rC   )r=   valuemanifest)
r
   	from_yamlr	   r   r   compute_and_store_featureswriteidload_featuresr'   )r   rB   rQ   rC   	extractor	in_memorywritercutr   r   r   rZ      s   
"rZ   )&r7   concurrent.futuresr   r   	functoolsr   pathlibr   typingr   r   rS   rV   lhotser   r	   r
   lhotse.bin.modes.cli_baser   lhotse.features.ior   lhotse.sharr   lhotse.utilsr   groupr   commanddictargumentoptionChoiceintstrboolr?   re   rZ   r   r   r   r   <module>   s    
	
"3
0