o
    i                     @  s   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	Z	d dl
Zd dlmZ d dlZd dlZd dlmZ ee jjZeeejvrTejd ee d dlmZ d dlmZ d dlmZ dddZ dddZ!dd Z"e#dkr}e"  dS dS )    )annotationsN)Path)load_dotenv)PipelineConfig)
HotEncoder)Segmentbucketstr	token_keyc                 C  sV   | j ||dd  }tt|}|d }|d }dd t||D S )NBucketKeyBody
segment_idxcodec2_tokensc                 S  s&   i | ]\}}|t j|t jd  qS ))dtype)np
frombufferuint16copy).0sidb r   scripts/compare_sft_tokens.py
<dictcomp>    s   & z&load_remote_tokens.<locals>.<dictcomp>)	
get_objectreadpq
read_tableioBytesIOcolumn	to_pylistzip)s3r   r
   bodytableseg_idsrawr   r   r   load_remote_tokens   s
   r*   	shard_keywantedset[str]c                 C  s&  | j || ddd  }i }tjt|ddo}dd | D }|D ]R}t|jj	}	|	|vr3q&|
|}
tt|
 \}}|dkrPtj||d}|jd	 d
kr^|jd	dd}|jd d }td||d||	< t|t|krx n	q&W d    |S W d    |S 1 sw   Y  |S )Nz	audio.tarr   r   r)fileobjmodec                 S  s$   g | ]}|  r|jd r|qS )z.flac)isfilenameendswith)r   mr   r   r   
<listcomp>'   s   $ z'load_local_segments.<locals>.<listcomp>i>  r      T)dimkeepdim        )start_send_saudio)r   r   tarfileopenr    r!   
getmembersr   r2   stemextractfile
torchaudioload
functionalresampleshapemeanr   len)r%   r   r+   r,   r&   outtarmembersr4   r   fwavsrdurr   r   r   load_local_segments#   s6   


rQ   c               
   C  s  t  } | jddd | jddd | jdtdd |  }ttd  tj	d	d
}t
jdtjd tjd tjd dd}t|||j}tt| d |j }t|||jt|}t }d|j_d|j_t|jdd}|  d}	d}
d}d}|D ]J}||vrq}|j|| gdd}|sq}|d jd   !t"j#}|| }|
d7 }
|j$|j$kr|d7 }||k}|t%|& 7 }|' r|	d7 }	q}t(|
||	t)d|	 t*|
d dt)d| t*|d dd d S )Nz--shard-keyT)requiredz--token-keyz--limiti  )typedefaultz.envR2_BUCKET_DESTINATIONfinalsftdatar%   R2_ENDPOINT_URLR2_ACCESS_KEY_IDR2_SECRET_ACCESS_KEYauto)endpoint_urlaws_access_key_idaws_secret_access_keyregion_namez!/tmp/pipeline/xcodec2_custom.ckpt   cuda)devicer   r:   r6   )xcodec_batch_size_overrided      )samples_comparedsame_lengthbit_exact_samplesbit_exact_pctavg_token_match_pct)+argparseArgumentParseradd_argumentint
parse_argsr   PROJECT_ROOTosenvirongetboto3clientr*   r
   listsortedkeyslimitrQ   r+   setr   from_envcodecxcodec2_custom_ckptxcodec_batch_sizer   rD   encode_segmentsr   squeezecpunumpyastyper   r   rG   floatrH   allprintroundmax)apargsr   r%   remote
picked_idssegscfgenc	bit_exacttotalsame_lentoken_match_sumr   rJ   local
remote_arreqr   r   r   main9   sV   & r   __main__)r   r	   r
   r	   )r   r	   r+   r	   r,   r-   )$
__future__r   rj   r    rp   sysr>   pathlibr   rs   r   r   pyarrow.parquetparquetr   torchrC   dotenvr   __file__resolveparentro   r	   pathinsertcodecbench.pipeline.configr   codecbench.pipeline.encoderr   codecbench.pipeline.vadr   r*   rQ   r   __name__r   r   r   r   <module>   s2   

3
