o
    wi                     @   s  d dl Zd dlZd dlmZmZ d dlmZ d dlm	Z
 d dlmZ d dlmZ d dlmZ d dlmZ ed	 G d
d dejjjZedkrd dlZe Zejddd ejdedg dd ejddd ejdddgd ejdedd ejdedd e Z ee j!dZ!e!"ej#Z!e!j$Z$ee j!ddZ%e&d d!dd"d#dd$Z'ee'd dee$e%d%e%d&Z(ej)j*e!e(e
j+e j,e j-e j.e j/d'dd(d d)d*d+g dd,e0ej1j2d-d.dd/ e j3dure!4e j3 dS dS dS )0    N)collate_matricescollate_vectors)	OmegaConf)	lightning)speechlm)!get_lhotse_dataloader_from_config)AutoTokenizer)HFAutoModelForSpeechSeq2Seqmediumc                       s&   e Zd Zd fdd	Zdd Z  ZS )LhotseHfNeMoDatasetc                    s    t    || _|| _|| _d S )N)super__init__	processor	tokenizerdecoder_mask_fill)selfr   r   r   	__class__ ^/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/tests/collections/speechlm/hf/sft.pyr      s   

zLhotseHfNeMoDataset.__init__c              	   C   s   g }|D ]}|  }|| j||jd|jd jd qtdd |D d}tdd |D d}|d d d df }||| j	k| j
j}|d d d	d f d}|||d
S )Nptr   )sampling_ratereturn_tensorstextc                 S   s   g | ]	}|d   dqS )input_featuresr   )squeeze).0fr   r   r   
<listcomp>1   s    z3LhotseHfNeMoDataset.__getitem__.<locals>.<listcomp>)tensorsc                 S   s   g | ]}|j d  jqS )r   )supervisionstokens)r   cr   r   r   r   2   s       )r   labelsdecoder_input_ids)
load_audioappendr   r   r!   r   r   r   masked_fillr   r   pad_idreshape)r   cutsfeaturescutaudior   r&   r'   r   r   r   __getitem__$   s,   
	zLhotseHfNeMoDataset.__getitem__)r   )__name__
__module____qualname__r   r1   __classcell__r   r   r   r   r      s    r   __main__z--modelzopenai/whisper-large-v3)defaultz
--strategyauto)r8   ddpfsdp)typer7   choicesz	--devicesr%   z--acceleratorgpu)r7   r<   z--max-stepsd   )r;   r7   z--model-save-path)
model_nameT)include_special_tokensz;/home/TestData/speechlm/lhotse/libri/libri-train-5.jsonl.gzi>        )	cuts_pathsample_rateshufflenum_workers
batch_sizeshuffle_buffer_size)r   r   )global_rank
world_sizedatasetr   z
bf16-mixedg        
   g      ?F)devices	max_stepsacceleratorstrategy	precisionlog_every_n_stepslimit_val_batchesnum_sanity_val_stepsaccumulate_grad_batchesgradient_clip_valuse_distributed_sampler	callbacksloggergh㈵>)lr)modeldatatraineroptimlog)5fiddlefdltorchlhotse.dataset.collationr   r   	omegaconfr   nemor   nlnemo.collectionsr   #nemo.collections.common.data.lhotser   =nemo.collections.common.tokenizers.huggingface.auto_tokenizerr    nemo.collections.speechlm.modelsr	   set_float32_matmul_precisionutilsr\   Datasetr   r2   argparseArgumentParserparseradd_argumentstrint
parse_argsargsr[   tofloatr   r   createconfigtrain_dataloaderapifinetuneTrainerrM   rN   rO   rP   buildadampytorch_adam_with_flat_lrmodel_save_pathsave_pretrainedr   r   r   r   <module>   s   
#
@