o
    i                     @   sR   d dl Z d dlZd dlmZ d dlmZmZ eddG dd de jj	j
ZdS )    N)tables)extract_fbankload_audio_text_image_videodataset_classesAudioLLMVicunaDatasetc                       sj   e Zd ZdZ					ddededef fdd	Zd
d Zdd Z	dd Z
dd ZddefddZ  ZS )r   z
    AudioLLMDataset
    N        index_dsint_pad_valuefloat_pad_valuec                    s  t    tj|}||fi || _|dd }	|	r-tj|	}
|
di |di }	|	| _|dd }|rItj|}|di |di }|| _|| _	|d u rUdn|j
| _
d| _|| _|| _|dd| _d	| _|d
d| _| j| _|dd| _|dd| _d| _d| _d S )Npreprocessor_speechpreprocessor_speech_confpreprocessor_textpreprocessor_text_confi>  soundpromptzTranscribe speech to text. IGNORE_INDEXiaudio_adaptor_downsample_rate   audio_encoder_downsample_rate   zUSER: {}
 ASSISTANT:z{} )super__init__r   index_ds_classesgetr	   preprocessor_classesr   r   frontendfs	data_type	tokenizerr   r   	prompt_afr   r
   r   r   prompt_templateanswer_template)selfpathr	   r   r!   r
   r   kwargsindex_ds_classr   preprocessor_speech_classr   preprocessor_text_class	__class__r   `/home/ubuntu/.local/lib/python3.10/site-packages/funasr/datasets/llm_datasets_vicuna/datasets.pyr      s8   



zAudioLLMVicunaDataset.__init__c                 C      | j | }| j |S N)r	   get_source_lenr%   indexitemr   r   r-   r0   8      
z$AudioLLMVicunaDataset.get_source_lenc                 C   r.   r/   )r	   get_target_lenr1   r   r   r-   r5   <   r4   z$AudioLLMVicunaDataset.get_target_lenc                 C   s
   t | jS r/   )lenr	   )r%   r   r   r-   __len__@   s   
zAudioLLMVicunaDataset.__len__c              
   C   s  | j | }|d }t|| jd}| jr| j|| jd}t|| j| jdd\}}|d}|jd d | j	 | j
 }t|fd}|d }	| jrL| |	}	| j| j| _| j| j}
t|
}| j|	 }d	| j|}| j|}|| jjg }t|}tj|tjd
}t||f}|d}| j|	 }d	| j|}| j|}t|| jjg }tj|tjd
}t||f}d|d || < |d}| j|| < dg| dg|  }tj|tj d
}| j|	}tj|tjd
}tjt|gtj!d
}|||||||||d	S )Nsource)r   T)r    r   is_finalr      r   targetz{}{})dtype)	speechspeech_lengthstexttext_lengths	input_idsattention_mask
labels_ids
label_mask
audio_mask)"r	   r   r   r   r   r    r   squeezeshaper   r   torchfullr   r#   formatr   
prompt_prer!   encoder6   r$   lowerpad_token_idtensorint64catgecopydeepcopyeos_token_idr   float32int32)r%   r2   r3   r8   data_srcr=   r>   audio_pseudo_lengthaudio_pseudor;   prompt_ids_preprompt_pre_lengthinputprompt_inputprompt_input_idsrA   input_ids_lengthrB   answerprompt_answerprompt_answer_idsrC   rD   rE   idsr?   r@   r   r   r-   __getitem__C   sl   





z!AudioLLMVicunaDataset.__getitem__samplesc                 C   s   i }|D ]}|  D ]}||vrg ||< || ||  q
q| D ]0\}}t|d tjrS|d jtjks?|d jtjkrC| j	}n| j
}tjjjj|d|d||< q#|S )Nr   T)batch_firstpadding_value)keysappenditems
isinstancerH   Tensorr<   rP   rW   r
   r   nnutilsrnnpad_sequence)r%   rf   outputssamplekey	data_list	pad_valuer   r   r-   collator   s"    

zAudioLLMVicunaDataset.collator)NNNr   r   r/   )__name__
__module____qualname____doc__strintfloatr   r0   r5   r7   re   listrw   __classcell__r   r   r+   r-   r      s&    *I)rH   rS   funasr.registerr   funasr.utils.load_utilsr   r   registerro   dataDatasetr   r   r   r   r-   <module>   s    
