o
    i[=                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	 e
ddG dd dejjjZe
ddG dd dejjjZdS )	    N)tables)extract_fbankload_audio_text_image_videodataset_classesSenseVoiceDatasetc                       t   e Zd ZdZ					ddededef fdd	Zd
d Zdd Z	dd Z
dd ZddefddZdddZ  ZS )r   z
    SenseVoiceDataset
    N        index_dsint_pad_valuefloat_pad_valuec                    :  t    tj|}||fi || _|dd }	|	r,tj|	}
|
di |d}	|	| _|dd }|rGtj|}|di |d}|| _|| _	|d u rSdn|j
| _
d| _|| _|| _|| _|dd| _|d	d
| _|d| _|d| _d| _|dd| _d| _ddlm} t| j	|rd| _d S d S Npreprocessor_speechpreprocessor_speech_confpreprocessor_textpreprocessor_text_confi>  soundsosz<|startoftranscript|>eosz<|endoftext|>
batch_size
batch_typer   retry   F)WhisperFrontendT super__init__r   index_ds_classesgetr
   preprocessor_classesr   r   frontendfs	data_type	tokenizerr   r   r   r   r   r   prompt_ids_lenr   permute!funasr.frontends.whisper_frontendr   
isinstanceselfpathr
   r"   r%   r   r   kwargsindex_ds_classr   preprocessor_speech_classr   preprocessor_text_classr   	__class__r   a/home/ubuntu/.local/lib/python3.10/site-packages/funasr/datasets/sense_voice_datasets/datasets.pyr      @   


zSenseVoiceDataset.__init__c                 C      | j | }| j |S Nr
   get_source_lenr+   indexitemr   r   r3   r8   ?      
z SenseVoiceDataset.get_source_lenc                 C   r5   r6   r
   get_target_lenr9   r   r   r3   r>   C   r<   z SenseVoiceDataset.get_target_lenc                 C   
   t | jS r6   lenr
   r+   r   r   r3   __len__G      
zSenseVoiceDataset.__len__c                 C   s  d }t | jD ]5}|dkr|}ntdt| jd }| j| }|d }z	t|| jd}W n! t	yP } zt
dt| dt   W Y d }~qd }~ww | jr\| j|| jd}t|| j| jdd\}	}
|
| jkrnq| jrx|	dd	d
}	|d }| jr| |}|dd}|dd}t| jtr| j | | }| jj|dd}n| | }| jj|dd}| jg| }t|d
 }|| _| jj|dd}t|d
 }|dkrqt| jtr| jj| jdd}n| jg}|| | }t|}tj|tjd}tj|gtjd}dg| d
g|  d
g }t|}tj|tj d}tj|gtjd}|	dd d d d f |
||||d} |S |S )Nr   r   sourcer#   Loading wav failed! , Tr$   r"   is_final      targetpromptz<|ASR|>text_language<|zh|>allallowed_special   dtype)speechspeech_lengthstexttext_lengthstarget_masktarget_mask_lengths)!ranger   torchrandintrA   r
   r;   r   r#   	Exceptionloggingerrorstr	traceback
format_excr   r   r$   r"   r   r'   r   r    r)   r   r%   encoder&   r   tensorint64int32float32)r+   r:   outputidx	index_curr;   rE   data_srcerW   rX   rM   taskrO   rN   
prompt_idsr&   
target_idstarget_ids_lenr   idsids_lengthsrY   rZ   r[   r\   r   r   r3   __getitem__J   sz   



zSenseVoiceDataset.__getitem__samplesc              	   C   s  i }|D ]}|d u rq|  D ]}||vrg ||< || ||  qqt|dk rtd tjdtjdd d d d d f tjdgtj	dd d d f tjdgtj	dd d d f tjdgtj	dd d d f tdg| j
 dgd  dg gd d d f d}|S | D ]0\}}t|d tjr|d jtjks|d jtj	kr| j}n| j}tjjjj|d	|d
||< q| jdkrtdD ]	}| j||d}q|S )NrL   ERROR: data is empty!
      rU   rz     r   )rW   rX   rY   rZ   r[   Tbatch_firstpadding_valueexamplei)keysappendrA   ra   rb   r^   randrj   rg   ri   r&   itemsr)   TensorrV   rh   r   r   nnutilsrnnpad_sequencer   r]   _filter_badcaser+   rw   outputssamplekey	data_list	pad_valuer   r   r   r3   collator   sf   
 


"
 


zSenseVoiceDataset.collatorr   c              
   C   s"  |d j \}}}|| | jd krtddd }|dk r d}td||  d| j d| d	|  | D ]\}}|| ||| d ||< q8|d
   }	|d d d d |	d d f |d< |d   }
|d d d d |
f |d< |d   }|d d d d |f |d< |S )NrW         ?r   rK   r   Warning, b * t:  > , drop half data th, beg:rX   rZ   rY   r\   r[   	shaper   r^   r_   r;   ra   infor   max)r+   r   r   bt_begr   r   speech_lengths_maxtext_lengths_maxtarget_mask_lengths_maxr   r   r3   r      s"    "z!SenseVoiceDataset._filter_badcaseNNNr   r	   r6   r   __name__
__module____qualname____doc__rc   intfloatr   r8   r>   rC   rv   listr   r   __classcell__r   r   r1   r3   r      s(    .Q8SenseVoiceCTCDatasetc                       r   )r   z
    SenseVoiceCTCDataset
    Nr   r	   r
   r   r   c                    r   r   r   r*   r1   r   r3   r      r4   zSenseVoiceCTCDataset.__init__c                 C   r5   r6   r7   r9   r   r   r3   r8     r<   z#SenseVoiceCTCDataset.get_source_lenc                 C   r5   r6   r=   r9   r   r   r3   r>   "  r<   z#SenseVoiceCTCDataset.get_target_lenc                 C   r?   r6   r@   rB   r   r   r3   rC   &  rD   zSenseVoiceCTCDataset.__len__c                 C   s   d }t | jD ]}|dkr|}ntdt| jd }| j| }|d }z	t|| jd}W n! t	yO } zt
dt| dt   W Y d }~qd }~ww | jr[| j|| jd}t|| j| jdd\}	}
|
| jkrmq| jrw|	dd	d
}	|d }| jr| |}|dd}|dd}|dd}|dd}| jj|dd}t|}|dkrq| jj|dd}| jj|dd}| jj|dd}| jj|dd}|| | | | }t|}tj|tjd}tj|gtjd}|	dd d d d f |
||d} |S |S )Nr   r   rE   rF   rG   rH   TrI   rK   rL   rM   
emo_targetz<|NEUTRAL|>event_targetz
<|Speech|>rO   rP   with_or_wo_itnz	<|woitn|>rQ   rR   rT   rU   rW   rX   rY   rZ   )r]   r   r^   r_   rA   r
   r;   r   r#   r`   ra   rb   rc   rd   re   r   r   r$   r"   r   r'   r   r    r%   rf   rg   rh   ri   )r+   r:   rk   rl   rm   r;   rE   rn   ro   rW   rX   
asr_targetr   r   rO   punc_itn_bottomrr   rs   lid_idsemo_ids	event_idspunc_itn_bottom_idsrt   ru   rY   rZ   r   r   r3   rv   )  sb   



z SenseVoiceCTCDataset.__getitem__rw   c                 C   sp  i }|D ]}|d u rq|  D ]}||vrg ||< || ||  qqt|dk rntd tjdtjdd d d d d f tjdgtj	dd d d f tjdgtj	dd d d f tjdgtj	dd d d f d}|S |
 D ]0\}}t|d tjr|d jtjks|d jtj	kr| j}n| j}tjjjj|d	|d
||< qr| jdkrtdD ]	}| j||d}q|S )NrL   rx   ry   rU   rz   r|   r   r   Tr}   r   r   )r   r   rA   ra   rb   r^   r   rj   rg   ri   r   r)   r   rV   rh   r   r   r   r   r   r   r   r]   r   r   r   r   r3   r   g  s`   
 


 


zSenseVoiceCTCDataset.collatorr   c              
   C   s   |d j \}}}|| | jd krytddd }|dk r d}td||  d| j d| d	|  | D ]\}}|| ||| d ||< q8|d
   }	|d d d d |	d d f |d< |d   }
|d d d d |
f |d< |S )NrW   r   r   rK   r   r   r   r   r   rX   rZ   rY   r   )r+   r   r   r   r   r   r   r   r   r   r   r   r   r3   r     s    "z$SenseVoiceCTCDataset._filter_badcaser   r6   r   r   r   r   r1   r3   r      s(    .>5)ra   rer^   randomrd   funasr.registerr   funasr.utils.load_utilsr   r   registerr   dataDatasetr   r   r   r   r   r3   <module>   s    
 
_