o
    iK                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	 e
ddG dd dejjjZe
ddG dd dejjjZdS )	    N)tables)extract_fbankload_audio_text_image_videodataset_classesOpenAIDatasetc                       j   e Zd ZdZ					ddededef fdd	Zd
d Zdd Z	dd Z
dd ZddefddZ  ZS )r   
    SenseVoiceDataset
    N        index_dsint_pad_valuefloat_pad_valuec                      t    tj|}||fi || _|dd }	|	r,tj|	}
|
di |d}	|	| _|dd }|rGtj|}|di |d}|| _|| _	|d u rSdn|j
| _
d| _|| _|| _|| _|dd| _|d	d
| _|d| _|d| _d| _|dd| _d| _ddlm} t| j	|rd| _td| _|dd| _|dd| _|dd| _|dd| _|dd| _ d S )Npreprocessor_speechpreprocessor_speech_confpreprocessor_textpreprocessor_text_conf>  soundsos<|startoftranscript|>eos<|endoftext|>
batch_size
batch_typer   retryd   FWhisperFrontendT)(<\|startofspeech\|>.*?<\|endofspeech\|>)max_token_lengthi   batch_size_scale_ratio_max      ?batch_size_token_max	  audio_adaptor_downsample_rate   audio_encoder_downsample_rate    )!super__init__r   index_ds_classesgetr   preprocessor_classesr   r   frontendfs	data_type	tokenizerr   r   r   r   r   r   prompt_ids_lenr   permute!funasr.frontends.whisper_frontendr   
isinstancerecompilepatternr    r!   r#   r%   r'   selfpathr   r/   r2   r   r   kwargsindex_ds_classr   preprocessor_speech_classr   preprocessor_text_classr   	__class__r)   \/home/ubuntu/.local/lib/python3.10/site-packages/funasr/datasets/openai_datasets/datasets.pyr+      J   

zOpenAIDataset.__init__c                 C      | j | }| j |S Nr   get_source_lenr;   indexitemr)   r)   rC   rH   F      
zOpenAIDataset.get_source_lenc                 C   rE   rF   r   get_target_lenrI   r)   r)   rC   rN   J   rL   zOpenAIDataset.get_target_lenc                 C   
   t | jS rF   lenr   r;   r)   r)   rC   __len__N      
zOpenAIDataset.__len__c           &      C   sx  d }t | jD ]}d}|dkr|}ntdt| jd }| j| }|d }|d }|d }	g g g g g g f\}
}}}}}tt|||	D ]\}\}}}d| d| d	}| j	
|}g }g }g }g }t|D ]\}}|d
s| j|}||7 }|dgt| 7 }qi|d
ddd}|dr'zt|dd  | jd}W n# ty } ztdt| dt   d}W Y d }~qid }~ww t|| j| jdd\}} | jr|ddd}| jdkrd| d  d d d  }!d|!d d d  }!n| jdkr| d  }!|!d | j d }"dg|" }t|g}||7 }|dgt| 7 }qi|r,qFdgt| }#| d}| j|}$|
||$ 7 }
||#|$ 7 }||7 }|| qFt|
| jkrotdt|
 d| j d|  d}|rsqtj |
tj!d}
tj dgt|
 tj"d}%tj |tj!d}|dd d d d f }| }tj |tj#d}tj |tj"d}|||||
|%|d} |S |S )NFr   r)   systemuser	assistant<|im_start|>system
<|im_end|>
<|im_start|>user
!<|im_end|>
<|im_start|>assistant
<|startofspeech|> <|endofspeech|>!   r0   Loading wav failed! , Tr1   r/   is_finalr&   r(      
<|im_end|>input_ids > max_token_length: >dtype)speechspeech_lengths
fbank_mask	fbank_beg	input_idsattention_mask
labels_ids)$ranger   torchrandintrQ   r   rK   	enumeratezipr9   split
startswithr2   encodereplacer   r0   	Exceptionloggingerrorstr	traceback
format_excr   r1   r/   r4   r'   r%   appendr    infotensorint64int32float32)&r;   rJ   outputidxbadcase_flag	index_currK   rU   rV   rW   rp   labelsfbank
fbank_lensrn   ro   isystem_promptuser_prompt
target_outsource_inputsplits
source_idsfbank_mask_ifbank_beg_ifbank_lens_iksub_str	sub_tokendata_srcerl   rm   olenssub_token_lensource_mask
target_idsrq   r)   r)   rC   __getitem__Q   s   







	zOpenAIDataset.__getitem__samplesc                 C   s:  t | jD ]}d}i }|D ]}|d u rq| D ]}||vr"g ||< || ||  qq| D ]0\}}t|d tjra|d jtj	ksM|d jtj
krQ| j}n| j}tjjjj|d|d||< q1| jdkr|d j\}	}
|	dkr|	|
 | jkrtd| d	|	 d
|
 d|	|
  d| j d |d d }q |S |S NFr   T)batch_firstpadding_valueexamplerp   r_   z	Warning, z	th, b*t: *=z > batch_size_sample_max: z, drop last datar	   )rs   r   keysr   itemsr6   rt   Tensorrk   r   r   r   r   nnutilsrnnpad_sequencer   shaper#   r}   r   r;   r   r   r   outputssamplekey	data_list	pad_valuebtr)   r)   rC   collator   s<    


(zOpenAIDataset.collatorNNNr	   r
   rF   __name__
__module____qualname____doc__r   intfloatr+   rH   rN   rS   r   listr   __classcell__r)   r)   rA   rC   r   
   s&    6oOpenAIDatasetMultiTurnc                       r   )r   r   Nr	   r
   r   r   r   c                    r   )Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Fr   Tr   r    i  r!   r"   r#   r$   multiturn_num_max   max_source_lengthi  r)   )!r*   r+   r   r,   r-   r   r.   r   r   r/   r0   r1   r2   r   r   r   r   r   r   r3   r   r4   r5   r   r6   r7   r8   r9   r    r!   r#   r   r   r:   rA   r)   rC   r+      rD   zOpenAIDatasetMultiTurn.__init__c                 C   rE   rF   rG   rI   r)   r)   rC   rH   $  rL   z%OpenAIDatasetMultiTurn.get_source_lenc                 C   rE   rF   rM   rI   r)   r)   rC   rN   (  rL   z%OpenAIDatasetMultiTurn.get_target_lenc                 C   rO   rF   rP   rR   r)   r)   rC   rS   ,  rT   zOpenAIDatasetMultiTurn.__len__c           )      C   s  d }t | jD ]}d}|dkr|}ntdt| jd }| j| }|d }|d }|d }	g g g g g g g f\}
}}}}}}tt|||	D ]^\}\}}}|| j	krX nPt|
| j
krstdt|
 d| j
 d	|   n5|dkrd
| d| d}nd| d}| j|}g }g }g }d}d}g }t|D ]\}}|ds| j|}||7 }|dgt| 7 }q|dddd}|dr_zt|dd  | jd} W n# ty }! ztdt|! d	t   d}W Y d }!~!qd }!~!ww t| | j| jdd\}"}#|#| jkrtd|# d| j d	|  d}| jr)|"ddd}"d|#d  d d d  }$d|$d d d  }$|$d d d }dg| }%t|}||%7 }|dgt|% 7 }q|rdqH||t|
 g7 }||g7 }dgt| }&| d}| j|}'|
||' 7 }
||&|' 7 }||"dd d d d f  ||7 }||# qH|rqtj |
tj!d}
tj dgt|
 tj"d}(tj |tj!d}tj |tj#d}tj |tj"d}tj |tj"d}||||||
|(|d} |S |S )NFr   r)   rU   rV   rW   rh   ri   rb   rX   rY   rZ   z<|im_start|>user
r	   r[   r\   r]   r^   r_   r`   ra   Trc   z$speech_lengths > max_source_length: r&   re   rf   rg   rj   )rl   rm   rn   ro   fake_token_lenrp   rq   rr   )$rs   r   rt   ru   rQ   r   rK   rv   rw   r   r    r}   r   r9   rx   ry   r2   rz   r{   r   r0   r|   r~   r   r   r   r   r1   r/   r   r4   r   r   r   r   r   ))r;   rJ   r   r   r   r   rK   rU   rV   rW   rp   r   r   r   rn   ro   r   r   r   r   r   r   r   r   fbank_ir   fake_token_len_ir   r   r   r   r   r   r   rl   rm   r   
fake_tokenr   r   rq   r)   r)   rC   r   /  s   










z"OpenAIDatasetMultiTurn.__getitem__r   c                 C   s`  t | jD ]}d}i }|D ]2}|d u rq| D ]&}||vr"g ||< t|| ttfr5|| ||  q|| ||  qq| D ]0\}}t|d t	j
rt|d jt	jks`|d jt	jkrd| j}n| j}t	jjjj|d|d||< qD| jdkr|d j\}	}
|	dkr|	|
 | jkrtd| d	|	 d
|
 d|	|
  d| j d |d d }q |S |S r   )rs   r   r   r6   r   tupleextendr   r   rt   r   rk   r   r   r   r   r   r   r   r   r   r   r#   r}   r   r   r)   r)   rC   r     s@    


(zOpenAIDatasetMultiTurn.collatorr   rF   r   r)   r)   rA   rC   r      s(    6 )r}   r7   rt   randomr   funasr.registerr   funasr.utils.load_utilsr   r   registerr   dataDatasetr   r   r)   r)   r)   rC   <module>   s    
 
^