o
    ॵi.                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ dd	lmZmZmZ dd
lmZ ddlmZ G dd dZdS )    N)path)Image)File)OFATokenizerOFATokenizerZH)
load_image)Trie   )_get_kaldi_fbank_get_torchaudio_fbankconvert_waveform)OFA_TASK_KEY_MAPPING)set_torch_seedc                   @   sr   e Zd ZdZdd ZdddZeddd	Zed
d Zdd Z	dd Z
dd Z		dddZdejfddZdS )OfaBasePreprocessorz#
    OFA base preprocessor for
    c                 O   s.  || _ || _| j jdd| _tj|rtj|}| jdkr&t	
|}n| jdv r1t
|}nt|dd tdD  |dd tdD  | j jd	d
dkr\|ddg || _t|jg| _t|jg| _t|jg| _dd |  D  | _| _|jdd| _|jdd| _|jdd| _| j jdd| _| j jdd| _| j jdd}t j!"| t#| | j jdd}|rg d| _$g d| _%n
g d| _$g d| _%| j jdd| _&d d t'| j j( D | _)t*| j d!r| j j+j)d"ur| j j+j) D ]
\}	}
|
| j)|	< qt,-d#d t.j/D | _0d"| _1| j jd$d"rt23|| j jj4}t5|d%d&d'}t67|}W d"   n	1 sPw   Y  || _8d(d | j8 D | _9t:|j| _1t;|< D ]\}}| j=d)| ddd*}| j1>|jg|?  |jg  qod"| _@d"| _Ad"S )+zpreprocess the data via the vocab.txt from the `model_dir` path

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path
        languageen)zhcnc                 S      g | ]}d  |qS )z	<code_{}>format.0i r   U/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/ofa/base.py
<listcomp>1       z0OfaBasePreprocessor.__init__.<locals>.<listcomp>i    c                 S   r   )z<bin_{}>r   r   r   r   r   r   2   r   i  multimodal_typedefaulttext2sqlz>=z<=c                 S      i | ]\}}||qS r   r   )r   keyvaluer   r   r   
<dictcomp>9   s    z0OfaBasePreprocessor.__init__.<locals>.<dictcomp>max_src_length   max_tgt_lengthmax_image_sizei   prompt_typenoneseed   imagenet_default_mean_and_stdF)g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?)      ?r.   r.   patch_image_sizei  c                 S   s   i | ]}||qS r   r   r   r"   r   r   r   r$   N   s    datasetNc                 S   s   i | ]}|d qS Nr   r0   r   r   r   r$   W   s    
answer2labelrzutf-8)encodingc                 S   r!   r   r   )r   kvr   r   r   r$   _   r    )add_bosadd_eos)Bcfgmodemodelgetr   osr   existsabspathr   from_pretrainedr   NotImplementedError
add_tokensrange	tokenizertorch
LongTensorbos_token_idbos_itempad_token_idpad_itemeos_token_ideos_item	get_vocabitemstgt_dictsrc_dictr%   r'   r(   r)   nprandomr+   r   meanstdr/   r   task
column_maphasattrr1   str	maketransstringpunctuationtranstabconstraint_trieospjoinr3   openjsonload	ans2label	label2ansr   	enumeratekeystokenize_textinserttolisttrain_audio_feature_transformstest_audio_feature_transforms)selfr;   	model_dirr<   argskwargsrF   r+   r-   r6   r7   ans2label_filereaderans2label_dictr   answeranswer_itemr   r   r   __init__   s   









zOfaBasePreprocessor.__init__Tc                 C   sX   |du rdS | j || jddddd d}|r t| j|g}|r*t|| jg}|S )a  
        Using `OFATokenizer` to tokenize text input.

        Args:
            text (`str`): Input text.
            add_bos ('bool', **optional**, default to `True`)
                Whether or not to add beginning of sentence token in
                the front of sentence.
            add_eos ('bool', **optional**, default to `True`)
                Whether or not to add ending of sentence token in
                the end of sentence.
        Returns:
            A list of tokens with the max length of `max_src_length + 2`
        NFTpt)
max_lengthadd_special_tokens
truncationreturn_tensors	input_idsr   )rF   r%   squeezerG   catrJ   rN   )rn   textr9   r:   inputsr   r   r   ri   k   s$   z!OfaBasePreprocessor.tokenize_textNc                 C   s|   |   ddddddd} tdd| } | d} | d} | d}|d	ur<t||kr<d	|d	| } | S )
a  
        Preprocessing for text sentence.

        step 1. Get the lower case of input text.
        step 2. Remove the words within `,.!?*#:;~ ` in the beginning
            of the sentence.
        step 3. Replace the words within `-/` or pattern `\s{2,}` with word ` `
            and replace tag `<person>` with `person`.
        step 4. Remove the `\n` in the end of the sentence.
        step 5. Split the sentence with token ` `, If `max_words` is not None,
            make a length truncation.

        Args:
            caption (`str`): Input text.
            max_words (`int`, **optional**, default `None`):
                The max length of input text. If None, do nothing, else
                make a truncation.

        Returns:
            A sequence of `str`.
        	,.!?*#:;~-r8   /z<person>person\s{2,}
N
lowerlstripreplaceresubrstripstripsplitlenra   )caption	max_wordscaption_wordsr   r   r   pre_caption   s   


zOfaBasePreprocessor.pre_captionc                 C   sl   |   ddddd} tdd| } | d} | d} | d}t||kr4d	|d| } | S )aM  
        Preprocessing for text sentence.
        Note that this function is very similar to `pre_caption`, should be merged in the future version.

        step 1. Get the lower case of input text.
        step 2. Remove the words within `,.!?*#:;~ ` in the beginning
            of the sentence.
        step 3. Replace the words within `-/` or pattern `\s{2,}` with word ` `.
        step 4. Remove the `\n` in the end of the sentence.
        step 5. Split the sentence with token ` `, If `max_words` is not None,
            make a length truncation.

        Args:
            question (`str`): Input text.
            max_ques_words (`int`, **optional**, default `None`):
                The max length of input text. If None, do nothing, else
                make a truncation.

        Returns:
            A sequence of `str`.
        r   r   r8   r   r   r   Nr   )questionmax_ques_wordsquestion_wordsr   r   r   pre_question   s$   


z OfaBasePreprocessor.pre_questionc           	      C   s   |d }| | jjdd }| jrPtt|t| jf	 }t|| }t
|t|D ]}| j |||   }| j|}d|| |< q.||d< dS dS )z&
        Add constraint mask.
        targetr   )dimTconstraint_maskN)nerL   sumitemr_   rG   zerosr   rQ   boolrE   rJ   rk   get_next_layer)	rn   sample
target_itmlen_label_itmr   	start_idxr   constraint_prefix_tokenconstraint_nodesr   r   r   add_constraint_mask   s&   z'OfaBasePreprocessor.add_constraint_maskc                 C   s    t |tjr
|}|S t|}|S )av  
        Get the pillow image. If the input is not a pillow image ,it will load
        image from a local path or an external url.

        Args:
            path_or_url_or_pil (`Union[str, Image]`):
                Can be:
                    - A path or url reference to an image
                    - A pillow image.
        Returns:
            A pillow image.
        )
isinstancer   r   )rn   path_or_url_or_pilimager   r   r   get_img_pil   s
   zOfaBasePreprocessor.get_img_pilc                 C   sN   t |trt|}|S t |trt|}t|}|S tdt| d)NzUnsupported input type: .)	r   bytesioBytesIOrZ   r   read	TypeErrortype)rn   path_or_urlaudio_bytes
file_bytesr   r   r   get_audio_bytes   s   




z#OfaBasePreprocessor.get_audio_bytes>  Fc           	      C   s   t j||dt|gdt|gg\}}t||ddd\}}|d }| }t||d}|d u r6t||d}|d u r>td|rK| j	d urK| 	|}n| rZ| 
|d urZ| 
|}t| }| |}|S )NspeedrateT)to_mononormalize_volumei   P   zGPlease install pyKaldi or torchaudio to enable fbank feature extraction)
torchaudiosox_effectsapply_effects_tensorrZ   r   numpyr
   r   ImportErrorrl   rm   rG   
from_numpyfloatpack_frames)	rn   waveformsample_rater   target_sample_rateis_train	_waveform_fbankr   r   r   prepare_fbank  s6   



z!OfaBasePreprocessor.prepare_fbankfeaturec                 C   sB   | j jdkr|S |jd | j j }|d | j j|  }||dS )Nr	   r   )r;   n_frames_per_stepshapereshape)rn   r   n_packed_framesr   r   r   r   '  s
   zOfaBasePreprocessor.pack_frames)TTr2   )r   F)__name__
__module____qualname____doc__rw   ri   staticmethodr   r   r   r   r   r   rG   Tensorr   r   r   r   r   r      s    
N(
)
r   )r   r?   r   r\   r   r`   rc   r   rS   rG   r   PILr   modelscope.fileior   !modelscope.models.multi_modal.ofar   r   modelscope.preprocessors.imager   modelscope.utils.trier   utils.audio_helperr
   r   r   utils.constantr   utils.random_helpr   r   r   r   r   r   <module>   s$   