o
    ॵi                     @   sD   d dl mZmZ d dlZd dlmZ ddlmZ G dd deZdS )    )AnyDictN)ModeKeys   )OfaBasePreprocessorc                       s   e Zd ZdZejf fdd	Zdeee	f deee	f fddZ
dd	 Zdeee	f deee	f fd
dZdeee	f deee	f fddZ  ZS )!OfaTextClassificationPreprocessorz9
    OFA preprocessor for text classification tasks.
    c                    s&   t t| j|||g|R i | dS )zpreprocess the data

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path,
            mode: preprocessor mode (model mode)
        N)superr   __init__)selfcfg	model_dirmodeargskwargs	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/ofa/text_classification.pyr	      s   
z*OfaTextClassificationPreprocessor.__init__datareturnc                 C   s    | j tjkr| |S | |S )N)r   r   TRAIN_build_train_sample_build_infer_sample)r
   r   r   r   r   __call__   s   

z*OfaTextClassificationPreprocessor.__call__c                 C   sf   d |d    d| j }d |d    d| j }d}|||}| |}|S )aE  
        Building text classification task's instruction.

        The `data` should contains key `text` and `text2`, and the final instruction
        is like  ` can text1 " {} " imply text2 " {} "?`, the first `{}` refer to
        the value of `text` and the latter refer to `text2`

        step 1. Preprocess for input text `text` and `text2` in `data`.
            - Do lower, stripe and restrict the maximum length as `max_src_length`.
        step 2. Using instruction template to generate the final instruction.
        step 3. Tokenize the instruction as result.
         textNtext2z% can text1 " {} " imply text2 " {} "?)joinlowerstripsplitmax_src_lengthformattokenize_text)r
   r   text1r   promptr   instruction_itmr   r   r   _build_instruction%   s   
z4OfaTextClassificationPreprocessor._build_instructionc                 C   s   |  |}d|v sJ d|d }| jr| j| }| jd| dd}| jdkr+|}n| jdkr<t|dd	 |g}ntt| j|d
d	 g}| j|d
t	| < |||d}| 
| |S )a  
        Building training samples.

        step 1. Building instruction for text classification using `_build_instruction`.
        step 2. If the `label` is not text, transfer it to text using `label2ans`.
        step 3. Tokenize the label data.
        step 4. Concatenate the instruction and label tokens as the target item.
            - padding the instruction tokens from target item as `target`.
            - remove the eos token from target item as `prev_output_tokens`.
        step 5. Add constraint mask.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `text`, `text2`
                and `label`, both of them refer to a text input, and the target of this job
                is to find whether or not `text` imply `text2`, the `label` is the supervised
                data for training.
        Return:
            A dict object, contains source text input, target tokens and previous output
            tokens and constraint mask.
        labelz-there must has `label` column in train phase r   F)add_bosnoneprev_outputr   N)sourcetargetprev_output_tokens)r'   	label2ansr#   prompt_typetorchcatNotImplementedErrorbos_itempad_itemlenadd_constraint_mask)r
   r   r&   r(   	label_itm
target_itmprev_output_itmsampler   r   r   r   ;   s&   




z5OfaTextClassificationPreprocessor._build_train_samplec                 C   sr   |  |}| jdkrg }| j}n| jdkr"|dd }|dd }nt|||d}d|v r7| j|d  |d< |S )aT  
        Building inference samples.

        step 1. Building instruction for text classification using `_build_instruction`.
        step 2. Whether or not to add `prefix_token`.
        step 3. Whether or not to add `label` data.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `text` and `text2`,
                both of them refer to a text input, and the target of this job is to find
                whether or not `text` imply `text2`.
        Return:
            A dict object, contains source text input, prefix tokens and label data.
        r*   r+   Nr,   )r-   prefix_tokendecoder_promptr(   )r'   r1   r5   r4   r0   )r
   r   r&   r=   r>   r<   r   r   r   r   f   s   


z5OfaTextClassificationPreprocessor._build_infer_sample)__name__
__module____qualname____doc__r   	INFERENCEr	   r   strr   r   r'   r   r   __classcell__r   r   r   r   r   
   s    ""*+r   )	typingr   r   r2   modelscope.utils.constantr   baser   r   r   r   r   r   <module>   s
   