o
    ॵiZ                     @   sD   d dl mZmZ d dlZd dlmZ ddlmZ G dd deZdS )    )AnyDictN)ModeKeys   )OfaBasePreprocessorc                       s   e Zd ZdZejf fdd	Zdeee	f deee	f fddZ
deee	f deee	f f fdd	Zdeee	f deee	f f fd
dZdd Z  ZS )OfaSummarizationPreprocessorz3
    OFA preprocessor for summarization tasks.
    c                    s&   t t| j|||g|R i | dS )zpreprocess the data

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path,
            mode: preprocessor mode (model mode)
        N)superr   __init__)selfcfg	model_dirmodeargskwargs	__class__ ^/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/ofa/summarization.pyr	      s   
z%OfaSummarizationPreprocessor.__init__datareturnc                 C   s    | j tjkr| |S | |S )N)r   r   TRAIN_build_train_sample_build_infer_sample)r
   r   r   r   r   __call__   s   

z%OfaSummarizationPreprocessor.__call__c                    s   |  |}|d  }t j|| jd}|dddd}| j|dd|d< | |d d	d
  }t	
| j|g|d< |S )a  
        Building training samples.

        step 1. Preprocess the data using the logic of `_build_infer_sample`
            and make sure the label data in the result.
        step 2. Preprocess the label data. Contains:
            - Get the lower case of label, and using `pre_caption` function
            to do the str preprocessing as new input label.
            - Tokenize the new input label as `target` for model input.
            - Add noise to the `target`
            - Calculate the `prev_output_tokens` from noise `target` for model input.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `image`, `prompt` and
                `label`, `image` refers the image input data, `prompt` refers the text input data
                and the `label` is the supervised data for training.
        Return:
            A dict object, contains source, image, mask, label, target tokens,
            and previous output tokens data.
        label	max_words[unk]unk<unk>F)add_bostargetNprev_output_tokens)r   lowerr   pre_captionmax_tgt_lengthreplacetokenize_textadd_noise_to_tgtclonetorchcatbos_item)r
   r   sample
target_strr!   noise_target_itemr   r   r   r   %   s   
z0OfaSummarizationPreprocessor._build_train_samplec                    s   t  j|| jd  | jd}|dddd}| jjdd}||}| 	|}| j
dkr3| j}n| j
d	kr?|d
d }nt||d}d| jv r[| jd |v r[|| jd  |d< |S )a   
        Building inference samples.

        step 1. Preprocessing the input text via `pre_cation` function, see more
            details from the doc of `pre_cation`.
        step 2. Uniform the unknown token, such as `<unk>` -> `unk` and `<unk>` -> `unk`.
        step 3. Get the prompt from input, concatenate with the input text, as new input.
        step 4. Tokenize the input text and generate the decoder prompt.
        step 5. Determine Whether or not to add labels to the sample.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `image` and `prompt`,
                the former refers the image input data, and the later refers the text input data.
        Return:
            A dict object, contains text, decoder prompt and label data.
        textr   r   r   r   promptz, " {} " Summarize the article with a title: noneprev_outputNr"   )sourcedecoder_promptsummaryr   )r   r%   
column_mapmax_src_lengthr'   r   modelgetformatr(   prompt_typer-   NotImplementedError)r
   r   r5   r2   r1   inputsr6   r.   r   r   r   r   E   s(   



z0OfaSummarizationPreprocessor._build_infer_samplec                 C   sh   t |d | jjddk }t jdt| j	| jjdd | jjdd |
 fd	||< |S )
a  
        Add noise token to the target sentence.

        step 1. Sampling from uniform distribution to randomly select the
            noise indices.
        step 2. Sampling from normal distribution as noise token to replace
            the relative token in the target.

        Args:
            target: A sequence of tokens.
        Returns:
            A sequence of tokens.
        r   noise_ratiog           	num_codesi    num_binsi  )size)r+   FloatTensorrD   uniform_r   r:   r;   randintlensrc_dictsum)r
   r!   noise_indicesr   r   r   r)   k   s   
z-OfaSummarizationPreprocessor.add_noise_to_tgt)__name__
__module____qualname____doc__r   	INFERENCEr	   r   strr   r   r   r   r)   __classcell__r   r   r   r   r   
   s    "&& &r   )	typingr   r   r+   modelscope.utils.constantr   baser   r   r   r   r   r   <module>   s
   