o
    ॵi                     @   sh   d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ ddlmZ G d	d
 d
eZdS )    )AnyDictN)Image)
transforms)
load_image)ModeKeys   )OfaBasePreprocessorc                       s   e Zd ZdZejf fdd	Zdeee	f deee	f fddZ
deee	f deee	f fdd	Zdeee	f deee	f fd
dZ  ZS )&OfaVisualQuestionAnsweringPreprocessorz5
    OFA preprocessor for question answer tasks.
    c              	      sf   t t| j|||g|R i | tdd tj| j| jftjjdt	 tj
| j| jdg| _dS )zpreprocess the data

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path,
            mode: preprocessor mode (model mode)
        c                 S   s
   |  dS )NRGB)convert)image r   j/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/ofa/visual_question_answering.py<lambda>#   s   
 zAOfaVisualQuestionAnsweringPreprocessor.__init__.<locals>.<lambda>)interpolation)meanstdN)superr
   __init__r   ComposeResizepatch_image_sizeInterpolationModeBICUBICToTensor	Normalizer   r   patch_resize_transform)selfcfg	model_dirmodeargskwargs	__class__r   r   r      s$   
z/OfaVisualQuestionAnsweringPreprocessor.__init__datareturnc                 C   s    | j tjkr| |S | |S )N)r!   r   TRAIN_build_train_sample_build_infer_sample)r   r&   r   r   r   __call__+   s   

z/OfaVisualQuestionAnsweringPreprocessor.__call__c                 C   s  |  |}| jd|d ddd}| jdkr,t| j|g}t|dd | jg}n<| jdkrGt|d	 |g}t|dd | jg}n!| jd
krft|d	 dd |g}t|dd | jg}nt| j	j
|dt| d < ||d< ||d< | jdurtt|t| jf }t|t| d }tt|t| d t|D ]}| j g|||   }	| j|	}
d|| |
< q||d< |S )a&  
        Building training samples.

        step 1. Preprocess the data using the logic of `_build_infer_sample`
            and make sure the label data in the result.
        step 2. Preprocessing the label data to generate `target` and `prev_output_token`.
            - add blank in the front out label data and tokenize it as `target` item.
            - if `prompt_type` is `None`, add the bos token as previous output tokens,
            add eos tokens as target items.
            - if `prompt_type` is `src`, concatenate source text input with target item as
            previous output tokens, remove the bos token and add eos token as target items.
            - if `prompt_type` is `prev_output`, just like the `prompt_type` is src, the
            difference is that it will remove the eos token in source text input in this
            setting.
            - padding the source item as final target item.
        step 3. Add constraint mask.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `image`
                `text` and `label`.
        Return:
            A dict object, contains source text input, patch images, patch masks
            with `Tensor([True])`, decoder prompt, label, target previous output tokens
            and constraint mask.
        z {}labelF)add_bosadd_eosnoner   Nsrcsourceprev_outputprev_output_tokenstargetTconstraint_mask)r*   tokenize_textformatprompt_typetorchcatbos_itemeos_itemNotImplementedError	tokenizerpad_token_idlenconstraint_triezerostgt_dictboolrangebostolistget_next_layer)r   r&   sampletgt_itemprev_output_itemtarget_itemr6   	start_idxiconstraint_prefix_tokenconstraint_nodesr   r   r   r)   1   sJ   




z:OfaVisualQuestionAnsweringPreprocessor._build_train_samplec                 C   s   |  || jd  }| |}|| jd  }| || j}|ds&|d n|}| d| }| jdkr9| j}n| jdkrA|}n| jdkrM|dd	 }nt	||t
d
g|d}d| jv ro| jd |v ro|| jd  |d< |S )a<  
        Building inference samples.

        step 1. Preprocessing image input for model's image input.
            - get pillow image from data.
            - do some transforms to the pillow image, such as resize, normalize etc.
        step 2. Preprocessing the text input for model's text input.
            - add blank in the front of input text.
            - tokenize the result above as source text input.
        step 3. Calculating the decoder prompt.
            - if `prompt_type` is `None`, using bos token.
            - if `prompt_type` is `src`, using source text input
            - if `prompt_type` is `prev_output`, using source text input without eos token.
        step 4. Whether or not to add label data which refer to an answer to the question
            in this task.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `image`
                `text`.
        Return:
            A dict object, contains source text input, patch images, patch masks
            with `Tensor([True])`, decoder prompt and label.
        r   text? r/   r0   r2   Nr3   T)r1   patch_image
patch_maskdecoder_promptanswerr,   )get_img_pil
column_mapr   pre_questionmax_src_lengthendswithr7   r9   r<   r>   r:   tensor)r   r&   r   rU   rR   inputsrW   rJ   r   r   r   r*   o   s*   




z:OfaVisualQuestionAnsweringPreprocessor._build_infer_sample)__name__
__module____qualname____doc__r   	INFERENCEr   r   strr   r+   r)   r*   __classcell__r   r   r$   r   r
      s    ""*>r
   )typingr   r   r:   PILr   torchvisionr   modelscope.preprocessors.imager   modelscope.utils.constantr   baser	   r
   r   r   r   r   <module>   s   