o
    ॵi                     @   sh   d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ ddlmZ G d	d
 d
eZdS )    )AnyDictN)Image)
transforms)
load_image)ModeKeys   )OfaBasePreprocessorc                       s   e Zd ZdZejf fdd	Zdeee	f deee	f fddZ
deee	f deee	f fdd	Zdeee	f deee	f fd
dZ  ZS )OfaVisualEntailmentPreprocessorz7
    OFA preprocessor for visual entailment tasks.
    c              	      sf   t t| j|||g|R i | tdd tj| j| jftjjdt	 tj
| j| jdg| _dS )zpreprocess the data

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path,
            mode: preprocessor mode (model mode)
        c                 S   s
   |  dS )NRGB)convert)image r   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/ofa/visual_entailment.py<lambda>#   s   
 z:OfaVisualEntailmentPreprocessor.__init__.<locals>.<lambda>)interpolation)meanstdN)superr
   __init__r   ComposeResizepatch_image_sizeInterpolationModeBICUBICToTensor	Normalizer   r   patch_resize_transform)selfcfg	model_dirmodeargskwargs	__class__r   r   r      s$   
z(OfaVisualEntailmentPreprocessor.__init__datareturnc                 C   s    | j tjkr| |S | |S )N)r!   r   TRAIN_build_train_sample_build_infer_sample)r   r&   r   r   r   __call__+   s   

z(OfaVisualEntailmentPreprocessor.__call__c                 C   s  |  |}d|d }|d di|d< | j|ddd}| jdkr6t| j|g}t|dd	 | jg}n<| jd
krQt|d |g}t|dd	 | jg}n!| jdkrpt|d d	d |g}t|dd	 | jg}nt| j	j
|d	t| d < ||d< ||d< | jd	urtt|t| jf }t|t| d }tt|t| d t|D ]}	| j g|||	   }
| j|
}d||	 |< q||d< |S )aO  
        Building training samples.

        step 1. Preprocess the data using the logic of `_build_infer_sample`
            and make sure the label data in the result.
        step 2. Preprocess the label data to generate the `target` and
        `prev_output_tokens`.
            - tokenize the label data.
            - calculate the target item.
                1) if `promp_type` is `None`, using tokenized label data.
                2) if `promp_type` is `src`, concatenating the `source` data
                and tokenized label data.
                3) if `promp_type` is `prev_output`, concatenating the `source`
                data without eos token and tokenized label data
        step 3. Add constraint mask

      Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `text`
                `text2` and `label` are optional.
        Return:
            A dict object, contains source text input, patch images, patch masks
            with `Tensor([True])` value, decoder prompt, label, target, previous
            output tokens and constraint mask.
        z {}labelg      ?ref_dictF)add_bosadd_eosnoner   Nsrcsourceprev_outputtargetprev_output_tokensTconstraint_mask)r*   formattokenize_textprompt_typetorchcatbos_itemeos_itemNotImplementedError	tokenizerpad_token_idlenconstraint_triezerostgt_dictboolrangebostolistget_next_layer)r   r&   sampler5   tgt_itemprev_output_itemtarget_itemr7   	start_idxiconstraint_prefix_tokenconstraint_nodesr   r   r   r)   1   sJ   




z3OfaVisualEntailmentPreprocessor._build_train_samplec                 C   sN  |  || jd  }| |}d|vr-| || jd  | j}| jjdd}||}n3d|v s:J d|	  | || jd  | j}| || jd  | j}| jjdd}|||}| 
|}| jdkrpg }	| j}
n| jd	kr|d
d }	|d
d }
nt||tdg|	|
d}d| jv r| jd |v r|| jd  |d< |S )aC  
        Building inference samples.

        step 1. Preprocessing the image as model's image input.
            - get the pillow image input from `data`
            - do some transforms to the pillow image, such as resize, normalize etc.
        step 2. Building the instruction as model's source text input.
            - use text input to build instruction. so far, we support two kind of
            input form, we will take different examples to both of them to explain
            how to use them.
                1) only `text` input in data. this setting can solve the tasks which
                judge whether or not the input `text` describe the input image.
                2) both `text` and `text2` input in data. this setting can solve the
                tasks which judge whether or not the `text` together with input image
                can imply the `text2`
            - tokenize the instruction above.
        step 3. Calculate the decoder prompt input.
        step 4. Whether or not to add label data.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `text`
                `text2` and `label` are optional.
        Return:
            A dict object, contains source text input, patch images, patch masks
            with `Tensor([True])` value, decoder prompt and label.
        r   text2textpromptz  does the image describe " {} "?ztext must be in the input z/ can image and text1 " {} " imply text2 " {} "?r0   r3   Nr4   T)r2   patch_image
patch_maskprefix_tokendecoder_promptrelationr,   )get_img_pil
column_mapr   pre_captionmax_src_lengthr   modelgetr8   keysr9   r:   r=   r?   r;   tensor)r   r&   r   rV   
hypothesisrU   rT   captioninputsrX   rY   rK   r   r   r   r*   o   sT   





z3OfaVisualEntailmentPreprocessor._build_infer_sample)__name__
__module____qualname____doc__r   	INFERENCEr   r   strr   r+   r)   r*   __classcell__r   r   r$   r   r
      s    ""*>r
   )typingr   r   r;   PILr   torchvisionr   modelscope.preprocessors.imager   modelscope.utils.constantr   baser	   r
   r   r   r   r   <module>   s   