o
    ei\                     @   sF   d Z ddlmZ ddlmZ ddlmZ eG dd deZdgZdS )z(
Image/Text processor class for CLIPSeg
   )ProcessorMixin)BatchEncoding)auto_docstringc                       s,   e Zd Zd fdd	ZedddZ  ZS )CLIPSegProcessorNc                    s   t  || d S )N)super__init__)selfimage_processor	tokenizerkwargs	__class__ l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/clipseg/processing_clipseg.pyr      s   zCLIPSegProcessor.__init__c           
      K   s2  |du r|du r|du rt d|dur|durt d| j| jfd| jji|}|dur;| j|fd|i|d }|durL| j|fd|i|d }|dur]| j|fd|i|d }	|durn|durn|	j|jd}|S |dur}|dur}|	j|d	< |S |dur|S |durd
|ji}|S ttdi |	|dS )a  
        visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
            The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image,
            NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape
            (C, H, W), where C is a number of channels, H and W are image height and width.

        Returns:
            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        Nz9You have to specify either text, visual prompt or images.zMYou have to specify exactly one type of prompt. Either text or visual prompt.tokenizer_init_kwargsreturn_tensorstext_kwargsimages_kwargs)pixel_valuesconditional_pixel_valuesr   r   )datatensor_typer   )	
ValueError_merge_kwargsvalid_processor_kwargsr
   init_kwargsr	   r   r   dict)
r   textimagesvisual_promptr   r   output_kwargsencodingprompt_featuresimage_featuresr   r   r   __call__   sX   
zCLIPSegProcessor.__call__)NN)NNNN)__name__
__module____qualname__r   r   r$   __classcell__r   r   r   r   r      s    r   N)	__doc__processing_utilsr   tokenization_utils_baser   utilsr   r   __all__r   r   r   r   <module>   s   
@