o
    ei                     @   s   d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	m
Z
 ddlmZmZ G dd	 d	ed
dZeeZeG dd deZdgZdS )z
Processor class for Donut.
    N   )
ImageInput)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringloggingc                   @   s   e Zd Zi ZdS )DonutProcessorKwargsN)__name__
__module____qualname__	_defaults r   r   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/donut/processing_donut.pyr      s    r   F)totalc                	       sp   e Zd Zd fdd	Ze		ddedB deee B eB e	B dB de
e fddZed	d
 ZdddZ  ZS )DonutProcessorNc                    s   t  || d S )N)super__init__)selfimage_processor	tokenizerkwargs	__class__r   r   r   #   s   zDonutProcessor.__init__imagestextr   c                 K   s   |d u r|d u rt d| jtfd| jji|}|d ur(| j|fi |d }|d urC|d ur8|d dd | j|fi |d }|d u rI|S |d u rO|S |d |d< |d |d< |S )	NzBYou need to specify either an `images` or `text` input to process.tokenizer_init_kwargsimages_kwargstext_kwargsadd_special_tokensF	input_idslabels)
ValueError_merge_kwargsr   r   init_kwargsr   
setdefault)r   r   r   r   output_kwargsinputs	encodingsr   r   r   __call__&   s,   zDonutProcessor.__call__c                 C   s   | j j}t|ddg S )Nr"   r#   )r   model_input_nameslist)r   image_processor_input_namesr   r   r   r,   F   s   z DonutProcessor.model_input_namesFc                 C   s@  |du r	| j  }i }|rtd|tj}|du rn|| d }d|vr(n|d|dd  }|tdtd  }t|}td| d|tj}	|	du rZ|	|d}n|	
 }	t|}
t|	}t|
 d| |tjtjB }|dur|
d }d|v rd|v r| j|d|d	}|rt|dkr|d
 }|||< n?g ||< |dD ]%}| }||v r|d
 dkr|dd dkr|dd }|| | qt|| dkr|| d
 ||< |||	t|	 d  }|dd dkr
|g| j|dd d|d	 S |s|r|r|gS |S |rg S d|iS )zS
        Convert a (generated) token sequence into an ordered JSON format.
        Nz<s_>   z</s_ z(.*?)T)is_inner_valueadded_vocabr   z<sep/><z/>   text_sequence)r   get_added_vocabresearch
IGNORECASEstartindexlenescapereplacegroupDOTALLstrip
token2jsonsplitappendfind)r   tokensr2   r3   outputpotential_startstart_tokenkeykey_escaped	end_tokenstart_token_escapedend_token_escapedcontentvalueleafr   r   r   rD   L   s\   



$,zDonutProcessor.token2json)NN)FN)r   r   r   r   r	   r   strr-   r   r   r   r   r+   propertyr,   rD   __classcell__r   r   r   r   r   !   s    
r   )__doc__r9   image_utilsr   processing_utilsr   r   r   tokenization_utils_baser   r   utilsr	   r
   r   
get_loggerr   loggerr   __all__r   r   r   r   <module>   s   

e