o
    ¾e¦iN  ã                   @   sZ   d Z ddlmZ ddlmZ ddlmZmZmZ ddl	m
Z
 e
G dd„ deƒƒZdgZdS )	z
Processor class for MarkupLM.
é   )Ú
TensorType)ÚProcessorMixin)ÚBatchEncodingÚPaddingStrategyÚTruncationStrategy)Úauto_docstringc                        sº   e Zd ZdZ‡ fdd„Ze																			ddedeeB eB d	eeB e	B d
e
dB de
de
dB dedB dedB dedededededeeB dB defdd„ƒZ‡  ZS )ÚMarkupLMProcessorTc                    s   t ƒ  ||¡ d S )N)ÚsuperÚ__init__)ÚselfÚfeature_extractorÚ	tokenizer©Ú	__class__© ún/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/markuplm/processing_markuplm.pyr
      s   zMarkupLMProcessor.__init__NFé    Úadd_special_tokensÚpaddingÚ
truncationÚ
max_lengthÚstrideÚpad_to_multiple_ofÚreturn_token_type_idsÚreturn_attention_maskÚreturn_overflowing_tokensÚreturn_special_tokens_maskÚreturn_offsets_mappingÚreturn_lengthÚverboseÚreturn_tensorsÚreturnc                 K   sž  | j r)|du rtdƒ‚|dus|dus|durtdƒ‚|  |¡}|d }|d }n|dur1tdƒ‚|du s9|du r=tdƒ‚|durL| j rLt|tƒrL|g}| jdi d|durW|n|“d	|dur`|n8d“d|“d
|“d|“d|“d|“d|	“d|
“d|“d|“d|“d|“d|“d|“d|“d|“d|“|¤Ž}|S “d|“d
|“d|“d|“d|“d|	“d|
“d|“d|“d|“d|“d|“d|“d|“d|“d|“|¤Ž}|S )a‡  
        html_strings (`str` or `list[str]`, *optional*):
            Raw HTML strings to parse and process. When `parse_html=True` (default), these strings are parsed
            to extract nodes and xpaths automatically. If provided, `nodes`, `xpaths`, and `node_labels` should
            not be provided. Required when `parse_html=True`.
        nodes (`list[list[str]]`, *optional*):
            Pre-extracted HTML nodes as a list of lists, where each inner list contains the text content of nodes
            for a single document. Required when `parse_html=False`. Should not be provided when `parse_html=True`.
        xpaths (`list[list[str]]`, *optional*):
            Pre-extracted XPath expressions corresponding to the nodes. Should be a list of lists with the same
            structure as `nodes`, where each XPath identifies the location of the corresponding node in the HTML
            tree. Required when `parse_html=False`. Should not be provided when `parse_html=True`.
        node_labels (`list[list[int]]`, *optional*):
            Labels for the nodes, typically used for training or fine-tuning tasks. Should be a list of lists
            with the same structure as `nodes`, where each label corresponds to a node. Optional and only used
            when `parse_html=False`.
        questions (`str` or `list[str]`, *optional*):
            Question strings for question-answering tasks. When provided, the tokenizer processes questions
            as the first sequence and nodes as the second sequence (text_pair). If a single string is provided,
            it is converted to a list to match the batch dimension of the parsed HTML.
        NzDMake sure to pass HTML strings in case `parse_html` is set to `True`zUPlease don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`ÚnodesÚxpathsz@You have passed HTML strings but `parse_html` is set to `False`.zIMake sure to pass nodes and xpaths in case `parse_html` is set to `False`ÚtextÚ	text_pairÚnode_labelsr   r   r   r   r   r   r   r   r   r   r   r   r   r    r   )Ú
parse_htmlÚ
ValueErrorr   Ú
isinstanceÚstrr   )r   Úhtml_stringsr"   r#   r&   Ú	questionsr   r   r   r   r   r   r   r   r   r   r   r   r   r    ÚkwargsÚfeaturesÚencoded_inputsr   r   r   Ú__call__   sº   .ÿ



€ÿþýüûúùø	÷
öõôóòñðïíìþýüûúùø	÷
öõôóòñðïízMarkupLMProcessor.__call__)NNNNNTFNNr   NNNFFFFTN)Ú__name__Ú
__module__Ú__qualname__r'   r
   r   Úboolr*   r   r   Úintr   r   r0   Ú__classcell__r   r   r   r   r      sl    ìù
ø
	÷
öõôóòñðïîí
ìêr   N)Ú__doc__Ú
file_utilsr   Úprocessing_utilsr   Útokenization_utils_baser   r   r   Úutilsr   r   Ú__all__r   r   r   r   Ú<module>   s   
d