o
    5tie                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZmZ ddlZddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZ d	d
l m!Z! d	dl"m#Z#m$Z$ d	dl%m&Z& d	dl'm(Z( d	dl)m*Z* d	dl+m,Z, d	dl-m-Z- d	dlm.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 e4e4e(  Z5ej6G dd dej7Z8G dd dZ9de9de:dee de1de,de;e:e4e( f fddZ<dS )zA
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
    N)Counter)Sequence)Decimal)AnyOptionalUnioncast)BoolDateTime
DictionaryInfinityInteger	IpAddressNanNoneType
NullString
RealNumberStrictLevelStringTypecodeis_empty_sequence)AbstractType   )ColumnDataProperty)MIN_STRICT_LEVEL_MAPDefaultValue)DataPropertyConverter)DataProperty)Format)Preprocessor)logger)DateTimeFormatterStrictLevelMap	TransFuncTypeHintTypeValueMapnormalize_type_hintc                   @   s   e Zd ZdZdZdZdZdS )MatrixFormatting            N)__name__
__module____qualname__	EXCEPTIONTRIM	FILL_NONEHEADER_ALIGNED r3   r3   K/home/ubuntu/.local/lib/python3.10/site-packages/dataproperty/_extractor.pyr'   5   s
    r'   c                   @   s  e Zd ZdZdydee ddfddZdzddZede	e
 fd	d
Zejde	e
 ddfdd
ZedefddZejdeddfddZedee fddZejde	ee
ef  ddfddZedefddZejdeddfddZedefddZejdeddfddZedefddZejdeddfddZedee
 fddZejde
ddfddZedefdd Zejdeddfd!d Zedefd"d#Zejdeddfd$d#Zede	e fd%d&Zejde	e ddfd'd&Zedeee ee df fd(d)Zejdeee ee f ddfd*d)Zede
fd+d,Zejde
ddfd-d,Zede fd.d/Z!e!jde ddfd0d/Z!edefd1d2Z"e"jdeddfd3d2Z"ede#fd4d5Z$e$jde#ddfd6d5Z$d7e%deee
edf ddfd8d9Z&d:e'ddfd;d<Z(ede)e%ef fd=d>Z*e*jde)e%ef ddfd?d>Z*edee+ fd@dAZ,e,jdee+ ddfdBdAZ,ede-fdCdDZ.e.jde-ddfdEdDZ.edefdFdGZ/e/jdee ddfdHdGZ/de0de1fdIdJZ2dKe	e0 dee1 fdLdMZ3	dydNe0dOee	e4  dee4 fdPdQZ5dRe	e	e0  de6fdSdTZ7dee1 fdUdVZ8dWe0defdXdYZ9de defdZd[Z:	 e;de0defd\d]Z<d^edefd_d`Z=d^edefdadbZ>			d{dce0ddedeee dfee  de1f
dgdhZ?			d{dce0ddedeee dfee  de1f
didjZ@dRe	e	e0  de6fdkdlZAdRe	e	e0  de6fdmdnZB			d{doe	e0 ddedeee dfee  dee1 f
dpdqZCdre	e	e0  de	e	e0  fdsdtZDdee4 fdudvZEdzdwdxZFdS )|DataPropertyExtractora  
    .. py:attribute:: quoting_flags

        Configurations to add double quote to for each items in a matrix,
        where |Typecode| of table-value is |True| in the ``quote_flag_table``
        mapping table. ``quote_flag_table`` should be a dictionary.
        And is ``{ Typecode : bool }``. Defaults to:

        .. code-block:: json
            :caption: The default values

            {
                Typecode.BOOL: False,
                Typecode.DATETIME: False,
                Typecode.DICTIONARY: False,
                Typecode.INFINITY: False,
                Typecode.INTEGER: False,
                Typecode.IP_ADDRESS: False,
                Typecode.LIST: False,
                Typecode.NAN: False,
                Typecode.NULL_STRING: False,
                Typecode.NONE: False,
                Typecode.REAL_NUMBER: False,
                Typecode.STRING: False,
            }
    Nmax_precisionreturnc                 C   s   t j| _|d u rt j| _n|| _g | _d | _g | _d | _d| _	d| _
tj| _g | _d | _t j| _ttttttf tf t j| _d| _t | _tt j| _ g | _!tt j"| _#d | _$t%j&| _'|  | (  d S )NTr   r   ))r   MAX_WORKERSmax_workersMAX_PRECISION%_DataPropertyExtractor__max_precision_DataPropertyExtractor__headers)_DataPropertyExtractor__default_type_hint&_DataPropertyExtractor__col_type_hints(_DataPropertyExtractor__strip_str_header+_DataPropertyExtractor__is_formatting_float0_DataPropertyExtractor__min_col_ascii_char_widthr   NONE,_DataPropertyExtractor__default_format_flags)_DataPropertyExtractor__format_flags_list"_DataPropertyExtractor__float_typeDATETIME_FORMAT+_DataPropertyExtractor__datetime_format_strcopydeepcopyr   dictr   r   strintSTRICT_LEVEL_MAP(_DataPropertyExtractor__strict_level_map2_DataPropertyExtractor__east_asian_ambiguous_widthr   $_DataPropertyExtractor__preprocessorTYPE_VALUE_MAP&_DataPropertyExtractor__type_value_map'_DataPropertyExtractor__trans_func_listQUOTING_FLAGS%_DataPropertyExtractor__quoting_flags*_DataPropertyExtractor__datetime_formatterr'   r0   )_DataPropertyExtractor__matrix_formatting#_DataPropertyExtractor__clear_cache)selfr6   r3   r3   r4   __init__`   s4   
zDataPropertyExtractor.__init__c                 C   sT   |    | d| _| d| _| d| _| d| _| d | dd| _d S )Nr   r   TF )Nr[   )+_DataPropertyExtractor__update_dp_converter!_DataPropertyExtractor__to_dp_raw%_DataPropertyExtractor__dp_cache_zero$_DataPropertyExtractor__dp_cache_one%_DataPropertyExtractor__dp_cache_true&_DataPropertyExtractor__dp_cache_false$_DataPropertyExtractor__dp_cache_maprY   r3   r3   r4   __clear_cache   s   z#DataPropertyExtractor.__clear_cachec                 C      | j S N)r<   rc   r3   r3   r4   headers      zDataPropertyExtractor.headersvaluec                 C       | j |krd S || _ |   d S rf   )r<   rX   rY   ri   r3   r3   r4   rg         
c                 C   re   rf   )r=   rc   r3   r3   r4   default_type_hint   rh   z'DataPropertyExtractor.default_type_hintc                 C   rj   rf   )r=   rX   rk   r3   r3   r4   rm      rl   c                 C   re   rf   )r>   rc   r3   r3   r4   column_type_hints   rh   z'DataPropertyExtractor.column_type_hintsc                 C   sv   g }|D ]&}t |}|tttttttjt	t
tttd fvr%tdt| || q| j|kr2d S || _|   d S )Nzinvalid type hint: )r&   r	   r
   r   r   r   r   typepyListr   r   r   r   r   
ValueErrortypeappendr>   rX   )rY   ri   normalized_type_hints	type_hintr3   r3   r4   rn      s0   
c                 C   re   rf   r@   rc   r3   r3   r4   is_formatting_float   rh   z)DataPropertyExtractor.is_formatting_floatc                 C   s
   || _ d S rf   rv   rk   r3   r3   r4   rw      s   
c                 C   re   rf   )r;   rc   r3   r3   r4   r6      rh   z#DataPropertyExtractor.max_precisionc                 C   rj   rf   )r;   rX   rk   r3   r3   r4   r6      rl   c                 C   re   rf   )rP   rc   r3   r3   r4   preprocessor   rh   z"DataPropertyExtractor.preprocessorc                 C   s    | j |krd S || _|   d S rf   )rx   rP   r\   rk   r3   r3   r4   rx      rl   c                 C   re   rf   )r?   rc   r3   r3   r4   strip_str_header   rh   z&DataPropertyExtractor.strip_str_headerc                 C   rj   rf   )r?   rX   rk   r3   r3   r4   ry      rl   c                 C   re   rf   )rA   rc   r3   r3   r4   min_column_width   rh   z&DataPropertyExtractor.min_column_widthc                 C   rj   rf   )rA   rX   rk   r3   r3   r4   rz      rl   c                 C   re   rf   )rC   rc   r3   r3   r4   default_format_flags   rh   z*DataPropertyExtractor.default_format_flagsc                 C   rj   rf   )rC   rX   rk   r3   r3   r4   r{     rl   c                 C   re   rf   )rD   rc   r3   r3   r4   format_flags_list  rh   z'DataPropertyExtractor.format_flags_listc                 C   rj   rf   )rD   rX   rk   r3   r3   r4   r|     rl   c                 C   re   rf   )rE   rc   r3   r3   r4   
float_type  rh   z DataPropertyExtractor.float_typec                 C   rj   rf   )rE   rX   rk   r3   r3   r4   r}     rl   c                 C   re   rf   )rG   rc   r3   r3   r4   datetime_format_str#  rh   z)DataPropertyExtractor.datetime_format_strc                 C   rj   rf   )rG   rX   rk   r3   r3   r4   r~   '  rl   c                 C   re   rf   )rN   rc   r3   r3   r4   strict_level_map/  rh   z&DataPropertyExtractor.strict_level_mapc                 C   s6   | j |krd S tttttf tf || _ |   d S rf   )rN   r   rJ   r   r   rK   rL   rX   rk   r3   r3   r4   r   3  s   
c                 C   re   rf   )rO   rc   r3   r3   r4   east_asian_ambiguous_width;  rh   z0DataPropertyExtractor.east_asian_ambiguous_widthc                 C   rj   rf   )rO   rX   rk   r3   r3   r4   r   ?  rl   c                 C   re   rf   )rR   rc   r3   r3   r4   type_value_mapG  rh   z$DataPropertyExtractor.type_value_mapc                 C   rj   rf   rR   rX   rk   r3   r3   r4   r   K  rl   keyc                 C   s   || j |< |   d S rf   r   )rY   r   ri   r3   r3   r4   set_type_valueS  s   
z$DataPropertyExtractor.set_type_value
trans_funcc                 C   s   | j d| |   d S )Nr   )rS   insertrX   )rY   r   r3   r3   r4   register_trans_funcW  s   z)DataPropertyExtractor.register_trans_funcc                 C   re   rf   )rU   rc   r3   r3   r4   quoting_flags[  rh   z#DataPropertyExtractor.quoting_flagsc                 C   rj   rf   )rU   rX   rk   r3   r3   r4   r   _  rl   c                 C   re   rf   )rV   rc   r3   r3   r4   datetime_formatterg  rh   z(DataPropertyExtractor.datetime_formatterc                 C   rj   rf   )rV   rX   rk   r3   r3   r4   r   k  rl   c                 C   re   rf   )rW   rc   r3   r3   r4   matrix_formattings  rh   z'DataPropertyExtractor.matrix_formattingc                 C   rj   rf   )rW   rX   rk   r3   r3   r4   r   w  rl   c                 C   s   | j sJ | j S rf   )#_DataPropertyExtractor__max_workersrc   r3   r3   r4   r9     s   
z!DataPropertyExtractor.max_workersc                 C   sr   z
ddl m}m} W n ty   td d}Y nw dtjv r+|dkr+td d}|| _| js7t	j
| _d S d S )Nr   )SemLock
sem_unlinkz9This platform lacks a functioning sem_open implementationr   pytestz@set max_workers to 1 to avoid deadlock when executed from pytest)_multiprocessingr   r   ImportErrorr    debugsysmodulesr   r   r8   )rY   ri   r   r   r3   r3   r4   r9     s   

c                 C   s   |    | |S rf   )r\   _DataPropertyExtractor__to_dprk   r3   r3   r4   to_dp  s   
zDataPropertyExtractor.to_dpvaluesc                 C   s   t |rg S |   | |S rf   )r   r\   _to_dp_list)rY   r   r3   r3   r4   
to_dp_list  s   
z DataPropertyExtractor.to_dp_listvalue_dp_matrixprevious_column_dp_listc           
      C   s  |   }td dg}| jr|dt| j  |d|r$t|nd d| j g | j	rC|dd
dd	 | j	D  n|d
 |D ]}t| qJtd tt| D ]b\}}z||  W n! ty   |t|| j| j| || j| j| j| jd Y nw || }|  z	|||  W n ttfy   Y nw |D ]}	||	 q|  tdt|d q]|S )Nz"converting to column dataproperty:z	  params:z    headers=z    prev_col_count={}z    matrix_formatting=z    column_type_hints=({})z, c                 S   s   g | ]	}|r	|j nd qS )none)r,   ).0ru   r3   r3   r4   
<listcomp>  s    z;DataPropertyExtractor.to_column_dp_list.<locals>.<listcomp>z    column_type_hints=()z
  results:column_indexr}   	min_widthformat_flagsrw   r~   r   r6   z    s),_DataPropertyExtractor__get_col_dp_list_baser    r   rg   rs   lenextendformatr   rn   join	enumeratezip
IndexErrorr   r}   rz   (_DataPropertyExtractor__get_format_flagsrw   r~   r   r;   begin_updatemerge	TypeErrorupdate_body
end_updaterK   )
rY   r   r   col_dp_listlogslogcol_idxvalue_dp_listcol_dpvalue_dpr3   r3   r4   to_column_dp_list  sn   



z'DataPropertyExtractor.to_column_dp_listvalue_matrixc                 C   sb   |    td| j d| j  | |}| |r"td |S | jdkr,| |S | |S )Nzmax_workers=z, preprocessor=zalready a dataproperty matrixr   )	r\   r    r   r9   rP   )_DataPropertyExtractor__strip_data_matrix$_DataPropertyExtractor__is_dp_matrix'_DataPropertyExtractor__to_dp_matrix_st'_DataPropertyExtractor__to_dp_matrix_mtrY   r   r3   r3   r4   to_dp_matrix  s   





z"DataPropertyExtractor.to_dp_matrixc                 C   s0   |    t| j}| j|_| j| jt|t	dS )Nru   rx   r   )
r\   rH   rI   rP   ry   	strip_strr   rg   r   r   rY   rx   r3   r3   r4   to_header_dp_list  s   z'DataPropertyExtractor.to_header_dp_listkwargsc                 K   s   | j jdi |}|   |S )Nr3   )rP   updater\   )rY   r   
is_updatedr3   r3   r4   update_preprocessor  s   z)DataPropertyExtractor.update_preprocessorc                 C   s2   t | j}| j| || jkrdS |   dS )NFT)rH   rI   rN   r   rX   )rY   ri   orgr3   r3   r4   update_strict_level_map  s   
z-DataPropertyExtractor.update_strict_level_mapc              	   C   s.   z
t | d d tW S  ttfy   Y dS w )Nr   F)
isinstancer   r   r   )ri   r3   r3   r4   __is_dp_matrix)  s
   z$DataPropertyExtractor.__is_dp_matrixr   c              	   C   *   z| j | W S  ttfy   | j Y S w rf   )rn   r   r   rm   rY   r   r3   r3   r4   __get_col_type_hint0  
   
z)DataPropertyExtractor.__get_col_type_hintc              	   C   r   rf   )r|   r   r   rC   r   r3   r3   r4   __get_format_flags6  r   z(DataPropertyExtractor.__get_format_flagsdataru   rx   r   c                 C   s   | j D ]}||}q|r| j||||dS z|| jv r!| j| W S W n	 ty+   Y nw |dkr:|du r7| jS | jS |dkrH|du rE| jS | jS | j||||dS )Nr   r   Fr   T)rS   r]   rb   r   ra   r^   r`   r_   )rY   r   ru   rx   r   r   r3   r3   r4   __to_dp<  s6   


zDataPropertyExtractor.__to_dpc              	   C   s   |rt |j|j|j|j|jd}nt | jj| jj| jj| jj| jjd}t|||d ur-|n| j	| j
| j|d ur9|n| j| jd}| j|S )N)dequoteline_break_handlingline_break_replr   is_escape_formula_injection)rx   ru   r}   r~   r   r   )r   r   r   r   r   r   rx   rP   r   rm   r}   r~   r   r   $_DataPropertyExtractor__dp_converterconvert)rY   r   ru   rx   r   r   r3   r3   r4   __to_dp_rawb  s2   
z!DataPropertyExtractor.__to_dp_rawc                    s"   t t fddtt| D  S )Nc                 3   s0    | ]\}}t  || | jd  V  qdS )r   N)_to_dp_list_helper)_DataPropertyExtractor__get_col_type_hintrP   r   r   r   rc   r3   r4   	<genexpr>  s    
z:DataPropertyExtractor.__to_dp_matrix_st.<locals>.<genexpr>)listr   r   r   r3   rc   r4   __to_dp_matrix_st  s   

z'DataPropertyExtractor.__to_dp_matrix_stc                    s   ddl m} i  |j(fddtt| D }||D ]}| \}}| |< q"W d    n1 s9w   Y  tt fddt	 D  S )Nr   )futuresc                    s,   g | ]\}}  t|||jqS r3   )submitr   r   rP   r   )executorrY   r3   r4   r     s    	z;DataPropertyExtractor.__to_dp_matrix_mt.<locals>.<listcomp>c                 3   s    | ]} | V  qd S rf   r3   )r   r   )col_data_mapr3   r4   r     s    z:DataPropertyExtractor.__to_dp_matrix_mt.<locals>.<genexpr>)

concurrentr   ProcessPoolExecutorr9   r   r   as_completedresultr   sorted)rY   r   r   future_listfuturer   r   r3   )r   r   rY   r4   __to_dp_matrix_mt  s   
	
z'DataPropertyExtractor.__to_dp_matrix_mt	data_listc              	   C   s   t |rg S t }g }|D ]F}|}|d u r7z|dd \}}	||| jtjd s,d }W n	 ty6   Y nw | j|||r?|n| j	|d}
||
j
  d7  < ||
 q|S )Nr   r   )r}   strict_level)r   ru   rx   r   )r   r   most_commonr}   r   MAXis_typer   r   rP   
type_classrs   )rY   r   ru   rx   r   type_counterdp_listr   expect_type_hint_countdatapropr3   r3   r4   r     s8   
z!DataPropertyExtractor._to_dp_listdata_matrixc                    s  | j rt| j nd}z	dd  D }W n ty   g  Y S w | j r1t|g| }t|g| }n|r<t|}t|}nd}d}| jtjkrT||krRtd	|| S | jtj
krd|dkra|n|n| jtjkrm|n| jtjkrv|ntd| j  fddt|D S )Nr   c                 S   s   g | ]}t |qS r3   )r   )r   r   r3   r3   r4   r     s    z=DataPropertyExtractor.__strip_data_matrix.<locals>.<listcomp>z,nonuniform column size found: min={}, max={}zunknown matrix formatting: c                    s2   g | ]\}}t  | d  d g|   qS rf   )r   )r   row_idxcol_sizer   format_col_sizer3   r4   r     s     )rg   r   r   minmaxr   r'   r/   rq   r   r2   r0   r1   r   )rY   r   header_col_sizecol_size_listmin_col_sizemax_col_sizer3   r   r4   __strip_data_matrix  sD   
z)DataPropertyExtractor.__strip_data_matrixc                 C   s`   |   }g }t|D ]#\}}t|| j| j| || j| j| j| j	d}|
| || q
|S )Nr   )r   r   r   r}   rz   r   rw   r~   r   r;   update_headerrs   )rY   header_dp_listr   r   	header_dpr   r3   r3   r4   __get_col_dp_list_base  s    

z,DataPropertyExtractor.__get_col_dp_list_basec              	   C   sH   t | jj| jj| jj| jjd}t|| j| j	| j
| j| j| jd| _d S )N)r   r   is_escape_html_tagr   )rx   r   r   r   r~   r}   r   )r   rP   r   rx   r   r  r   r   r   r   r   r~   r}   r   r   r   r3   r3   r4   __update_dp_converter  s   z+DataPropertyExtractor.__update_dp_converterrf   )r7   N)NNN)Gr,   r-   r.   __doc__r   rL   rZ   rX   propertyr   rK   rg   setterr$   rm   r   rn   r   boolrw   r6   r   rx   ry   rz   r{   r|   rr   floatr   r}   r~   r"   r   r   r%   r   r   r   r#   r   rJ   r   r!   r   r'   r   r9   r   r   r   r   r   r   DataPropertyMatrixr   r   r   r   staticmethodr   r   r   r   r]   r   r   r   r   r   r\   r3   r3   r3   r4   r5   D   s(   
$ "$"

H	
)
$
"%,r5   	extractorr   r   ru   rx   r7   c                 C   s   || j |||dfS )N)ru   rx   )r   )r  r   r   ru   rx   r3   r3   r4   r   '  s   r   )=r	  rH   enumr   typingcollectionsr   collections.abcr   decimalr   r   r   r   r   ro   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   typepy.typer   _columnr   _commonr   r   
_converterr   _datapropertyr   
_formatterr   _preprocessorr   r    r!   r"   r#   r$   r%   r&   r   r  uniqueEnumr'   r5   rL   tupler   r3   r3   r3   r4   <module>   sT    @ 
     h