o
    }oi+                     @   s   d dl Z d dlmZmZmZ d dlZd dlmZ d dlm	Z	m
Z
mZ d dlmZ g dZG dd deZG d	d
 d
eZG dd deZG dd deZeeedZG dd deZdS )    N)DictListTuple)ndarray)PowerTransformerQuantileTransformerRobustScaler)logging)IntCode	FloatCodeCategoryCodeColumnCodesc                   @   s   e Zd ZdefddZddedededed	ef
d
dZdede	e fddZ
de	e defddZede	eeef  fddZdS )Codedata_seriesc                 C      t  )zd
        @params:
            data_series: an array of input data used to calculate mapping
        NotImplementedError)selfr    r   c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/column_coder.pycompute_code   s   zCode.compute_codeTcol_namecode_lenstart_idfillallhasnanc                 C   s(   || _ || _|| _|| _|| _|| _dS )a  
        @params:
            col_name: name of the column
            code_len: number of tokens used to code the column.
            start_id: offset for token_id. 
            fillall: if True, reserve space for digit number even the digit number is
            not present in the data_series. Otherwise, only reserve space for the numbers
            in the data_series. 
            hasnan: if True, reserve space for nan
        N)namer   r   end_idr   r   )r   r   r   r   r   r   r   r   r   __init__#   s   
zCode.__init__itemreturnc                 C   r   Nr   r   r   r   r   r   encode5      zCode.encodeidsc                 C   r   r!   r   r   r%   r   r   r   decode8   r$   zCode.decodec                 C   s   | j | jfgS )v
        get the vocab id range for each of the encoded tokens
        @returns [(min, max), (min, max), ...]
        )r   r   r   r   r   r   
code_range;   s   zCode.code_rangeN)TT)__name__
__module____qualname__r   r   strintboolr   r   r#   r'   propertyr   r*   r   r   r   r   r      s      r   c                       s   e Zd Z	ddedededededef fd	d
ZdefddZdefddZ	de
defddZdedefddZedeeeef  fddZdedee fddZdee defddZ  ZS )r
   Td   r   r   r   r   baser   c                    s$   t  ||||| || _d | _d S r!   )superr   r3   int_min)r   r   r   r   r   r3   r   	__class__r   r   r   E   s   
zIntCode.__init__r   c                 C   s&  |  |}dd t| jD }dd t| jD }t| jD ]G}|| }|| }|| j|  | j }| jr<td| j}	n	tt| }	tt	|	D ]}
t
|	|
 }| j||< ||| j< |  jd7  _qKq|| _|| _d| _| jr|  jd7  _g }| j}|D ]}||d d  q|| _d S d S )Nc                 S      g | ]}i qS r   r   .0_r   r   r   
<listcomp>O       z(IntCode.compute_code.<locals>.<listcomp>c                 S   r8   r   r   r9   r   r   r   r<   P   r=   r      nan)array_convert_to_intranger   r3   r   sortednpuniquetolistlenr.   r   digits_id_to_itemdigits_item_to_idNA_tokenr   r*   appendNA_token_id)r   r   significant_valrG   rH   i
id_to_item
item_to_idv
uniq_itemskr   codesrangesr   r   r   r   L   s6   



zIntCode.compute_codevalc                 C   s   | t}| | _|| j S r!   )astyper/   minr5   r   rU   r   r   r   r@   i   s   


zIntCode.array_convert_to_intr    c                 C   s   t || j S r!   )r/   r5   rX   r   r   r   convert_to_intn      zIntCode.convert_to_intc                 C   s
   || j  S r!   )r5   rX   r   r   r   reverse_convert_to_intq   s   
zIntCode.reverse_convert_to_intc                 C   s   g }d}t t| jD ]=}| j|  }|dkr7| jr)|t|t|d f n|t|t|d f n|t|t|d f |d7 }q|S )r(   r      r>   )	reversedrA   r   rG   keysr   rJ   rW   max)r   outputscrM   r%   r   r   r   r*   t   s   
zIntCode.code_ranger   c                 C   sD  | j r|| jkr| jS | j s|| jkrtd| j dt|}| |}g }t| jD ]}|| j	|  | j	 }|
t| q,|| j	| j  dkrNtdg }tt| jD ]H}|| }|| j| v ro|
| j| |  qWtdd | j|  D }	tt|	t| }
t|	|
 }|
| j| |  td qW|S )Nzcolum z* cannot handle nan, please set hasnan=Truer   znot right lengthc                 S   s   g | ]}t |qS r   )r/   )r:   dr   r   r   r<      s    z"IntCode.encode.<locals>.<listcomp>z1out of domain num is encounterd, use nearest code)r   rI   rK   
ValueErrorr   floatrY   rA   r   r3   rJ   r.   r]   rH   rC   arrayr^   argminabsr/   r	   warning)r   r   rU   val_intdigitsrM   digitrS   	digit_strallowed_digitsnear_idr   r   r   r#      s.   
zIntCode.encoder%   c                 C   sz   | j r|d | jd kr| jS d}tt| jD ]}t| j| || j| d   }||| j|  7 }q| 	|}t
|S )Nr   r>   )r   rK   rI   r]   rA   r   r/   rG   r3   r[   r.   )r   r%   rP   rM   rk   r   r   r   r'      s    
zIntCode.decode)Tr2   T)r+   r,   r-   r.   r/   r0   r   r   r   r@   rd   rY   r[   r1   r   r   r*   r#   r'   __classcell__r   r   r6   r   r
   D   s.    r
   c                       s   e Zd Z				ddededededed	ed
ef fddZdedefddZde	fddZ
dedefddZdee defddZ  ZS )r   Tr2   quantiler   r   r   r   r3   r   	transformc                    sd   t  |||||| |dkrtdd| _d S |dkr$tddd| _d S |dkr.t | _d S td	)
Nzyeo-johnsonT)standardizerp   uniformr2   )output_distributionn_quantilesrobustzJSupported data transformations are "yeo-johnson", "quantile", and "robust")r4   r   r   scalerr   r   rc   )r   r   r   r   r   r3   r   rq   r6   r   r   r      s   
zFloatCode.__init__rU   r    c                 C   s`   t jt |dd}| j|d d d f d d df | j }|| j| j  t	}|d }|S )Nr   axis)
rC   expand_dimsre   rw   rq   mvalr3   extra_digitsrV   r/   )r   rU   valuesoutputr   r   r   rY      s
   *zFloatCode.convert_to_intc                 C   s   | j |d d d f d d df }| | _|| j }tt| | jd }| j	| }|dk r6t
d|| _|| j| j  t}|S )Nr   r>   z%need large length to code the nummber)rw   fit_transformrW   r{   r/   mathlogr_   r3   r   rc   r|   rV   )r   rU   r}   rj   r|   r   r   r   r@      s   $


zFloatCode.array_convert_to_intc                 C   sJ   || j | j  }tjt|dd}| j|d d d f | j d }|S )Nr   rx   )r   r   )r3   r|   rC   rz   re   rw   inverse_transformr{   )r   rU   rP   r   r   r   r[      s   "z FloatCode.reverse_convert_to_intr%   c                 C   s   | j r|d | jd kr| jS d}tt| jD ]}t| j| || j| d   }||| j|  7 }q| 	|}t
tttd| j| j  d}|d| dS )Nr   r>   g?.f)r   rK   rI   r]   rA   r   r/   rG   r3   r[   r_   rg   rC   log10r|   )r   r%   rP   rM   rk   accuracyr   r   r   r'      s    
$zFloatCode.decode)Tr2   Trp   )r+   r,   r-   r.   r/   r0   r   rd   rY   r   r@   r[   r   r'   ro   r   r   r6   r   r      s0    r   c                       s\   e Zd Zdedef fddZdefddZdee fd	d
Z	dee defddZ
  ZS )r   r   r   c                    s   t  |d|dd d S )Nr>   TF)r4   r   )r   r   r   r6   r   r   r      s   zCategoryCode.__init__r   c                 C   sf   t | }i }i }tt|D ]}t|| }| j||< ||| j< |  jd7  _q|| _|| _d S )Nr>   )	rC   rD   rE   rA   rF   r.   r   rN   rO   )r   r   rQ   rN   rO   rM   r   r   r   r   r      s   


zCategoryCode.compute_coder    c                 C   s   | j | gS r!   )rO   r"   r   r   r   r#      s   zCategoryCode.encoder%   c                 C   s   | j |d  S )Nr   )rN   r&   r   r   r   r'      rZ   zCategoryCode.decode)r+   r,   r-   r.   r/   r   r   r   r   r#   r'   ro   r   r   r6   r   r      s
    r   )r/   rd   categoryc                   @   s   e Zd Zdd Zedd ZdedefddZd	ed
ede	e
 fddZd	ede	e
 defddZde
de	ee
e
f  fddZedd ZdS )r   c                 C   s   i | _ g | _g | _d S r!   )column_codescolumnssizesr)   r   r   r   r     s   
zColumnCodes.__init__c                 C   s   | j | jd  jS )N)r   r   r   r)   r   r   r   
vocab_size
  s   zColumnCodes.vocab_sizer   ccodec                 C   s(   | j | || j|< | j|j d S r!   )r   rJ   r   r   r   )r   r   r   r   r   r   register  s   
zColumnCodes.registercolr   r    c                 C   s.   || j v r| j | |S td| d| )Nzcannot encode  )r   r#   rc   )r   r   r   r   r   r   r#     s   
zColumnCodes.encoder%   c                 C   s"   || j v r| j | |S td)Nzcannot decode)r   r'   rc   )r   r   r%   r   r   r   r'     s   
zColumnCodes.decode	column_idc                 C   s   | j | j|  jS r!   )r   r   r*   )r   r   r   r   r   	get_range  s   zColumnCodes.get_rangec                 C   s   |  }d}d }|D ]7}|d }t |d  }|di }	|d u r!|n|j}
|
|	d< ||	d< |di |	}|||  ||| q	|S )Nr   r   	code_typeargsr   r   r   )
column_mapgetr   r   r   )clscolumn_configsexample_arraysr   begccconfigr   coderr   r   r   r   r   get_column_codes"  s   zColumnCodes.get_column_codesN)r+   r,   r-   r   r1   r   r.   r   r   r   r/   r#   r'   r   r   classmethodr   r   r   r   r   r     s    
r   )r   typingr   r   r   numpyrC   r   sklearn.preprocessingr   r   r   
nemo.utilsr	   __all__objectr   r
   r   r   r   r   r   r   r   r   <module>   s   )k;