o
    ߥi                     @   s|   d dl mZmZ d dlZd dlmZ d dlmZ dedefddZ	d	e
dedefd
dZG dd deZG dd deZdS )    )ListUnionN)AutoTokenizer)GPT2TokenizerFaststart_extra_idmax_lenc                    sv   dt dtf fdd}d}d}| D ]!}|dkr(|d7 }| kr'|||}d}q|||}d}|| }q|||}|S )	z Encode whitespaces to extra tokens in GPT-J.

    >>> encode_whitespaces('a\n  b\n   c', 10, 10)
    'a\n<|extratoken_10|>b\n<|extratoken_11|>c'
    acc_lentextc                    sX   | dkr|S | dkr|d S |  ksJ d  d|  d |  }d| d}|| S )	Nr       zMax whitespace run length z, but found    <|extratoken_|> )r   r	   extra_idextra_tokenr   r   r   \/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/codegeex/tokenizer.pypush_acc_space   s   z*encode_whitespaces.<locals>.push_acc_spacer    r   r
   )intstr)r	   r   r   r   r   reschr   r   r   encode_whitespaces	   s   




r   r	   c                 C   s@   t d|d D ]}|d | }d| d}| |d| } q| S )z Decode the whitespace-encoded strings produced by encode_whitespace.

    >>> text = 'a\n  b\n   c'
    >>> s, l = 10, 10
    >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l)
    True
    r   r
   r   r   r   )rangereplace)r	   r   r   ltoken_idtokenr   r   r   decode_whitespaces,   s
   r    c                   @   s   e Zd Z		ddedee defddZdefd	d
ZdefddZdedefddZ	dd Z
defddZdefddZdeeef fddZdd Zdd ZdS ) Code13BDictionaryN	dict_fileextra_token_idspad_to_vocab_sizec                 C   s   t  | _t  | _d| _g | _| dd | dd | dd | dd | | |d u r9dd tdd	D }|D ]}| |d q;|dkrO| | d S d S )
Nr   z<s>z<pad>z</s>z<unk>c                 S   s   g | ]}t |qS r   )r   .0xr   r   r   
<listcomp>O   s    z.Code13BDictionary.__init__.<locals>.<listcomp>iQ  i  )	dict_idx_count_num_symbols_symbols_add_symbol
_load_dictr   _pad_to_vocab_size)selfr#   r$   r%   r   r   r   r   __init__=   s    
zCode13BDictionary.__init__
vocab_sizec                 C   sB   |t |  }|dkrd S td|d D ]}| d|d qd S )Nr   r
   zvocab_pad_token{})lenr   r/   format)r2   r4   num_padir   r   r   r1   X   s   z$Code13BDictionary._pad_to_vocab_sizec                 C   sr   t |d*}|D ]}| }|dks|drq| \}}| |t| qW d    d S 1 s2w   Y  d S )Nrr   #)openstrip
startswithsplitr/   r   )r2   r#   flinesymcountr   r   r   r0   _   s   "zCode13BDictionary._load_dictrA   rB   c                 C   s4   | j | j|< || j|< | j| |  j d7  _ d S )Nr
   )r-   r+   r,   r.   append)r2   rA   rB   r   r   r   r/   h   s   
zCode13BDictionary._add_symbolc                 C   s   | j S N)r-   r2   r   r   r   __len__n   s   zCode13BDictionary.__len__c                 C   
   | j | S rD   )r+   )r2   rA   r   r   r   indexq      
zCode13BDictionary.indexidxc                 C   rG   rD   )r.   )r2   rJ   r   r   r   stringt   rI   zCode13BDictionary.stringr   c                 C   s   t |tr	t|}| |S rD   )
isinstancer   r   rH   )r2   r   r   r   r   	map_tokenw   s   

zCode13BDictionary.map_tokenc                    s    fdd|D S )Nc                    s   g | ]}  |qS r   )rM   r'   r   rE   r   r   r)   }   s    z0Code13BDictionary.map_tokens.<locals>.<listcomp>r   )r2   tokensr   rE   r   
map_tokens|   s   zCode13BDictionary.map_tokensc                    s     fdd|D }dd |D S )Nc                    s"   g | ]}|d kr
dn  |qS )iP  50256)rK   rN   rE   r   r   r)      s    z3Code13BDictionary.decode_tokens.<locals>.<listcomp>c                 S   s   g | ]}| d st|qS )vocab_pad_token)r=   r   r&   r   r   r   r)      s    r   )r2   rO   decodedr   rE   r   decode_tokens   s   
zCode13BDictionary.decode_tokens)Nr"   )__name__
__module____qualname__r   r   r   r3   r1   r0   r/   rF   rH   rK   r   rM   rP   rT   r   r   r   r   r!   ;   s&    
	r!   c                   @   sN   e Zd Z						ddedededed	ef
d
dZdefddZdd ZdS )CodeGeeXTokenizerNEleutherAI/gpt-j-6B
   codegeex-13b	tokenizertokenizer_pathr   r   r#   c                 C   s|   |d ur|nt || _|dvrtd| d|| _|| _|| _|d ur4| jdkr0t|ddnd | _nd | _| jj	| _	d S )N)r[   codegeex-python-13bzInvalid mode z5, choose from ['codegeex-13b', 'codegeex-python-13b']r^   i   )r%   )
r   from_pretrainedr\   
ValueErrorr   r   moder!   	code_dicteos_token_id)r2   r\   r]   r   r   ra   r#   r   r   r   r3      s&   	

zCodeGeeXTokenizer.__init__codec                 C   sr   | j dkrt|| j| j}| j|ddj}|S | j dkr7t|| j| j}| j| j|}t	
|dd}|S )Nr[   F)is_split_into_wordsr^   r
   r"   )ra   r   r   r   r\   	input_idsrb   rP   encodetorch
LongTensorreshape)r2   rd   rf   r   r   r   encode_code   s   

zCodeGeeXTokenizer.encode_codec                 C   sr   | j dkr| jj|dd}t|| j| j}|S | j dkr7| j| d g}| jj|dd}t|| j| j}|S )Nr[   F)skip_special_tokensr^   r   )	ra   r\   decoder    r   r   rb   rT   tolist)r2   rf   r	   output_coder   r   r   decode_code   s   

zCodeGeeXTokenizer.decode_code)NrY   rZ   rZ   r[   N)	rU   rV   rW   r   r   r   r3   rk   rp   r   r   r   r   rX      s(    
rX   )typingr   r   rh   transformersr   transformers.models.gpt2r   r   r   r   r    objectr!   rX   r   r   r   r   <module>   s   #L