o
    ½Òi·  ã                   @   sd   d Z ddlZddlZddlmZ ddlmZ e d¡Zde	de	fdd	„Z
ddd„Zde	fdd„ZdS )av  
Utility functions for the extended Indic tokenizer.

The extended tokenizer uses BPE merges that compose byte-fallback tokens
(<0xNN>) into Indic subwords. Encoding works perfectly. Decoding requires
a post-processing step to convert <0xNN> patterns back to Unicode text,
because HF's ByteFallback decoder only handles individual byte tokens,
not merged multi-byte tokens.
é    N)ÚPath)ÚPreTrainedTokenizerFastz<0x([0-9A-Fa-f]{2})>ÚtextÚreturnc                 C   s    g }d}|t | ƒk rKt | |¡}|r:g }|r-| t| d¡dƒ¡ | ¡ }t | |¡}|s| t|ƒjddd¡ n| | | ¡ |d7 }|t | ƒk s
d 	|¡S )uH  Convert contiguous <0xNN> byte patterns in text to Unicode.

    Example: '<0xE0><0xA4><0xA8>' â†’ 'à¤¨' (Devanagari NA)

    This is needed because the extended tokenizer's BPE merges create
    tokens whose names contain <0xNN> patterns. The standard HF decoder
    only handles individual <0xNN> tokens, not merged ones.
    r   é   é   zutf-8Úreplace)ÚerrorsÚ )
ÚlenÚBYTE_PATTERNÚmatchÚappendÚintÚgroupÚendÚbytesÚdecodeÚjoin)r   ÚresultÚposÚmÚ	byte_vals© r   ú@/home/ubuntu/training/cohere-transcribe-indic/tokenizer_utils.pyÚdecode_byte_patterns   s    	ýõ
r   Tc                 C   s   | j ||d}t|ƒS )zŠDecode token IDs to Unicode text with byte-pattern fix.

    Use this instead of tokenizer.decode() for the extended Indic tokenizer.
    )Úskip_special_tokens)r   r   )Ú	tokenizerÚidsr   Úrawr   r   r   Údecode_tokens.   s   r    Ú
model_pathc              
   C   s¾   t | ƒ}|d }|d }| ¡ std|› ƒ‚| ¡ s"td|› ƒ‚t | ¡ ¡}tt|ƒ| d¡| d¡| d¡| d¡| d	g ¡d
d}|d }| ¡ r]t | ¡ ¡}| d|j	¡|_	|S )aW  Load the extended fast tokenizer without splitting control tokens.

    `from_pretrained()` currently honors `split_special_tokens=true` from the
    saved config, which breaks decoder prompts like
    `<|startofcontext|>...<|nodiarize|>` into dozens of subword pieces.
    For training we need those prompt/control tokens to stay atomic.
    ztokenizer.jsonzspecial_tokens_map.jsonzMissing tokenizer.json in z#Missing special_tokens_map.json in Ú	bos_tokenÚ	eos_tokenÚ	pad_tokenÚ	unk_tokenÚadditional_special_tokensF)Útokenizer_filer"   r#   r$   r%   r&   Úsplit_special_tokensztokenizer_config.jsonÚmodel_max_length)
r   ÚexistsÚFileNotFoundErrorÚjsonÚloadsÚ	read_textr   ÚstrÚgetr)   )r!   Ú	model_dirr'   Úspecial_tokens_fileÚspecialr   Útokenizer_configÚcfgr   r   r   Úload_extended_tokenizer7   s,   
ùr6   )T)Ú__doc__r,   ÚreÚpathlibr   Útransformersr   Úcompiler   r/   r   r    r6   r   r   r   r   Ú<module>   s    


	