o
    ڷiH                  	   @  s  U d dl mZ d dlZd dlmZ d dlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZ edZe  Z!e!"e#d g Z$de%d< g Z&de%d< eD ]!Z'zee'r{e$(e' ne&(e' W qn e)y   e&(e' Y qnw e$e& Z*de%d< 									d6d7d(d)Z+									d6d8d,d-Z,									d6d9d0d1Z-									d:d;d4d5Z.dS )<    )annotationsN)PathLike)BinaryIO   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDIANA_SUPPORTED_SIMILARTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encodingcut_sequence_chunks	iana_nameidentify_sig_or_bomis_multi_byte_encodingshould_strip_sig_or_bomcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)sz	list[str]_mb_supported_sb_supportedIANA_SUPPORTED_MB_FIRST      皙?TF皙?	sequencesbytes | bytearraystepsint
chunk_size	thresholdfloatcp_isolationlist[str] | Nonecp_exclusionpreemptive_behaviourboolexplainlanguage_thresholdenable_fallbackreturnr   c
           @      C  s  t | ttfstdt| |rtj}
tt	 t
t t| }|dkrDtd |r8tt	 t
|
 tt| dddg dgS |durZttd	d
| dd |D }ng }|durrttdd
| dd |D }ng }||| krttd||| d}|}|dkr|| |k rt|| }t| tk }t| tk}|rttd| n|rttd| g }|rt| nd}|dur|| ttd| t }g }g }t }t }i }d}t }d}d}d}d}d}d}t }t }t| \} }!| dur||  ttdt|!|  |d d|vr#|d |t D ]'}"|r4|"|vr4q'|r>|"|v r>q'|"|v rEq'||" d}#| |"k}$|$oVt|"}%|"dv rh|$shttd|" q'|"dv ry|$syttd|" q'|"|v rttd|" q'|"|v rttd|" q'zt|"}&W n ttfy   ttd|" Y q'w |r|&stt |"}'ntt!|"}'|'"|sttd|"|'| q'|r|&s||krttd|"|| q'|r|&sttd |" q'z9|r|&du rt#|%du r| dtd! n	| t|!td! |"d" nt#|%du r&| n| t|!d |"d"}#W n+ t$t%fy] }( zt |(t%sMttd#|"t#|( ||" W Y d}(~(q'd}(~(ww t&|$sddnt|!|t|| })|&o||#duo|t|#|k }*|*rttd$|" |#dur{|&s{t'|#}+|(|+},|,dur{|,\}-}.}/|/rCt| |"|-|$|.|du s|"|ddfv r|#nd|d%}0||0 ||" ttd&|"t)|-d' d(d) |"|ddfv r	|-d*k r	|-dkrtd+|0j* |rtt	 t
|
 t|0g  S ||0 t|rA|du s||v rAd|v rAd|v rA|+ }1td+|1j* |r:tt	 t
|
 t|1g  S q'||" ttd,|" |	ry|"dd|d-d.fv ryt| |"||$g |#|d%}2|"|kro|2}n
|"dkrw|2}n|2}q'tt|)d/ }3t,|3d0}3d}4d}5g }6g }7zLt-| |"|)||$|%|!|&|#	D ]=}8|6|8 |7t.|8||d1u odt|  kod0kn   |7d2 |kr|4d7 }4|4|3ks|$r|%du r nqW n! t$y }( zttd3|"t#|( |3}4d1}5W Y d}(~(nd}(~(ww |5s:|r:|&s:z| td4d j/|"d5d6 W n# t$y9 }( zttd7|"t#|( ||" W Y d}(~(q'd}(~(ww |7rEt0|7t|7 nd}9|9|ksQ|4|3kr||" |"t1v rb|2t1|"  |#duru|&su|3t'|#|9g df ttd8|"|4t)|9d' d(d) |	r|"dd|d-d.fv r|5st| |"||$g |#|d%}2|"|kr|2}n
|"dkr|2}n|2}q'ttd9|"t)|9d' d(d) |&st |"}:nt!|"}:|:rttd:|"t#|: g };|"dkr|6D ]}8t4|8||:rd;|:nd}<|;|< qt5|;}=nt5|;}=|=rttd<|=|" t| |"|9|$|=|du s%|"|ddfv r'|#nd|d%}>||> |#durD|&sD|3t'|#|9|=d1f |rS|&sS|9d=k rS|d7 }|"|ddfv r|9d*k r|9dkrtd+|>j* |rytt	 t
|
 t|>g  S ||> t|r|du s||v rd|v rd|v r|+ }1td+|1j* |rtt	 t
|
 t|1g  S |s|&s|=rt,d>d? |=D dd@nd}?|?dAkrd|v rd|v rd1}|2|: ttdB|"|9|? |s-|&r-|*r-|#dur-t|#|dC k r-|"dDvr-d|v r-d|v r-d1}ttdE|"|9t|#|t|#| d'  |"| krNtdF|" |rEtt	 t
|
 t||" g  S q't|dkr|s`|s`|rfttdG |rvtdH|j* || n2|r~|du s|r|r|j6|j6ks|durtdI || n|rtdJ || |rtdK|+ j*t|d  ntdL |rtt	 t
|
 |S )Maf  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
    Custom logging format and handler can be set manually.
    z3Expected object of type bytes or bytearray, got: {}r   z<Encoding detection on empty bytes, assuming utf_8 intention.utf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c                 S     g | ]}t |d qS Fr   .0cp r8   L/home/ubuntu/vllm_env/lib/python3.10/site-packages/charset_normalizer/api.py
<listcomp>s       zfrom_bytes.<locals>.<listcomp>zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c                 S  r2   r3   r4   r5   r8   r8   r9   r:   ~   r;   z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.   zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z\Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.>   utf_7zREncoding %s won't be tested as-is because detection is unreliable without BOM/SIG.zY%s is deemed too similar to a code page that was already considered unsuited. Continuing!zESkipping %s: already fast-tracked from a similar successful encoding.z2Encoding %s does not provide an IncrementalDecoderzbSkipping %s: definitive match already found, this encoding targets different languages (%s vs %s).zXSkipping %s: already accumulated %d same-family results after definitive match (cap=%d).zCSkipping single-byte %s: multi-byte definitive match already found.g    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %szpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.)preemptive_declarationzM%s fast-tracked (identical decoded payload to a prior encoding, chaos=%f %%).d      )ndigitsr   z.Encoding detection: %s is most likely the one.zZ%s fast-skipped (identical decoded payload to a prior encoding that failed chaos probing).r>   r?         TzaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %sg     j@strict)errorsz^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.z=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {},z We detected language {} using {}g{Gz?c                 s  s    | ]\}}|V  qd S )Nr8   )r6   _vr8   r8   r9   	<genexpr>  s    zfrom_bytes.<locals>.<genexpr>)defaultg      ?zyDefinitive match found: %s (chaos=%.3f, coherence=%.2f). Encodings targeting different language families will be skipped.g\(\?>	   r@   r0   r>   r?   	utf_16_be	utf_16_le	utf_32_be	utf_32_le	utf_8_sigzjMulti-byte definitive match: %s (chaos=%.3f, decoded=%d/%d=%.1f%%). Single-byte encodings will be skipped.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.)7
isinstance	bytearraybytes	TypeErrorformattypeloggerlevel
addHandlerexplain_handlersetLevelr   lendebugremoveHandlerr   r   logjoinr#   r   r   r   appendsetr   r   addr   r   ModuleNotFoundErrorImportErrorr   r   intersectionstrUnicodeDecodeErrorLookupErrorrangehashgetroundrA   bestmaxr   r   decodesumr   update
setdefaultr   r	   fingerprint)@r    r"   r$   r%   r'   r)   r*   r,   r-   r.   previous_logger_levellengthis_too_small_sequenceis_too_large_sequenceprioritized_encodingsspecified_encodingtestedtested_but_hard_failuretested_but_soft_failuresoft_failure_skipsuccess_fast_trackedpayload_result_cachedefinitive_match_founddefinitive_target_languages post_definitive_sb_success_countPOST_DEFINITIVE_SB_CAPmb_definitive_match_foundfallback_asciifallback_u8fallback_specifiedresultsearly_stop_resultssig_encodingsig_payloadencoding_ianadecoded_payloadbom_or_sig_availablestrip_sig_or_bomis_multi_byte_decoderenc_languageser_multi_byte_bonuspayload_hashcachedcached_mess	cached_cdcached_passed
fast_matchprobable_resultfallback_entrymax_chunk_gave_upearly_stop_countlazy_str_hard_failure	md_chunks	md_ratioschunkmean_mess_ratiotarget_languages	cd_ratioschunk_languagescd_ratios_mergedcurrent_matchbest_coherencer8   r8   r9   
from_bytes9   s  





	











	




















	


&







	















	
















r   fpr   c
           
      C  s   t |  |||||||||	
S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )r   read)
r   r"   r$   r%   r'   r)   r*   r,   r-   r.   r8   r8   r9   from_fpR  s   r   pathstr | bytes | PathLikec
                 C  sH   t | d}
t|
|||||||||	
W  d   S 1 sw   Y  dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openr   )r   r"   r$   r%   r'   r)   r*   r,   r-   r.   r   r8   r8   r9   	from_pathp  s   $r   fp_or_path_or_payload!PathLike | str | BinaryIO | bytesc
                 C  s   t | ttfrt| |||||||||	d
}
|
 S t | ttfr0t| |||||||||	d
}
|
 S t| |||||||||	d
}
|
 S )a)  
    Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
    Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
    are disabled to be stricter around ASCII-compatible but unlikely to be a string.
    )	r"   r$   r%   r'   r)   r*   r,   r-   r.   )rU   rk   r   r   rW   rV   r   r   )r   r"   r$   r%   r'   r)   r*   r,   r-   r.   guessesr8   r8   r9   	is_binary  s\   -r   )	r   r   r   NNTFr   T)r    r!   r"   r#   r$   r#   r%   r&   r'   r(   r)   r(   r*   r+   r,   r+   r-   r&   r.   r+   r/   r   )r   r   r"   r#   r$   r#   r%   r&   r'   r(   r)   r(   r*   r+   r,   r+   r-   r&   r.   r+   r/   r   )r   r   r"   r#   r$   r#   r%   r&   r'   r(   r)   r(   r*   r+   r,   r+   r-   r&   r.   r+   r/   r   )	r   r   r   NNTFr   F)r   r   r"   r#   r$   r#   r%   r&   r'   r(   r)   r(   r*   r+   r,   r+   r-   r&   r.   r+   r/   r+   )/
__future__r   loggingosr   typingr   cdr   r   r   r	   constantr
   r   r   r   r   mdr   modelsr   r   utilsr   r   r   r   r   r   	getLoggerr[   StreamHandlerr^   setFormatter	Formatterr   __annotations__r   _supported_encre   ri   r   r   r   r   r   r8   r8   r8   r9   <module>   s     
		
      ! !