o
    i                     @   s&  d Z ddlZddlZddlmZ g dZg dZg dZddd	d
ddddddd
ZddddddddZ	g dZ
edZdedefddZdedefd d!Zdd#edefd$d%Zd&edefd'd(Zed)Zed*ejZed+Zed,Zed-Zed.Zed/Zed0Zed1ejZed2Zed3Zed4ejZed5Z ed6Z!ed7ejZ"ed8Z#ed9Z$ed:Z%ed;Z&ed<Z'dedefd=d>Z(d?edefd@dAZ)d?edefdBdCZ*d?edefdDdEZ+d?edefdFdGZ,d?edefdHdIZ-d?edefdJdKZ.d?edefdLdMZ/dd?edOe0defdPdQZ1d?edefdRdSZ2d?edefdTdUZ3d?edefdVdWZ4d?edefdXdYZ5d?edefdZd[Z6d?edefd\d]Z7d?edefd^d_Z8dd?ed`e0defdadbZ9d?edefdcddZ:dd?edfedefdgdhZ;dd?edfedefdidjZ<d?edefdkdlZ=dd?edfedefdmdnZ>dd?edfedefdodpZ?d?edefdqdrZ@d?edefdsdtZAdd?edvedefdwdxZBd?edefdydzZCd?edefd{d|ZDdd?ed}eeE defd~dZFG dd dZGeHdkreG ZIg dZJeKd eKd eKd eJD ]\ZLZMeKdeL d eKdeM  eKdeIeM  qeKd eKd! eKd dD ]ZNeKdeNddeeN  q9eKd eKd% eKd dD ]ZOeKdeO deeO  qXeKd eKd eKd eGdNdZPdD ]ZMeKdeM  eKdePeM  q{dS dS )zU
text_preprocessing.py
A comprehensive text preprocessing library for NLP pipelines.
    N)Optional) onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteen)
r   r   twentythirtyfortyfiftysixtyseventyeightyninety)r   thousandmillionbilliontrillionfirstsecondthirdfourthfifthsixthseventheighthninthtwelfth)
r   r   r   r   r   r	   r
   r   r   r   dollareuropoundyenrupeewonbitcoin)$u   €   £   ¥u   ₹u   ₩u   ₿))  M)i  CM)  D)i  CD)d   C)Z   XC)2   L)(   XL)
   X)	   IX)   V)   IV)   Iz<\b(M{0,4})(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\bnreturnc                 C   s   | dkrdS g }| d }| d }|r| t|  d |dk r*|r)| t|  nt|d  }t|d  }| |rA| d| n| d|S )	u*   Convert a number 0–999 to English words.r   r   r=    hundred   rE   - )append_ONES_TENSjoin)rO   partshundreds	remainder	tens_word	ones_word r^   H/home/ubuntu/.local/lib/python3.10/site-packages/kittentts/preprocess.py_three_digits_to_words,   s   
r`   c                 C   s   t | ts	t| } | dkrdS | dk rdt|   S d|   kr%dkrBn n| d dkrB| d dkrB| d }|dk rBt|  dS g }ttD ](\}}| d }|rft|}||rc| d	|  n| | d } | dkrp nqHd		t
|S )
u  
    Convert an integer to its English word representation.

    Examples:
        1200      → "twelve hundred"
        1000      → "one thousand"
        1_000_000 → "one million"
        -42       → "negative forty-two"
        0         → "zero"
    r   zero	negative r=   i'  r7   rR   rQ   rT   )
isinstanceintnumber_to_wordsrV   	enumerate_SCALEr`   rU   striprX   reversed)rO   rZ   rY   iscalechunkchunk_wordsr^   r^   r_   re   ?   s*   
0 re   pointdecimal_sepc           	         s   t | tr| n|  }|d}|r|dd }d|v rM|dd\}}|r+tt|nd}dgtdd   d fdd|D }| d| d| }ntt|}|rZd	| S |S )
uu  
    Convert a float (or numeric string) to words, reading decimal digits individually.
    Accepts a string to preserve trailing zeros (e.g. "1.50" → "one point five zero").

    Examples:
        3.14   → "three point one four"
        -0.5   → "negative zero point five"
        "3.10" → "three point one zero"
        1.007  → "one point zero zero seven"
    rS   rM   N.ra   rT   c                 3   s    | ]	} t | V  qd S N)rd   ).0d	digit_mapr^   r_   	<genexpr>z   s    z!float_to_words.<locals>.<genexpr>rb   )rc   str
startswithsplitre   rd   rV   rX   )	valuero   textnegativeint_partdec_part	int_words	dec_wordsresultr^   rt   r_   float_to_wordse   s   
r   sc                 C   sT   dddddddd}d	}d	}t |  D ]}|| }|||kr!|n| 7 }|}q|S )
z-Convert a Roman numeral string to an integer.rM   rI   rE   rA   r=   r:   r7   )rN   rJ   rF   rB   r>   r;   r8   r   )ri   upper)r   valr   prevchcurrr^   r^   r_   roman_to_int   s   r   zhttps?://\S+|www\.\S+z\b[\w.+-]+@[\w-]+\.[a-z]{2,}\bz#\w+z@\w+z<[^>]+>z![^\w\s.,?!;:\-\u2014\u2013\u2026]z\s+z(?<![a-zA-Z])-?[\d,]+(?:\.\d+)?z\b(\d+)(st|nd|rd|th)\bz(-?[\d,]+(?:\.\d+)?)\s*%uD   ([$€£¥₹₩₿])\s*([\d,]+(?:\.\d+)?)\s*([KMBT])?(?![a-zA-Z\d])z-\b(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(am|pm)?\bz(?<!\w)(\d+)-(\d+)(?!\w)z/\b([a-zA-Z][a-zA-Z0-9]*)-(\d[\d.]*)(?=[^\d.]|$)u`   (\d+(?:\.\d+)?)\s*(km|kg|mg|ml|gb|mb|kb|tb|hz|khz|mhz|ghz|mph|kph|°[cCfF]|[cCfF]°|ms|ns|µs)\bz5(?<![a-zA-Z])(\d+(?:\.\d+)?)\s*([KMBT])(?![a-zA-Z\d])z<(?<![a-zA-Z\d])(-?\d+(?:\.\d+)?)[eE]([+-]?\d+)(?![a-zA-Z\d])z\b(\d+)\s*/\s*(\d+)\bz\b(\d{1,3})0s\bz(?<!\d)\.([\d])c           	      C   s   t | }d|v r|dd\}}d}n|dd}t|dkr(|d |d dfnd|d df\}}}t D ]\}}||krB|} nq6|drM|d }n|d	r[|d
d d }n|d }|ri| | | S |S )uY   Return the ordinal word for n (e.g. 1 → 'first', 5 → 'fifth', 21 → 'twenty-first').rS   rM   rT      r   r   theNth)re   rsplitlen_ORDINAL_EXCEPTIONSitemsendswith)	rO   wordprefixlastjoinerrY   baseordinallast_ordr^   r^   r_   _ordinal_suffix   s"   2


r   r{   c                 C       dt jdtfdd}t|| S )u  
    Convert ordinal numbers to words.

    Examples:
        "1st place"  → "first place"
        "2nd floor"  → "second floor"
        "3rd base"   → "third base"
        "21st century" → "twenty-first century"
        "100th day"  → "one hundredth day"
    mrP   c                 S   s   t t| dS )NrM   )r   rd   groupr   r^   r^   r_   _replace   s   z!expand_ordinals.<locals>._replace)reMatchrw   _RE_ORDINALsubr{   r   r^   r^   r_   expand_ordinals   s   r   c                 C   r   )u   
    Expand percentage expressions.

    Examples:
        "50% off"    → "fifty percent off"
        "3.5% rate"  → "three point five percent rate"
        "-2% change" → "negative two percent change"
    r   rP   c                 S   s:   |  ddd}d|v rtt|d S tt|d S )NrM   ,r   rp   z percent)r   replacer   floatre   rd   r   rawr^   r^   r_   r      s   z$expand_percentages.<locals>._replace)r   r   rw   _RE_PERCENTr   r   r^   r^   r_   expand_percentages   s   	r   c                    2   ddddd dt jdtf fdd	}t|| S )
up  
    Expand currency amounts, including optional scale suffixes.

    Examples:
        "$100"      → "one hundred dollars"
        "€1,200.50" → "twelve hundred euros and fifty cents"
        "£9.99"     → "nine pounds and ninety-nine cents"
        "$85K"      → "eighty five thousand dollars"
        "$2.5M"     → "two point five million dollars"
    r   r    r!   r"   Kr8   BTr   rP   c                    s@  |  d}|  ddd}|  d}t|d}|r? | }d|v r't|ntt|}| d| d| |r9dnd  S d|v r|dd\}}t|d d 	dd	}	tt|}
|rg|
 d| dn|
}|	rt|	}|d
| d|	dkrzdnd 7 }|S t|}t|}|r| d| |dkr|rdnd n|}|S )NrM   r   r   r      rp   rT   r   0z and z cent)
r   r   _CURRENCY_SYMBOLSgetr   re   rd   rh   ry   ljust)r   symbolr   scale_suffixunit
scale_wordnumr}   r~   dec_valr   r   centsr   words
_scale_mapr^   r_   r     s*   

$ *z!expand_currency.<locals>._replace)r   r   rw   _RE_CURRENCYr   r   r^   r   r_   expand_currency  s   r   c                 C   r   )u   
    Expand time expressions.

    Examples:
        "3:30pm"  → "three thirty pm"
        "14:00"   → "fourteen hundred"
        "9:05 AM" → "nine oh five am"
        "12:00pm" → "twelve pm"
    r   rP   c                 S   s   t | d}t | d}| drd| d  nd}t|}|dkr8| ds2| d| S | | S |dk rG| d	t| | S | dt| | S )
NrM   r   rK   rT   r   r   rQ   rE   z oh )rd   r   lowerre   )r   r   minssuffixh_wordsr^   r^   r_   r   <  s    $zexpand_time.<locals>._replace)r   r   rw   _RE_TIMEr   r   r^   r^   r_   expand_time2  s   
r   c                 C   r   )u   
    Expand numeric ranges.

    Examples:
        "10-20 items"   → "ten to twenty items"
        "pages 100-200" → "pages one hundred to two hundred"
        "2020-2024"     → "twenty twenty to twenty twenty-four"
    r   rP   c                 S   s2   t t| d}t t| d}| d| S )NrM   r   z to )re   rd   r   )r   lohir^   r^   r_   r   S  s   zexpand_ranges.<locals>._replace)r   r   rw   	_RE_RANGEr   r   r^   r^   r_   expand_rangesJ  s   	r   c                 C   s   t dd | S )u  
    Normalise version/model names that use letter-hyphen-number patterns,
    so the number is not misread as negative.

    Examples:
        "GPT-3"      → "GPT 3"
        "gpt-3.5"    → "gpt 3.5"
        "GPL-3"      → "GPL 3"
        "Python-3.10"→ "Python 3.10"
        "v2.0"       stays as "v2.0" (no hyphen — handled by number replacement)
        "IPv6"       stays as "IPv6"
    c                 S   s   |  d d|  d S )NrM   rT   r   )r   r   r^   r^   r_   <lambda>g  s    z$expand_model_names.<locals>.<lambda>)_RE_MODEL_VERr   r{   r^   r^   r_   expand_model_namesZ  s   r   c                    s   i ddddddddd	d
dddddddddddddddddddddd d!d"d#d#d$d$d% d&t jd'tf fd(d)}t|| S )*u   
    Expand common measurement units glued to numbers.

    Examples:
        "100km"  → "one hundred kilometers"
        "50kg"   → "fifty kilograms"
        "25°C"   → "twenty-five degrees Celsius"
        "5GB"    → "five gigabytes"
    km
kilometerskg	kilogramsmg
milligramsmlmillilitersgb	gigabytesmb	megabyteskb	kilobytestb	terabyteshzhertzkhz	kilohertzmhz	megahertzghz	gigahertzmphzmiles per hourkphzkilometers per hourmsmillisecondsnsnanosecondsu   µsmicrosecondszdegrees Celsiuszdegrees Fahrenheit)u   °cu   c°u   °fu   f°r   rP   c                    sX   |  d}|  d } ||  d}d|v rtt|ntt|}| d| S NrM   r   rp   rT   )r   r   r   r   r   re   rd   )r   r   r   expandedr   	_unit_mapr^   r_   r   ~  s
   
 zexpand_units.<locals>._replace)r   r   rw   _RE_UNITr   r   r^   r   r_   expand_unitsj  sP   

r   Tcontext_wordsc                    s4   t dt j dt jdtf fdd}t|S )ux  
    Expand Roman numerals that appear as standalone tokens (optionally
    only when preceded by a title-like word to avoid false positives).

    Examples:
        "World War II"     → "World War two"
        "Chapter IV"       → "Chapter four"
        "Louis XIV"        → "Louis fourteen"
        "mix I with V"     → left unchanged (ambiguous single letters)
    z\b(war|chapter|part|volume|act|scene|book|section|article|king|queen|pope|louis|henry|edward|george|william|james|phase|round|level|stage|class|type|version|episode|season)\br   rP   c                    s   |  d}| s|S t|dkr+|dv r+|  }td|d | } |s+|S zt|}|dkr7|W S t|W S  tyF   | Y S w )Nr   rM   IVX   )	r   rh   r   startmaxsearchr   re   	Exception)r   romanr   	precedingr   _TITLE_WORDSr{   r^   r_   r     s    


z'expand_roman_numerals.<locals>._replace)r   compile
IGNORECASEr   rw   	_RE_ROMANr   )r{   r   r   r^   r  r_   expand_roman_numerals  s   r  c                 C   s   t dd| } td| S )u   
    Normalise bare leading-decimal floats so the number pipeline handles them.

    Examples:
        ".5 teaspoons" → "0.5 teaspoons"
        "-.25 adjustment" → "-0.25 adjustment"
    z(?<!\d)(-)\.([\d])z	\g<1>0.\2z0.\1)r   r   _RE_LEAD_DECr   r^   r^   r_   normalize_leading_decimals  s   	r  c                 C   r   )u  
    Expand scientific-notation numbers to spoken form.

    Examples:
        "1e-4"    → "one times ten to the negative four"
        "2.5e10"  → "two point five times ten to the ten"
        "6.022E23"→ "six point zero two two times ten to the twenty three"
    r   rP   c                 S   sb   |  d}t|  d}d|v rt|ntt|}tt|}|dk r&dnd}| d| | S )NrM   r   rp   r   rb   r   z times ten to the )r   rd   r   re   abs)r   	coeff_rawexpcoeff_words	exp_wordssignr^   r^   r_   r     s   
z,expand_scientific_notation.<locals>._replace)r   r   rw   _RE_SCIr   r   r^   r^   r_   expand_scientific_notation  s   	r  c                    r   )
uT  
    Expand standalone uppercase scale suffixes attached to numbers.

    Examples:
        "7B parameters" → "seven billion parameters"
        "340M model"    → "three hundred forty million model"
        "1.5K salary"   → "one point five thousand salary"
        "$100K budget"  → "$100K budget"  (currency handled upstream)
    r   r    r!   r"   r   r   rP   c                    sJ   |  d}|  d} ||}d|v rt|ntt|}| d| S r   )r   r   r   re   rd   )r   r   r   r   r   _mapr^   r_   r     s
   

z'expand_scale_suffixes.<locals>._replace)r   r   rw   	_RE_SCALEr   r   r^   r  r_   expand_scale_suffixes  s   
r  c                 C   r   )u   
    Expand simple numeric fractions.

    Examples:
        "1/2 cup"  → "one half cup"
        "3/4 mile" → "three quarters mile"
        "2/3 done" → "two thirds done"
        "5/8 inch" → "five eighths inch"
    r   rP   c                 S   s   t | d}t | d}|dkr|  S t|}|dkr'|dkr$dnd}n|dkr4|dkr1dnd}nt|}|dkr@|d	7 }| d
| S )NrM   r   r   halfhalvesrK   quarterquartersr   rT   )rd   r   re   r   )r   r   den	num_words
denom_wordr^   r^   r_   r     s   z"expand_fractions.<locals>._replace)r   r   rw   _RE_FRACTIONr   r   r^   r^   r_   expand_fractions  s   
r  c                    s>   ddddddddd	d
d
 dt jdtf fdd}t|| S )u   
    Expand decade expressions to words.

    Examples:
        "the 80s"    → "the eighties"
        "the 1980s"  → "the nineteen eighties"
        "the 2020s"  → "the twenty twenties"
        "'90s music" → "nineties music"
    rZ   tenstwentiesthirtiesfortiesfiftiessixties	seventieseightiesnineties)
r   rM   r   r   rK   rI            rG   r   rP   c                    sH   t | d}|d } |d}|dk r|S |d }t| d| S )NrM   rE   r   rT   )rd   r   r   re   )r   r   decade_digitdecade_wordcentury_part_decade_mapr^   r_   r     s   z expand_decades.<locals>._replace)r   r   rw   
_RE_DECADEr   r   r^   r-  r_   expand_decades  s
   

	r0  c                    sV   ddddddddd	d
d
 dt dt f fdddtjdt ffdd}td|| S )u   
    Expand IPv4 addresses to spoken digits per octet.

    Examples:
        "192.168.1.1"  → "one nine two dot one six eight dot one dot one"
        "10.0.0.1"     → "one zero dot zero dot zero dot one"
    ra   r   r   r   r   r   r	   r
   r   r   
r   123456789r   rP   c                       d  fdd| D S )NrT   c                 3       | ]} | V  qd S rq   r^   rr   c_dr^   r_   rv   )      z6expand_ip_addresses.<locals>._octet.<locals>.<genexpr>rX   r   r?  r^   r_   _octet(     z#expand_ip_addresses.<locals>._octetr   c                    s   d  fdd|  D S )Nz dot c                 3       | ]} |V  qd S rq   r^   rr   grD  r^   r_   rv   ,  rA  z8expand_ip_addresses.<locals>._replace.<locals>.<genexpr>)rX   groupsr   rI  r^   r_   r   +  s   z%expand_ip_addresses.<locals>._replacez.\b(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})\b)rw   r   r   r   r   r^   )r@  rD  r_   expand_ip_addresses  s   

rK  c                    s   ddddddddd	d
d
 dt dt f fdddt ffddtdfdd| } tdfdd| } tdfdd| } | S )uR  
    Expand US phone numbers to spoken digits before range expansion claims the hyphens.

    Examples:
        "555-1234"       → "five five five one two three four"
        "555-123-4567"   → "five five five one two three four five six seven"
        "1-800-555-0199" → "one eight zero zero five five five zero one nine nine"
    ra   r   r   r   r   r   r	   r
   r   r   r1  r   rP   c                    r;  )NrT   c                 3   r<  rq   r^   r=  r?  r^   r_   rv   >  rA  z8expand_phone_numbers.<locals>._digits.<locals>.<genexpr>rB  rC  r?  r^   r_   _digits=  rE  z%expand_phone_numbers.<locals>._digitsc                     r;  )NrT   c                 3   rF  rq   r^   rG  rL  r^   r_   rv   A  rA  z6expand_phone_numbers.<locals>._join.<locals>.<genexpr>rB  rJ  rM  r^   r_   _join@  rE  z#expand_phone_numbers.<locals>._joinz;(?<!\d-)(?<!\d)\b(\d{1,2})-(\d{3})-(\d{3})-(\d{4})\b(?!-\d)c                        |    S rq   rN  r   rO  r^   r_   r   F      z&expand_phone_numbers.<locals>.<lambda>z1(?<!\d-)(?<!\d)\b(\d{3})-(\d{3})-(\d{4})\b(?!-\d)c                    rP  rq   rN  r   rQ  r^   r_   r   I  rR  z"(?<!\d-)\b(\d{3})-(\d{4})\b(?!-\d)c                    rP  rq   rN  r   rQ  r^   r_   r   L  rR  )rw   r   r   r   r^   )r@  rL  rO  r_   expand_phone_numbers1  s   
	
rS  replace_floatsc                    s$   dt jdtf fdd}t|| S )u0  
    Replace all numeric tokens with their word equivalents.

    Examples:
        "There are 1200 students" → "There are twelve hundred students"
        "Pi is 3.14"              → "Pi is three point one four"
        "gpt-3 rocks"             → "gpt-3 rocks"  (hyphen not treated as minus)
    r   rP   c              	      sX   |   dd}zd|v r rt|W S ttt|W S  ttfy+   |    Y S w )Nr   r   rp   )r   r   r   re   rd   r   
ValueErrorOverflowErrorr   rT  r^   r_   r   ]  s   
z!replace_numbers.<locals>._replace)r   r   rw   
_RE_NUMBERr   )r{   rT  r   r^   rW  r_   replace_numbersT  s   	
rY  c                 C   s   |   S )zConvert text to lowercase.r   r   r^   r^   r_   to_lowercasej  s   r[  r   replacementc                 C      t ||  S )zRemove URLs from text.)_RE_URLr   rh   r{   r\  r^   r^   r_   remove_urlso     r`  c                 C   r]  )z!Remove email addresses from text.)	_RE_EMAILr   rh   r_  r^   r^   r_   remove_emailst  ra  rc  c                 C      t d| S )zStrip HTML tags from text.rT   )_RE_HTMLr   r   r^   r^   r_   remove_html_tagsy     rf  c                 C      t || S )z&Remove hashtags (e.g. #NLP) from text.)_RE_HASHTAGr   r_  r^   r^   r_   remove_hashtags~  rg  rj  c                 C   rh  )zRemove @mentions from text.)_RE_MENTIONr   r_  r^   r^   r_   remove_mentions  rg  rl  c                 C   rd  )zXRemove non-prosodic punctuation, keeping marks that affect speech rhythm and intonation.rT   )	_RE_PUNCTr   r   r^   r^   r_   remove_punctuation  rg  rn  c                 C   s   t d|  S )zKCollapse multiple whitespace characters into a single space and strip ends.rT   )
_RE_SPACESr   rh   r   r^   r^   r_   remove_extra_whitespace  ra  rp  NFCformc                 C   rh  )z7Normalize unicode characters (NFC, NFD, NFKC, or NFKD).)unicodedata	normalize)r{   rr  r^   r^   r_   normalize_unicode  rg  ru  c                 C   s    t d| }ddd |D S )z3Remove diacritical marks (accents) from characters.NFDr   c                 s   s"    | ]}t |d kr|V  qdS )MnN)rs  categoryr=  r^   r^   r_   rv     s     z!remove_accents.<locals>.<genexpr>)rs  rt  rX   )r{   nfkdr^   r^   r_   remove_accents  s   rz  c                 C   sH   ddddddddd	d
ddd}|  D ]\}}tj||| tjd} q| S )u   
    Expand common English contractions.

    Examples:
        "don't"   → "do not"
        "they're" → "they are"
        "I've"    → "I have"
    cannotzwill notz	shall notzis notzlet usz\1 notz\1 arez\1 havez\1 willz\1 wouldz\1 amzit is)z	\bcan't\bz	\bwon't\bz
\bshan't\bz	\bain't\bz	\blet's\bz\b(\w+)n't\bz\b(\w+)'re\bz\b(\w+)'ve\bz\b(\w+)'ll\bz\b(\w+)'d\bz\b(\w+)'m\bz\bit's\b)flags)r   r   r   r  )r{   contractionspatternr\  r^   r^   r_   expand_contractions  s    
r  	stopwordsc                    s0    du rh d |   }d fdd|D S )z
    Remove stopwords from text.

    Args:
        stopwords: Set of words to remove. Uses a built-in English set if None.
    N>6   arj   anatbebydoheinisitmemyofonortoweandarebutdidforhadhasherhimitsmayourshethewasyoubeendoesfromhavethatthemtheythiswerewillwithyourbeingcouldmighttheirthesethosewouldshouldrT   c                 3   s     | ]}|   vr|V  qd S rq   rZ  )rr   r   r  r^   r_   rv     s    z#remove_stopwords.<locals>.<genexpr>)ry   rX   )r{   r  tokensr^   r  r_   remove_stopwords  s   r  c                =   @   s   e Zd ZdZ																														d+dedededed	ed
edededededededededededededededededededededee d ed!ed"ef<d#d$Zd%ed&efd'd(Z	d%ed&efd)d*Z
dS ),TextPreprocessoru  
    Configurable preprocessing pipeline.

    Usage:
        pp = TextPreprocessor(
            lowercase=True,
            replace_numbers=True,
            remove_urls=True,
            remove_html=True,
            remove_punctuation=True,
        )
        clean = pp("GPT-3 costs $0.002 per token — 50% cheaper than before!")
        # → "gpt three costs zero dollars and zero point two cents per token fifty percent cheaper than before"
    TFN	lowercaserY  rT  r  r   r   r   r   r   r   r   r  r  r  r0  rS  rK  r  r  r`  rc  remove_htmlrj  rl  rn  r  r  ru  rz  rp  c                 C   s    dd t   D | _|| _d S )Nc                 S   s   i | ]\}}|d kr||qS )selfr^   )rr   kvr^   r^   r_   
<dictcomp>  s    z-TextPreprocessor.__init__.<locals>.<dictcomp>)localsr   config
_stopwords)r  r  rY  rT  r  r   r   r   r   r   r   r   r  r  r  r0  rS  rK  r  r  r`  rc  r  rj  rl  rn  r  r  ru  rz  rp  r^   r^   r_   __init__  s   !
zTextPreprocessor.__init__r{   rP   c                 C   s
   |  |S rq   )process)r  r{   r^   r^   r_   __call__  s   
zTextPreprocessor.__call__c                 C   s  | j }|d rt|}|d rt|}|d rt|}|d r#t|}|d r+t|}|d r3t|}|d r;t|}|d rCt|}|d	 rKt	|}|d
 rSt
|}|d r[t|}|d rct|}|d rkt|}|d rst|}|d r{t|}|d rt|}|d rt|}|d rt|}|d rt|}|d rt|}|d rt|}|d rt|}|d rt||d d}|d rt|}|d rt|}|d rt|}|d rt|| j}|d rt|}|S )Nru  r  r`  rc  rj  rl  r  rK  r  r   r   r  r   r   r   r  r  r0  rS  r   r   r  rY  rT  rW  rz  rn  r  r  rp  )r  ru  rf  r`  rc  rj  rl  r  rK  r  r   r   r  r   r   r   r  r  r0  rS  r   r   r  rY  rz  rn  r[  r  r  rp  )r  r{   cfgr^   r^   r_   r    st   zTextPreprocessor.process)TTTTTTTTTTTTTTTTTTFTTTFFTFNTFT)__name__
__module____qualname____doc__boolr   setr  rw   r  r  r^   r^   r^   r_   r    s    	

$r  __main__)D)zPlain integerz(There are 1200 students and 42 teachers.)zLarge numberz/The project costs $1,000,000 and took 365 days.)zNegative numberz,Temperature dropped to -5 degrees overnight.)FloatzPi is approximately 3.14159.)zFloat trailing zerozThe voltage is 1.50 volts.)zLeading decimalz-Add .5 teaspoons of salt and .25 cup of milk.)zNegative leading decimalzA -.05 correction was applied.)Zeroz%There were 0 errors and 0.0 warnings.)zComma thousandsz The population is 7,900,000,000.)zScientific e-notationz)Learning rate is 1e-4, weight decay 1e-5.)zScientific capital EzAvogadro's number is 6.022E23.)zScientific large expzThe signal is 2.5e10 Hz.)zModel params Bz2We trained a 7B parameter model and a 13B variant.)zModel params Mz#The 340M model beat the 7B on MMLU.)zScale suffix KzThe salary was $85K per year.)zDollar amountzA coffee costs $4.99 here.)zEuro amountu   Rent is €1,200 per month.)zPound with centsu   The book is £9.99.)
Percentagez$Inflation rose by 3.5% last quarter.)zNegative percentagezStocks fell -2% today.)zOrdinals 1st/2nd/3rdz)She finished 1st, he came 2nd, I was 3rd.)zOrdinal 21stz0It's the 21st century and the 100th anniversary.)zOrdinal 42ndzHe ran his 42nd marathon.)zOrdinal 33rdzOn the 33rd floor.)HalfzCut the recipe in 1/2.)Quartersz.Add 3/4 cup of sugar and 1/4 teaspoon of salt.)Thirdsz&The team completed 2/3 of the project.)Eighthsz!The pipe is 5/8 inch in diameter.)z12-hour timezThe meeting starts at 3:30pm.)z24-hour timezDeparture at 14:00.)zTime with ohzAlarm set for 9:05 AM.)MidnightzThe server restarts at 0:00.)zBare decadezThe 80s music scene was iconic.)zFull decadez&She grew up listening to 1990s grunge.)2000szThe 2000s brought social media.)2020szAI took off in the 2020s.)zApostrophe decadez&Born in the '90s, raised on 2000s pop.)zNumeric rangezRead pages 10-20 for homework.)z
Year rangezThe war lasted from 2020-2024.)zTemperature rangezStore between 5-10 degrees.)zGPT-3zgpt-3 is pretty sick.)zGPT-3.5z$They upgraded to GPT-3.5 last month.)zGPL-3 licensez%This project is licensed under GPL-3.)zPython versionzRequires Python-3.10 or higher.)zMultiple versionsz'Both CUDA-11 and CUDA-12 are supported.)DistancezThe trail is 42km long.)WeightzEach package weighs 500kg.)u   Temperature °Cu   Water boils at 100°C.)zData size GBzDownload the 2.5GB model file.)zFrequency GHzzThe CPU runs at 3.6GHz.)z
Latency mszAverage latency is 12ms.)z	HTML tagsz%<b>Hello</b> World! It's a great day.)zURL and emailz5Visit https://example.com or email hello@example.com.)zHashtags and mentionsz#NLP @user great post!)Contractionsz3I don't know, won't you help? They've already left.)zAin't / let'sz(Ain't no mountain high enough. Let's go!)zScore / ratiozThe final score was 3:0.)zAspect ratiozThe display is 16:9.)z
IP addressz.Connect to server at 192.168.1.1 on port 8080.)zPhone numberz&Call us at 555-1234 or 1-800-555-0199.)zNegative vs. hyphenz(On a scale of -10 to 10, she rated it 8.)EllipsiszHe paused... then spoke.)zEm dash numberu)   The result — 42 — surprised everyone.)zResearch abstractzEWe trained a 7B parameter model for 100 epochs at 1e-4 learning rate.)zGPT benchmarku>   GPT-4 scored 90% on the benchmark — 15% better than GPT-3.5.)zNews headlinez.Fed raises rates by 0.25%, S&P 500 drops 1.2%.)zStartup pitchzEWe raised $2.5M in seed funding and are growing 20% month-over-month.)z	Tech speczBThe M3 chip runs at 4.05GHz with a 40M transistor GPU and 8GB RAM.zF======================================================================zTextPreprocessor Demoz
  []z  IN : z  OUT: zG
======================================================================)r   rM         rR   c   r=   r7   i  i:  i@B iiɚ;z  z>15,u    → )gQ	@g      g)\Ò@g@g&1?gMbP?zexpand_roman_numerals  (opt-in))r  )zWorld War II ended in 1945.zChapter IV begins here.zLouis XIV was king.)rn   )T)r   )rq  rq   )Qr  r   rs  typingr   rV   rW   rg   r   r   _ROMANr  r  rd   rw   r`   re   r   r   r^  r  rb  ri  rk  re  rm  ro  rX  r   r   r   r   r   r   r   r  r  r  r/  r  r   r   r   r   r   r   r   r   r  r  r  r  r  r  r0  rK  rS  rY  r[  r`  rc  rf  rj  rl  rn  rp  ru  rz  r  r  r  r  r  ppcasesprintlabelr{   rO   fpp_romanr^   r^   r^   r_   <module>   s    &















*(# 
W
r