o
    \i&7                     @   sh  U d dl Z d dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlmZ d dlZd dlZd dlZeeeejef f Zeee	e f ZdZdZdZdZg d	Ze	e ed
< dZdZ dedefddZ!deddfddZ"dede	e fddZ#dede$fddZ%dede$fddZ&dede'fddZ(dedeejef fddZ)d ejdejfd!d"Z*d ejdejfd#d$Z+d%edejfd&d'Z,d(eee	e f de	ej fd)d*Z-d+e	e d,edee	e  fd-d.Z.ed/d0G d1d2 d2Z/	3	4dFd5ejd6ed7e0d8e0d9e0de	eeje0f  fd:d;Z1dGd=d>Z2	dHd?ed@e
e deeef fdAdBZ3dCe	e defdDdEZ4dS )I    N)	dataclass)AnyIterableListOptionalTupleUnion)urlparsei>  i     g      ?)(ChineseEnglish	CantoneseArabicGermanFrenchSpanish
Portuguese
IndonesianItalianKoreanRussianThai
VietnameseJapaneseTurkishHindiMalayDutchSwedishDanishFinnishPolishCzechFilipinoPersianGreekRomanian	Hungarian
MacedonianAssameseBengaliGujaratiKannada	MalayalamMarathiOdiaPunjabiTamilTeluguSUPPORTED_LANGUAGESz
<asr_text>z	language languagereturnc                 C   sH   | du rt dt|  }|st d|dd  |dd   S )a?  
    Normalize language name to the canonical format used by Qwen3-ASR:
    first letter uppercase, the rest lowercase (e.g., 'cHINese' -> 'Chinese').

    Args:
        language (str): Input language name.

    Returns:
        str: Normalized language name.

    Raises:
        ValueError: If language is empty.
    Nzlanguage is Nonezlanguage is empty   )
ValueErrorstrstripupperlower)r4   s r=   K/home/ubuntu/training/qwen3-asr-1.7b-phase2-sft/qwen_asr/inference/utils.pynormalize_language_nameT   s    r?   c                 C   s    | t vrtd|  dt  dS )z
    Validate the language is supported.

    Args:
        language (str): Canonical language name.

    Raises:
        ValueError: If unsupported.
    zUnsupported language: z. Supported: N)r3   r7   )r4   r=   r=   r>   validate_languagej   s   
r@   xc                 C   s   t | tr| S | gS N)
isinstancelist)rA   r=   r=   r>   ensure_listx   s   rE   r<   c                 C   s4   zt | }|jdv ot|jW S  ty   Y dS w )N)httphttpsF)r	   schemeboolnetloc	Exception)r<   ur=   r=   r>   is_url|   s   rM   c                 C   s2   |  drdS d| vrd| vrt| dkrdS dS )Nz
data:audioT/\   F)
startswithlen)r<   r=   r=   r>   is_probably_base64   s
   
rS   b64c                 C   s0   d| v r|   dr| ddd } t| S )N,zdata:r6   )r9   rQ   splitbase64	b64decode)rT   r=   r=   r>   decode_base64_bytes   s   
rY   c                 C   s   t | r>tj| }| }W d    n1 sw   Y  t|}tj|ddd\}}W d    n1 s8w   Y  n2t| rft	| }t|}tj|ddd\}}W d    n1 s`w   Y  n
t
j| d dd\}}tj|tjd}t|}||fS )Nfloat32F)dtype	always_2d)srmonor[   )rM   urllibrequesturlopenreadioBytesIOsfrS   rY   librosaloadnpasarrayrZ   int)rA   respaudio_bytesfaudior]   r=   r=   r>   load_audio_any   s$   
rp   ro   c                 C   sf   | j dkr| S | j dkr+| jd dkr | jd | jd kr | j} tj| ddtjS td| j  )Nr6      r      axiszUnsupported audio ndim=)ndimshapeTri   meanastyperZ   r7   )ro   r=   r=   r>   to_mono   s   

"r{   c                 C   s\   |  tj} | jdkr| S ttt| }|dkr| S |dkr%| | } t| dd} | S )Nr           g      ?g      )rz   ri   rZ   sizefloatmaxabsclip)ro   peakr=   r=   r>   float_range_normalize   s   
r   ac                 C   s   t | trt| \}}n(t | tr+t| dkr+t | d tjr+| d t| d }}n	tdt	|  t
t|}|tkrKtj||tdtj}t|}|S )a/  
    Normalize one audio input to mono 16k float32 waveform in [-1, 1].

    Supported inputs:
        - str: local file path / https URL / base64 audio string
        - (np.ndarray, sr): waveform and sampling rate

    Returns:
        np.ndarray:
            Mono 16k float32 waveform in [-1, 1].
    rq   r   r6   zUnsupported audio input type: )orig_sr	target_sr)rC   r8   rp   tuplerR   ri   ndarrayrk   	TypeErrortyper{   rj   SAMPLE_RATErg   resamplerz   rZ   r   )r   ro   r]   r=   r=   r>   normalize_audio_input   s   
&r   audiosc                 C   s   t | }dd |D S )Nc                 S   s   g | ]}t |qS r=   )r   ).0r   r=   r=   r>   
<listcomp>   s    z$normalize_audios.<locals>.<listcomp>)rE   )r   itemsr=   r=   r>   normalize_audios   s   r   xs
chunk_sizec                 c   s@    |dkr
| V  dS t dt| |D ]}| |||  V  qdS )z
    Yield chunks of a list.

    Args:
        xs (List[Any]): Input list.
        chunk_size (int): Chunk size.

    Yields:
        List[Any]: Slices of xs.
    r   N)rangerR   )r   r   ir=   r=   r>   
chunk_list   s   r   T)frozenc                   @   s<   e Zd ZU dZeed< eed< ejed< eed< eed< dS )
AudioChunka[  
    One chunk cut from an original audio.

    Attributes:
        orig_index: Index of the original sample in the input batch.
        chunk_index: Index of this chunk within the original sample.
        wav: Mono float32 waveform.
        sr: Sampling rate.
        offset_sec: Start offset of this chunk in the original audio, in seconds.
    
orig_indexchunk_indexwavr]   
offset_secN)	__name__
__module____qualname____doc__rk   __annotations__ri   r   r~   r=   r=   r=   r>   r      s   
 

r         @      Y@r   r]   max_chunk_secsearch_expand_secmin_window_msc                  C   s6  t j| t jd} | jdkrt j| ddt j} t| jd }|t| }||kr.| dfgS t|| }t|| }t	dt|d | }	g }
d}d}|| |kr|| }t	||| }t
||| }|| |	krl|}n9| || }t |}t j|t j|	t jdd	d
}tt |}|}||	 }||| }tt |}|| | }tt	||d }tt
||}| || }|
||f ||| t| 7 }|}|| |ksQ| || }|
||f tt| }g }|
D ]+\}}|jd |k r|t|jd  }t j|d|fdddt j}|||f q|}
|
S )a  
    Split a long audio into chunks close to max_chunk_sec, using a low-energy boundary.

    This implementation guarantees:
      - Concatenating all returned chunks reproduces the original audio exactly
        (total number of samples is identical, no overlaps, no gaps).

    Args:
        wav: Mono waveform float32.
        sr: Sampling rate.
        max_chunk_sec: Target max chunk duration in seconds.
        search_expand_sec: Boundary search half-window in seconds.
        min_window_ms: Sliding window in milliseconds for energy estimation.

    Returns:
        List[Tuple[np.ndarray, float]]: List of (chunk_wav, offset_sec).
    r_   r6   rs   rt   r   r|      g     @@valid)modeconstant)r   constant_values)ri   rj   rZ   rv   ry   rz   rk   rw   r~   r   minr   convolveonesargminappendMIN_ASR_INPUT_SECONDSpad) r   r]   r   r   r   	total_len	total_secmax_lenexpandwinchunksstartr   cutleftrightboundarysegseg_abswindow_sumsmin_poswstartwendlocalinnerchunktailmin_lenpaddedcoffr   r=   r=   r>   split_audio_into_chunks  s\   


r      c                    s2   dd }d fdd	 | }|||}  | |} | S )Nc                 S   s   g }d}t | }||k rUd}|| |k r2| ||  | | kr2|d7 }|| |k r2| ||  | | ks||krB|| |  ||7 }n|| |||   ||7 }||k sd|S )Nr   r6    )rR   r   join)r<   threshresr   ncountr=   r=   r>   fix_char_repeats[  s     

z4detect_and_fix_repetitions.<locals>.fix_char_repeatsr   c                    s  t | }|d }||k r| S d}g }||| krd}td|d D ]|}|||  |kr- nq| |||  }	d}
td|D ]}|||  }| |||  |	krRd}
 nq<|
r|}|||  }|| |kr| |||  |	kr|d7 }||7 }|| |kr| |||  |	ksm||	 | | |d  || |}d} nq!|rn|| |  |d7 }||| ks|s|| |d   d|S )Nrq   r   Fr6   Tr   )rR   r   r   r   )r<   r   r   r   min_repeat_charsr   resultfoundkpatternr   rep	start_idx	total_rep	end_indexfix_pattern_repeatsr=   r>   r   l  sP     
 
z7detect_and_fix_repetitions.<locals>.fix_pattern_repeatsr   r=   )text	thresholdr   text_rawr=   r   r>   detect_and_fix_repetitionsZ  s   ,

r   rawuser_languagec                 C   s   | du rdS t |  }|sdS t|}|r||fS |}d}t|v }|r-|td\}}nd| fS | }d|v rG| }|sCdS d|fS d}| D ]$}	|	 }	|	sVqM|	 }
|
trq|	t	td  }|rot
|} nqM|| fS )a  
    Parse Qwen3-ASR raw output into (language, text).

    Cases:
      - With tag: "language Chinese<asr_text>...."
      - With newlines: "language Chinese\n...\n<asr_text>...."
      - No tag: treat whole string as text.
      - "language None<asr_text>": treat as empty audio -> ("", "")

    If user_language is provided, language is forced to user_language and raw is treated as text-only
    (the model is expected to output plain transcription without metadata).

    Args:
        raw: Raw decoded string.
        user_language: Canonical language name if user forced language.

    Returns:
        Tuple[str, str]: (language, text)
    N)r   r   r   r6   zlanguage none)r8   r9   r   _ASR_TEXT_TAGrV   r;   
splitlinesrQ   _LANG_PREFIXrR   r?   )r   r   r<   	meta_part	text_parthas_tag
meta_lowertlanglinelowvalr=   r=   r>   parse_asr_output  sB   
r   langsc                 C   sF   g }d}| D ]}|pd  }|sq||krq|| |}qd|S )aM  
    Merge per-chunk languages into a compact comma-separated string,
    keeping order and removing consecutive duplicates and empty entries.

    Example:
      ["Chinese", "English", "English"] -> "Chinese,English"

    Args:
        langs: List of canonical language names.

    Returns:
        str: Merged language string.
    Nr   rU   )r9   r   r   )r   outprevrA   r=   r=   r>   merge_languages  s   

r   )r   r   r   rB   )5rW   rd   urllib.requestr`   dataclassesr   typingr   r   r   r   r   r   urllib.parser	   rg   numpyri   	soundfilerf   r8   r   rk   	AudioLike	MaybeListr   MAX_ASR_INPUT_SECONDSMAX_FORCE_ALIGN_INPUT_SECONDSr   r3   r   r   r   r?   r@   rE   rI   rM   rS   bytesrY   rp   r{   r   r   r   r   r   r~   r   r   r   r   r=   r=   r=   r>   <module>   sz   
 +$"

YF

F