o
    Eiy                     @   s4  d dl mZ d dlmZmZmZmZmZ d dlZ	d dl
Z
d dlmZmZmZ d dlmZmZmZ ede eee eee ddlmZ dd	lmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# zd d
l$mZ d dl%m&Z& e&'de W n   Y eG dd dZ(eG dd dZ)G dd dZ*dS )    )	dataclass)AnyDictListOptionalUnionN)Qwen3ASRConfig Qwen3ASRForConditionalGenerationQwen3ASRProcessor)
AutoConfig	AutoModelAutoProcessor	qwen3_asr   )Qwen3ForcedAligner)MAX_ASR_INPUT_SECONDSMAX_FORCE_ALIGN_INPUT_SECONDSSAMPLE_RATESUPPORTED_LANGUAGES
AudioChunk	AudioLike
chunk_listmerge_languagesnormalize_audiosnormalize_language_nameparse_asr_outputsplit_audio_into_chunksvalidate_language)r	   )ModelRegistryr	   c                   @   s2   e Zd ZU dZeed< eed< dZee ed< dS )ASRTranscriptiona  
    One transcription result.

    Attributes:
        language (str):
            Merged language string for the sample, e.g. "Chinese" or "Chinese,English".
            Empty string if unknown or silent audio.
        text (str):
            Transcribed text.
        time_stamps (Optional[Any]):
            Forced aligner output (ForcedAlignResult).
            Present only when return_time_stamps=True.
    languagetextNtime_stamps)	__name__
__module____qualname____doc__str__annotations__r"   r   r    r)   r)   O/home/ubuntu/training/qwen3-asr-1.7b-phase2-sft/qwen_asr/inference/qwen3_asr.pyr   9   s
   
 r   c                   @   s   e Zd ZU dZeed< eed< eed< eed< eed< ejed< ejed< e	ed	< e	ed
< e
e	 ed< e	ed< e	ed< e	ed< dS )ASRStreamingStatea  
    Streaming ASR state for one audio stream (single utterance).

    Attributes:
        unfixed_chunk_num (int):
            For the first N chunks, do not use previous ASR result as prefix prompt (reset prefix to "").
        unfixed_token_num (int):
            When chunk_id >= unfixed_chunk_num, rollback the last K tokens from the accumulated text
            before using it as prefix prompt, to reduce boundary jitter.
        chunk_size_sec (float):
            Chunk size in seconds. Audio will be fed to the model in increments of this length.
        chunk_size_samples (int):
            Chunk size in samples at 16kHz (derived from chunk_size_sec).
        chunk_id (int):
            Current chunk index (0-based).
        buffer (np.ndarray):
            Buffered PCM samples that are not yet consumed into a full chunk.
        audio_accum (np.ndarray):
            Accumulated audio from the beginning of the stream up to current time (no padding).
        prompt_raw (str):
            Base prompt generated by chat template (with generation prompt), without appended prefix text.
        context (str):
            Context string.
        force_language (Optional[str]):
            If provided, force output to be text-only by appending "language X<asr_text>" in prompt_raw,
            consistent with non-streaming transcribe().
        language (str):
            Latest parsed language (updated after each chunk decode). Empty if unknown/silent.
        text (str):
            Latest parsed transcription text (updated after each chunk decode).
        _raw_decoded (str):
            Internal accumulated decoded raw text (before parse_asr_output normalization).
            Used for rollback/token trimming and as prefix for prompting.
    unfixed_chunk_numunfixed_token_numchunk_size_secchunk_size_sampleschunk_idbufferaudio_accum
prompt_rawcontextforce_languager    r!   _raw_decodedN)r#   r$   r%   r&   intr(   floatnpndarrayr'   r   r)   r)   r)   r*   r+   M   s   
 "

r+   c                   @   s  e Zd ZdZ				dEdedededee d	ee d
edefddZ	e
				dFded	ee deeeef  d
edee dd fddZe
				dGded	ee deeeef  d
edee dd fddZdee fddZe 			dHdeeee f deeee f deeeeee  f  dedee f
dd Zded!edeeeef  fd"d#Zded$ee defd%d&Zd'ee d(eej d)eee  dee fd*d+Zd'ee d(eej d)eee  dee fd,d-Zd'ee d(eej d)eee  dee fd.d/Zd0ed1edefd2d3Zd4ee dee fd5d6Z 			7	8	9dIdedee d:ed;ed<ede!fd=d>Z"d?ejd@e!de!fdAdBZ#d@e!de!fdCdDZ$dS )JQwen3ASRModela  
    Unified inference wrapper for Qwen3-ASR with two backends:
      - Transformers backend 
      - vLLM backend

    It optionally supports time stamp output via Qwen3-ForcedAligner.

    Notes:
      - Each request uses a context text and exactly one audio.
      - If language is provided, the prompt will force the output to be text-only by appending
        "language {Language}<asr_text>" to the assistant prompt.
    N   backendmodel	processorsampling_paramsforced_alignermax_inference_batch_sizemax_new_tokensc                 C   s   || _ || _|| _|| _|| _t|| _|| _|dkrKt|dd | _	| j	d u rAz
t
| j	| _	W n ty@   t	d| _	Y nw t|dtj| _d S d | _	d | _d S )Ntransformersdevicecpudtype)r>   r?   r@   rA   rB   r7   rC   rD   getattrrF   next
parametersStopIterationtorchfloat32rH   )selfr>   r?   r@   rA   rB   rC   rD   r)   r)   r*   __init__   s$   



zQwen3ASRModel.__init__    pretrained_model_name_or_pathforced_aligner_kwargsreturnc           
   	   K   sX   t j|fi |}tj|dd}d}	|dur!tj|fi |pi }	| d||d|	||dS )a  
        Initialize using Transformers backend.

        Args:
            pretrained_model_name_or_path:
                HuggingFace repo id or local directory.
            forced_aligner:
                Optional forced aligner model path/repo id.
            forced_aligner_kwargs:
                Optional kwargs forwarded to Qwen3ForcedAligner.from_pretrained(...).
            max_inference_batch_size:
                Batch size limit for inference. -1 means no chunking. Small values can avoid OOM.
            max_new_tokens:
                Maximum number of tokens to generate.
            **kwargs:
                Forwarded to AutoModel.from_pretrained(...).

        Returns:
            Qwen3ASRModel
        Tfix_mistral_regexNrE   r>   r?   r@   rA   rB   rC   rD   )r   from_pretrainedr   r   )
clsrR   rB   rS   rC   rD   kwargsr?   r@   forced_aligner_modelr)   r)   r*   rX      s$   zQwen3ASRModel.from_pretrained   c              
   K   s   zddl m} ddl m} W n ty }	 ztd|	d}	~	ww |dd|i|}
tj|dd}|di d	|d
}d}|durKtj|fi |pHi }| d|
||||ddS )a  
        Initialize using vLLM backend.

        Import is isolated to keep vLLM optional.

        Args:
            model:
                Model path/repo for vLLM.
            forced_aligner:
                Optional forced aligner model path/repo id.
            forced_aligner_kwargs:
                Optional kwargs forwarded to Qwen3ForcedAligner.from_pretrained(...).
            max_inference_batch_size:
                Batch size limit for inference. -1 means no chunking. Small values can avoid OOM.
            max_new_tokens:
                Maximum number of tokens to generate.
            **kwargs:
                Forwarded to vllm.LLM(...).

        Returns:
            Qwen3ASRModel

        Raises:
            ImportError: If vLLM is not installed.
        r   )LLM)SamplingParamsz?vLLM is not available. Install with: pip install qwen-asr[vllm]Nr?   TrU   g        )temperature
max_tokensvllmrW   r)   )ra   r]   r^   	ExceptionImportErrorr
   rX   r   )rY   r?   rB   rS   rC   rD   rZ   vLLMr^   ellmr@   rA   r[   r)   r)   r*   r]      s<   #zQwen3ASRModel.LLMc                 C   s   t tS )zy
        Returns the supported language list.

        Returns:
            List[str]: Canonical language names.
        )listr   rO   r)   r)   r*   get_supported_languages"  s   z%Qwen3ASRModel.get_supported_languages Faudior4   r    return_time_stampsc           2         s  |r| j du rtdt|}t|}t|tr|n|g t dkr+|dkr+ |  t |kr=td| dt  |du rGdg| }n*t|trN|n|g}t|dkr_|dkr_|| }t||krqtd| dt| g |D ]#}|du st| dkrd qut	t|}	t
|	 |	 qu|rtnt}
g }t|D ]"\}}t|t|
d}t|D ]\}\}}|t|||t|d	 qq fd
d|D }fdd|D }dd |D }| |||}g }g }t||D ]\}}t||d\}}|| || qdgt| }|rg }g }g }g } tt|||D ])\}!\}"}}#| dkr2q"||"j|"jf || ||# | |! q"g }$tt|| jt|| jt|| jD ]\}%}&}'|$| j j|%|&|'d q`t| D ]\}(}!||! }"|$|( })| |)|"j||!< qwdd t|D }*dd t|D }+dd t|D },t||||D ]'\}"}}}-|*|"j | |+|"j | |r|-dur|,|"j |- qg }.t|D ]+}ddd |+| D }/t|*| }0d}1|r|  |,| }1|.t!|0|/|1d q|.S )a  
        Transcribe audio with optional context and optional forced alignment timestamps.

        Args:
            audio:
                Audio input(s). Supported:
                  - str: local path / URL / base64 data url
                  - (np.ndarray, sr)
                  - list of above
            context:
                Context string(s). If scalar, it will be broadcast to batch size.
            language:
                Optional language(s). If provided, it must be in supported languages.
                If scalar, it will be broadcast to batch size.
                If provided, the prompt will force output to be transcription text only.
            return_time_stamps:
                If True, timestamps are produced via forced aligner and merged across chunks.
                This requires forced_aligner initialized.

        Returns:
            List[ASRTranscription]: One result per input audio.

        Raises:
            ValueError:
                - If return_time_stamps=True but forced_aligner is not provided.
                - If language is unsupported.
                - If batch sizes mismatch for context/language.
        NzSreturn_time_stamps=True requires `forced_aligner` to be provided at initialization.r   zBatch size mismatch: audio=z
, context=z, language=rj   )wavsrmax_chunk_sec)
orig_indexchunk_indexrm   rn   
offset_secc                       g | ]} |j  qS r)   rp   .0c)ctxsr)   r*   
<listcomp>|      z,Qwen3ASRModel.transcribe.<locals>.<listcomp>c                    rs   r)   rt   ru   )
langs_normr)   r*   ry   }  rz   c                 S   s   g | ]}|j qS r)   )rm   ru   r)   r)   r*   ry   ~  s    user_language)rk   r!   r    c                 S      g | ]}g qS r)   r)   rv   _r)   r)   r*   ry         c                 S   r~   r)   r)   r   r)   r)   r*   ry     r   c                 S   r~   r)   r)   r   r)   r)   r*   ry     r   c                 S   s   g | ]}|d ur|qS )Nr)   )rv   tr)   r)   r*   ry     s    )r    r!   r"   )"rB   
ValueErrorr   len
isinstancerg   r'   stripappendr   r   r   r   	enumerater   r   r   
_infer_asrzipr   rm   rn   r   rC   extendalign_offset_align_resultrr   rangerp   joinr   _merge_align_resultsr   )2rO   rk   r4   r    rl   wavsnlangs_inllnro   chunksirm   partsjcwavrr   	chunk_ctx
chunk_lang
chunk_wavsraw_outputsper_chunk_langper_chunk_textoutforced_langlangtxtper_chunk_alignto_align_audioto_align_textto_align_langto_align_idxidxrw   	lang_predaligned_resultsa_chunkt_chunkl_chunkkr	out_langs	out_texts
out_alignsalresultsmerged_textmerged_languagemerged_alignr)   )rx   r{   r*   
transcribe+  s   $





zQwen3ASRModel.transcribeaudio_payloadc                 C   s    d|pdddd|dgdgS )Nsystemrj   )rolecontentuserrk   )typerk   r)   )rO   r4   r   r)   r)   r*   _build_messages  s   zQwen3ASRModel._build_messagesr5   c                 C   s:   | j |dd}| jj|ddd}|r|d| d  }|S )z
        Build the string prompt for one request.

        If force_language is provided, "language X<asr_text>" is appended after the generation prompt
        to request text-only output.
        rj   )r4   r   TF)add_generation_prompttokenizez	language z
<asr_text>)r   r@   apply_chat_template)rO   r4   r5   msgsbaser)   r)   r*   _build_text_prompt  s
   z Qwen3ASRModel._build_text_promptcontextsr   	languagesc                 C   s@   | j dkr| |||S | j dkr| |||S td| j  )a:  
        Run backend inference for chunk-level items.

        Args:
            contexts: List of context strings.
            wavs: List of mono waveforms (np.ndarray).
            languages: List of forced languages or None.

        Returns:
            List[str]: Raw decoded strings (one per chunk).
        rE   ra   zUnknown backend: )r>   _infer_asr_transformers_infer_asr_vllmRuntimeError)rO   r   r   r   r)   r)   r*   r     s
   

zQwen3ASRModel._infer_asrc                    s   g } fddt ||D } j}|d u s|dk rt|}tdt||D ]S}||||  }||||  }	 j||	ddd}
|
 jj jj}
 jj	di |
d j
i} jj|jd d |
d jd	 d f dd
d}|t| q%|S )Nc                    s   g | ]\}} j ||d qS )r4   r5   )r   )rv   rw   flrh   r)   r*   ry     s    z9Qwen3ASRModel._infer_asr_transformers.<locals>.<listcomp>r   ptT)r!   rk   return_tensorspaddingrD   	input_idsr   F)skip_special_tokensclean_up_tokenization_spacesr)   )r   rC   r   r   r@   tor?   rF   rH   generaterD   batch_decode	sequencesshaper   rg   )rO   r   r   r   outstexts
batch_sizer   sub_textsub_wavsinputstext_idsdecodedr)   rh   r*   r     s$   z%Qwen3ASRModel._infer_asr_transformersc                 C   s   g }t |||D ]\}}}| j||d}||d|gid qg }	t|| jD ]}
| jj|
| jdd}|D ]}|	|jd j	 q6q(|	S )Nr   rk   promptmulti_modal_dataFrA   use_tqdmr   )
r   r   r   r   rC   r?   r   rA   outputsr!   )rO   r   r   r   r   rw   wr   r   r   batchr   or)   r)   r*   r   	  s   zQwen3ASRModel._infer_asr_vllmresultrr   c              	   C   s\   |du rdS g }|j D ]}|t||jt|j| dt|j| dd qt||dS )a  
        Apply time offset to a ForcedAlignResult-like object.

        This function assumes:
          - result has attribute `.items` which is a list of items with start_time/end_time in seconds.
          - dataclasses are frozen in upstream implementation, so we reconstruct by type.

        Args:
            result: ForcedAlignResult
            offset_sec: Offset in seconds

        Returns:
            ForcedAlignResult: New object with shifted timestamps.
        N   )r!   
start_timeend_timeitems)r   r   r   r!   roundr   r   )rO   r   rr   r   itr)   r)   r*   r     s   

z"Qwen3ASRModel._offset_align_resultr   c                 C   sJ   |sdS g }|D ]}|du rq| t|j q|sdS t|d |dS )z
        Merge multiple ForcedAlignResult objects into a single one by concatenating items.

        Args:
            results: List of ForcedAlignResult

        Returns:
            ForcedAlignResult or None
        Nr   r   )r   rg   r   r   )rO   r   	all_itemsr   r)   r)   r*   r   3  s   
z"Qwen3ASRModel._merge_align_results             @r,   r-   r.   c           
      C   s   | j dkr	td|du st|dkrtd| d}|dur4t| dkr4tt|}t| |}ttt|t	 }t
d|}| j||d}	tt|t|t|t|dtjd	tjd
tjd	tjd
|	|pjd|ddddS )ay  
        Initialize streaming ASR state for a single stream.

        Notes:
            - Streaming ASR is supported ONLY for vLLM backend.
            - Streaming ASR does NOT support timestamps (forced aligner is not used).
            - Batch inference is NOT supported.

        Args:
            context:
                Context string.
            language:
                Optional forced language. If provided, it must be in supported languages.
                Same behavior as transcribe(): forces text-only output via prompt suffix.
            unfixed_chunk_num:
                For the first N chunks, do not use previous output as prefix prompt (reset prefix to "").
            unfixed_token_num:
                Roll back the last K tokens from accumulated output when using it as prefix prompt
                after unfixed_chunk_num.
            chunk_size_sec:
                Chunk size in seconds (audio is 16k PCM). The function will internally convert it
                to sample count at 16kHz.

        Returns:
            ASRStreamingState: Mutable state object to be passed to streaming_transcribe() and
            finish_streaming_transcribe().

        Raises:
            ValueError:
                - If backend is not "vllm".
                - If chunk_size_sec <= 0.
                - If forced language is invalid (same validation rules as transcribe()).
        ra   zBStreaming ASR is supported only for vLLM backend (backend='vllm').Nr   z!chunk_size_sec must be > 0, got: rj   r   r   r   rH   )r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r    r!   r6   )r>   r   r8   r'   r   r   r   r7   r   r   maxr   r+   r9   zerosrN   )
rO   r4   r    r,   r-   r.   r5   r   r/   r3   r)   r)   r*   init_streaming_stateH  s6   
)
z"Qwen3ASRModel.init_streaming_statepcm16kstatec                 C   s*  | j dkr	td|du rtd|du rtdt|}|jdkr(|d}|jtjkr7|tj	d }n|jtj	d	d
}|j
d dkrQtj|j|gdd|_|jj
d |jkr|jd|j }|j|jd |_|jj
d dkrx||_ntj|j|gdd|_d}|j|jk rd}n9| jj|j}t|j}	 tdt|| }|dkr| jj|d| nd}d|vrn|dkrd}n|d7 }q|j| }	|	d|jgid}
| jj|
g| jd	d}|d jd j}|dur|| n||_t |j|j!d\}}||_"||_| jd7  _|jj
d |jks[|S )a  
        Streaming ASR decode step.

        This function accepts an arbitrary-length 16k PCM float numpy array (mono).
        It buffers incoming samples, and whenever enough samples are accumulated to form one
        full chunk (chunk_size_sec), it runs one incremental decode step and updates:

          - state.language
          - state.text

        The caller only needs to keep passing audio to this function and read state.language/state.text.

        Implementation details:
            - Each time a new chunk is ready, we append it to audio_accum and re-feed *all* audio seen
              so far to the model (no padding).
            - We update the prompt as: state.prompt_raw + prefix_text
            - Prefix rollback strategy:
                * If chunk_id < unfixed_chunk_num: prefix_text = ""
                * Else: rollback last unfixed_token_num tokens from previously accumulated decoded text.

        Notes:
            - vLLM backend only.
            - No timestamps.
            - Single stream only (no batching).

        Args:
            pcm16k:
                16kHz mono PCM waveform (np.ndarray). Length can be any non-negative integer.
                dtype can be float32/float64/int16; it will be converted to float32.
            state:
                Streaming state returned by init_streaming_state().

        Returns:
            ASRStreamingState: The same state object (mutated) for convenience.

        Raises:
            ValueError:
                If backend is not "vllm" or state is invalid.
        ra   zKstreaming_transcribe() is supported only for vLLM backend (backend='vllm').Nz:state must not be None. Call init_streaming_state() first.zpcm16k must not be None.r   r<   g      @F)copyr   axisrj   Tu   �rk   r   r   r|   )#r>   r   r9   asarrayndimreshaperH   int16astyperN   r   concatenater1   r/   r2   r0   r,   r@   	tokenizerencoder6   r7   r-   r   r   decoder3   r?   r   rA   r   r!   r   r5   r    )rO   r   r   xchunkprefixcur_idsr   end_idxr   inpr   gen_textr   r   r)   r)   r*   streaming_transcribe  s\   
(



"
-z"Qwen3ASRModel.streaming_transcribec                 C   sd  | j dkr	td|du rtd|jdu s|jjd dkr |S |j}tjdtjd|_|jjd dkr8||_ntj|j|gdd|_d	}|j	|j
k rNd	}n| jj|j}td
t|t|j }| jj|d| }|j| }|d|jgid}| jj|g| jdd}|d jd j}	|dur||	 n|	|_t|j|jd\}
}|
|_||_| j	d
7  _	|S )a  
        Finish streaming ASR.

        This function flushes the remaining buffered audio in state.buffer (tail audio).
        It sends the remaining samples to the model even if shorter than chunk_size_sec,
        without padding. Then it updates state.language/state.text one last time.

        Notes:
            - vLLM backend only.
            - No timestamps.
            - Single stream only.

        Args:
            state:
                Streaming state.

        Returns:
            ASRStreamingState: Updated state (mutated).

        Raises:
            ValueError:
                If backend is not "vllm" or state is invalid.
        ra   zRfinish_streaming_transcribe() is supported only for vLLM backend (backend='vllm').Nzstate must not be None.r   r   r   r   rj   r   rk   r   Fr   r|   )r>   r   r1   r   r9   r   rN   r2   r  r0   r,   r@   r  r  r6   r   r   r7   r-   r	  r3   r?   r   rA   r   r!   r   r5   r    )rO   r   tailr  r  r  r   r  r   r  r   r   r)   r)   r*   finish_streaming_transcribe  s6   

z)Qwen3ASRModel.finish_streaming_transcribe)NNr<   r=   )NNrQ   r=   )NNr<   r\   )rj   NF)rj   Nr   r   r   )%r#   r$   r%   r&   r'   r   r   r   r7   rP   classmethodr   rX   r]   r   ri   rM   no_gradr   r   boolr   r   r   r   r9   r:   r   r   r   r8   r   r   r+   r   r  r  r)   r)   r)   r*   r;      s    
2?	 "






Inr;   )+dataclassesr   typingr   r   r   r   r   numpyr9   rM   "qwen_asr.core.transformers_backendr   r	   r
   rE   r   r   r   registerqwen3_forced_alignerr   utilsr   r   r   r   r   r   r   r   r   r   r   r   r   qwen_asr.core.vllm_backendra   r   register_modelr   r+   r;   r)   r)   r)   r*   <module>   s,   <5