o
    i                  	   @   sv  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dl	mZ
 d dlZd dlmZ zd dlZW n ey=   edZY nw zd dlmZ W n eyV   eddZY nw G d	d
 d
eeZeG dd dZedejdZeddZde
jej ejB dede
jej ejB fddZde
jej dedede
jej fddZde
jej dedefddZ G dd dZ!dS )    N)	dataclass)Enum)Literal)PlaceholderModulelibrosascipysignalc                   @   s    e Zd ZdZdZdZdZdZdS )ChannelReductionz8Method to reduce multi-channel audio to target channels.meanfirstmaxsumN)__name__
__module____qualname____doc__MEANFIRSTMAXSUM r   r   K/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/multimodal/audio.pyr	      s    r	   c                   @   sP   e Zd ZU dZdZedB ed< ejZ	eed< e
defddZdefd	d
ZdS )	AudioSpeca  Specification for target audio format.

    This dataclass defines the expected audio format for a model's feature
    extractor. It is used to normalize audio data before processing.

    Attributes:
        target_channels: Number of output channels. None means passthrough
            (no normalization). 1 = mono, 2 = stereo, etc.
        channel_reduction: Method to reduce channels when input has more
            channels than target. Only used when reducing channels.
       Ntarget_channelschannel_reductionreturnc                 C   s
   | j duS )z&Whether audio normalization is needed.Nr   selfr   r   r   needs_normalization6   s   
zAudioSpec.needs_normalizationc                 C   s&   | j d u rdS d| j  d| jj dS )NzAudioSpec(passthrough)zAudioSpec(channels=z, reduction=))r   r   valuer   r   r   r   __repr__;   s   

zAudioSpec.__repr__)r   r   r   r   r   int__annotations__r	   r   r   propertyboolr    strr#   r   r   r   r   r   %   s   
 r   r   )r   r   r   audiospecr   c                 C   s  |j s| S | jdkr|jdkr| S td|j d| jdkr(td| j d| jd | jd kr>t| tjr;| jn| j} | jd }||jkrJ| S ||jk rZtd| d	|j t| tj}|jdkr|j	t
jkr~|rvtj| dd
}|S | jdd}|S |j	t
jkr| d }|S |j	t
jkr|rtj| dd
}|S | jddj}|S |j	t
jkr|rtj| dd
}|S | jdd}|S td|j	 | d|j S )aK  Normalize audio to the specified format.

    This function handles channel reduction for multi-channel audio,
    supporting both numpy arrays and torch tensors.

    Args:
        audio: Input audio data. Can be:
            - 1D array/tensor: (time,) - already mono
            - 2D array/tensor: (channels, time) - standard format from torchaudio
            - 2D array/tensor: (time, channels) - format from soundfile
              (will be auto-detected and transposed if time > channels)
        spec: AudioSpec defining the target format.

    Returns:
        Normalized audio in the same type as input (numpy or torch).
        For mono output (target_channels=1), returns 1D array/tensor.

    Raises:
        ValueError: If audio has unsupported dimensions or channel expansion
            is requested (e.g., mono to stereo).
    r   zCannot expand mono audio to z	 channels   zUnsupported audio shape: z. Expected 1D or 2D.r   zCannot expand z channels to )axis)dimzUnknown reduction method: N)r    ndimr   
ValueErrorshape
isinstancenpndarrayTr   r	   r   r
   r   r   r   valuesr   r   )r)   r*   num_channelsis_numpyresultr   r   r   normalize_audioI   sN   






		r9   orig_sr	target_src                C   s   t j| ||dS )Nr:   r;   )r   resampler)   r:   r;   r   r   r   resample_audio_librosa   s   r?   c                C   s8   ||krt | d|| S ||k rt | || dS | S )Nr   )scipy_signalresample_polyr>   r   r   r   resample_audio_scipy   s
   rB   c                   @   sV   e Zd ZdZ		ddedB ded fddZd	eje	j
 d
edeje	j
 fddZdS )AudioResamplerz,Resample audio data to a target sample rate.Nr   r;   method)r   r   c                 C   s   || _ || _d S )N)r;   rD   )r   r;   rD   r   r   r   __init__   s   
zAudioResampler.__init__r)   r:   r   c                C   sx   | j d u r	tdtjt|t| j dddr|S | jdkr&t||| j dS | jdkr3t||| j dS td| j d	)
NzBAudio resampling is not supported when `target_sr` is not providedg        gư>)rel_tolabs_tolr   r<   r   zInvalid resampling method: z.. Supported methods are 'librosa' and 'scipy'.)	r;   RuntimeErrormathisclosefloatrD   r?   rB   r/   )r   r)   r:   r   r   r   r=      s,   


zAudioResampler.resample)Nr   )r   r   r   r   rK   r   rE   nptNDArrayr2   floatingr=   r   r   r   r   rC      s     


rC   )"rI   dataclassesr   enumr   typingr   numpyr2   numpy.typingrL   torchvllm.utils.import_utilsr   r   ImportErrorscipy.signalr   r@   placeholder_attrr(   r	   r   r   MONO_AUDIO_SPECPASSTHROUGH_AUDIO_SPECrM   rN   Tensorr9   rK   r?   rB   rC   r   r   r   r   <module>   s^   	

S


	

