o
    it                     @   s   d dl mZ d dlmZmZ d dlZd dlZddlm	Z	m
Z
 ddlmZ ddlmZmZ e
eZe	ed	d	d
G dd deZdS )    )UserDict)AnyUnionN   )add_end_docstringslogging   )ffmpeg_read)Pipelinebuild_pipeline_init_argsT)has_feature_extractorhas_tokenizerc                	       s   e Zd ZdZdZdZdZdZ fddZde	e
jeeef dedeeeef  f fd	d
Zdd ZdddZdd Zdd Z  ZS )#ZeroShotAudioClassificationPipelinea  
    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
    provide an audio and a set of `candidate_labels`.

    <Tip warning={true}>

    The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.

    </Tip>

    Example:
    ```python
    >>> from transformers import pipeline
    >>> from datasets import load_dataset

    >>> dataset = load_dataset("ashraq/esc50")
    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
    >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vacuum cleaner"])
    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vacuum cleaner'}]
    ```


    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
    classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-audio-classification"`. See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
    FTc                    s2   t  jdi | | jdkrtd| j dd S )NptzThe z is only available in PyTorch. )super__init__	framework
ValueError	__class__)selfkwargsr   r   i/home/ubuntu/.local/lib/python3.10/site-packages/transformers/pipelines/zero_shot_audio_classification.pyr   D   s   
z,ZeroShotAudioClassificationPipeline.__init__audiosr   returnc                    s   t  j|fi |S )a  
        Assign labels to the audio(s) passed as inputs.

        Args:
            audios (`str`, `list[str]`, `np.array` or `list[np.array]`):
                The pipeline handles three types of inputs:
                - A string containing a http link pointing to an audio
                - A string containing a local path to an audio
                - An audio loaded in numpy
            candidate_labels (`list[str]`):
                The candidate labels for this audio. They will be formatted using *hypothesis_template*.
            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
                The format used in conjunction with *candidate_labels* to attempt the audio classification by
                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
                already formatted.
        Return:
            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
            following keys:
            - **label** (`str`) -- One of the suggested *candidate_labels*.
            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
                0 and 1, computed as the `softmax` of `logits_per_audio`.
        )r   __call__)r   r   r   r   r   r   r   K   s   z,ZeroShotAudioClassificationPipeline.__call__c                 K   s6   i }d|v r|d |d< d|v r|d |d< |i i fS )Ncandidate_labelshypothesis_templater   )r   r   preprocess_paramsr   r   r   _sanitize_parametersd   s   
z8ZeroShotAudioClassificationPipeline._sanitize_parametersNThis is a sound of {}.c                    s  t |tr/|ds|drt|j}nt|d}| }W d    n1 s*w   Y  t |tr;t	|| j
j}t |tjsEtdt|jdkrPtd| j
|g| j
jdd}| jdkrf|| j}||d	<  fd
d|D }| j|| jdd}|g|d< |S )Nzhttp://zhttps://rbz"We expect a numpy ndarray as inputr   zNWe expect a single channel audio input for ZeroShotAudioClassificationPipeliner   )sampling_ratereturn_tensorsr   c                    s   g | ]}  |qS r   )format).0xr   r   r   
<listcomp>   s    zBZeroShotAudioClassificationPipeline.preprocess.<locals>.<listcomp>T)r$   paddingtext_inputs)
isinstancestr
startswithrequestsgetcontentopenreadbytesr	   feature_extractorr#   npndarray	TypeErrorlenshaper   r   todtype	tokenizer)r   audior   r   finputs	sequencesr+   r   r(   r   
preprocessm   s,   




z.ZeroShotAudioClassificationPipeline.preprocessc                 C   s\   | d}| d}t|d tr|d }n|d d }| jdi ||}||jd}|S )Nr   r+   r   )r   logitsr   )popr,   r   modellogits_per_audio)r   model_inputsr   r+   outputsmodel_outputsr   r   r   _forward   s   


z,ZeroShotAudioClassificationPipeline._forwardc                 C   sb   | d}|d d }| jdkr|jdd}| }ntddd tt||d	d
 dD }|S )Nr   rC   r   r   )dimz`tf` framework not supported.c                 S   s   g | ]	\}}||d qS ))scorelabelr   )r&   rL   candidate_labelr   r   r   r)      s    zCZeroShotAudioClassificationPipeline.postprocess.<locals>.<listcomp>c                 S   s
   | d  S )Nr   r   )r'   r   r   r   <lambda>   s   
 zAZeroShotAudioClassificationPipeline.postprocess.<locals>.<lambda>)key)rD   r   softmaxtolistr   sortedzip)r   rI   r   rC   probsscoresresultr   r   r   postprocess   s   


z/ZeroShotAudioClassificationPipeline.postprocess)Nr!   )__name__
__module____qualname____doc___load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   r   r6   r7   r4   r-   dictr   listr   r    rB   rJ   rX   __classcell__r   r   r   r   r       s    4
	r   )collectionsr   typingr   r   numpyr6   r/   utilsr   r   audio_classificationr	   baser
   r   
get_loggerrY   loggerr   r   r   r   r   <module>   s   
