o
    	۷i+                     @   sv   d dl mZmZmZ ddlmZ ddlmZ ddlm	Z	 e r/d dl
Z
ddlmZ dd	lmZ d
ZG dd de	ZdS )    )AnyUnionoverload   )GenerationConfig)is_torch_available   )PipelineN)%MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING)SpeechT5HifiGanzmicrosoft/speecht5_hifiganc                
       s   e Zd ZdZdZdZdZdZdZdZe	ddZ
dddd fdd	
Zd
d Zdd Zedededeeef fddZedee dedeeeef  fddZdeeee f deeeef eeeef  f f fddZ			dddZdd Z  ZS )TextToAudioPipelinea  
    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
    pipeline generates an audio file from an input text and optional other conditional inputs.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(model="suno/bark-small")
    >>> output = pipe("Hey it's HuggingFace on the phone!")

    >>> audio = output["audio"]
    >>> sampling_rate = output["sampling_rate"]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    <Tip>

    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
    [`TextToAudioPipeline.__call__.generate_kwargs`].

    Example:

    ```python
    >>> from transformers import pipeline

    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")

    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
    >>> generate_kwargs = {
    ...     "do_sample": True,
    ...     "temperature": 0.7,
    ...     "max_new_tokens": 35,
    ... }

    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
    ```

    </Tip>

    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
    `"text-to-audio"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
    TF   )max_new_tokensN)vocodersampling_rateno_processorc          	         s@  t  j|i | || _| jdkrtdd | _| jjt	 v r1|d u r.t
t| jjn|| _|| _| jd ur?| jjj| _| jd u r| jj}| jjdd }|d ur[||  dD ]&}t||d }|d urm|| _q]t|dd d urt|j|d }|d ur|| _q]| jd u r| jst| jdr| jjj| _d S d S d S d S )Ntfz5The TextToAudioPipeline is only available in PyTorch.generation_config)sample_rater   codec_configfeature_extractor)super__init__r   	framework
ValueErrorr   model	__class__r
   valuesr   from_pretrainedDEFAULT_VOCODER_IDtodevicer   config__dict__getupdateto_dictgetattrr   hasattr	processorr   )	selfr   r   r   argskwargsr"   
gen_configsampling_rate_namer    Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/pipelines/text_to_audio.pyr   a   s<   


zTextToAudioPipeline.__init__c                 K   sv   t |tr|g}| jjjdkr$| jjddddddd}|| |}| j	r*| j
n| j}||fi |dd	i}|S )
Nbarkmax_input_semantic_lengthr   FT
max_length)r4   add_special_tokensreturn_attention_maskreturn_token_type_idspaddingreturn_tensorspt)
isinstancestrr   r"   
model_typer   semantic_configr$   r%   r   	tokenizerr)   )r*   textr,   
new_kwargspreprocessoroutputr0   r0   r1   
preprocess   s   

	zTextToAudioPipeline.preprocessc                 K   s   | j || jd}|d }|d }| j r7| j || jd}d|vr&| j|d< || | jjdi ||}nt|rDtd|	  | jdi ||d }| j
d urZ| 
|}|S )N)r!   forward_paramsgenerate_kwargsr   zYou're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. For reference, the `generate_kwargs` used here are: r   r0   )_ensure_tensor_on_devicer!   r   can_generater   r%   generatelenr   keysr   )r*   model_inputsr,   rE   rF   rC   r0   r0   r1   _forward   s&   




zTextToAudioPipeline._forwardtext_inputsrE   returnc                 K      d S Nr0   r*   rN   rE   r0   r0   r1   __call__      zTextToAudioPipeline.__call__c                 K   rP   rQ   r0   rR   r0   r0   r1   rS      rT   c                    s   t  j|fi |S )a  
        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

        Args:
            text_inputs (`str` or `list[str]`):
                The text(s) to generate.
            forward_params (`dict`, *optional*):
                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
                underlying model.
            generate_kwargs (`dict`, *optional*):
                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                complete overview of generate, check the [following
                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
                only passed to the underlying model if the latter is a generative model.

        Return:
            A `dict` or a list of `dict`: The dictionaries have two keys:

            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
        )r   rS   rR   r/   r0   r1   rS      s   c                 C   sr   t | dd d ur| j|d< t | dd d ur| j|d< | j|d< |r#|ni |r(|ni d}|d u r2i }i }|||fS )Nassistant_modelassistant_tokenizerr?   )rE   rF   )r'   rU   r?   rV   )r*   preprocess_paramsrE   rF   paramspostprocess_paramsr0   r0   r1   _sanitize_parameters   s   





z(TextToAudioPipeline._sanitize_parametersc                 C   s   i }| j jjdkrd}nd}| jr(t|tr|| }nt|tr%|d }n	|}n| j|}t|t	r=dd |D |d< n|j
dtjd |d< | j|d	< |S )
Ncsmaudiowaveformr   c                 S   s    g | ]}|j d tjd qS )cpur!   dtype)r    torchfloatnumpy).0elr0   r0   r1   
<listcomp>  s     z3TextToAudioPipeline.postprocess.<locals>.<listcomp>r^   r_   r   )r   r"   r=   r   r;   dicttupler)   decodelistr    ra   rb   rc   r   )r*   r\   output_dictwaveform_keyr]   r0   r0   r1   postprocess   s    





zTextToAudioPipeline.postprocess)NNN)__name__
__module____qualname____doc___load_processor_pipeline_calls_generate_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr   rD   rM   r   r<   r   rg   rS   rj   r   rZ   rm   __classcell__r0   r0   r/   r1   r      s8    5*! (
r   )typingr   r   r   
generationr   utilsr   baser	   ra   models.auto.modeling_autor
   !models.speecht5.modeling_speecht5r   r   r   r0   r0   r0   r1   <module>   s   