o
    ei/3                     @   s   d dl mZmZmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ e	 r=d d	lZdd
lmZ ddlmZ dZG dd deddZG dd deZd	S )    )Any	TypedDictoverload   )
AudioInput)GenerationConfig)is_torch_available)ChatChatType   )PipelineN)%MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING)SpeechT5HifiGanzmicrosoft/speecht5_hifiganc                   @   s"   e Zd ZU dZeed< eed< dS )AudioOutputz
    audio (`AudioInput`):
        The generated audio waveform.
    sampling_rate (`int`):
        The sampling rate of the generated audio waveform.
    audiosampling_rateN)__name__
__module____qualname____doc__r   __annotations__int r   r   b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/pipelines/text_to_audio.pyr   !   s   
 r   F)totalc                       s   e Zd ZdZdZdZdZdZdZe	ddZ
ddd fdd	
Zd
d Zdd ZedededefddZedee dedee fddZedededefddZedee dedee fddZ fddZ			dddZdd Z  ZS )TextToAudioPipelinea  
    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
    pipeline generates an audio file from an input text and optional other conditional inputs.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(model="suno/bark-small")
    >>> output = pipe("Hey it's HuggingFace on the phone!")

    >>> audio = output["audio"]
    >>> sampling_rate = output["sampling_rate"]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    <Tip>

    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
    [`TextToAudioPipeline.__call__.generate_kwargs`].

    Example:

    ```python
    >>> from transformers import pipeline

    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small")

    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
    >>> generate_kwargs = {
    ...     "do_sample": True,
    ...     "temperature": 0.7,
    ...     "max_new_tokens": 35,
    ... }

    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
    ```

    </Tip>

    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
    `"text-to-audio"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
    TNF   )max_new_tokens)vocoderr   c                   s@  t  j|i | d | _| jjt v r%|d u r"tt	
| jjn|| _| jjjdv r/d | _|| _| jd ur=| jjj| _| jd u r| jj}| jjdd }|d urY||  dD ]&}t||d }|d urk|| _q[t|dd d urt|j|d }|d ur|| _q[| jd u r| jd urt| jdr| jjj| _d S d S d S d S )N)musicgenspeecht5generation_config)sample_rater   codec_configfeature_extractor)super__init__r   model	__class__r   valuesr   from_pretrainedDEFAULT_VOCODER_IDtodeviceconfig
model_type	processorr   __dict__getupdateto_dictgetattrr#   hasattrr$   )selfr   r   argskwargsr.   
gen_configsampling_rate_namer(   r   r   r&   m   s:   

 zTextToAudioPipeline.__init__c                 K   s   t |tr|g}| jjjdkr-d}t| jdrt| jjdd}|dddd}|	| |}| j
d ur5| j
n| j}t |trL|j|jfddd|}|S | jjjd	krZd
d |D }| jjjdkrhdd |D }||fi |ddi}|S )Nbarkr   semantic_configmax_input_semantic_lengthFT)
max_lengthadd_special_tokensreturn_attention_maskreturn_token_type_ids)tokenizereturn_dictcsmc                 S   $   g | ]}| d sd| n|qS )[z[0]
startswith.0tr   r   r   
<listcomp>      $ z2TextToAudioPipeline.preprocess.<locals>.<listcomp>diac                 S   rG   )rH   z[S1] rI   rK   r   r   r   rN      rO   return_tensorspt)
isinstancestrr'   r.   r/   r6   r!   r5   r>   r3   r0   	tokenizerr	   apply_chat_templatemessages)r7   textr9   r@   
new_kwargspreprocessoroutputr   r   r   
preprocess   s<   


zTextToAudioPipeline.preprocessc                 K   s   | j || jd}|d }|d }| j rM| j || jd}d|vr&| j|d< || |ddi | jjjdv rAd|vrAd|d< | jjdi ||}nt	|rZt
d	|  | jdi ||d
 }| jd urp| |}|S )N)r-   forward_paramsgenerate_kwargsr!   return_dict_in_generateT)rF   output_audiozYou're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. For reference, the `generate_kwargs` used here are: r   r   )_ensure_tensor_on_devicer-   r'   can_generater!   r3   r.   r/   generatelen
ValueErrorkeysr   )r7   model_inputsr9   r]   r^   r[   r   r   r   _forward   s.   




zTextToAudioPipeline._forwardtext_inputsr]   returnc                 K      d S Nr   r7   ri   r]   r   r   r   __call__      zTextToAudioPipeline.__call__c                 K   rk   rl   r   rm   r   r   r   rn      ro   c                 K   rk   rl   r   rm   r   r   r   rn      ro   c                 K   rk   rl   r   rm   r   r   r   rn      ro   c                    s   t  j|fi |S )aL  
        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

        Args:
            text_inputs (`str`, `list[str]`, `ChatType`, or `list[ChatType]`):
                One or several texts to generate. If strings or a list of string are passed, this pipeline will
                generate the corresponding text. Alternatively, a "chat", in the form of a list of dicts with "role"
                and "content" keys, can be passed, or a list of such chats. When chats are passed, the model's chat
                template will be used to format them before passing them to the model.
            forward_params (`dict`, *optional*):
                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
                underlying model.
            generate_kwargs (`dict`, *optional*):
                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                complete overview of generate, check the [following
                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
                only passed to the underlying model if the latter is a generative model.

        Return:
            `AudioOutput` or a list of `AudioOutput`, which is a `TypedDict` with two keys:

            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
        )r%   rn   rm   r<   r   r   rn      s   c                 C   sr   t | dd d ur| j|d< t | dd d ur| j|d< | j|d< |r#|ni |r(|ni d}|d u r2i }i }|||fS )Nassistant_modelassistant_tokenizerrU   )r]   r^   )r5   rp   rU   rq   )r7   preprocess_paramsr]   r^   paramspostprocess_paramsr   r   r   _sanitize_parameters  s   





z(TextToAudioPipeline._sanitize_parametersc                 C   s   d}t |trd|v r|d }nd}|d }n	t |tr |d }|r-| jd ur-| j|}t |trFdd |D }t|dkrA|n|d }n|jd	tj	d

  }t|| jdS )NFr   T	sequencesr   c                 S   s$   g | ]}|j d tjd  qS )cpur-   dtype)r,   torchfloatnumpysqueeze)rL   elr   r   r   rN   3  rO   z3TextToAudioPipeline.postprocess.<locals>.<listcomp>r   rw   rx   )r   r   )rS   dicttupler0   decodelistrd   r,   rz   r{   r|   r}   r   r   )r7   r   needs_decodingr   r   r   postprocess$  s$   




zTextToAudioPipeline.postprocess)NNN)r   r   r   r   _pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr&   r\   rh   r   rT   r   r   rn   r   r
   ru   r   __classcell__r   r   r<   r   r   -   s6    4('*  
r   )typingr   r   r   audio_utilsr   
generationr   utilsr   utils.chat_template_utilsr	   r
   baser   rz   models.auto.modeling_autor   !models.speecht5.modeling_speecht5r   r+   r   r   r   r   r   r   <module>   s   