o
    ۷if                     @   s   d dl Z d dlmZmZ d dlZd dlZd dlm  m	Z
 d dlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ d	d
lmZmZmZmZ e rbd dl m!  m"Z# dZ$ndZ$e%e&Z'dZ(G dd deeeZ)dS )    N)AnyCallable)ClapTextModelWithProjectionRobertaTokenizerRobertaTokenizerFastSpeechT5HifiGan   )AutoencoderKLUNet2DConditionModel)KarrasDiffusionSchedulers)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )AudioPipelineOutputDeprecatedPipelineMixinDiffusionPipelineStableDiffusionMixinTFaj  
    Examples:
        ```py
        >>> from diffusers import AudioLDMPipeline
        >>> import torch
        >>> import scipy

        >>> repo_id = "cvssp/audioldm-s-full-v2"
        >>> pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
        >>> pipe = pipe.to("cuda")

        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
        >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]

        >>> # save the audio sample as a .wav file
        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
        ```
c                $       s  e Zd ZdZdZdZdededee	B de
ded	ef fd
dZ			d1dejdB dejdB fddZdd Zdd Zdd Z			d1ddZd2ddZe ee																 d3d!eee B d"edB d#ed$ed%eee B dB d&edB d'ed(ejeej B dB d)ejdB dejdB dejdB d*ed+eeeejgdf dB d,edB d-e ee!f dB d.edB f d/d0Z"  Z#S )4AudioLDMPipelinea  
    Pipeline for text-to-audio generation using AudioLDM.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        text_encoder ([`~transformers.ClapTextModelWithProjection`]):
            Frozen text-encoder (`ClapTextModelWithProjection`, specifically the
            [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
        tokenizer ([`PreTrainedTokenizer`]):
            A [`~transformers.RobertaTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A `UNet2DConditionModel` to denoise the encoded audio latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        vocoder ([`~transformers.SpeechT5HifiGan`]):
            Vocoder of class `SpeechT5HifiGan`.
    z0.33.1ztext_encoder->unet->vaevaetext_encoder	tokenizerunet	schedulervocoderc                    sR   t    | j||||||d t| dd r$dt| jjjd  | _d S d| _d S )N)r   r   r   r   r   r   r   r         )	super__init__register_modulesgetattrlenr   configblock_out_channelsvae_scale_factor)selfr   r   r   r   r   r   	__class__ d/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/audioldm/pipeline_audioldm.pyr   W   s   
	2zAudioLDMPipeline.__init__Nprompt_embedsnegative_prompt_embedsc              
   C   s  |durt |trd}n|durt |trt|}n|jd }|du r| j|d| jjddd}	|	j}
|	j}| j|ddd	j}|jd
 |
jd
 krmt	
|
|sm| j|dd| jjd d
f }td| jj d|  | j|
|||d}|j}tj|d
d}|j| jj|d}|j\}}|d|}||| |}|r|du r|du rdg| }n;t|t|urtdt| dt| dt |tr|g}n|t|krtd| dt| d| d| d	|}|jd }| j|d|ddd}|j|}|j|}| j||d}|j}tj|d
d}|r?|jd }|j| jj|d}|d|}||| |}t	||g}|S )a`  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device (`torch.device`):
                torch device
            num_waveforms_per_prompt (`int`):
                number of waveforms that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
        Nr   r   
max_lengthTpt)paddingr-   
truncationreturn_tensorslongest)r/   r1   z\The following part of your input was truncated because CLAP can only handle sequences up to z	 tokens: )attention_mask)dim)dtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancestrlistr"   shaper   model_max_length	input_idsr4   torchequalbatch_decodeloggerwarningr   totext_embedsF	normalizer6   repeatviewtype	TypeError
ValueErrorcat)r&   promptr7   num_waveforms_per_promptdo_classifier_free_guidancenegative_promptr+   r,   
batch_sizetext_inputstext_input_idsr4   untruncated_idsremoved_textbs_embedseq_lenuncond_tokensr-   uncond_inputuncond_input_idsr)   r)   r*   _encode_promptl   s   "




zAudioLDMPipeline._encode_promptc                 C   s$   d| j jj | }| j |j}|S )Nr   )r   r#   scaling_factordecodesample)r&   latentsmel_spectrogramr)   r)   r*   decode_latents   s   zAudioLDMPipeline.decode_latentsc                 C   s0   |  dkr|d}| |}|  }|S )N   r   )r5   squeezer   cpufloat)r&   rb   waveformr)   r)   r*   mel_spectrogram_to_waveform   s
   

z,AudioLDMPipeline.mel_spectrogram_to_waveformc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Neta	generator)setinspect	signaturer   step
parameterskeys)r&   rk   rj   accepts_etaextra_step_kwargsaccepts_generatorr)   r)   r*   prepare_extra_step_kwargs  s   z*AudioLDMPipeline.prepare_extra_step_kwargsc           	      C   sj  || j  }||k rtd| d| d| jjj| j  dkr-td| jjj d| j  d|d u s>|d urKt|tr>|dkrKtd| dt| d|d ur^|d ur^td	| d
| d|d u rj|d u rjtd|d urt|tst|t	stdt| |d ur|d urtd| d| d|d ur|d ur|j
|j
krtd|j
 d|j
 dd S d S d S )NzH`audio_length_in_s` has to be a positive value greater than or equal to z	, but is r9   r   zwThe number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the VAE scale factor, but got z bins and a scale factor of z5`callback_steps` has to be a positive integer but is z	 of type zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )r%   rM   r   r#   model_in_dimr:   intrK   r;   r<   r=   )	r&   rO   audio_length_in_svocoder_upsample_factorcallback_stepsrR   r+   r,   min_audio_length_in_sr)   r)   r*   check_inputs  sb   

zAudioLDMPipeline.check_inputsc           	      C   s   ||t || j t | jjj| j f}t|tr+t||kr+tdt| d| d|d u r8t	||||d}n|
|}|| jj }|S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)rk   r7   r6   )rw   r%   r   r#   rv   r:   r<   r"   rM   r   rE   r   init_noise_sigma)	r&   rS   num_channels_latentsheightr6   r7   rk   ra   r=   r)   r)   r*   prepare_latentsR  s    
z AudioLDMPipeline.prepare_latents
         @r           TnprO   rx   num_inference_stepsguidance_scalerR   rP   rj   rk   ra   return_dictcallbackrz   cross_attention_kwargsoutput_typec           %   	   C   s   t | jjj| jjj }|du r| jjj| j | }t	|| }t	|| jjj }|| j dkrOt	t 
|| j | j }td| d||  d| d | ||||||
| |durft|trfd}n|durtt|trtt|}n|
jd }| j}|dk}| j||||||
|d	}
| jj||d
 | jj}| jjj}| || |||
j|||	}	| ||}t||| jj  }| j|d}t|D ]\}}|rt |	gd n|	}| j!||}| j||d|
|dj"}|r|#d\} }!| ||!|    }| jj$|||	fi |j%}	|t|d ks!|d |kr@|d | jj dkr@|&  |dur@|| dkr@|t'| jdd }"||"||	 t(rGt)*  qW d   n	1 sSw   Y  | +|	}#| ,|#}$|$ddd|f }$|dkru|$- }$|s{|$fS t.|$dS )u  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
            audio_length_in_s (`int`, *optional*, defaults to 5.12):
                The length of the generated audio sample in seconds.
            num_inference_steps (`int`, *optional*, defaults to 10):
                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 2.5):
                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
                The number of waveforms to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            output_type (`str`, *optional*, defaults to `"np"`):
                The output format of the generated image. Choose between `"np"` to return a NumPy `np.ndarray` or
                `"pt"` to return a PyTorch `torch.Tensor` object.

        Examples:

        Returns:
            [`~pipelines.AudioPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated audio.
        Nr   zAudio length in seconds z is increased to z; so that it can be handled by the model. It will be cut to z after the denoising process.r   g      ?)r+   r,   )r7   )totalr   )encoder_hidden_statesclass_labelsr   orderr   )audios)/r   prodr   r#   upsample_ratessampling_rater   sample_sizer%   rw   ceilrC   infor|   r:   r;   r<   r"   r=   _execution_devicer]   r   set_timesteps	timestepsin_channelsr   r6   ru   r   progress_bar	enumerater@   rN   scale_model_inputr`   chunkro   prev_sampleupdater!   XLA_AVAILABLExm	mark_steprc   ri   numpyr   )%r&   rO   rx   r   r   rR   rP   rj   rk   ra   r+   r,   r   r   rz   r   r   ry   r   original_waveform_lengthrS   r7   rQ   r   r~   rs   num_warmup_stepsr   itlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textstep_idxrb   audior)   r)   r*   __call__h  s   O


	6
"


zAudioLDMPipeline.__call__)NNN)N)NNr   r   Nr   r   NNNNTNr   Nr   )$__name__
__module____qualname____doc___last_supported_versionmodel_cpu_offload_seqr	   r   r   r   r
   r   r   r   r@   Tensorr]   rc   ri   ru   r|   r   no_gradr   EXAMPLE_DOC_STRINGr;   r<   rg   rw   	Generatorboolr   dictr   r   __classcell__r)   r)   r'   r*   r   <   s    
 


;
	
r   )*rm   typingr   r   r   r   r@   torch.nn.functionalnn
functionalrG   transformersr   r   r   r   modelsr	   r
   
schedulersr   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rC   r   r   r)   r)   r)   r*   <module>   s$   
