o
    Gi                     @   s   d dl mZ d dlmZ d dlZd dlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ eeZG dd	 d	ejZeG d
d deZG dd dejZG dd deeZdS )    )	dataclass)piN   )ConfigMixinregister_to_config)
ModelMixin)
BaseOutputloggingc                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	StableAudioPositionalEmbeddingzUsed for continuous timedimc                    s8   t    |d dksJ |d }tt|| _d S )N   r   )super__init__nn	Parametertorchrandnweights)selfr   half_dim	__class__ j/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/stable_audio/modeling_stable_audio.pyr       s   
z'StableAudioPositionalEmbedding.__init__timesreturnc                 C   sN   |d }|| j d   d t }tj| | fdd}tj||fdd}|S )N).Nr   )r   )r   r   r   catsincos)r   r   freqs	fourieredr   r   r   forward&   s
   z&StableAudioPositionalEmbedding.forward
__name__
__module____qualname____doc__intr   r   Tensorr"   __classcell__r   r   r   r   r
      s    r
   c                   @   sH   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dS ) StableAudioProjectionModelOutputa  
    Args:
    Class for StableAudio projection layer's outputs.
        text_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states obtained by linearly projecting the hidden-states for the text encoder.
        seconds_start_hidden_states (`torch.Tensor` of shape `(batch_size, 1, hidden_size)`, *optional*):
            Sequence of hidden-states obtained by linearly projecting the audio start hidden states.
        seconds_end_hidden_states (`torch.Tensor` of shape `(batch_size, 1, hidden_size)`, *optional*):
            Sequence of hidden-states obtained by linearly projecting the audio end hidden states.
    Ntext_hidden_statesseconds_start_hidden_statesseconds_end_hidden_states)
r$   r%   r&   r'   r,   r   r)   __annotations__r-   r.   r   r   r   r   r+   .   s
   
 r+   c                       s>   e Zd ZdZ	d
dedB f fddZdejfdd	Z  Z	S )StableAudioNumberConditionera  
    A simple linear projection model to map numbers to a latent space.

    Args:
        number_embedding_dim (`int`):
            Dimensionality of the number embeddings.
        min_value (`int`):
            The minimum value of the seconds number conditioning modules.
        max_value (`int`):
            The maximum value of the seconds number conditioning modules
        internal_dim (`int`):
            Dimensionality of the intermediate number hidden states.
       internal_dimNc                    s@   t    tt|tj|d |d| _|| _|| _|| _	d S )N   )in_featuresout_features)
r   r   r   
Sequentialr
   Lineartime_positional_embeddingnumber_embedding_dim	min_value	max_value)r   r9   r:   r;   r2   r   r   r   r   O   s   

z%StableAudioNumberConditioner.__init__floatsc                 C   s^   | | j| j}|| j | j| j  }t| j j}||}| |}|dd| j	}|S )Nr   r3   )
clampr:   r;   nextr8   
parametersdtypetoviewr9   )r   r<   normalized_floatsembedder_dtype	embeddingfloat_embedsr   r   r   r"   `   s   

z$StableAudioNumberConditioner.forward)r1   r#   r   r   r   r   r0   @   s    r0   c                       sT   e Zd ZdZe fddZ			d
dejdB dejdB dejdB fdd	Z  Z	S )StableAudioProjectionModela  
    A simple linear projection model to map the conditioning values to a shared latent space.

    Args:
        text_encoder_dim (`int`):
            Dimensionality of the text embeddings from the text encoder (T5).
        conditioning_dim (`int`):
            Dimensionality of the output conditioning tensors.
        min_value (`int`):
            The minimum value of the seconds number conditioning modules.
        max_value (`int`):
            The maximum value of the seconds number conditioning modules
    c                    sH   t    ||krt nt||| _t|||| _t|||| _d S )N)	r   r   r   Identityr7   text_projectionr0   start_number_conditionerend_number_conditioner)r   text_encoder_dimconditioning_dimr:   r;   r   r   r   r      s
   
z#StableAudioProjectionModel.__init__Nr,   start_secondsend_secondsc                 C   sP   |d u r|n|  |}|d u r|n| |}|d u r|n| |}t|||dS )N)r,   r-   r.   )rI   rJ   rK   r+   )r   r,   rN   rO   r-   r.   r   r   r   r"      s   z"StableAudioProjectionModel.forward)NNN)
r$   r%   r&   r'   r   r   r   r)   r"   r*   r   r   r   r   rG   r   s    
rG   )dataclassesr   mathr   r   torch.nnr   configuration_utilsr   r   models.modeling_utilsr   utilsr   r	   
get_loggerr$   loggerModuler
   r+   r0   rG   r   r   r   r   <module>   s   
2