o
    ꁱi$                     @   s  d dl Z d dlmZmZmZ d dlZd dlmZ d dlm  m	Z
 d dlmZ d dlmZ d dlmZ d dlmZ ddlmZ eeZG d	d
 d
ejZdd ZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZe ee dgZ!dS )    N)OptionalTupleUnion)	AutoModel)PreTrainedModel)ACT2FN)logging   )VibeVoiceDiffusionHeadConfigc                       sF   e Zd Zddedef fddZdd	 Zd
d ZdefddZ	  Z
S )RMSNormư>TFdimepsc                    sH   t    || _|| _|| _| jrtt|| _	d S | 
dd  d S )Nweight)super__init__r   r   elementwise_affinenn	Parametertorchonesr   register_parameter)selfr   r   r   memory_efficient	__class__ V/home/ubuntu/vibevoice-community/vibevoice/modular/modular_vibevoice_diffusion_head.pyr      s   
zRMSNorm.__init__c                 C   s$   |t |djddd| j  S )N   T)keepdim)r   rsqrtpowmeanr   )r   xr   r   r   _norm   s   $zRMSNorm._normc                 C   s,   |  | |}| jd ur|| j }|S N)r%   floattype_asr   )r   r$   outputr   r   r   forward"   s   

zRMSNorm.forwardreturnc                 C   s   d| j  d| j d| j S )Nzdim=z, eps=z, elementwise_affine=)r   r   r   )r   r   r   r   
extra_repr(   s   zRMSNorm.extra_repr)r   TF)__name__
__module____qualname__intr'   r   r%   r*   strr,   __classcell__r   r   r   r   r      s
    
r   c                 C   s   | d|  | S )z!Apply modulation to input tensor.r	   r   )r$   shiftscaler   r   r   modulate+   s   r5   c                       s8   e Zd ZdZd
 fdd	ZedddZdd	 Z  ZS )TimestepEmbedderz
    Embeds scalar timesteps into vector representations.
    
    Args:
        hidden_size (`int`): Size of the output embedding
        frequency_embedding_size (`int`, optional): Size of the intermediate frequency embedding
       c              	      s@   t    ttj||ddtd tj||dd| _|| _d S NFbiassilu)r   r   r   
SequentialLinearr   mlpfrequency_embedding_size)r   hidden_sizer?   r   r   r   r   8   s   

zTimestepEmbedder.__init__'  c                 C   s   |d }t t| t jd|t jd | | j}| dddf  |d  }t j	t 
|t |gdd}|d rQt j	|t |ddddf gdd}|| jS )a  
        Create sinusoidal timestep embeddings.
        
        Args:
            t (`torch.Tensor`): A 1-D Tensor of N indices, one per batch element.
                            These may be fractional.
            dim (`int`): The dimension of the output.
            max_period (`int`, optional): Controls the minimum frequency of the embeddings.
            
        Returns:
            `torch.Tensor`: An [N, D] Tensor of positional embeddings.
        r   r   )startenddtypeNr   r   r	   )r   expmathlogarangefloat32todevicer'   catcossin
zeros_likerD   )tr   
max_periodhalffreqsargs	embeddingr   r   r   timestep_embeddingB   s    (z#TimestepEmbedder.timestep_embeddingc                 C   s   |  || j}| |}|S r&   )rW   r?   r>   )r   rQ   t_freqt_embr   r   r   r*   Z   s   
zTimestepEmbedder.forward)r7   )rA   )	r-   r.   r/   __doc__r   staticmethodrW   r*   r2   r   r   r   r   r6   0   s    
r6   c                       s(   e Zd ZdZ fddZdd Z  ZS )FeedForwardNetworkz
    Standard feed-forward network with SwiGLU activation.
    
    Args:
        embed_dim (`int`): Input dimension
        ffn_dim (`int`): Hidden dimension
    c                    sZ   t    || _tj| j|dd| _tj| j|dd| _tj|| jdd| _td | _	d S r8   )
r   r   	embed_dimr   r=   	gate_projup_proj	down_projr   act_fn)r   r]   ffn_dimr   r   r   r   h   s   
zFeedForwardNetwork.__init__c                 C   s,   |  |}| |}| |}| || S r&   )r^   r_   ra   r`   )r   r$   gateupr   r   r   r*   t   s   


zFeedForwardNetwork.forwardr-   r.   r/   rZ   r   r*   r2   r   r   r   r   r\   `   s    r\   c                       s,   e Zd ZdZ	d fdd	Zdd Z  ZS )	HeadLayera  
    A layer in the diffusion head.
    
    Args:
        embed_dim (`int`): Input dimension
        ffn_dim (`int`): Hidden dimension
        cond_dim (`int`): Condition embedding dimension
        norm_eps (`float`, optional): Epsilon for normalization
    h㈵>c                    sd   t    || _|| _|| _t| j| j| _t| j|d| _t	
td t	j|d| j dd| _d S )N)r   r;      Fr9   )r   r   r]   cond_dimrb   r\   ffnr   normr   r<   r   r=   adaLN_modulation)r   r]   rb   ri   norm_epsr   r   r   r      s   

zHeadLayer.__init__c                 C   s>   |  |jddd\}}}||| t| |||  }|S )Nrh   r   rE   )rl   chunkrj   r5   rk   )r   r$   c	shift_ffn	scale_ffngate_ffnr   r   r   r*      s    zHeadLayer.forwardrg   re   r   r   r   r   rf   ~   s
    rf   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )
FinalLayera  
    Final layer in the diffusion head.
    
    Args:
        hidden_size (`int`): Input dimension
        output_size (`int`): Output dimension
        cond_size (`int`): Condition embedding dimension
        norm_eps (`float`, optional): Epsilon for normalization
    rg   c                    sR   t    t||dd| _tj||dd| _ttd tj|d| dd| _	d S )NF)r   r   r9   r;   r   )
r   r   r   
norm_finalr   r=   linearr<   r   rl   )r   r@   output_size	cond_sizerm   r   r   r   r      s   

zFinalLayer.__init__c                 C   s8   |  |jddd\}}t| |||}| |}|S )Nr   r   rE   )rl   rn   r5   ru   rv   )r   r$   ro   r3   r4   r   r   r   r*      s   
zFinalLayer.forwardrs   re   r   r   r   r   rt      s    	
rt   c                       s@   e Zd ZdZeZdZdZdZ fddZ	dd Z
dd Z  ZS )	VibeVoiceDiffusionHeadz
    Diffusion head model for vibevoice.
    
    Args:
        config (`VibeVoiceDiffusionHeadConfig`): Model configuration
        latent_size (`int`, optional): Size of the latent space. If not provided, uses `config.latent_size`.
    Tc                    s   t     _ j_ j}tj| jdd_tj jjdd_	t
j_t j j t fddt jD _t j|j jd_  d S )NFr9   c                    s"   g | ]}t  jj jd qS ))r]   rb   ri   rm   )rf   r@   ri   rms_norm_eps).0_configrb   r   r   r   
<listcomp>   s    z3VibeVoiceDiffusionHead.__init__.<locals>.<listcomp>)r@   rw   rx   rm   )r   r   r~   r@   ri   latent_sizer   r=   noisy_images_proj	cond_projr6   
t_embedderr0   head_ffn_ratio
ModuleListrangehead_layerslayersrt   rz   final_layerinitialize_weights)r   r~   r   r   r}   r   r      s$   
zVibeVoiceDiffusionHead.__init__c                 C   s   t jj| jjd jdd t jj| jjd jdd | jD ]}t j|jd jd qt j| j	jd jd t j| j	j
jd dS )z$Initialize the weights of the model.r   g{Gz?)stdr   r   N)r   initnormal_r   r>   r   r   	constant_rl   r   rv   )r   layerr   r   r   r      s   
z)VibeVoiceDiffusionHead.initialize_weightsc                 C   sL   |  |}| |}| |}|| }| jD ]}|||}q| ||}|S )ap  
        Forward pass of the prediction head.
        
        Args:
            noisy_images (`torch.Tensor`): Noisy images/latents to denoise
            timesteps (`torch.Tensor`): Timesteps for diffusion
            condition (`torch.Tensor`): Conditioning information
            
        Returns:
            `torch.Tensor`: The predicted noise/velocity
        )r   r   r   r   r   )r   noisy_images	timesteps	conditionr$   rQ   ro   r   r   r   r   r*      s   



zVibeVoiceDiffusionHead.forward)r-   r.   r/   rZ   r
   config_classsupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar   r   r*   r2   r   r   r   r   ry      s    $ry   )"rG   typingr   r   r   r   torch.nnr   torch.nn.functional
functionalFtransformers.models.autor   transformers.modeling_utilsr   transformers.activationsr   transformers.utilsr   configuration_vibevoicer
   
get_loggerr-   loggerModuler   r5   r6   r\   rf   rt   ry   register__all__r   r   r   r   <module>   s*    
0&\