
    h$                        d dl Z d dlmZmZmZ d dlZd dlmZ d dlmc m	Z
 d dlmZ d dlmZ d dlmZ d dlmZ ddlmZ  ej        e          Z G d	 d
ej                  Zd Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d de          Z ej         ee           dgZ!dS )    N)OptionalTupleUnion)	AutoModel)PreTrainedModel)ACT2FN)logging   )VibeVoiceDiffusionHeadConfigc                   B     e Zd Zddedef fdZd Zd Zd	efd
Z	 xZ
S )RMSNormư>TFdimepsc                    t                                                       || _        || _        || _        | j        r-t          j        t          j        |                    | _	        d S | 
                    dd            d S )Nweight)super__init__r   r   elementwise_affinenn	Parametertorchonesr   register_parameter)selfr   r   r   memory_efficient	__class__s        Z/workspace/chatterbox-finetuning/src/vibevoice/modular/modular_vibevoice_diffusion_head.pyr   zRMSNorm.__init__   ss    "4" 	4,uz#77DKKK##Hd33333    c                     |t          j        |                    d                              dd          | j        z             z  S )N   T)keepdim)r   rsqrtpowmeanr   )r   xs     r   _normzRMSNorm._norm   s8    5;quuQxx}}R}>>IJJJJr   c                     |                      |                                                              |          }| j        
|| j        z  }|S N)r(   floattype_asr   )r   r'   outputs      r   forwardzRMSNorm.forward"   sB    AGGII&&..q11;"dk)Fr   returnc                 6    d| j          d| j         d| j         S )Nzdim=z, eps=z, elementwise_affine=)r   r   r   )r   s    r   
extra_reprzRMSNorm.extra_repr(   s'    ^dh^^dh^^TE\^^^r   )r   TF)__name__
__module____qualname__intr+   r   r(   r.   strr1   __classcell__r   s   @r   r   r      s        4 4C 4e 4 4 4 4 4 4K K K  _C _ _ _ _ _ _ _ _r   r   c                     | d|z   z  |z   S )z!Apply modulation to input tensor.r
    )r'   shiftscales      r   modulater=   +   s    E	?U""r   c                   B     e Zd ZdZd fd	Zedd            Zd Z xZS )	TimestepEmbedderz
    Embeds scalar timesteps into vector representations.
    
    Args:
        hidden_size (`int`): Size of the output embedding
        frequency_embedding_size (`int`, optional): Size of the intermediate frequency embedding
       c           	          t                                                       t          j        t          j        ||d          t
          d         t          j        ||d                    | _        || _        d S NFbiassilu)r   r   r   
SequentialLinearr   mlpfrequency_embedding_size)r   hidden_sizerI   r   s      r   r   zTimestepEmbedder.__init__8   sm    =I.%HHH6NIk;U;;;	
 
 )A%%%r   '  c           	      D   |dz  }t          j        t          j        |           t          j        d|t           j                  z  |z                                | j                  }| dddf                                         |d         z  }t          j	        t          j
        |          t          j        |          gd          }|dz  r6t          j	        |t          j        |ddddf                   gd          }|                    | j                  S )a  
        Create sinusoidal timestep embeddings.
        
        Args:
            t (`torch.Tensor`): A 1-D Tensor of N indices, one per batch element.
                            These may be fractional.
            dim (`int`): The dimension of the output.
            max_period (`int`, optional): Controls the minimum frequency of the embeddings.
            
        Returns:
            `torch.Tensor`: An [N, D] Tensor of positional embeddings.
        r!   r   )startenddtypeNr"   r   r
   )r   expmathlogarangefloat32todevicer+   catcossin
zeros_likerO   )tr   
max_periodhalffreqsargs	embeddings          r   timestep_embeddingz#TimestepEmbedder.timestep_embeddingB   s    ax	Xj!!!ELqd%-$X$X$XX[__
 

"QX,, 	 Dz!!E$K/Iuy	$@bIII	7 	[	9e.>yBQB?O.P.P"QWYZZZI||AG$$$r   c                 f    |                      || j                  }|                     |          }|S r*   )rb   rI   rH   )r   r\   t_freqt_embs       r   r.   zTimestepEmbedder.forwardZ   s0    ((D,IJJ  r   )r@   )rK   )	r2   r3   r4   __doc__r   staticmethodrb   r.   r7   r8   s   @r   r?   r?   0   s|         A A A A A A % % % \%.      r   r?   c                   (     e Zd ZdZ fdZd Z xZS )FeedForwardNetworkz
    Standard feed-forward network with SwiGLU activation.
    
    Args:
        embed_dim (`int`): Input dimension
        ffn_dim (`int`): Hidden dimension
    c                 @   t                                                       || _        t          j        | j        |d          | _        t          j        | j        |d          | _        t          j        || j        d          | _        t          d         | _	        d S rB   )
r   r   	embed_dimr   rG   	gate_projup_proj	down_projr   act_fn)r   rk   ffn_dimr   s      r   r   zFeedForwardNetwork.__init__h   s    
 	"4>7GGGyuEEE7DNGGGVnr   c                     |                      |          }|                     |          }|                     |          }|                     ||z            S r*   )rl   rm   ro   rn   )r   r'   gateups       r   r.   zFeedForwardNetwork.forwardt   sK    ~~a  \\!__ {{4  ~~dRi(((r   r2   r3   r4   rf   r   r.   r7   r8   s   @r   ri   ri   `   sQ         
% 
% 
% 
% 
%) ) ) ) ) ) )r   ri   c                   ,     e Zd ZdZ	 d fd	Zd Z xZS )	HeadLayera  
    A layer in the diffusion head.
    
    Args:
        embed_dim (`int`): Input dimension
        ffn_dim (`int`): Hidden dimension
        cond_dim (`int`): Condition embedding dimension
        norm_eps (`float`, optional): Epsilon for normalization
    h㈵>c                 j   t                                                       || _        || _        || _        t          | j        | j                  | _        t          | j        |          | _        t          j
        t          d         t          j        |d| j        z  d                    | _        d S )N)r   rE      FrC   )r   r   rk   cond_dimrp   ri   ffnr   normr   rF   r   rG   adaLN_modulation)r   rk   rp   rz   norm_epsr   s        r   r   zHeadLayer.__init__   s     	" %NL
 
 DN999	 "6NIhDN 2???!
 !
r   c           	          |                      |                              dd          \  }}}|||                     t          |                     |          ||                    z  z   }|S )Nry   r"   rP   )r}   chunkr{   r=   r|   )r   r'   c	shift_ffn	scale_ffngate_ffns         r   r.   zHeadLayer.forward   se    )-)>)>q)A)A)G)Gr)G)R)R&	9h488HTYYq\\9i$P$PQQQQr   rw   rt   r8   s   @r   rv   rv   ~   s[          
 
 
 
 
 
,      r   rv   c                   *     e Zd ZdZd fd	Zd Z xZS )
FinalLayera  
    Final layer in the diffusion head.
    
    Args:
        hidden_size (`int`): Input dimension
        output_size (`int`): Output dimension
        cond_size (`int`): Condition embedding dimension
        norm_eps (`float`, optional): Epsilon for normalization
    rw   c                 (   t                                                       t          ||d          | _        t	          j        ||d          | _        t	          j        t          d         t	          j        |d|z  d                    | _	        d S )NF)r   r   rC   rE   r!   )
r   r   r   
norm_finalr   rG   linearrF   r   r}   )r   rJ   output_size	cond_sizer~   r   s        r   r   zFinalLayer.__init__   s    !+8PUVVVi[uEEE "6NIi[u===!
 !
r   c                     |                      |                              dd          \  }}t          |                     |          ||          }|                     |          }|S )Nr!   r"   rP   )r}   r   r=   r   r   )r   r'   r   r;   r<   s        r   r.   zFinalLayer.forward   s]    ,,Q//55aR5@@uT__Q''66KKNNr   r   rt   r8   s   @r   r   r      sV         
 
 
 
 
 
      r   r   c                   >     e Zd ZdZeZdZdZdZ fdZ	d Z
d Z xZS )VibeVoiceDiffusionHeadz
    Diffusion head model for vibevoice.
    
    Args:
        config (`VibeVoiceDiffusionHeadConfig`): Model configuration
        latent_size (`int`, optional): Size of the latent space. If not provided, uses `config.latent_size`.
    Tc                 d    t                                                      _        j         _        j        }t          j        |j        d           _        t          j        j         j        d           _	        t           j                   _        t          j        j        z            t          j         fdt          j                  D                        _        t%          j        | j        j                   _                                          d S )NFrC   c                 T    g | ]$}t          j        j        j                   %S ))rk   rp   rz   r~   )rv   rJ   rz   rms_norm_eps).0_configrp   r   s     r   
<listcomp>z3VibeVoiceDiffusionHead.__init__.<locals>.<listcomp>   sN     %
 %
 %
   ,,	  %
 %
 %
r   )rJ   r   r   r~   )r   r   r   rJ   rz   latent_sizer   rG   noisy_images_proj	cond_projr?   
t_embedderr5   head_ffn_ratio
ModuleListrangehead_layerslayersr   r   final_layerinitialize_weights)r   r   r   rp   r   s   `` @r   r   zVibeVoiceDiffusionHead.__init__   s5    	   *(!#;8JQV!W!W!W6#5t}5QQQ*4=99f(6+@@AA m %
 %
 %
 %
 %
 %
 6-..%
 %
 %
   &*#m(	
 
 
 	!!!!!r   c                    t           j                            | j        j        d         j        d           t           j                            | j        j        d         j        d           | j        D ]2}t           j                            |j        d         j        d           3t           j                            | j	        j        d         j        d           t           j                            | j	        j
        j        d           dS )z$Initialize the weights of the model.r   g{Gz?)stdr!   r"   N)r   initnormal_r   rH   r   r   	constant_r}   r   r   )r   layers     r   r   z)VibeVoiceDiffusionHead.initialize_weights   s     	+A.54@@@
+A.54@@@ [ 	D 	DEGe4R8?CCCC 	$*;B?FJJJ
$*18!<<<<<r   c                     |                      |          }|                     |          }|                     |          }||z   }| j        D ]} |||          }|                     ||          }|S )ap  
        Forward pass of the prediction head.
        
        Args:
            noisy_images (`torch.Tensor`): Noisy images/latents to denoise
            timesteps (`torch.Tensor`): Timesteps for diffusion
            condition (`torch.Tensor`): Conditioning information
            
        Returns:
            `torch.Tensor`: The predicted noise/velocity
        )r   r   r   r   r   )r   noisy_images	timesteps	conditionr'   r\   r   r   s           r   r.   zVibeVoiceDiffusionHead.forward   s~    " ""<00OOI&&NN9--	M[ 	 	EaAAQ""r   )r2   r3   r4   rf   r   config_classsupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar   r   r.   r7   r8   s   @r   r   r      sy          0L&*#!N"" "" "" "" ""H= = =      r   r   )"rR   typingr   r   r   r   torch.nnr   torch.nn.functional
functionalFtransformers.models.autor   transformers.modeling_utilsr   transformers.activationsr   transformers.utilsr	   configuration_vibevoicer   
get_loggerr2   loggerModuler   r=   r?   ri   rv   r   r   register__all__r:   r   r   <module>r      s!    ) ) ) ) ) ) ) ) ) )                 . . . . . . 7 7 7 7 7 7 + + + + + + & & & & & & A A A A A A 
	H	%	%_ _ _ _ _bi _ _ _.# # #
- - - - -ry - - -`) ) ) ) ) ) ) )<# # # # #	 # # #L       6Y Y Y Y Y_ Y Y Yx 	 /1G H H H r   