o
    i                     @   sh   d dl Z ddlmZmZ d dlmZ d dlZG dd de jjZG dd dejZ	G dd	 d	ejZ
dS )
    N   )MiniCPMModelMiniCPM4Configc                       s&   e Zd Z fddZdddZ  ZS )SinusoidalPosEmbc                    s*   t    || _| jd dksJ dd S )Nr   r   z(SinusoidalPosEmb requires dim to be even)super__init__dim)selfr   	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/voxcpm/modules/locdit/local_dit.pyr      s   
zSinusoidalPosEmb.__init__  c                 C   s   |j dk r
|d}|j}| jd }td|d  }ttj||j	|d|  }||d |d }tj
| | fdd}|S )N   r   r   i'  )dtypedevicer   )ndim	unsqueezer   r   mathlogtorchexparanger   catsincos)r	   xscaler   half_dimembr   r   r   forward   s   


zSinusoidalPosEmb.forward)r   )__name__
__module____qualname__r   r"   __classcell__r   r   r
   r   r      s    r   c                       s6   e Zd Z	d	dededef fddZdd Z  ZS )
TimestepEmbeddingNin_channelstime_embed_dimout_dimc                    sN   t    tj||dd| _t | _|d ur|}n|}tj||dd| _d S )NTbias)r   r   nnLinearlinear_1SiLUactlinear_2)r	   r(   r)   r*   time_embed_dim_outr
   r   r   r      s   

zTimestepEmbedding.__init__c                 C   s"   |  |}| |}| |}|S N)r/   r1   r2   )r	   sampler   r   r   r"   +   s   


zTimestepEmbedding.forwardr4   )r#   r$   r%   intr   r"   r&   r   r   r
   r   r'      s    r'   c                
       sV   e Zd ZdZ	ddedef fddZdejdejd	ejd
ejdejf
ddZ	  Z
S )VoxCPMLocDiTz6
    Diffusion model with a Transformer backbone.
    @   configr(   c                    s   t    || _|| _|| _tj||jdd| _tj||jdd| _	tj|j| jdd| _
t|j| _t|j|jd| _t|j|jd| _|jdksNJ dt|| _d S )NTr+   )r(   r)   r   z"vocab_size must be 0 for local DiT)r   r   r(   out_channelsr9   r-   r.   hidden_sizein_proj	cond_projout_projr   time_embeddingsr'   time_mlpdelta_time_mlp
vocab_sizer   decoder)r	   r9   r(   r
   r   r   r   7   s$   
zVoxCPMLocDiT.__init__r   mutconddtc           	      C   s   |  |dd }| |dd }|d}| ||j}| |}| ||j}| 	|}|| }t
j|| d||gdd}| j|dd\}}|dd|d dddf }| |}|dd S )a)  
        Forward pass of DiT.
        x: (N, C, T) tensor of inputs
        mu: (N, C) tensor of hidden embedding
        t: (N,) tensor of diffusion timesteps
        cond: (N, C, T') tensor of prefix conditions
        dt: (N,) used for mean velocity (may be supported in the future...)
        r   r   r   F)	is_causalN)r<   	transpose
contiguousr=   sizer?   tor   r@   rA   r   r   r   rC   r>   )	r	   r   rD   rE   rF   rG   prefixhidden_r   r   r   r"   R   s   



zVoxCPMLocDiT.forward)r8   )r#   r$   r%   __doc__r   r6   r   r   Tensorr"   r&   r   r   r
   r   r7   2   s&    r7   )r   minicpm4r   r   torch.nnr-   r   Moduler   r'   r7   r   r   r   r   <module>   s    