o
    پin                      @   sF  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ e ZG dd dejZG dd deZG dd deZG dd deZG dd dejZdejfdej de!de!dej"dej f
ddZG dd dejZ#dej fd d!Z$dS )"    N)*CombinedTimestepGuidanceTextProjEmbeddings)"CombinedTimestepTextProjEmbeddings)PixArtAlphaTextProjectionTimestepEmbedding)	Timesteps)get_timestep_embedding)timestep_embedding)
get_act_fn)ColumnParallelLinear)MLP)current_platformc                       s@   e Zd ZdZ								ddef fd	d
Zdd Z  ZS )
PatchEmbeda  2D Image to Patch Embedding

    Image to Patch Embedding using Conv2d

    A convolution based approach to patchifying a 2D image w/ embedding projection.

    Based on the impl in https://github.com/google-research/vision_transformer

    Hacked together by / Copyright 2020 Ross Wightman

    Remove the _assert function in forward function to be compatible with multi-resolution images.
             NT prefixc	           	         s   t    t|ttB rt|dkr|d |d f}n||f}|| _|| _tj	||||||d| _
|r:||| _d S t | _d S )N   r   )kernel_sizestridebiasdtype)super__init__
isinstancelisttuplelen
patch_sizeflattennnConv3dprojIdentitynorm)	selfr   in_chans	embed_dim
norm_layerr   r   r   r   	__class__ i/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/layers/visual_embedding.pyr   1   s"   
 zPatchEmbed.__init__c                 C   s0   |  |}| jr|ddd}| |}|S )N   r   )r"   r   	transposer$   )r%   xr+   r+   r,   forwardQ   s
   

zPatchEmbed.forward)r   r   r   NTTNr   )__name__
__module____qualname____doc__strr   r0   __classcell__r+   r+   r)   r,   r   #   s    	 r   c                   @   s"   e Zd ZdejdejfddZdS )r   	timestepsreturnc                 C   s8   t rt|| j| j| j| jdS t|| j| j| j| jdS )N)flip_sin_to_cosdownscale_freq_shiftscale)_is_cudatimestep_embedding_cudanum_channelsr9   r:   r;   timestep_embedding_diffusers)r%   r7   r+   r+   r,   r0   Z   s   zTimesteps.forwardN)r1   r2   r3   torchTensorr0   r+   r+   r+   r,   r   Y   s    r   c                   @      e Zd Zdd ZdS )r   c                 C   sL   t j|  tdddd| _td|d| _td|d| _t||dd| _	d S N   Tr   )r>   r9   r:   )in_channelstime_embed_dimsilu)act_fn)
r    Moduler   r   	time_projr   timestep_embedderguidance_embedderr   text_embedderr%   embedding_dimpooled_projection_dimr+   r+   r,   r   p   s   z3CombinedTimestepGuidanceTextProjEmbeddings.__init__Nr1   r2   r3   r   r+   r+   r+   r,   r   m   s    r   c                   @   rB   )r   c                 C   s>   t j|  tdddd| _td|d| _t||dd| _d S rC   )	r    rI   r   r   rJ   r   rK   r   rM   rN   r+   r+   r,   r      s   z+CombinedTimestepTextProjEmbeddings.__init__NrQ   r+   r+   r+   r,   r      s    r   c                       sX   e Zd ZdZddddejdfdef fdd	Z	dd
ejde	dB dejfddZ
  ZS )TimestepEmbedderz>
    Embeds scalar timesteps into vector representations.
    rG   rD   '  Nr   r   c                    s4   t    || _|| _t|||||d| _|| _d S )N)act_typer   )r   r   frequency_embedding_size
max_periodr   mlp
freq_dtype)r%   hidden_size	act_layerrU   rV   r   rX   r   r)   r+   r,   r      s   


zTimestepEmbedder.__init__ttimestep_seq_lenr8   c                 C   sr   t || j| j| jd| jjjj}|d ur2|j	d | dks#J d|j	d | }|
d||f}| |}|S )N)r   r   z4timestep length is not divisible by timestep_seq_len)r   rU   rV   rX   torW   fc_inweightr   shape	unflatten)r%   r[   r\   t_freq
batch_sizet_embr+   r+   r,   r0      s   
zTimestepEmbedder.forwardN)r1   r2   r3   r4   r@   float32r5   r   rA   intr0   r6   r+   r+   r)   r,   rR      s&    rR   rS   r[   dimrV   r   r8   c              	   C   s   |d }t t| t jd||| jd | }| dddf  |d  }t jt |t 	|gdd}|d rNt j|t 
|ddddf gdd}|S )a  
    Create sinusoidal timestep embeddings.

    Args:
        t: Tensor of shape [B] with timesteps
        dim: Embedding dimension
        max_period: Controls the minimum frequency of the embeddings

    Returns:
        Tensor of shape [B, dim] with embeddings
    r-   r   )startendr   deviceN)rh   r   )r@   expmathlogarangerk   floatcatcossin
zeros_like)r[   rh   rV   r   halffreqsargs	embeddingr+   r+   r,   r      s   
(r   c                       s\   e Zd ZdZ				ddededed	ejdB d
ef
 fddZdej	dej	fddZ
  ZS )ModulateProjectionz Modulation layer for DiT blocks.r-   rG   Nr   rY   factorrZ   r   r   c                    s<   t    || _|| _t||| dd|d| _t|| _d S )NT)r   gather_outputparams_dtype)r   r   r{   rY   r
   linearr	   act)r%   rY   r{   rZ   r   r   r)   r+   r,   r      s   
zModulateProjection.__init__r/   r8   c                 C   s   |  |}| |\}}|S re   )r   r~   )r%   r/   _r+   r+   r,   r0      s   
zModulateProjection.forward)r-   rG   Nr   )r1   r2   r3   r4   rg   r5   r@   r   r   rA   r0   r6   r+   r+   r)   r,   rz      s$    rz   c              	   C   s   | j dksJ d| j  t|dksJ d| || | | jd ks6J d|| |  d| jd  |}|\}}}	| j| jd |||||||	fd} td	| } | j| jd ||| || ||	 fd}
|
S )
z
    Convert patched representation back to image space.

    Args:
        x: Tensor of shape [B, T*H*W, C*P_t*P_h*P_w]
        t, h, w: Temporal and spatial dimensions

    Returns:
        Unpatchified tensor of shape [B, C, T*P_t, H*P_h, W*P_w]
    r   zx.ndim: zpatch_size: r   zt * h * w: z, x.shape[1]: r   )r`   znthwcopq->nctohpwq)ndimr   r`   reshaper@   einsum)r/   r[   hwr   channelscptphpwimgsr+   r+   r,   
unpatchify   s   8
"(r   )%rn   r@   torch.nnr    diffusers.models.embeddingsr   +_CombinedTimestepGuidanceTextProjEmbeddingsr   #_CombinedTimestepTextProjEmbeddingsr   r   r   
_Timestepsr   r?   $sglang.jit_kernel.timestep_embeddingr   r=   /sglang.multimodal_gen.runtime.layers.activationr	   +sglang.multimodal_gen.runtime.layers.linearr
   (sglang.multimodal_gen.runtime.layers.mlpr   'sglang.multimodal_gen.runtime.platformsr   is_cudar<   rI   r   rR   rf   rA   rg   r   rz   r   r+   r+   r+   r,   <module>   sH   6
0
