o
    پi                     @   s  d dl Z d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z" d d	l#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ d
Z0e j1dddZ2e j1dddZ3e j1dd dZ4e j1dddZ5ee2e3e4e0e5 e	d+ddZ6G dd dej7Z8G dd dej7Z9G dd dej7Z:G dd dej7Z;G dd dej7Z<G dd dej7Z=G dd  d ej7Z>G d!d" d"ej7Z?G d#d$ d$ej7Z@d%d& ZAd'd( ZBG d)d* d*eZCeCZDdS ),    N)contextmanager)	rearrange)WanVAEConfig)get_sp_parallel_rankget_sp_world_size)
get_act_fn)DiagonalGaussianDistributionParallelTiledVAE)	AvgDown3DDupUp3DWanCausalConv3dWanRMS_normWanUpsampleattention_block_forwardbind_contextmid_block_forwardresample_forwardresidual_block_forwardresidual_down_block_forwardresidual_up_block_forwardup_block_forward)WanDistAttentionBlockWanDistCausalConv3dWanDistMidBlockWanDistResampleWanDistResidualBlockWanDistResidualDownBlockWanDistResidualUpBlockWanDistUpBlockensure_local_heightgather_and_trim_heightsplit_for_parallel_decodesplit_for_parallel_encode   is_first_frameF)default
feat_cachefeat_idxfirst_chunkc              	   c   s    t | }t|}t|}t|}zd V  W t | t| t| t| d S t | t| t| t| w N)r$   setr&   r'   r(   reset)first_frame_argfeat_cache_argfeat_idx_argfirst_chunk_argis_first_frame_tokenfeat_cache_tokenfeat_idx_tokenfirst_chunk_token r4   d/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/vaes/wanvae.pyforward_contextM   s   









r6   c                	       s<   e Zd ZdZddedededdf fddZd	d
 Z  ZS )WanResampleax  
    A custom resampling module for 2D and 3D data.

    Args:
        dim (int): The number of input/output channels.
        mode (str): The resampling mode. Must be one of:
            - 'none': No resampling (identity operation).
            - 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
            - 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
            - 'downsample2d': 2D downsampling with zero-padding and convolution.
            - 'downsample3d': 3D downsampling with zero-padding, convolution, and causal 3D convolution.
    Ndimmodeupsample_out_dimreturnc              	      s  t    || _|| _|d u r|d }|dkr+ttdddtj||ddd| _d S |d	krNttdddtj||ddd| _t	||d d
dd| _
d S |dkrettdtj||ddd| _d S |dkrttdtj||ddd| _t	||d
ddd| _
d S t | _d S )Nr#   
upsample2d)       @r=   znearest-exact)scale_factorr9         padding
upsample3d)r?   r@   r@   )r@   r   r   downsample2d)r   r@   r   r@   )r#   r#   )stridedownsample3d)r#   r@   r@   )r   r   r   )rE   rB   )super__init__r8   r9   nn
Sequentialr   Conv2dresampler   	time_conv	ZeroPad2dIdentity)selfr8   r9   r:   	__class__r4   r5   rH   l   s8   





zWanResample.__init__c                 C   
   t | |S r)   )r   rP   xr4   r4   r5   forward      
zWanResample.forwardr)   )	__name__
__module____qualname____doc__intstrrH   rV   __classcell__r4   r4   rQ   r5   r7   ^   s     %r7   c                       sD   e Zd ZdZ		ddededededd	f
 fd
dZdd Z  Z	S )WanResidualBlockaE  
    A custom residual block module.

    Args:
        in_dim (int): Number of input channels.
        out_dim (int): Number of output channels.
        dropout (float, optional): Dropout rate for the dropout layer. Default is 0.0.
        non_linearity (str, optional): Type of non-linearity to use. Default is "silu".
            siluin_dimout_dimdropoutnon_linearityr;   Nc                    s   t    || _|| _t|| _t|dd| _t||ddd| _	t|dd| _
t|| _t||ddd| _||krCt||d| _d S t | _d S )NFimagesr?   r@   rA   )rG   rH   rb   rc   r   nonlinearityr   norm1r   conv1norm2rI   Dropoutrd   conv2rO   conv_shortcut)rP   rb   rc   rd   re   rQ   r4   r5   rH      s   

zWanResidualBlock.__init__c                 C   rS   r)   )r   rT   r4   r4   r5   rV      rW   zWanResidualBlock.forward)r`   ra   
rX   rY   rZ   r[   r\   floatr]   rH   rV   r^   r4   r4   rQ   r5   r_      s     r_   c                       s*   e Zd ZdZd fddZdd Z  ZS )	WanAttentionBlockz}
    Causal self-attention with a single head.

    Args:
        dim (int): The number of channels in the input tensor.
    r;   Nc                    sB   t    || _t|| _t||d d| _t||d| _d S )Nr?   r@   )	rG   rH   r8   r   normrI   rK   to_qkvproj)rP   r8   rQ   r4   r5   rH      s
   

zWanAttentionBlock.__init__c                 C   rS   r)   )r   rT   r4   r4   r5   rV      rW   zWanAttentionBlock.forwardr;   N)rX   rY   rZ   r[   rH   rV   r^   r4   r4   rQ   r5   rq      s    	rq   c                	       sB   e Zd ZdZ			ddedededef fd	d
Zdd Z  Z	S )WanMidBlockz
    Middle block for WanVAE encoder and decoder.

    Args:
        dim (int): Number of input/output channels.
        dropout (float): Dropout rate.
        non_linearity (str): Type of non-linearity to use.
    r`   ra   r@   r8   rd   re   
num_layersc                    sv   t    || _t||||g}g }t|D ]}|t| |t|||| qt|| _	t|| _
d| _d S )NF)rG   rH   r8   r_   rangeappendrq   rI   
ModuleList
attentionsresnetsgradient_checkpointing)rP   r8   rd   re   rw   r|   r{   _rQ   r4   r5   rH      s   

zWanMidBlock.__init__c                 C   rS   r)   )r   rT   r4   r4   r5   rV      rW   zWanMidBlock.forward)r`   ra   r@   ro   r4   r4   rQ   r5   rv      s    rv   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )WanResidualDownBlockFc           
         s   t    t|||rdnd|rdndd| _g }t|D ]}|t||| |}qt|| _	|rA|r6dnd}	t
||	d| _d S d | _d S )Nr#   r@   factor_tfactor_srF   rD   r9   )rG   rH   r
   avg_shortcutrx   ry   r_   rI   rz   r|   r7   downsampler)
rP   rb   rc   rd   num_res_blockstemperal_downsample	down_flagr|   r~   r9   rQ   r4   r5   rH      s    
	


zWanResidualDownBlock.__init__c                 C   rS   r)   )r   rT   r4   r4   r5   rV     rW   zWanResidualDownBlock.forward)FF)rX   rY   rZ   rH   rV   r^   r4   r4   rQ   r5   r      s
    !r   c                	       R   e Zd ZdZ										
		ddedededef fddZdd Z  Z	S )WanEncoder3dao  
    A 3D encoder module.

    Args:
        dim (int): The base number of channels in the first layer.
        z_dim (int): The dimensionality of the latent space.
        dim_mult (list of int): Multipliers for the number of channels in each block.
        num_res_blocks (int): Number of residual blocks in each block.
        attn_scales (list of float): Scales at which to apply attention mechanisms.
        temperal_downsample (list of bool): Whether to downsample temporally in each block.
        dropout (float): Dropout rate for the dropout layers.
        non_linearity (str): Type of non-linearity to use.
    r?         r@   r#   r   r   r#   r4   TTFr`   ra   Fin_channelsre   is_residualuse_parallel_encodec                    sF  t     | _|| _t|}|| _|| _t|| _t|| _t	|	| _
|| _tt|d d| _ fdddg| D }d}d}t rHt }|r[|dkr[t}t}t}t}t}t}nt}t}t}t}t}t}|||d ddd| _t g | _!t"t#|d d |dd  d	d
D ]f\}\}}|
r| j!$||||||t|d kr|| nd|t|d kd qt%|D ]}| j!$|||| ||v r| j!$|| |}q|t|d kr|| rdnd}| j!$|||d |d }q||||	dd| _&t'|dd| _(|||ddd| _)d| _*d| _+d| _,t r!t | _+t- | _,d S d S )Nr@   r   c                       g | ]} | qS r4   r4   .0ur8   r4   r5   
<listcomp>C      z)WanEncoder3d.__init__.<locals>.<listcomp>      ?r?   rA   TstrictF)r   r   rF   rD   r   r=   rw   rf   ).rG   rH   r8   z_dimlistdim_multr   attn_scalesr   r   rh   r   maxlendownsample_countdistis_initializedr   r   r   r   r   r   r   r   r   r_   rq   r7   rv   conv_inrI   rz   down_blocks	enumeratezipry   rx   	mid_blockr   norm_outconv_outr}   
world_sizerankr   )rP   r   r8   r   r   r   r   r   rd   re   r   r   dimsscaler   CausalConv3dResidualDownBlockResidualBlockAttentionBlockResampleMidBlockirb   rc   r~   r9   rQ   r   r5   rH   (  s~   



.
zWanEncoder3d.__init__c           	      C   sn  d }d }| j r| jdkrt|| j| j| j\}}}t }t }|d ur|}|d d d d t d d d d d f 	 }|j
d dk rk|| d urktj|| d d d d dd d d d f d|j|gdd}| ||| }|||< |d7 }t| t| n| |}| jD ]}||}q| j r| jdkrt||}| |}| |}| |}t }t }|d ur!|}|d d d d t d d d d d f 	 }|j
d dk r|| d urtj|| d d d d dd d d d f d|j|gdd}| ||| }|||< |d7 }t| t| n| |}| j r5| jdkr5t||}|S Nr@   r#   r   r   )r   r   r"   r   r   r&   getr'   CACHE_Tcloneshapetorchcat	unsqueezetodevicer   r*   r   r   r   r   rh   r   r    )	rP   rU   expected_local_heightexpected_height_feat_cache	_feat_idxidxcache_xlayerr4   r4   r5   rV     sp   
,$	








,$	


zWanEncoder3d.forward)r?   r   r   r   r#   r4   r   r`   ra   FF)
rX   rY   rZ   r[   r\   r]   boolrH   rV   r^   r4   r4   rQ   r5   r     s.    
`r   c                       sP   e Zd ZdZ				ddedededed	ed
edef fddZdd Z	  Z
S )WanResidualUpBlocka  
    A block that handles upsampling for the WanVAE decoder.
    Args:
        in_dim (int): Input dimension
        out_dim (int): Output dimension
        num_res_blocks (int): Number of residual blocks
        dropout (float): Dropout rate
        temperal_upsample (bool): Whether to upsample on temporal dimension
        up_flag (bool): Whether to upsample or not
        non_linearity (str): Type of non-linearity to use
    r`   Fra   rb   rc   r   rd   temperal_upsampleup_flagre   c                    s   t    || _|| _|rt|||rdnddd| _nd | _g }|}	t|d D ]}
|t|	||| |}	q(t	
|| _|rN|rCdnd}t|||d| _nd | _d| _d S )Nr#   r@   r   rC   r<   )r9   r:   F)rG   rH   rb   rc   r   r   rx   ry   r_   rI   rz   r|   r7   	upsamplerr}   )rP   rb   rc   r   rd   r   r   re   r|   current_dimr~   upsample_moderQ   r4   r5   rH     s4   





zWanResidualUpBlock.__init__c                 C   rS   r)   )r   rT   r4   r4   r5   rV     rW   zWanResidualUpBlock.forward)r`   FFra   )rX   rY   rZ   r[   r\   rp   r   r]   rH   rV   r^   r4   r4   rQ   r5   r     s,    .r   c                       sN   e Zd ZdZ			ddedededed	edB d
ef fddZdd Z  Z	S )
WanUpBlocka  
    A block that handles upsampling for the WanVAE decoder.

    Args:
        in_dim (int): Input dimension
        out_dim (int): Output dimension
        num_res_blocks (int): Number of residual blocks
        dropout (float): Dropout rate
        upsample_mode (str, optional): Mode for upsampling ('upsample2d' or 'upsample3d')
        non_linearity (str): Type of non-linearity to use
    r`   Nra   rb   rc   r   rd   r   re   c           
         s   t    || _|| _g }|}t|d D ]}	|t|||| |}qt|| _	d | _
|d ur<tt||dg| _
d| _d S )Nr@   r   F)rG   rH   rb   rc   rx   ry   r_   rI   rz   r|   
upsamplersr7   r}   )
rP   rb   rc   r   rd   r   re   r|   r   r~   rQ   r4   r5   rH     s   
	
zWanUpBlock.__init__c                 C   rS   r)   )r   rT   r4   r4   r5   rV   ?  rW   zWanUpBlock.forward)r`   Nra   ro   r4   r4   rQ   r5   r     s&     r   c                	       r   )WanDecoder3dak  
    A 3D decoder module.

    Args:
        dim (int): The base number of channels in the first layer.
        z_dim (int): The dimensionality of the latent space.
        dim_mult (list of int): Multipliers for the number of channels in each block.
        num_res_blocks (int): Number of residual blocks in each block.
        attn_scales (list of float): Scales at which to apply attention mechanisms.
        temperal_upsample (list of bool): Whether to upsample temporally in each block.
        dropout (float): Dropout rate for the dropout layers.
        non_linearity (str): Type of non-linearity to use.
    r   r   r   r#   r4   FTTr`   ra   r?   Fre   out_channelsr   use_parallel_decodec              
      s  t     | _|| _t|}|| _|| _t|| _t|| _t	|| _
|| _d| _ fdd|d g|d d d  D }d}t rFt }|rU|dkrUt}t}t}t}nt}t}t}t}|||d ddd| _||d ||dd| _d| _tg | _tt|d d |dd  d	d
D ]T\}\}}|dkr|
s|d }|t|d k}d }|r|| rd}n|rd}|
r||||||r|| nd||d}n
|||||||d}| j | |r|  jd7  _qt!|dd| _"|||	ddd| _#d| _$d| _%d| _&t r
t | _%t' | _&d S d S )Nr   c                    r   r4   r4   r   r   r4   r5   r   n  r   z)WanDecoder3d.__init__.<locals>.<listcomp>r   r@   r?   rA   r   Tr   r#   rC   r<   F)rb   rc   r   rd   r   r   re   )rb   rc   r   rd   r   re   rf   )(rG   rH   r8   r   r   r   r   r   r   r   rh   r   upsample_countr   r   r   r   r   r   r   r   rv   r   r   r   r   rI   rz   	up_blocksr   r   r   ry   r   r   r   r}   r   r   r   )rP   r8   r   r   r   r   r   rd   re   r   r   r   r   r   r   r   ResidualUpBlockUpBlockr   rb   rc   r   r   up_blockrQ   r   r5   rH   R  s   



&.

zWanDecoder3d.__init__c                 C   sJ  d }| j r| jdkrt|| j| j| j\}}t }t }|d ur|}|d d d d t d d d d d f 	 }|j
d dk rh|| d urhtj|| d d d d dd d d d f d|j|gdd}| ||| }|||< |d7 }t| t| n| |}| |}| jD ]}||}q| |}| |}t }t }|d ur|}|d d d d t d d d d d f 	 }|j
d dk r|| d urtj|| d d d d dd d d d f d|j|gdd}| ||| }|||< |d7 }t| t| n| |}| j r#| jdkr#t||}|S r   )r   r   r!   r   r   r&   r   r'   r   r   r   r   r   r   r   r   r   r*   r   r   r   rh   r   r    )rP   rU   r   r   r   r   r   r   r4   r4   r5   rV     sj   ,$	







,$	


zWanDecoder3d.forward)r   r   r   r#   r4   r   r`   ra   r?   FF)
rX   rY   rZ   r[   r]   r\   r   rH   rV   r^   r4   r4   rQ   r5   r   C  s.    	
hr   c                 C   s\   |dkr| S |   dkrt| d||d} | S |   dkr&t| d||d} | S td| j )Nr@   r   z b c (h q) (w r) -> b (c r q) h wqr   z$b c f (h q) (w r) -> b (c r q) f h wzInvalid input shape: )r8   r   
ValueErrorr   rU   
patch_sizer4   r4   r5   patchify  s   	r   c                 C   sL   |dkr| S |   dkrt| d||d} | S |   dkr$t| d||d} | S )Nr@   r   z b (c r q) h w -> b c (h q) (w r)r   r   z$b (c r q) f h w -> b c f (h q) (w r))r8   r   r   r4   r4   r5   
unpatchify  s   	r   c                	   @   s$  e Zd ZdZdZdeddfddZd#dd	Zd
ej	dej	fddZ
d$d
ej	dej	fddZd
ej	dej	fddZd
ej	dej	fddZdej	dej	fddZd$dej	dej	fddZdej	dej	fddZdej	dej	fddZdejdejfddZ		d%dej	ded ejdB dej	fd!d"ZdS )&AutoencoderKLWanz
    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos.
    Introduced in [Wan 2.1].
    Fconfigr;   Nc                 C   s@  t j|  t| | |j| _t|j| _t|jd d d | _|jd u r*|j	}n|j}t|j
| _
t|j| _|j| _t|dd| _t|dd| _|jrit|j|j	| jd |j|j|j| j|j|j| jd
| _t| jd | jd d| _t| j| jd| _|jrt|| j|j|j|j| j|j|j|j| jd
| _|j | _ d S )	Nr   r   Fr   r#   )
r   r8   r   r   r   r   r   rd   r   r   r@   )
r8   r   r   r   r   r   rd   r   r   r   )!rI   ModulerH   r	   r   r   r   r   decoder_base_dimbase_dimlatents_meanlatents_stdshift_factorgetattrr   r   load_encoderr   r   r   r   r   rd   r   encoderr   
quant_convpost_quant_convload_decoderr   r   decoderuse_feature_cache)rP   r   r   r4   r4   r5   rH   -  sT   
zAutoencoderKLWan.__init__c                 C   sf   dt fdd}| jjr|| j| _d| _d g| j | _| jjr1|| j| _	d| _
d g| j	 | _d S d S )Nr;   c                 S   s2   d}|   D ]}t|tst|tr|d7 }q|S )Nr   r@   )modules
isinstancer   r   )modelcountmr4   r4   r5   _count_conv3de  s   z3AutoencoderKLWan.clear_cache.<locals>._count_conv3dr   )r\   r   r   r   	_conv_num	_conv_idx	_feat_mapr   r   _enc_conv_num_enc_conv_idx_enc_feat_map)rP   r   r4   r4   r5   clear_cachec  s   zAutoencoderKLWan.clear_cacherU   c                 C   s  | j r|   | jjd urt|| jjd}t| j| jdd |jd }d|d d  }t	|D ]J}t
d |dkrQ| |d d d d d dd d d d f }q/| |d d d d dd|d   dd|  d d d d f }t||gd}q/W d    n1 sw   Y  | |}|d d d | jd d d d d d f |d d | jd d d d d d d f }}	tj||	gdd}t|}|   |S | jjD ]}
t|
tr|
jdkrt|
jj}d|d< t||
j_qt| |}|S )	Nr   r-   r.   r#   r@   r   r   r   rF   )r   r  r   r   r   r6   r   r   r   rx   r'   r*   r   r   r   r   r   r   r   r   r7   r9   r   rM   _paddingtupler	   encode)rP   rU   titer_r   outout_encmulogvarblockr  r4   r4   r5   r  v  s>   

.@
R	zAutoencoderKLWan.encodec                 C   s   t |d | |}W d    n1 sw   Y  | |}|d d d | jd d d d d d f |d d | jd d d d d d d f }}tj||gdd}|S )Nr,   r@   r   )r6   r   r   r   r   r   )rP   rU   first_framer	  r  r  r  r4   r4   r5   _encode  s   
RzAutoencoderKLWan._encodec                 C   r   |d d d d dd d d d f  d}| j|dd}t| |}|d d d d dd f }tj||gdd}|S Nr   r#   T)r  r@   r   )r   r  r	   tiled_encoder   r   rP   rU   r  r  r4   r4   r5   r       (zAutoencoderKLWan.tiled_encodec                 C   r  r  )r   r  r	   spatial_tiled_encoder   r   r  r4   r4   r5   r    r  z%AutoencoderKLWan.spatial_tiled_encodezc                 C   sH  | j r|   |jd }| |}t| j| jd[ t|D ]N}t	d |dkrGt
	d | |d d d d ||d d d d d f }qt
	d | |d d d d ||d d d d d f }t||gd}qW d    n1 sww   Y  | jjd urt|| jjd}| }tj|dd	d
}|   |S t| |}|S )Nr#   r  r   Tr@   Fr        r   minr   )r   r  r   r   r6   r   r   rx   r'   r*   r(   r   r   r   r   r   r   rp   clampr	   decode)rP   r  r  rU   r   r	  r
  r4   r4   r5   r    s2   



2
0zAutoencoderKLWan.decodec                 C   sR   |  |}t|d | |}W d    n1 sw   Y  tj|ddd}|S )Nr  r  r   r  )r   r6   r   r   r  )rP   r  r  rU   r	  r4   r4   r5   _decode  s   
zAutoencoderKLWan._decodec                 C   B   |  j d9  _ t| |}| jd }|d d d d |d f }|S Nr#   r@   )blend_num_framesr	   tiled_decodetemporal_compression_ratiorP   r  decstart_frame_idxr4   r4   r5   r"    
   
zAutoencoderKLWan.tiled_decodec                 C   s4   t | |}| jd }|d d d d |d f }|S )Nr@   )r	   spatial_tiled_decoder#  r$  r4   r4   r5   r(    s   
z%AutoencoderKLWan.spatial_tiled_decodec                 C   r  r   )r!  r	   parallel_tiled_decoder#  r$  r4   r4   r5   r)    r'  z&AutoencoderKLWan.parallel_tiled_decodesamplesample_posterior	generatorc                 C   s8   |}|  |j}|r|j|d}n| }| |}|S )z
        Args:
            sample (`torch.Tensor`): Input sample.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
        )r,  )r  latent_distr*  r9   r  )rP   r*  r+  r,  rU   	posteriorr  r%  r4   r4   r5   rV     s   
zAutoencoderKLWan.forwardru   )F)FN)rX   rY   rZ   r[    _supports_gradient_checkpointingr   rH   r  r   Tensorr  r  r  r  r  r  r"  r(  FloatTensorr)  r   	GeneratorrV   r4   r4   r4   r5   r   %  s:    

6 			
r   )FNNN)Econtextvars
contextlibr   r   torch.distributeddistributedr   torch.nnrI   einopsr   )sglang.multimodal_gen.configs.models.vaesr   8sglang.multimodal_gen.runtime.distributed.parallel_stater   r   /sglang.multimodal_gen.runtime.layers.activationr   0sglang.multimodal_gen.runtime.models.vaes.commonr   r	   Csglang.multimodal_gen.runtime.models.vaes.parallel.wan_common_utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   Asglang.multimodal_gen.runtime.models.vaes.parallel.wan_dist_utilsr   r   r   r   r   r   r   r   r   r    r!   r"   r   
ContextVarr$   r&   r'   r(   r6   r   r7   r_   rq   rv   r   r   r   r   r   r   r   r   
EntryClassr4   r4   r4   r5   <module>   sJ   <87%#' ;?1 ? \