o
    پi2d                     @   s   d dl mZmZ d dlmZ d dlmZ d dlmZm	Z	 d dl
Zd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ G dd deejZG dd dZdS )    )ABCabstractmethod)Iterator)prod)OptionalcastN)DiagonalGaussianDistribution)randn_tensor)nn)	VAEConfig)get_sp_parallel_rankget_sp_world_sizec                       s  e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< deddf fddZedd Z	edefddZ
edefddZedeejB fddZedejfddZedejfddZdejdefddZdejdejfd d!Zd"ejd#ejd$edejfd%d&Zd"ejd#ejd$edejfd'd(Zd"ejd#ejd$edejfd)d*Zdejdejfd+d,Zdeeejef  fd-d.Zdejdejfd/d0Zdejfd1d2Zdejdejfd3d4Z dejdejfd5d6Z!dejdejfd7d8Z"										d=dedB dedB dedB dedB dedB dedB dedB dedB d	edB d
edB ddfd9d:Z#d>d;d<Z$  Z%S )?ParallelTiledVAEtile_sample_min_heighttile_sample_min_widthtile_sample_min_num_framestile_sample_stride_heighttile_sample_stride_widthtile_sample_stride_num_framesblend_num_frames
use_tilinguse_temporal_tilinguse_parallel_tilingconfigreturnNc                    sd   t    || _|j| _|j| _|j| _|j| _|j| _|j| _|j	| _	|j
| _
|j| _|j| _d S N)super__init__r   r   r   r   r   r   r   r   r   r   r   )selfr   kwargs	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/vaes/common.pyr   $   s   
zParallelTiledVAE.__init__c                 C   s   t |  jS r   )next
parametersdevicer   r"   r"   r#   r&   2      zParallelTiledVAE.devicec                 C      t t| jjS r   )r   intr   temporal_compression_ratior'   r"   r"   r#   r+   6   r(   z+ParallelTiledVAE.temporal_compression_ratioc                 C   r)   r   )r   r*   r   spatial_compression_ratior'   r"   r"   r#   r,   :   r(   z*ParallelTiledVAE.spatial_compression_ratioc                 C   s   t ttjB | jjS r   )r   floattorchTensorr   scaling_factorr'   r"   r"   r#   r0   >   s   zParallelTiledVAE.scaling_factorc                 O      d S r   r"   r   argsr   r"   r"   r#   _encodeB      zParallelTiledVAE._encodec                 O   r1   r   r"   r2   r"   r"   r#   _decodeF   r5   zParallelTiledVAE._decodexc           	      C   s   |j \}}}}}|d | j d }| jr0| jr0|| jkr0| |d d d d d |f }t
|S | jrQ|| jks=|| jkrQ| |d d d d d |f }t
|S | 	|d d d d d |f }t
|S N   )shaper+   r   r   r   tiled_encoder   r   spatial_tiled_encoder4   r   )	r   r7   
batch_sizenum_channels
num_framesheightwidthlatent_num_frameslatentsr"   r"   r#   encodeJ   s   
   zParallelTiledVAE.encodezc                 C   s  |j \}}}}}| j| j }| j| j }| j| j }	|d | j d }
| jr>| jr>t dkr>| 	|d d d d d |
f S | jrX| j
rX||	krX| |d d d d d |
f S | jrs||ksc||krs| |d d d d d |
f S | |d d d d d |
f S r8   )r:   r   r,   r   r   r+   r   r   r   parallel_tiled_decoder   tiled_decodespatial_tiled_decoder6   )r   rE   r=   r>   r?   r@   rA   tile_latent_min_heighttile_latent_min_widthtile_latent_min_num_framesnum_sample_framesr"   r"   r#   decode\   s(   


    zParallelTiledVAE.decodeabblend_extentc              	   C   s   t |jd |jd |}t|D ]@}|d d d d d d | | d d f d||   |d d d d d d |d d f ||   |d d d d d d |d d f< q|S )Nr9   minr:   range)r   rN   rO   rP   yr"   r"   r#   blend_vy      &
($zParallelTiledVAE.blend_vc                 C   s   t |jd |jd |}t|D ]@}|d d d d d d d d | | f d||   |d d d d d d d d |f ||   |d d d d d d d d |f< q|S )Nr9   rR   r   rN   rO   rP   r7   r"   r"   r#   blend_h   rW   zParallelTiledVAE.blend_hc              	   C   s   t |jd |jd |}t|D ]@}|d d d d | | d d d d f d||   |d d d d |d d d d f ||   |d d d d |d d d d f< q|S )Nr9   rR   rY   r"   r"   r#   blend_t   rW   zParallelTiledVAE.blend_tc              
   C   s   |j \}}}}}| j| j }| j| j }| j| j }| j| j }|| }	|| }
g }td|| jD ]6}g }td|| jD ]%}|dddddd||| j ||| j f }| |}|| q<|| q1| 	||	|
||S )zEncode a batch of images using a tiled encoder.

        Args:
            x (`torch.Tensor`): Input batch of videos.

        Returns:
            `torch.Tensor`:
                The latent representation of the encoded videos.
        r   N)
r:   r   r,   r   r   r   rT   r4   append_merge_spatial_tiles)r   r7   _r@   rA   rI   rJ   tile_latent_stride_heighttile_latent_stride_widthblend_heightblend_widthrowsirowjtiler"   r"   r#   r<      sB   





z%ParallelTiledVAE.spatial_tiled_encodec           	      c   sb    d}t |D ]'\}}d}|D ]}t|}||||| f ||fV  ||7 }|d7 }qqd S )Nr   r9   )	enumerater   reshape)	r   gathered_resultsgathered_dim_metadata
global_idxre   per_rank_metadata_start_shaper:   	mul_shaper"   r"   r#   _parallel_data_generator   s    
z)ParallelTiledVAE._parallel_data_generatorc           3   
      s  t  t }}|j\}}}}}| j| j }	| j| j }
| j| j }| j| j }| j	| j }| j
| j }| j| j }| j| j	 }|| d | }|| d |  || d |   }|| }|| d | }|| }t|d | |}g }g }tt||D ]i\}}|| }|| }| }| }|| }|| } || }!|dddd||| d | | |	 |!|!|
 f }"| |"}"|dkr|"ddddddddddf }"|"j}#|"d}$||$ ||# qtj|dd ~tjdgjtjd}%fddt|D }&t|&|% td	d
 |&D }'tj|'jd}(|(dd< dg| })t|(j|gdgt|(j R   }*t|*|( t |)|  fddt|D }+| !|*|)D ]\},}|| }|| }| }| }|,|+| | |< qdg }-d}.t|+D ]P\}/}0| "|0||| j| j	}1|/dkr| #|.|1| j$}1|-|1ddddd| j
ddddf  n|-|1ddddd| j
d ddddf  |1}.qtj|-dd}2|2S )zu
        Parallel version of tiled_decode that distributes both temporal and spatial computation across GPUs
        r9   Nr   rX   dimr&   dtypec                    s    g | ]}t jd  jt jdqS )r9   rt   )r.   zerosr&   int64.0r_   )resultsr"   r#   
<listcomp>5  s    z:ParallelTiledVAE.parallel_tiled_decode.<locals>.<listcomp>c                 s   s    | ]}|  V  qd S r   )item)ry   sizer"   r"   r#   	<genexpr>:  s    z9ParallelTiledVAE.parallel_tiled_decode.<locals>.<genexpr>)r&   c                    s"   g | ]}fd dt  D qS )c                    s   g | ]}d d t  D qS )c                 S   s   g | ]}g qS r"   r"   rx   r"   r"   r#   r{   K  s    zPParallelTiledVAE.parallel_tiled_decode.<locals>.<listcomp>.<listcomp>.<listcomp>rT   rx   )num_w_tilesr"   r#   r{   K  s    zEParallelTiledVAE.parallel_tiled_decode.<locals>.<listcomp>.<listcomp>r   rx   )num_h_tilesr   r"   r#   r{   J  s       )%r   r   r:   r   r,   r   r   r+   r   r   r   rS   ri   rT   r6   rj   r]   r.   cat
contiguoustensorr}   r&   rw   dist
all_gathermaxrv   
zeros_likerepeatlenall_gather_into_tensorall_gather_objectrq   r^   r\   r   )3r   rE   
world_sizerankBCTHWrI   rJ   rK   r`   ra   tile_latent_stride_num_framesrb   rc   num_t_tilestotal_spatial_tilestotal_tilestiles_per_rankstart_tile_idxend_tile_idxlocal_resultslocal_dim_metadata	local_idxrm   t_idxspatial_idxh_idxw_idxt_starth_startw_startrh   r:   decoded_flat
local_size	all_sizesmax_sizepadded_resultsrl   rk   datacurrent_dataresult_sliceslast_slice_datare   tem_data
slice_datadecr"   )r   r   rz   r#   rF      s   









	&




&*z&ParallelTiledVAE.parallel_tiled_decodec                 C   s   g }t |D ]O\}}g }	t |D ]:\}
}|dkr%| ||d  |
 ||}|
dkr4| ||
d  ||}|	|ddddddd|d|f  q|tj|	dd qtj|ddS )z4Helper function to merge spatial tiles with blendingr   r9   NrX   rr   rQ   )ri   rV   rZ   r]   r.   r   )r   tilesrb   rc   stride_heightstride_widthresult_rowsre   rf   
result_rowrg   rh   r"   r"   r#   r^   q  s   .z%ParallelTiledVAE._merge_spatial_tilesc              
   C   s   |j \}}}}}| j| j }| j| j }| j| j }| j| j }| j| j }	| j| j }
g }td||D ]3}g }td||D ]#}|dddddd||| ||| f }| |}|| q>|| q4| 	||	|
| j| jS )z
        Decode a batch of images using a tiled decoder.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.

        Returns:
            `torch.Tensor`:
                The decoded images.
        r   N)
r:   r   r,   r   r   r   rT   r6   r]   r^   )r   rE   r_   r@   rA   rI   rJ   r`   ra   rb   rc   rd   re   rf   rg   rh   decodedr"   r"   r#   rH     sB   






z%ParallelTiledVAE.spatial_tiled_decodec              
   C   sr  |j \}}}}}| j| j }g }td|| jD ]N}|d d d d ||| j d d d d d f }	| jrD|| jks>|| jkrD| |	}	n| 	|	}	|dkr`|	d d d d dd d d d d f }	|
|	 qg }
t|D ]C\}}	|dkr| ||d  |	| j}	|

|	d d d d d |d d d d f  ql|

|	d d d d d |d d d d d f  qltj|
dd}|S )Nr   r9   r   rr   )r:   r   r+   rT   r   r   r   r   r<   r4   r]   ri   r\   r   r.   r   )r   r7   r_   r?   r@   rA   r   rf   re   rh   r   encr"   r"   r#   r;     s,   
0


&.2zParallelTiledVAE.tiled_encodec              
   C   s  |j \}}}}}| j| j }| j| j }| j| j }	| j| j }
g }td||
D ]Q}|d d d d |||	 d d d d d f }| jrX|j d |ksR|j d |krX| 	|}n| 
|}|dkrt|d d d d dd d d d d f }|| q(g }t|D ]E\}}|dkr| ||d  || j}||d d d d d | jd d d d f  q||d d d d d | jd d d d d f  qtj|dd}|S )Nr   r9   rX   rQ   r   rr   )r:   r   r,   r   r   r+   r   rT   r   rH   r6   r]   ri   r\   r   r.   r   )r   rE   r=   r>   r?   r@   rA   rI   rJ   rK   r   rf   re   rh   r   r   r   r"   r"   r#   rG     s@   



.
&&*zParallelTiledVAE.tiled_decodec                 C   s   d| _ |p| j| _|p| j| _|p| j| _|p| j| _|p| j| _|p%| j| _|dur/|| _n| j| j | _|p:| j | _ |	p@| j| _|
pF| j	| _	dS )a  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.

        Args:
            tile_sample_min_height (`int`, *optional*):
                The minimum height required for a sample to be separated into tiles across the height dimension.
            tile_sample_min_width (`int`, *optional*):
                The minimum width required for a sample to be separated into tiles across the width dimension.
            tile_sample_min_num_frames (`int`, *optional*):
                The minimum number of frames required for a sample to be separated into tiles across the frame
                dimension.
            tile_sample_stride_height (`int`, *optional*):
                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
                no tiling artifacts produced across the height dimension.
            tile_sample_stride_width (`int`, *optional*):
                The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
                artifacts produced across the width dimension.
            tile_sample_stride_num_frames (`int`, *optional*):
                The stride between two consecutive frame tiles. This is to ensure that there are no tiling artifacts
                produced across the frame dimension.
        TN)
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r"   r"   r#   enable_tiling  s&   $
zParallelTiledVAE.enable_tilingc                 C   s
   d| _ dS )z
        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
        decoding in one step.
        FN)r   r'   r"   r"   r#   disable_tilingC  s   
zParallelTiledVAE.disable_tiling)
NNNNNNNNNN)r   N)&__name__
__module____qualname__r*   __annotations__boolr   r   propertyr&   r+   r,   r-   r.   r/   r0   r   r4   r6   r   rD   rM   rV   rZ   r\   r<   r   tuplerq   FloatTensorrF   r^   rH   r;   rG   r   r   __classcell__r"   r"   r    r#   r      s   
 






7
 
8.	

?r   c                   @   s   e Zd ZddejdefddZddejdB dejfd	d
Z		dde	d  de
edf dejfddZ	ddejde
edf dejfddZdejfddZdS )r   Fr%   deterministicc                 C   s   || _ tj|ddd\| _| _t| jdd| _|| _td| j | _t| j| _	| jrAtj
| j| j j| j jd | _	| _d S d S )Nr   r9   rr   g      >g      4@      ?rt   )r%   r.   chunkmeanlogvarclampr   expstdvarr   r&   ru   )r   r%   r   r"   r"   r#   r   N  s   z%DiagonalGaussianDistribution.__init__N	generatorr   c                 C   s0   t | jj|| jj| jjd}| j| j|  }|S )N)r   r&   ru   )r	   r   r:   r%   r&   ru   r   )r   r   sampler7   r"   r"   r#   r   Z  s   z#DiagonalGaussianDistribution.sampler9   r      otherdims.c                 C   s   | j r	tdgS |d u r#dtjt| jd| j d | j |d S dtjt| j|j d|j | j|j  d | j |j |d S )N        r   r   g      ?rr   )r   r.   r/   sumpowr   r   r   )r   r   r   r"   r"   r#   kle  s&   
zDiagonalGaussianDistribution.klr   c                 C   sR   | j r	tdgS tdtj }dtj|| j t|| j	 d| j
  |d S )Nr   g       @r   r   rr   )r   r.   r/   nplogpir   r   r   r   r   )r   r   r   logtwopir"   r"   r#   nll|  s    z DiagonalGaussianDistribution.nllc                 C   s   | j S r   )r   r'   r"   r"   r#   mode  s   z!DiagonalGaussianDistribution.mode)Fr   )Nr   )r   )r   r   r   r.   r/   r   r   	Generatorr   r   r   r*   r   r   r   r"   r"   r"   r#   r   L  s*    



r   )abcr   r   collections.abcr   mathr   typingr   r   numpyr   r.   torch.distributeddistributedr   !diffusers.models.autoencoders.vaer   diffusers.utils.torch_utilsr	   r
   $sglang.multimodal_gen.configs.modelsr   )sglang.multimodal_gen.runtime.distributedr   r   Moduler   r"   r"   r"   r#   <module>   s$       8