o
    ۷iA.                     @   s   d dl Z d dlZd dlm  mZ ddlmZmZ ddl	m
Z
 ddddd	ZG d
d dejjZG dd dejjZG dd dejjZG dd dejjZG dd de
eZdS )    N   )ConfigMixinregister_to_config)
ModelMixin)r      )r      )r      )r   r   )g      ?g      ?       @g      @c                       sF   e Zd ZddededB def fddZdejd	ejfd
dZ  ZS )ResBlockNr   channelsmid_channelsdimsc                    s   t    |d u r|}|dkrtjjntjj}|||ddd| _tjd|| _|||ddd| _	tjd|| _
tj | _d S )Nr   r   r   kernel_sizepadding    )super__init__torchnnConv2dConv3dconv1	GroupNormnorm1conv2norm2SiLU
activation)selfr   r   r   Conv	__class__ _/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/ltx2/latent_upsampler.pyr   "   s   
zResBlock.__init__hidden_statesreturnc                 C   sH   |}|  |}| |}| |}| |}| |}| || }|S N)r   r   r   r   r   )r   r%   residualr#   r#   r$   forward/   s   




zResBlock.forward)Nr   )	__name__
__module____qualname__intr   r   Tensorr)   __classcell__r#   r#   r!   r$   r
   !   s     r
   c                       s&   e Zd Zd fdd	Zdd Z  ZS )PixelShuffleNDr   r   r   c                    s*   t    || _|| _|dvrtdd S )N)r   r   r   zdims must be 1, 2, or 3)r   r   r   upscale_factors
ValueError)r   r   r2   r!   r#   r$   r   <   s   
zPixelShuffleND.__init__c              
   C   s   | j dkr*|ddg| jd d R dddddddd	ddd	dddS | j dkrN|ddg| jd d R ddd	dddd	dddS | j dkrn|ddg| jd d R ddddd	dddS d S )
Nr   r   r      r         r   )r   	unflattenr2   permuteflattenr   xr#   r#   r$   r)   E   s   

<
6zPixelShuffleND.forward)r1   )r*   r+   r,   r   r)   r/   r#   r#   r!   r$   r0   ;   s    	r0   c                	       sJ   e Zd ZdZddedededdf fdd	Zd
ejdejfddZ  Z	S )BlurDownsamplez
    Anti-aliased spatial downsampling by integer stride using a fixed separable binomial kernel. Applies only on H,W.
    Works for dims=2 or dims=3 (per-frame).
    r5   r   strider   r&   Nc              	      s   t    |dvrtd|  dk s d dkr!td  || _|| _ | _t fddt D }|d d d f |d d d f  }||	  
 }| d	|d d d d d d f  d S )
N)r   r   z$`dims` must be either 2 or 3 but is r   r   r   z0`kernel_size` must be an odd number >= 3 but is c                    s   g | ]
}t  d  |qS )r   )mathcomb).0kr   r#   r$   
<listcomp>o   s    z+BlurDownsample.__init__.<locals>.<listcomp>kernel)r   r   r3   r   r>   r   r   tensorrangesumfloatregister_buffer)r   r   r>   r   rB   k2dr!   rC   r$   r   _   s   
 $zBlurDownsample.__init__r<   c           	      C   s   | j dkr|S | jdkr-|jd }| j|d| j| j}tj||d | j | jd |d}|S |j\}}}}}|dd	dd}| j|d| j| j}tj||d | j | jd |d}|jdd  \}}|
d||f|d|||}|S )Nr   r   )weightbiasr>   r   groupsr   r4   )r>   r   shaperE   expandr   Fconv2d	transposer:   r8   reshape)	r   r<   crL   bf_h2w2r#   r#   r$   r)   t   s   


zBlurDownsample.forward)r5   )
r*   r+   r,   __doc__r-   r   r   r.   r)   r/   r#   r#   r!   r$   r=   Y   s     r=   c                       sB   e Zd ZdZddedef fddZdejd	ejfd
dZ	  Z
S )SpatialRationalResamplera  
    Scales by the spatial size of the input by a rational number `scale`. For example, `scale = 0.75` will downsample
    by a factor of 3 / 4, while `scale = 1.5` will upsample by a factor of 3 / 2. This works by first upsampling the
    input by the (integer) numerator of `scale`, and then performing a blur + stride anti-aliased downsample by the
    (integer) denominator.
       r	   r   scalec                    s   t    t|| _t|d }|d u r"td| dtt  |\| _	| _
tjj|| j	d | ddd| _td| j	| j	fd| _td| j
d| _d S )	NzThe supplied `scale` z( is not supported; supported scales are r   r   r   r   )r2   )r   r>   )r   r   rI   r_    RATIONAL_RESAMPLER_SCALE_MAPPINGgetr3   listkeysnumdenr   r   r   convr0   pixel_shuffler=   	blur_down)r   r   r_   	num_denomr!   r#   r$   r      s   

 z!SpatialRationalResampler.__init__r<   r&   c                 C   s"   |  |}| |}| |}|S r'   )rf   rg   rh   r;   r#   r#   r$   r)      s   


z SpatialRationalResampler.forward)r^   r	   )r*   r+   r,   r\   r-   rI   r   r   r.   r)   r/   r#   r#   r!   r$   r]      s    r]   c                       sl   e Zd ZdZe							dd	ed
ededededededB f fddZde	j
de	j
fddZ  ZS )LTX2LatentUpsamplerModela  
    Model to spatially upsample VAE latents.

    Args:
        in_channels (`int`, defaults to `128`):
            Number of channels in the input latent
        mid_channels (`int`, defaults to `512`):
            Number of channels in the middle layers
        num_blocks_per_stage (`int`, defaults to `4`):
            Number of ResBlocks to use in each stage (pre/post upsampling)
        dims (`int`, defaults to `3`):
            Number of dimensions for convolutions (2 or 3)
        spatial_upsample (`bool`, defaults to `True`):
            Whether to spatially upsample the latent
        temporal_upsample (`bool`, defaults to `False`):
            Whether to temporally upsample the latent
       r^   r   r   TFr	   in_channelsr   num_blocks_per_stager   spatial_upsampletemporal_upsamplerational_spatial_scaleNc           	         s|  t    || _| _|| _ | _|| _|| _ dkrtj	j
ntj	j}||ddd| _tj	d| _tj	 | _tj	 fddt|D | _|rd|rdtj	tj	jd dddtd| _n>|r|d urrt|d	| _n0tj	tj	j
d
 dddtd| _n|rtj	tj	jd dddtd| _ntdtj	 fddt|D | _||ddd| _d S )Nr   r   r   r   r   c                       g | ]}t  d qS )r   r
   rA   rY   r   r   r#   r$   rD          z5LTX2LatentUpsamplerModel.__init__.<locals>.<listcomp>   )r   r_   r   z9Either spatial_upsample or temporal_upsample must be Truec                    rq   rr   rs   rt   ru   r#   r$   rD      rv   )r   r   rl   r   rm   r   rn   ro   r   r   r   r   initial_convr   initial_normr   initial_activation
ModuleListrG   
res_blocks
Sequentialr0   	upsamplerr]   r3   post_upsample_res_blocks
final_conv)	r   rl   r   rm   r   rn   ro   rp   ConvNdr!   ru   r$   r      sD   
"z!LTX2LatentUpsamplerModel.__init__r%   r&   c                 C   s~  |j \}}}}}| jdkrX|ddddddd}| |}| |}| |}| jD ]}||}q,| |}| j	D ]}||}q;| 
|}|d|dfddddd}|S | |}| |}| |}| jD ]}||}qj| jr| |}|d d d d dd d d d d f }n!|ddddddd}| |}|d|dfddddd}| j	D ]}||}q| 
|}|S )Nr   r   r   r   r   r4   )rP   r   r9   r:   rx   ry   rz   r|   r~   r   r   r8   ro   )r   r%   
batch_sizenum_channels
num_framesheightwidthblockr#   r#   r$   r)      s:   















(



z LTX2LatentUpsamplerModel.forward)rk   r^   r   r   TFr	   )r*   r+   r,   r\   r   r-   boolrI   r   r   r.   r)   r/   r#   r#   r!   r$   rj      s4    6rj   )r?   r   torch.nn.functionalr   
functionalrR   configuration_utilsr   r   models.modeling_utilsr   r`   Moduler
   r0   r=   r]   rj   r#   r#   r#   r$   <module>   s   	0!