o
    پi                     @   s   d Z ddlmZmZ ddlZddlZddlm  m	Z
 ddlmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ eeZeG dd deZeG dd deZeG dd deZ dS )z
MOVA pipeline configuration.
    )	dataclassfieldN)Image)MOVAAudioConfigMOVAVideoConfigT5Config)DacVAEConfigWanVAEConfig)ModelTaskTypePipelineConfigt5_postprocess_text)init_loggerc                	   @   sT  e Zd ZU dZejZeed< ee	dZ
e	ed< eedZeed< eedZeed< eedZeed< dZeed	< ed
d dZeed< edd dZeed< dZeed< dZedB ed< dZeed< dZeed< dejejB dededejejB fddZdedefdd Z d!d" Z!d#d$ Z"d%d& Z#d'ejdejfd(d)Z$d'ejdejfd*d+Z%dS ),MOVAPipelineConfigz=Configuration for MOVA (text+image -> video+audio) pipelines.	task_type)default_factory
dit_configaudio_dit_config
vae_configaudio_vae_configfp32audio_vae_precisionc                   C   s   t  fS Nr    r   r   g/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/configs/pipeline_configs/mova.py<lambda>*   s    zMOVAPipelineConfig.<lambda>text_encoder_configsc                   C   s   t fS r   r   r   r   r   r   r   ,   s    postprocess_text_funcsdacaudio_vae_typeg?Nboundary_ratio   time_division_factor   time_division_remainderimagetarget_heighttarget_widthreturnc           
      C   s  t |tjtjfstdt| t |tjr tt|}|j	dkr)|d }|j
js6|tjd}|j	dkrZ|jd dv rO|jd dvrO|d}n&|ddd	d}n|j	d
kru|jd	 dvru|jd dv ru|ddd	d}|jd |jd }}||kr||kr|S td|| || || k r|| | }|| d }|dd d ||| f }n|| | }|| d }	|d|	|	| d d f }tj|||fdddd}|S )NzUnsupported image type:    ).Ng     o@   r   )r$   r+   r"   r$   r"   z+Center cropping and resizing image to %dx%d.bilinearFT)sizemodealign_corners	antialias)
isinstancer   torchTensor	TypeErrortype
from_numpynparrayndimdtypeis_floating_pointtofloat32divshape	unsqueezepermuteloggerinfoFinterpolate)
selfr&   r'   r(   image_heightimage_widthcropped_widthleftcropped_heighttopr   r   r   _center_crop_and_resize7   sH   


z*MOVAPipelineConfig._center_crop_and_resize
num_framesc                 C   sR   |d u r|S || j  | jkr'|| j  d | j  | j  | j }td|| |S |S )Nr$   zS`num_frames` (%s) is not compatible with MOVA temporal constraints. Rounding to %s.)r#   r%   rD   warning)rH   rP   adjustedr   r   r   adjust_num_framesh   s$   z$MOVAPipelineConfig.adjust_num_framesc                 C   s   |  |||}|||ffS r   )rO   )rH   r&   r(   r'   _vae_image_processorr   r   r   preprocess_condition_image{   s   z-MOVAPipelineConfig.preprocess_condition_imagec                 C   s@   | j jj}|d | j d }|| jjj||j| |j| f}|S Nr$   )r   arch_configspatial_compression_ratior#   r   out_dimheightwidth)rH   batch
batch_sizerP   spatiallengthrA   r   r   r   prepare_latent_shape   s   
z'MOVAPipelineConfig.prepare_latent_shapec                 C   s    ||j  d |j  }||j|fS rV   )
hop_length
latent_dim)rH   r]   num_samples	audio_vaelatent_Tr   r   r   prepare_audio_latent_shape   s   z-MOVAPipelineConfig.prepare_audio_latent_shapelatentsc                 C   s   t |jdd }t |jdd }|d u s|d u r|S tj||j|jdd|jjddd}dtj||j|jd d|jjddd}|| | S )Nlatents_meanlatents_stddevicer<   r$   g      ?getattrconfigr4   tensorrk   r<   viewz_dim)rH   rg   	video_vaerh   ri   meaninv_stdr   r   r   normalize_video_latents   s   
z*MOVAPipelineConfig.normalize_video_latentsc                 C   s   t |jdd }t |jdd }|d u s|d u r|S tj||j|jdd|jjddd}tj||j|jdd|jjddd}|| | S )Nrh   ri   rj   r$   rl   )rH   rg   rr   rh   ri   rs   stdr   r   r   denormalize_video_latents   s   

z,MOVAPipelineConfig.denormalize_video_latents)&__name__
__module____qualname____doc__r   T2Vr   __annotations__r   r   r   r   r   r
   r   r	   r   r   strr   tupler   r    r!   floatr#   intr%   r4   r5   r   rO   rS   rU   r`   rf   ru   rw   r   r   r   r   r      sF   
 


1r   c                   @      e Zd ZU dZdZeed< dS )MOVA360PConfigzBConfiguration for MOVA 360P (text+image -> video+audio) pipelines.i p max_areaNrx   ry   rz   r{   r   r   r}   r   r   r   r   r         
 r   c                   @   r   )MOVA720PConfigzBConfiguration for MOVA 720P (text+image -> video+audio) pipelines.i  r   Nr   r   r   r   r   r      r   r   )!r{   dataclassesr   r   numpyr9   r4   torch.nn.functionalnn
functionalrF   PILr   )sglang.multimodal_gen.configs.models.ditsr   r   -sglang.multimodal_gen.configs.models.encodersr   )sglang.multimodal_gen.configs.models.vaesr	   r
   3sglang.multimodal_gen.configs.pipeline_configs.baser   r   2sglang.multimodal_gen.configs.pipeline_configs.wanr   1sglang.multimodal_gen.runtime.utils.logging_utilsr   rx   rD   r   r   r   r   r   r   r   <module>   s(    