o
    پi{                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	m
Z
mZ d dlmZ d dlZd dlZd dlZd dlmZ d dlmZmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z- e)e.Z/G dd de
Z0G dd de1e
Z2de1de1fddZ3dedej4fddZ5dd Z6dd Z7eG d d! d!Z8eG d"d# d#e8Z9eG d$d% d%e9Z:eG d&d' d'e8Z;d(e1de<e= fd)d*Z>dS )+    N)Callable)asdict	dataclassfieldfields)Enumauto)Any)	rearrange)	DiTConfigEncoderConfigModelConfig	VAEConfig)BaseEncoderOutputT5Config)DataType)update_config_from_args"sequence_model_parallel_all_gatherget_sp_parallel_rankget_sp_world_size)get_default_height_width)init_logger)FlexibleArgumentParserStoreBooleanshallow_asdictc                   @   sh   e Zd Ze Ze Ze Ze Ze Ze Z	de
fddZde
fddZde
fddZdefdd	Zd
S )ModelTaskTypereturnc                 C   s   | t jkp| t jkp| t jkS N)r   T2II2ITI2Iself r&   g/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/configs/pipeline_configs/base.pyis_image_gen9   s
   
zModelTaskType.is_image_genc                 C   s   | t jkp	| t jkS r    )r   I2Vr"   r$   r&   r&   r'   requires_image_input@   s   z"ModelTaskType.requires_image_inputc                 C   s(   | t jkp| t jkp| t jkp| t jkS r    )r   r)   r"   r#   TI2Vr$   r&   r&   r'   accepts_image_inputC   s   
z!ModelTaskType.accepts_image_inputc                 C   s   |   rtjS tjS r    )r(   r   IMAGEVIDEOr$   r&   r&   r'   	data_typeK   s   zModelTaskType.data_typeN)__name__
__module____qualname__r   r)   T2Vr+   r!   r"   r#   boolr(   r*   r,   r   r/   r&   r&   r&   r'   r   .   s    r   c                   @   s$   e Zd ZdZdZdZdZdZdZdS )STA_Modez#STA (Sliding Tile Attention) modes.STA_inferenceSTA_searching
STA_tuningSTA_tuning_cfgN)	r0   r1   r2   __doc__STA_INFERENCESTA_SEARCHING
STA_TUNINGSTA_TUNING_CFGNONEr&   r&   r&   r'   r5   R   s    r5   promptr   c                 C   s   | S r    r&   )r@   r&   r&   r'   preprocess_text\      rA   outputc                 C   s   t r    )NotImplementedError)rC   _text_inputsr&   r&   r'   postprocess_text`   rB   rF   c                 C   s   zddl m}m} | }W n ty   d}Y nw | jd }|| dkr=|||  }| dd |d}tj| |gdd} |dkrkz| }W n tyQ   d}Y nw | jd }|| }|| }	|	| }
| |	|
 } | S | S )z
    Shard rotary embeddings [S, D] along sequence for SP.
    If S is not divisible by SP degree, pad by repeating the last row.
    r   r      Ndim)8sglang.multimodal_gen.runtime.distributed.parallel_stater   r   	Exceptionshaperepeattorchcat)embr   r   sp_world_sizeseq_lenpad_lenpadrank	local_lenstartendr&   r&   r'   shard_rotary_emb_for_spd   s0   



rZ   c                 C   sb   |j }t|dkr|d }n|d |d }}|| }| jd |kr/| d d d |d d f } | S )N   rG   rH   )raw_latent_shapelenrM   )latentsbatch	raw_shapetarget_tokenswidthheightr&   r&   r'   maybe_unpad_latents   s   
re   c                   @   s  e Zd ZU dZejZeed< dZe	ed< dZ
e	dB ed< dZeed< dZeed	< d
Zeed< dZedB ed< dZeed< eedZeed< dZe	ed< eedZeed< dZe	ed< dZeed< dZeed< eedZeed< dZe	ed< dZedd dZe edf ed< edd dZ!e e	df ed < ed!d dZ"e#e$ ed"< ed#d dZ%e$ed$< d%d& Z&ed'd dZ'e e(e	ge	f df ed(< ed)d dZ)e e(e*ge+j,f df ed*< dZ-e	dB ed+< e.j/Z0e.ed,< d-Z1e2ed.< edd/Z3e#e2 dB ed0< dZ4edB ed1< d2e e2e2f fd3d4Z5d5d6 Z6d7d8 Z7d9d: Z8d{d;d<Z9d=d> Z:d?d@ Z;dAdB Z<dCe#e	 d2e$fdDdEZ=dFdG Z>dHdI Z?dJdK Z@dLdM ZAdNdO ZBdPdQ ZCd|dRdSZDdTdU ZEdVdW ZFdXdY ZGdZd[ ZHd\d] ZId^d_ ZJd`da ZKdbdc ZLddde ZMeN	d}dfeOdge	d2eOfdhdiZPd}dje$e	eQf dge	d2dfdkdlZReS	d}dme$e	eQf dne	d2d fdodpZTd~dqdrZUdse	fdtduZVdse	fdvdwZWdxe$e	eQf d2dfdydzZXdS )PipelineConfigz7The base configuration class for a generation pipeline.	task_type 
model_pathNpipeline_config_pathTenable_autocastshould_use_guidanceg      @embedded_cfg_scale
flow_shiftFdisable_autocast)default_factory
dit_configbf16dit_precision
vae_configfp32vae_precision
vae_tilingvae_spimage_encoder_configimage_encoder_precisionru   c                   C   s   t  fS r    )r   r&   r&   r&   r'   <lambda>   s    zPipelineConfig.<lambda>.text_encoder_configsc                   C      dS )Nr{   r&   r&   r&   r&   r'   r|          text_encoder_precisionsc                   C   s   i gS r    r&   r&   r&   r&   r'   r|          text_encoder_extra_argsc                   C      i S r    r&   r&   r&   r&   r'   r|      r   image_encoder_extra_argsc                 C      |j S r    )last_hidden_stater%   imager&   r&   r'   postprocess_image      z PipelineConfig.postprocess_imagec                   C      t fS r    )rA   r&   r&   r&   r'   r|      r   preprocess_text_funcsc                   C   r   r    )rF   r&   r&   r&   r'   r|      r   postprocess_text_funcsmask_strategy_file_pathSTA_mode   skip_time_steps)defaultdmd_denoising_stepsboundary_ratior   c                 C   s$   | j jj}t||||\}}||fS r    )rt   arch_configspatial_compression_ratior   )r%   r   rc   rd   vae_scale_factorr&   r&   r'   calculate_condition_image_size   s   
z-PipelineConfig.calculate_condition_image_sizec                 C      |S r    r&   r%   sigmasnum_inference_stepsr&   r&   r'   prepare_sigmas   rB   zPipelineConfig.prepare_sigmasc                 C   s   | ||ftjjj||ffS )zh
        preprocess the condition image, returns (image, final_image_width, final_image_height)
        )resizePILImage
ResamplingLANCZOS)r%   r   target_widthtarget_height_vae_image_processorr&   r&   r'   preprocess_condition_image   s
   z)PipelineConfig.preprocess_condition_imagec                 C   s   |  ||j|jS r    )r   rc   rd   r   r&   r&   r'   prepare_calculated_size   s   z&PipelineConfig.prepare_calculated_sizec                 C   r   r    r&   )r%   r`   negr&   r&   r'   prepare_image_processor_kwargs   rB   z-PipelineConfig.prepare_image_processor_kwargsc                 C   s   | j j}|j}|j}|j}|j| }|j| }tdd|||}	d|	d d d d dd f< |	d d d d ddf }
tj	|
|dd}
tj
|
|	d d d d dd d d f gdd}	|	dd|||}	|	dd}	|	|j}	tj
|	|gdd}|S )NrG   r      )repeatsrJ   rI   rH   )rt   r   r   temporal_compression_ratio
num_framesrd   rc   rO   onesrepeat_interleaveconcatview	transposetodevice)r%   latent_conditionr`   vae_arch_configr   r   r   latent_heightlatent_widthmask_lat_sizefirst_frame_maskimage_latentsr&   r&   r'   postprocess_image_latent   s8   

$z'PipelineConfig.postprocess_image_latentc                 C   r   r    r&   )r%   noiser_   r&   r&   r'   slice_noise_pred  rB   zPipelineConfig.slice_noise_predc                 C   r   r    r&   )r%   r   r&   r&   r'   adjust_num_frames  rB   z PipelineConfig.adjust_num_framesr@   c                 C   s   ||fi |S r    r&   )r%   r@   	tokenizer
tok_kwargsr&   r&   r'   tokenize_prompt!  s   zPipelineConfig.tokenize_promptc                 C   s6   |j | jjj }|j| jjj }|| jj|||f}|S r    )rd   rt   r   r   rc   rq   num_channels_latents)r%   r`   
batch_sizer   rd   rc   rM   r&   r&   r'   prepare_latent_shape$  s   z#PipelineConfig.prepare_latent_shapec                 C   r~   )NFr&   r$   r&   r&   r'   allow_set_num_frames3  rB   z#PipelineConfig.allow_set_num_framesc                 C   sP   | j j}t|dd }|d u rt|dd }t|dd }|d u r$t|dd }||fS )Nscaling_factorshift_factor)rt   r   getattr)r%   r   dtypevaer   r   r   r&   r&   r'   get_decode_scale_and_shift6  s   z)PipelineConfig.get_decode_scale_and_shiftc                 C   r   r    r&   )r%   r_   r   r`   r&   r&   r'   maybe_pack_latentsB  rB   z!PipelineConfig.maybe_pack_latentsc                 C      d S r    r&   r%   r_   r&   r&   r'   maybe_prepare_latent_idsE  rB   z'PipelineConfig.maybe_prepare_latent_idsc                 C   r   r    r&   )r%   r   r   r&   r&   r'   postprocess_vae_encodeI  rB   z%PipelineConfig.postprocess_vae_encodec                 C   r   r    r&   )r%   r_   server_argsr   r&   r&   r'   preprocess_decodingM  rB   z"PipelineConfig.preprocess_decodingc                 C      t |dd}|S )Nr   rI   r   r   r&   r&   r'   gather_latents_for_spP     z$PipelineConfig.gather_latents_for_spc                 C   r   r    r&   )r%   r`   vae_image_processorr&   r&   r'   preprocess_vae_imageU  rB   z#PipelineConfig.preprocess_vae_imagec           	      C   s  |j r|dfS t t }}| dkr|dfS |jd }|dkrV|| dkrVtd |||  }tjg |jd d ||jdd  R |j	|j
d}tj||gdd}|jd | dksaJ t|d	|d
 }|d d d d |d d d d d d f }|dfS )NF   r   r   zIPadding latents to next multiple of SP degree, performance is sub-optimalr[   r   r   rI   zb c (n t) h w -> b c n t h wnT)enable_sequence_shardr   r   rJ   rM   loggerdebugrO   zerosr   r   rP   r
   
contiguous)	r%   r`   r_   rR   rank_in_sp_grouptime_dimrT   rU   sharded_tensorr&   r&   r'   shard_latents_for_spX  s2   
$(z#PipelineConfig.shard_latents_for_spc                 C   r   r    )prompt_embedsr%   r`   r&   r&   r'   get_pos_prompt_embedsu  r   z$PipelineConfig.get_pos_prompt_embedsc                 C   r   r    )negative_prompt_embedsr   r&   r&   r'   get_neg_prompt_embedsx  r   z$PipelineConfig.get_neg_prompt_embedsc                 C   s   t ||}|S r    )re   )r%   r_   r`   r&   r&   r'   post_denoising_loop{  s   
z"PipelineConfig.post_denoising_loopc                 C   r   r    r&   )r%   framesr   r&   r&   r'   post_decoding  rB   zPipelineConfig.post_decodingc                 C   r   r    r&   r%   r`   r   
rotary_embr   r&   r&   r'   prepare_pos_cond_kwargs  rB   z&PipelineConfig.prepare_pos_cond_kwargsc                 C   r   r    r&   r   r&   r&   r'   prepare_neg_cond_kwargs  rB   z&PipelineConfig.prepare_neg_cond_kwargsparserprefixc              	   C   sp  |  dkr| dnd}|dkr'| jd| dt|dd dtjdd	 | jd| d
t|dd dtjdd	 | jd| dt|dd dtjdd	 | jd| dt|dd dtj	dd	 | jd| dt|dd dtj
g ddd | jd| dt|dd dtjg ddd | jd| dt|dd dtjdd | jd| dt|dd d d!d" | jd| d#d$t|dd d%tjg dd&d' | jd| d(t|dd d)tjg dd*d | jd| d+ttjd,d- d.d/lm} |j| | d0d1 d.d2lm} |j| | d3d1 d.d4lm} |j| | d5d1 | S )6Nrh   .z--z
model-path-_ri   zPath to the pretrained model)typedestr   helpzpipeline-config-pathrj   zPath to the pipeline configzembedded-cfg-scalerm   zEmbedded CFG scalez
flow-shiftrn   zFlow shift parameterzdit-precisionrs   )ru   fp16rr   zPrecision for the DiT model)r   r   r   choicesr   zvae-precisionrv   zPrecision for VAEz
vae-tilingrw   zEnable VAE tiling)actionr   r   r   zvae-sprx   zEnable VAE spatial parallelism)r   r   r   ztext-encoder-precisions+r   zPrecision for each text encoder)nargsr   r   r   r   r   zimage-encoder-precisionrz   zPrecision for image encoderzdmd-denoising-stepsz>Comma-separated list of denoising steps (e.g., '1000,757,522'))r   r   r   r   )r   z
vae-config)r   )r   z
dit-configr   z	t5-config)stripadd_argumentstrreplacerf   ri   rj   floatrm   rn   rs   rv   r   rw   DEFAULT_TEXT_ENCODER_PRECISIONSrz   parse_int_listr   .sglang.multimodal_gen.configs.models.vaes.baser   add_cli_args.sglang.multimodal_gen.configs.models.dits.baser   0sglang.multimodal_gen.configs.models.encoders.t5r   )r   r   prefix_with_dotr   r   r   r&   r&   r'   r    s   



	








zPipelineConfig.add_cli_argsargsc                 C   s   |  dkr| dnd}t| ||dd t| j|| ddd t| j|| ddd | jD ]}t|trBt||| ddd q0d S )Nrh   r   T)pop_argsrt   rq   	t5_config)r   r   rt   rq   r}   
isinstancer   )r%   r  r   r
  text_encoder_configr&   r&   r'   update_config_from_dict  s$   

z&PipelineConfig.update_config_from_dictkwargsconfig_cli_prefixc                 C   s*  ddl m} | dkr| dnd}||d dp|d}||d dp,|d}|du r5td||d	 p@|d	}tj|oK|d
}ddl	m
}	 ddl m}
 |r|r|
|}|durv|\}}td| d|j d nD|||dd}|du rddl m}m} |  t| }td| d| |j}n|||dd}|du rtd| d|j}||d p|d}t|trt||	r|durd|v rddlm} |}| }t|tr|| |||d < nt|tr|}nt|tr|| |||d < ||| |S )z
        Load PipelineConfig from kwargs Dictionary.
        kwargs: dictionary of kwargs
        config_cli_prefix: prefix of CLI arguments for this PipelineConfig instance
        r   )get_model_inforh   r   ri   Npipeline_configz model_path is required in kwargspipeline_class_namez.safetensors)Flux2PipelineConfig)get_pipeline_config_classeszDetected safetensors file with z, using z" directly without model_index.jsonbackend)r  )_PIPELINE_CONFIG_REGISTRY _discover_and_register_pipelineszCould not get model info for 'zu'. If using a safetensors file, please specify a valid pipeline_class_name. Available pipelines with config classes: zB'. If using a safetensors file, please specify pipeline_class_namevae_pathzFLUX.2-Tiny-AutoEncoder)Flux2FinetunedPipelineConfigrj   )sglang.multimodal_gen.registryr  r   get
ValueErrorospathisfileendswith3sglang.multimodal_gen.configs.pipeline_configs.fluxr  r  r   infor0   r  r  listkeyspipeline_config_clsr  r   
issubclass=sglang.multimodal_gen.configs.pipeline_configs.flux_finetunedr  r  load_from_jsonrf   dictupdate_pipeline_configr  )clsr  r  r  r
  ri   pipeline_config_or_pathr  is_safetensors_filer  r  config_classesr(  r   
model_infor  r  available_pipelinesr  r  r  r&   r&   r'   from_kwargs  s   	




zPipelineConfig.from_kwargsc                 C   s   | j r
| js
tdt| jt| jkr%tdt| j dt| j dt| jt| jkr@tdt| j dt| j dt| jt| jkr[tdt| j dt| j dd S )NzXCurrently enabling vae_sp requires enabling vae_tiling, please set --vae-tiling to True.z Length of text encoder configs (z6) must be equal to length of text encoder precisions ()z;) must be equal to length of text preprocessing functions (z&Length of text postprocess functions ()rx   rw   r  r^   r}   r   r   r   r$   r&   r&   r'   check_pipeline_configx  s"   z$PipelineConfig.check_pipeline_config	file_pathc           
      C   s  t | }g }| D ]R\}}t|tr!t|}|d |||< q
t|trItdd |D rIg }|D ]}t|}|d || q3|||< q
t|tr\tdd |D r\|| q
|D ]}||d  q_t	|d}	t
j||	dd W d    d S 1 sw   Y  d S )Nr   c                 s       | ]}t |tV  qd S r    r  r   .0vr&   r&   r'   	<genexpr>      

z.PipelineConfig.dump_to_json.<locals>.<genexpr>c                 s   s    | ]}t |V  qd S r    )callable)r;  fr&   r&   r'   r=    s    wr   )indent)r   itemsr  r   r   poptupleallappendopenjsondump)
r%   r7  output_dictdel_keyskeyvalue
model_dictmodel_dictsr<  r@  r&   r&   r'   dump_to_json  s0   





"zPipelineConfig.dump_to_jsonc                 C   s@   t |}t|}W d    n1 sw   Y  | | d S r    )rH  rI  loadr-  )r%   r7  r@  input_pipeline_dictr&   r&   r'   r+    s   
zPipelineConfig.load_from_jsonsource_pipeline_dictc                 C   s   t | D ]O}|j}||v rSt| |}|| }t|tr!|| qt|trMtdd |D rMt|t|ks;J dt	||ddD ]	\}}|| qBqt
| || qt| dr_|   d S d S )Nc                 s   r8  r    r9  r:  r&   r&   r'   r=    r>  z8PipelineConfig.update_pipeline_config.<locals>.<genexpr>zFUsers shouldn't delete or add text encoder config objects in your jsonT)strict__post_init__)r   namer   r  r   update_model_configrE  rF  r^   zipsetattrhasattrrV  )r%   rT  r@  rM  current_value	new_valuetarget_configsource_configr&   r&   r'   r-    s2   


z%PipelineConfig.update_pipeline_config)F)NN)rh   )r   N)Yr0   r1   r2   r:   r   r"   rg   __annotations__ri   r  rj   rk   r4   rl   rm   r  rn   ro   r   r   rq   rs   r   rt   rv   rw   rx   r   ry   rz   r  r}   rE  r   r   r&  r,  r   r   r   r   r   r   rO   tensorr   r5   r;   r   r   intr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodr   r  r	   r  classmethodr4  r6  rQ  r+  r-  r&   r&   r&   r'   rf      s   
  


 u

grf   c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )ImagePipelineConfigzMBase config for image generation pipelines with token-like latents [B, S, D].c                 C   s&   |d u rt dd| |}|S |}|S )Ng      ?rG   )nplinspacer   r&   r&   r'   _prepare_sigmas  s   z#ImagePipelineConfig._prepare_sigmasc           	      C   s   t  t }}|jd }|| dkr<|||  }tjg |jd d ||jdd  R |j|jd}tj||gdd}t|d|d	 }|d d |d d d d f }|dfS )	NrG   r   r   r   rI   zb (n s) d -> b n s dr   T)
r   r   rM   rO   r   r   r   rP   r
   r   )	r%   r`   r_   rR   r   rS   rT   rU   r   r&   r&   r'   r     s"   
$z(ImagePipelineConfig.shard_latents_for_spc                 C   r   )NrG   rI   r   r   r&   r&   r'   r     r   z)ImagePipelineConfig.gather_latents_for_spc                 C   s   | j jj}| jjj}|jd }dt|j|d   }dt|j|d   }t	||}|
||d |d |d dd}|dddddd}|||||fS )Nr   r      r[   rG   r   )rt   r   r   rq   in_channelsrM   rb  rd   rc   re   r   permute)r%   r_   r`   r   channelsr   rd   rc   r&   r&   r'   _unpad_and_unpack_latents  s   



 z-ImagePipelineConfig._unpad_and_unpack_latentsN)r0   r1   r2   r:   rh  r   r   rm  r&   r&   r&   r'   re    s    re  c                       s,   e Zd ZdZ fddZ fddZ  ZS )SpatialImagePipelineConfiga!  Base config for spatial image pipelines (e.g. GLM-Image) with 4D latents (B, C, H', W').

    Overrides shard_latents_for_sp / gather_latents_for_sp to shard along the height dimension
    so that each SP rank gets (B, C, H'_local, W') instead of using the token-style (B, S, C) path.
    c                    s   t  }|dkr|dfS | dkrt ||S |j\}}}}|| dkrP|||  }tj|jd |jd ||jd f|j|jd}tj	||gdd}|jd }t
 }	|| }
|	|
 }||
 }|d d d d ||d d f  }|d	fS )
NrG   Fri  r   r[   r   r   rI   T)r   rJ   superr   rM   rO   r   r   r   rP   r   r   )r%   r`   r_   rR   r   h_latw_latrT   rU   r   
chunk_sizeh0h1sharded	__class__r&   r'   r     s*   
$z/SpatialImagePipelineConfig.shard_latents_for_spc                    s2   t  dkr|S | dkrt |S t|ddS )NrG   ri  r   rI   )r   rJ   ro  r   r   r   rv  r&   r'   r   "  s
   
z0SpatialImagePipelineConfig.gather_latents_for_sp)r0   r1   r2   r:   r   r   __classcell__r&   r&   rv  r'   rn     s    rn  c                   @   sZ   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dS )SlidingTileAttnConfigz)Configuration for sliding tile attention.   window_size   stridei@  rd   i   rc   Fpad_to_squareTuse_overlap_optimizationN)r0   r1   r2   r:   r{  rb  r`  r}  rd   rc   r~  r4   r  r&   r&   r&   r'   ry  +  s   
 ry  rN  c                 C   s   | sg S dd |  dD S )z7Parse a comma-separated string of integers into a list.c                 S   s   g | ]}t | qS r&   )rb  r   )r;  xr&   r&   r'   
<listcomp>A  s    z"parse_int_list.<locals>.<listcomp>,)split)rN  r&   r&   r'   r  =  s   r  )?rI  r   collections.abcr   dataclassesr   r   r   r   enumr   r   typingr	   numpyrf  r   rO   einopsr
   $sglang.multimodal_gen.configs.modelsr   r   r   r   -sglang.multimodal_gen.configs.models.encodersr   r	  r   4sglang.multimodal_gen.configs.sample.sampling_paramsr   #sglang.multimodal_gen.configs.utilsr   :sglang.multimodal_gen.runtime.distributed.communication_opr   rK   r   r   1sglang.multimodal_gen.runtime.models.vision_utilsr   1sglang.multimodal_gen.runtime.utils.logging_utilsr   sglang.multimodal_gen.utilsr   r   r   r0   r   r   r  r5   rA   ra  rF   rZ   re   rf   re  rn  ry  r&  rb  r  r&   r&   r&   r'   <module>   sP   $
#    65*