o
    پi#^                     @   s@  d dl Z d dlmZmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZ d dlmZmZmZ d dlmZ d dlmZmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dl m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ dede	j,fddZ-eG dd de!Z.de	j,fddZ/de	j,de	j,de0e	j, fddZ1dd Z2dd  Z3	d>de	j,d!ee	j, fd"d#Z4	$d?d%ee	j, d&e5fd'd(Z6dede	j,fd)d*Z7dede	j,fd+d,Z8eG d-d. d.eZ9eG d/d0 d0eZ:d>d1ee; d2e;fd3d4Z<d5e;fd6d7Z=d8d9 Z>eG d:d; d;e.Z?eG d<d= d=e?Z@dS )@    N)	dataclassfield)CallableListOptional)VaeImageProcessor)	DiTConfigEncoderConfig	VAEConfig)
FluxConfig)BaseEncoderOutputCLIPTextConfigT5ConfigTextEncoderConfig)TextEncoderArchConfigQwen3TextConfig_is_transformer_layer)Flux2VAEConfigFluxVAEConfig)ImagePipelineConfigModelTaskTypepreprocess_textshard_rotary_emb_for_sp)clip_postprocess_textclip_preprocess_text)_pack_latents)get_local_torch_deviceoutputsreturnc                 C   s   | j S N)last_hidden_state)r   _text_inputs r$   g/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/configs/pipeline_configs/flux.pyt5_postprocess_text%   s   r&   c                   @   sj  e Zd ZU dZdZeed< ejZ	eed< dZ
eed< dZeed< eedZeed	< eedZeed
< dZeed< edd dZeedf ed< edd dZeedf ed< edd dZeeegef df ed< edd dZeeegef df ed< edd dZee ed< dd Z dd Z!dd Z"dd Z#d d! Z$d"d# Z%d$d% Z&d&d' Z'd(d) Z(d*d+ Z)d,S )-FluxPipelineConfigz$Configuration for the FLUX pipeline.g      @embedded_cfg_scale	task_typeF
vae_tilingvae_spdefault_factory
dit_config
vae_configenable_autocastc                   C   s   t  t fS r!   )r   r   r$   r$   r$   r%   <lambda>=   s    zFluxPipelineConfig.<lambda>.text_encoder_configsc                   C      dS )N)bf16r4   r$   r$   r$   r$   r%   r1   A       text_encoder_precisionsc                   C      t tfS r!   )r   r   r$   r$   r$   r%   r1   E       preprocess_text_funcsc                   C   r7   r!   )r   r&   r$   r$   r$   r%   r1   I   r8   postprocess_text_funcsc                   C   s   t ddddddd gS )NM   
max_lengthTF)r<   padding
truncationreturn_overflowing_tokensreturn_length)dictr$   r$   r$   r%   r1   M   s   text_encoder_extra_argsc                 C   s   |  ||S r!   )_prepare_sigmas)selfsigmasnum_inference_stepsr$   r$   r%   prepare_sigmasY   s   z!FluxPipelineConfig.prepare_sigmasc                 C   sN   d|j | jjjd   }d|j| jjjd   }| jjjd }||||f}|S N      heightr/   arch_configvae_scale_factorwidthr.   in_channelsrD   batch
batch_size
num_framesrL   rO   num_channels_latentsshaper$   r$   r%   prepare_latent_shape\   s   z'FluxPipelineConfig.prepare_latent_shapec                 C   sN   d|j | jjjd   }d|j| jjjd   }| jjjd }t|||||S rH   )rL   r/   rM   rN   rO   r.   rP   r   )rD   latentsrS   rR   rL   rO   rU   r$   r$   r%   maybe_pack_latentse   s   z%FluxPipelineConfig.maybe_pack_latentsc                 C   
   |j d S N   prompt_embedsrD   rR   r$   r$   r%   get_pos_prompt_embedsn      
z(FluxPipelineConfig.get_pos_prompt_embedsc                 C   rZ   r[   negative_prompt_embedsr_   r$   r$   r%   get_neg_prompt_embedsq   ra   z(FluxPipelineConfig.get_neg_prompt_embedsc                 C   s   | j jj}t||d  }t||d  }tj||d|d}|d tj||dd d d f  |d< |d tj||dd d d f  |d< |j\}}	}
|||	 |
}|S )NrI      device).r\   ).rI   )	r/   rM   rN   inttorchzerosarangerV   reshape)rD   original_heightoriginal_widthrg   rN   rL   rO   latent_image_idslatent_image_id_heightlatent_image_id_widthlatent_image_id_channelsr$   r$   r%   _prepare_latent_image_idst   s   
  z,FluxPipelineConfig._prepare_latent_image_idsc                 C   s   t j|jd d|d}| j|||d}||\}	}
t|	}	t|
}
||\}}t j||	gddj|d}t j||
gddj|d}||fS )Nr\   re   rf   )rm   rn   rg   r   dim)ri   rj   rV   rs   forwardr   catto)rD   r^   rO   rL   rg   
rotary_embrR   txt_idsimg_idsimg_cosimg_sintxt_costxt_sincossinr$   r$   r%   get_freqs_cis   s   z FluxPipelineConfig.get_freqs_cisc                 C   s.   |  ||\}}}}}|||d ||}|S )NrJ   )_unpad_and_unpack_latentsrl   )rD   rX   rR   rS   channelsrL   rO   r$   r$   r%   post_denoising_loop   s   
z&FluxPipelineConfig.post_denoising_loopc                 C   8   |  |jd |j|j||||jr|jd dS d dS Nr\   r   )	freqs_cispooled_projections)r   r^   rO   rL   pooled_embedsrD   rR   rg   ry   dtyper$   r$   r%   prepare_pos_cond_kwargs      	
z*FluxPipelineConfig.prepare_pos_cond_kwargsc                 C   r   r   )r   rc   rO   rL   neg_pooled_embedsr   r$   r$   r%   prepare_neg_cond_kwargs   r   z*FluxPipelineConfig.prepare_neg_cond_kwargsN)*__name__
__module____qualname____doc__r(   float__annotations__r   T2Ir)   r*   boolr+   r   r   r.   r   r   r/   r
   r0   r2   tupler	   r6   strr9   r   r:   rB   listrA   rG   rW   rY   r`   rd   rs   r   r   r   r   r$   r$   r$   r%   r'   )   sD   
   		r'   rX   c           
      C   s^   | j \}}}}td}t|}t|}td}t||||}	|	d|dd}	|	S )a^  
    Generates 4D position coordinates (T, H, W, L) for latent tensors.

    Args:
        latents (torch.Tensor):
            Latent tensor of shape (B, C, H, W)

    Returns:
        torch.Tensor:
            Position IDs tensor of shape (B, H*W, 4) All batches share the same coordinate structure: T=0,
            H=[0..H-1], W=[0..W-1], L=0
    r\   r   )rV   ri   rk   cartesian_prod	unsqueezeexpand)
rX   rS   _rL   rO   thwlayer
latent_idsr$   r$   r%   _prepare_latent_ids   s   



r   xx_idsc                 C   s   g }|j | jd}t| |D ]b\}}|j\}}|dddf  tj}|dddf  tj}t|d }	t|d }
||
 | }tj|	|
 |f|j|jd}|	d|
dd|| ||	|
|ddd}|| qtj|ddS )	z9
    using position ids to scatter tokens into place
    rf   Nr\   rI   rg   r   r   r   rt   )rx   rg   ziprV   ri   int64maxrj   r   scatter_r   r   viewpermuteappendstack)r   r   x_listdataposr   chh_idsw_idsr   r   flat_idsoutr$   r$   r%   _unpack_latents_with_ids   s   
r   c                 C   s^   | j \}}}}| |||d d|d d} | dddddd} | ||d |d |d } | S )NrI   r   r\   re      rJ   )rV   r   r   rl   rX   rS   rU   rL   rO   r$   r$   r%   _patchify_latents  s   r   c                 C   sZ   | j \}}}}| ||d dd||} | dddddd} | ||d |d |d } | S )NrJ   rI   r   r\   r   re   rV   rl   r   r   r$   r$   r%   _unpatchify_latents  s   r   t_coordc                 C   sz   | j \}}}g }t|D ]+}|d u rtdn|| }td}td}	t|}
t|||	|
}|| qt|S r[   )rV   rangeri   rk   r   r   r   )r   r   BLr   out_idsir   r   r   r   coordsr$   r$   r%   _prepare_text_ids  s   



r   
   image_latentsscalec           
   	      s   t | tstdt|  d fddtdt| D }dd |D }g }t| |D ]%\}}|d}|j	\}}}t
|t|t|td}	||	 q,tj|dd}|d}|S )	Nz+Expected `image_latents` to be a list, got .c                    s   g | ]}  |  qS r$   r$   .0r   r   r$   r%   
<listcomp>9      z&_prepare_image_ids.<locals>.<listcomp>r   c                 S   s   g | ]}| d qS )r   )r   r   r$   r$   r%   r   :      r\   rt   )
isinstancer   
ValueErrortyperi   rk   lenr   squeezerV   r   r   rw   r   )
r   r   t_coordsimage_latent_idsr   r   r   rL   rO   r   r$   r   r%   _prepare_image_ids/  s"   


r   c           	         T   g d}t j fdd|D dd}|j\}}}}|dddd|||| }|S )	N)r         c                       g | ]} j | qS r$   hidden_statesr   kr   r$   r%   r   O  r   z*flux2_postprocess_text.<locals>.<listcomp>r\   rt   r   rI   re   ri   r   rV   r   rl   	r   r#   hidden_states_layersr   rS   num_channelsseq_len
hidden_dimr^   r$   r   r%   flux2_postprocess_textL  s   
r   c           	         r   )	N)	         c                    r   r$   r   r   r   r$   r%   r   ]  r   z0flux2_klein_postprocess_text.<locals>.<listcomp>r\   rt   r   rI   re   r   r   r$   r   r%   flux2_klein_postprocess_textX  s   
r   c                   @   sP   e Zd ZU edd dZeeeeef  ed< edd dZ	eed< dd Z
d	S )
Flux2MistralTextArchConfigc                   C   s   g dS )N))qkv_projq_projq)r   k_projr   )r   v_projvr$   r$   r$   r$   r%   r1   i  r8   z#Flux2MistralTextArchConfig.<lambda>r,   stacked_params_mappingc                   C   s   t gS r!   r   r$   r$   r$   r%   r1   q      _fsdp_shard_conditionsc                 C   s   ddddddd| _ d S )Nr<   T   pt)r=   r>   r<   add_special_tokensreturn_attention_maskreturn_tensors)tokenizer_kwargs)rD   r$   r$   r%   __post_init__t  s   z(Flux2MistralTextArchConfig.__post_init__N)r   r   r   r   r   r   r   r   r   r   r   r$   r$   r$   r%   r   f  s   
 r   c                   @   s    e Zd ZU eedZeed< dS )Flux2MistralTextConfigr,   rM   N)r   r   r   r   r   rM   r   r   r$   r$   r$   r%   r     s   
 r   promptssystem_messagec                    s    dd | D } fdd|D S )Nc                 S   s   g | ]}| d dqS )z[IMG] )replacer   promptr$   r$   r%   r     r   z%format_text_input.<locals>.<listcomp>c                    s0   g | ]}d d dgddd|dgdgqS )systemtext)r   r  rolecontentuserr$   r  r  r$   r%   r     s    
r$   )r   r  cleaned_txtr$   r  r%   format_text_input  s   
r  r  c                 C   s   d}t | g|dS )NzYou are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object attribution and actions without speculation.r  )r  )r  r  r$   r$   r%   flux_2_preprocess_text  s   r  c                 C   s.   | j \}}}}| |||| ddd} | S )Nr   rI   r\   r   )rX   rS   r   rL   rO   r$   r$   r%   flux2_pack_latents  s   r  c                   @   s~  e Zd ZU dZeed< ejZeed< e	dd dZ
eedf ed< e	d	d dZeedf ed
< e	dd dZeeegef df ed< e	dd dZeeegef df ed< e	edZeed< dee defddZdd Zdd Zdd Zdeeeef  fddZdefddZdd  Z d!d" Z!d#d$ Z"d%d& Z#d'd( Z$d)d* Z%d+d, Z&d-d. Z'd8d0d1Z(d2d3 Z)d4d5 Z*d6d7 Z+d/S )9Flux2PipelineConfigg      @r(   r)   c                   C   r3   N)r4   r$   r$   r$   r$   r%   r1     r5   zFlux2PipelineConfig.<lambda>r,   .r6   c                   C      t  fS r!   )r   r$   r$   r$   r%   r1     r8   r2   c                   C      t fS r!   )r  r$   r$   r$   r%   r1     r   r9   c                   C   r  r!   )r   r$   r$   r$   r%   r1     r   r:   r/   r   r    c              
   C   s,   dd |D }|j |dddddddd}|S )	Nc                 S      g | ]	}|D ]}|qqS r$   r$   r   r  pr$   r$   r%   r         z7Flux2PipelineConfig.tokenize_prompt.<locals>.<listcomp>FTr   r<   r   )add_generation_prompttokenizereturn_dictr   r=   r>   r<   )apply_chat_template)rD   r   	tokenizer
tok_kwargsinputsr$   r$   r%   tokenize_prompt  s   z#Flux2PipelineConfig.tokenize_promptc                 C   sR   d|j | jjjd   }d|j| jjjd   }| jjj}|||d |d f}|S )NrI   rK   rQ   r$   r$   r%   rW     s   
z(Flux2PipelineConfig.prepare_latent_shapec                 C   rZ   Nr   r]   r_   r$   r$   r%   r`     ra   z)Flux2PipelineConfig.get_pos_prompt_embedsc                 C   rZ   r!  rb   r_   r$   r$   r%   rd     ra   z)Flux2PipelineConfig.get_neg_prompt_embedsc           
      C   s   | j jj}|d }d}|d urK|d urK||}}|| |kr3t|||  }	t||	 }t||	 }|| | }|| | }||ksG||krK||fS d S )NrI   i   )r/   rM   rN   mathsqrtrh   )
rD   imagerO   rL   rN   multiple_oftarget_area	new_width
new_heightr   r$   r$   r%   calculate_condition_image_size  s   

z2Flux2PipelineConfig.calculate_condition_image_sizevae_image_processorc           
      C   sh   | ||ftjjj}|j\}}| jjj}|d }	||	 |	 }||	 |	 }|j	|||dd}|||ffS )NrI   crop)rL   rO   resize_mode)
resizePILImage
ResamplingLANCZOSsizer/   rM   rN   
preprocess)
rD   r$  target_widthtarget_heightr*  imgimage_widthimage_heightrN   r%  r$   r$   r%   preprocess_condition_image  s   

z.Flux2PipelineConfig.preprocess_condition_imagec                 C   sd   |j }t|g}||dd}|t }||_| |d |}|d}|d}||dd}|S )Nr\   r   )	rS   r   repeatrx   r   condition_image_latent_idsrY   r   r   )rD   latent_conditionrR   rS   r   packedr   r$   r$   r%   postprocess_image_latent  s   


z,Flux2PipelineConfig.postprocess_image_latentc                 C   s   t |j|d}|j}|jd ur |j}	tj||	gddj|d}|jdkr)|d }|jdkr2|d }||\}
}t	|
}
t	|}||\}}tj||
gddj|d}tj||gddj|d}||fS )Nrf   r\   rt   re   r   )
r   rx   r   image_latentr;  ri   rw   ndimrv   r   )rD   r^   rO   rL   rg   ry   rR   rz   r{   r   r|   r}   r~   r   r   r   r$   r$   r%   r     s    


z!Flux2PipelineConfig.get_freqs_cisc              	   C   s"   d|  |jd |j|j|||iS )Nr   r   )r   r^   rO   rL   r   r$   r$   r%   r   )  s   z+Flux2PipelineConfig.prepare_pos_cond_kwargsc                 C   s   i S r!   r$   r   r$   r$   r%   r   5  s   z+Flux2PipelineConfig.prepare_neg_cond_kwargsc                 C      t |S r!   )r  )rD   rX   rS   rR   r$   r$   r%   rY   8     z&Flux2PipelineConfig.maybe_pack_latentsc                 C   rA  r!   )r   )rD   rX   r$   r$   r%   maybe_prepare_latent_ids;  rB  z,Flux2PipelineConfig.maybe_prepare_latent_idsc                 C   s   t |}|S r!   )r   )rD   r   vaer$   r$   r%   postprocess_vae_encode>  s   z*Flux2PipelineConfig.postprocess_vae_encodec                 C   s&   t | dst |do|jdu| _| jS )zMCheck if VAE has bn attribute (cached check to avoid repeated hasattr calls)._vae_has_bn_cachebnN)hasattrrG  rF  )rD   rD  r$   r$   r%   _check_vae_has_bnC  s   
z%Flux2PipelineConfig._check_vae_has_bnNc                 C   s   |dur|  |rt|S |S )zPreprocess latents before decoding.

        Dynamically adapts based on VAE type:
        - Standard Flux2 VAE (has bn): needs unpatchify (128 channels -> 32 channels)
        - Distilled VAE (no bn): keeps patchified latents (128 channels)
        N)rI  r   )rD   rX   server_argsrD  r$   r$   r%   preprocess_decodingI  s   z'Flux2PipelineConfig.preprocess_decodingc           	      C   s   | j j}| |r1|jjdddd||}t|jj	dddd|j
 ||}d| |fS t|dr=t|jddnt|ddpHt|dd}tj|||ddddd}d| dfS )zGet scale and shift for decoding.

        Dynamically adapts based on VAE type:
        - Standard Flux2 VAE (has bn): uses BatchNorm statistics
        - Distilled VAE (no bn): uses scaling_factor from config
        r\   r   configscaling_factorNgy&1?r   )r/   rM   rI  rG  running_meanr   rx   ri   r#  running_varbatch_norm_epsrH  getattrrL  tensor)	rD   rg   r   rD  vae_arch_configlatents_bn_meanlatents_bn_stdrM  r   r$   r$   r%   get_decode_scale_and_shiftT  s&   


z.Flux2PipelineConfig.get_decode_scale_and_shiftc                 C   s   |j }t||}|S r!   )r   r   )rD   rX   rR   r   r$   r$   r%   r   q  s   
z'Flux2PipelineConfig.post_denoising_loopc                 C   s   |d d d | df }|S r[   )r2  )rD   noiserX   r$   r$   r%   slice_noise_predw  s   z$Flux2PipelineConfig.slice_noise_pred)NN),r   r   r   r(   r   r   r   TI2Ir)   r   r6   r   r   r2   r	   r9   r   r:   r   r/   r
   r   rA   r   rW   r`   rd   r   rh   r)  r   r9  r>  r   r   r   rY   rC  rE  rI  rK  rV  r   rX  r$   r$   r$   r%   r    sH   
   	


r  c                   @   s   e Zd ZU dZeed< edd dZee	df ed< edd dZ
eedf ed	< ed
d dZeee	ge	f df ed< edd dZeee	ge	f df ed< dee	 defddZdS )Flux2KleinPipelineConfigFshould_use_guidancec                   C   r3   r  r$   r$   r$   r$   r%   r1     r5   z!Flux2KleinPipelineConfig.<lambda>r,   .r6   c                   C   r  r!   r   r$   r$   r$   r%   r1     r8   r2   c                   C   r  r!   )r   r$   r$   r$   r%   r1     r   r9   c                   C   r  r!   )r   r$   r$   r$   r%   r1     r   r:   r   r    c           	         s   |rt |d trdd |D }dtdtffdd  fdd|D }t|p(i }|d	d
}|dd	}|dd}|dd}|f||||d|S )Nr   c                 S   r  r$   r$   r  r$   r$   r%   r     r  z<Flux2KleinPipelineConfig.tokenize_prompt.<locals>.<listcomp>r  r    c                    sD   d| dg}z
 j |ddddW S  ty!    j |ddd Y S w )Nr  r  FT)r  r  enable_thinking)r  r  )r  	TypeError)r  messages)r  r$   r%   _apply_chat_template  s   
zFFlux2KleinPipelineConfig.tokenize_prompt.<locals>._apply_chat_templatec                    s   g | ]} |qS r$   r$   r  )r_  r$   r%   r     s    r<   r   r=   r>   Tr   r   )r=   r>   r<   r   )r   r   r   rA   pop)	rD   r   r  r  textsr<   r=   r>   r   r$   )r_  r  r%   r     s&   z(Flux2KleinPipelineConfig.tokenize_promptN)r   r   r   r[  r   r   r   r6   r   r   r2   r	   r9   r   r:   r   rA   r   r$   r$   r$   r%   rZ  }  s   
   rZ  r!   )r   )Ar"  dataclassesr   r   typingr   r   r   r.  ri   diffusers.image_processorr   $sglang.multimodal_gen.configs.modelsr   r	   r
   .sglang.multimodal_gen.configs.models.dits.fluxr   -sglang.multimodal_gen.configs.models.encodersr   r   r   r   2sglang.multimodal_gen.configs.models.encoders.baser   3sglang.multimodal_gen.configs.models.encoders.qwen3r   8sglang.multimodal_gen.configs.models.encoders.qwen_imager   .sglang.multimodal_gen.configs.models.vaes.fluxr   r   3sglang.multimodal_gen.configs.pipeline_configs.baser   r   r   r   6sglang.multimodal_gen.configs.pipeline_configs.hunyuanr   r   9sglang.multimodal_gen.configs.pipeline_configs.qwen_imager   )sglang.multimodal_gen.runtime.distributedr   Tensorr&   r'   r   r   r   r   r   r   rh   r   r   r   r   r   r   r  r  r  r  rZ  r$   r$   r$   r%   <module>   s~     




 X