o
    پi0                     @   s   d dl Z d dlmZmZ d dlmZ d dlZd dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZmZ defddZdedejfddZG dd de	ZeG dd deZ dS )    N)	dataclassfield)Callable)	DiTConfigEncoderConfig	VAEConfig)ZImageDitConfig)BaseEncoderOutputQwen3TextConfig)FluxVAEConfig)ImagePipelineConfigModelTaskType)"sequence_model_parallel_all_gather)get_sp_parallel_rankget_sp_world_sizepromptc                 C   s   d| dg}|S )Nuser)rolecontent )r   messagesr   r   i/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/configs/pipeline_configs/zimage.pyzimage_preprocess_text   s   r   outputsreturnc                 C   s2   | j d j}|j| }| j d d |d  S )Nr   )hidden_statesdeviceattention_masktobool)r   _text_inputsr   prompt_maskr   r   r   zimage_postprocess_text!   s   r$   c                   @   s$   e Zd ZU edd dZeed< dS )TransformersModelConfigc                   C   s   i S Nr   r   r   r   r   <lambda>(   s    z TransformersModelConfig.<lambda>default_factorytokenizer_kwargsN)__name__
__module____qualname__r   r*   dict__annotations__r   r   r   r   r%   '   s   
 r%   c                   @   sh  e Zd ZU dZeed< ejZeed< e	e
dZeed< e	edZeed< e	dd dZeed	f ed
< e	dd dZeed	f ed< e	dd dZeed	f ed< dZeed< dZeed< dZeed< dee defddZedededefddZdefddZ defdd Z!d!e"j#d"ede"j#fd#d$Z$d%d& Z%d'd( Z&d)d* Z'd+d, Z(d-d. Z)d/d0 Z*d1d2 Z+d3S )4ZImagePipelineConfigFshould_use_guidance	task_typer(   
dit_config
vae_configc                   C   s   t  fS r&   r
   r   r   r   r   r'   2   s    zZImagePipelineConfig.<lambda>.text_encoder_configsc                   C      t fS r&   )r   r   r   r   r   r'   6       preprocess_text_funcsc                   C   r6   r&   )r$   r   r   r   r   r'   9   r7   postprocess_text_funcs    SEQ_LEN_MULTIPLE   
PATCH_SIZE   F_PATCH_SIZEpromptsr   c                 C   s    |j |ddddddddd	}|S )NT
max_lengthi   pt)tokenizeadd_generation_promptenable_thinkingpaddingrA   
truncationreturn_tensorsreturn_dict)apply_chat_template)selfr@   	tokenizer
tok_kwargsinputsr   r   r   tokenize_prompt@   s   z$ZImagePipelineConfig.tokenize_promptxmc                 C   s"   |dkr| S t t| | | S )Nr   )intmathceil)rP   rQ   r   r   r   _ceil_to_multipleO   s   z&ZImagePipelineConfig._ceil_to_multiplec                 C   s2  t  }t }t|dd}|dur#t|dkr#t|d }t|d }nt|j| jjj }t|j	| jjj }||k}|r?|n|}|rE|n|}	|| j
 }
|	| j
 }| |
|}|| }|| }t|ddrot|jd dnd}| || j| }|| }|| }|||||||	|
|||||||d}||_|S )	zNBuild a minimal SP plan on batch for zimage (spatial sharding + cap sharding).raw_latent_shapeN         prompt_embedsr   )sp_sizerankswap_hwHWH_effW_effH_tokW_tok	H_tok_padH_tok_localh0_tok	cap_total	cap_local	cap_start)r   r   getattrlenrR   heightr4   arch_configspatial_compression_ratiowidthr=   rU   rZ   sizer;   _zimage_sp_plan)rK   batchr[   r\   rV   r^   r_   r]   r`   ra   rb   rc   rd   re   rf   cap_lenrg   rh   ri   planr   r   r   _build_zimage_sp_planU   sZ   


z*ZImagePipelineConfig._build_zimage_sp_planc                 C   s6   t |dd }t }|d u s|d|kr| |}|S )Nrq   r[   )rj   r   getru   )rK   rr   rt   r[   r   r   r   _get_zimage_sp_plan   s
   
z(ZImagePipelineConfig._get_zimage_sp_plancaprt   c                 C   sp   |d dkr|S | d}|d }||kr(tj||dd || dgdd}|d }|d	 }||||  S )
z>cap: [L, D] -> [cap_local, D], padded by repeating last token.r[   r>   r   rg   Ndimri   rh   )rp   torchcatrepeat)rK   rx   rt   Lrg   startlocalr   r   r   
_shard_cap   s   
&zZImagePipelineConfig._shard_capc                 C   s.   t  dkr|jS | |}| |jd |gS )Nr>   r   )r   rZ   rw   r   )rK   rr   rt   r   r   r   get_pos_prompt_embeds   s   

z*ZImagePipelineConfig.get_pos_prompt_embedsc                 C   s  t  }|dks| dkr|dfS | |}|d r"|dd }|d}|| j }|d | }|| j }|dkr^|d d d d d d d	d d d f ddd|d}	tj	||	gdd
}|d | j }
|d |d  | j }|d d d d d d |
|d d f }|d |_
|dfS )Nr>   rW   Fr]   rX   rY   rd   r   ry   rz   rf   re   T)r   r{   rw   	transpose
contiguousrp   r=   r~   r|   r}   _zimage_sp_swap_hw)rK   rr   latentsr[   rt   r`   rb   pad_tokpad_latpadh0h1r   r   r   shard_latents_for_sp   s$   



4&
z)ZImagePipelineConfig.shard_latents_for_spc                 C   s.   |  }t dks| dkr|S t|ddS )Nr>   rW   rX   rz   )r   r   r{   r   )rK   r   r   r   r   gather_latents_for_sp   s   z*ZImagePipelineConfig.gather_latents_for_spc           	      C   s  |  dkrt|ddr|dd }t|dd }|d ur;|  dkr;|d d d d d d d |d d |d f }|j\}}}}}|d urf||d krf|d d d d d |d d d d d f }|d }|dkr{|d d d d d	d d d d f S |||||S )
NrW   r   FrX   rY   rV   r<   r>   r   )r{   rj   r   r   shapeview)	rK   r   rr   rV   bschannels
num_framesrl   ro   r   r   r   post_denoising_loop   s   .*"z(ZImagePipelineConfig.post_denoising_loopc           "      C   s   ddd}t  }|dkrx| |}	||	d ddfd|	d  ddf|ddd}
||
}d}|	d	 }|	d
 }||||f|	d d |	d df|ddd}|jd  | j }|rp|dd|ddd}tj|||dgdd}||}||fS |d}| | j }||| ddfd|ddd}d}|| j	j
j }|| j	j
j }| j| j}}| j}|| || || }}}|| | }| | j }||||f|| d ddf|ddd}|dd|ddd|d} tj|| gdd}!||}||!}||fS )Nc                    sJ   |d u rdd | D } fddt || D }tj|dd}tj|ddS )	Nc                 s   s    | ]}d V  qdS )r   Nr   ).0_r   r   r   	<genexpr>   s    zUZImagePipelineConfig.get_freqs_cis.<locals>.create_coordinate_grid.<locals>.<genexpr>c                    s(   g | ]\}}t j||| t j d qS ))dtyper   )r|   arangeint32)r   x0spanr   r   r   
<listcomp>   s    zVZImagePipelineConfig.get_freqs_cis.<locals>.create_coordinate_grid.<locals>.<listcomp>ij)indexingry   rz   )zipr|   meshgridstack)rp   r   r   axesgridsr   r   r   create_coordinate_grid   s   
zBZImagePipelineConfig.get_freqs_cis.<locals>.create_coordinate_gridr>   rh   ri   r   )rp   r   r   r<   re   rc   rg   rf   )r>   r>   r>   )r   r   r   rz   )r>   r   r   )NN)r   rw   flattenr   r;   r|   r}   r~   rp   r4   rm   rn   r=   r?   )"rK   rZ   ro   rl   r   
rotary_embrr   r   r[   rt   cap_pos_idscap_freqs_cisF_tokensH_tokens_localW_tokensimg_pos_idsimg_pad_lenpad_idsx_freqs_ciscap_ori_lencap_padding_lencap_padded_pos_idsFr^   r_   pHpWpFH_tokensimage_ori_lenimage_padding_lenimage_ori_pos_idsimage_padding_pos_idsimage_padded_pos_idsr   r   r   get_freqs_cis   s   


	z"ZImagePipelineConfig.get_freqs_cisc              	   C   "   d|  |jd |j|j|||iS N	freqs_cisr   r   rZ   ro   rl   rK   rr   r   r   r   r   r   r   prepare_pos_cond_kwargs2     z,ZImagePipelineConfig.prepare_pos_cond_kwargsc              	   C   r   r   r   r   r   r   r   prepare_neg_cond_kwargs>  r   z,ZImagePipelineConfig.prepare_neg_cond_kwargsN),r+   r,   r-   r1   r!   r/   r   T2Ir2   r   r   r3   r   r   r4   r   r5   tupler   r8   r   r9   r;   rR   r=   r?   liststrr.   rO   staticmethodrU   ru   rw   r|   Tensorr   r   r   r   r   r   r   r   r   r   r   r   r0   +   s<   
 <Tr0   )!rS   dataclassesr   r   typingr   r|   $sglang.multimodal_gen.configs.modelsr   r   r   0sglang.multimodal_gen.configs.models.dits.zimager   -sglang.multimodal_gen.configs.models.encodersr	   3sglang.multimodal_gen.configs.models.encoders.qwen3r   .sglang.multimodal_gen.configs.models.vaes.fluxr   3sglang.multimodal_gen.configs.pipeline_configs.baser   r   :sglang.multimodal_gen.runtime.distributed.communication_opr   8sglang.multimodal_gen.runtime.distributed.parallel_stater   r   r   r   r   r$   r%   r0   r   r   r   r   <module>   s"   