o
    ٷi                     @  s  d Z ddlmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlZddlZddlm  mZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3 ee4Z5e
G dd dZ6dd Z7d*ddZ8e
G dd dZ9e
G d d! d!Z:d+d$d%Z;G d&d' d'ej<Z=G d(d) d)ej<Z>dS ),z-
BagelPipeline implementation for vLLM-Omni.
    )annotationsN)Iterable)deepcopy)	dataclass)isqrt)Image)nn)AutoTokenizerSiglipImageProcessorSiglipVisionConfigSiglipVisionModel)init_logger)AutoWeightsLoader)BagelConfig)DiffusionOutputOmniDiffusionConfig)get_local_device)DiffusersPipelineLoader)OmniDiffusionRequest)!download_weights_from_hf_specific   )AutoEncoderAutoEncoderParams)Bagel
NaiveCacheQwen2MoTConfigQwen2MoTForCausalLMc                   @  sb   e Zd ZU dZded< dZded< dZded< d	Zded
< dZded< dZ	ded< dZ
ded< dS )BagelGenParams2   intnum_timesteps      @floattimestep_shift      @cfg_text_scale      ?cfg_img_scale)g?      ?tuplecfg_interval        cfg_renorm_minglobalstrcfg_renorm_typeN)__name__
__module____qualname__r    __annotations__r#   r%   r'   r*   r,   r/    r4   r4   c/home/ubuntu/.local/lib/python3.10/site-packages/vllm_omni/diffusion/models/bagel/pipeline_bagel.pyr   &   s   
 r   c                 C  s   g }| j  D ]\}}t|tr|| qt|tr||7 }qg }d|vr+|d d|vr4|d d|vr=|d d|vrF|d | |}| d}| d}| d}| d}	t||||	d}
| |
|fS )Nz<|im_start|>z
<|im_end|>z<|vision_start|>z<|vision_end|>)bos_token_ideos_token_idstart_of_imageend_of_image)	special_tokens_mapitems
isinstancer.   appendlist
add_tokensconvert_tokens_to_idsdict)	tokenizerall_special_tokenskv
new_tokensnum_new_tokensr6   r7   r8   r9   new_token_idsr4   r4   r5   add_special_tokens1   s8   











rI   	od_configr   c                 C  s   dd }|S )Nc                 S  s   | S Nr4   )xr4   r4   r5   post_process_funcY   s   z6get_bagel_post_process_func.<locals>.post_process_funcr4   )rJ   rM   r4   r4   r5   get_bagel_post_process_funcW   s   rN   c                   @  &   e Zd ZU dZded< dZded< dS )_VaeCfg   r   
z_channels   
downsampleN)r0   r1   r2   rR   r3   rT   r4   r4   r4   r5   rP   _      
 rP   c                   @  rO   )_VitCfg   r   
patch_size  hidden_sizeN)r0   r1   r2   rX   r3   rZ   r4   r4   r4   r5   rV   e   rU   rV   returnr   c                   C  s    t dddddg ddddd	d

S )N      rS      )r         r`   r_   rQ   gxz,C?g=U?)

resolutionin_channelsrT   chout_chch_multnum_res_blocksrR   scale_factorshift_factor)r   r4   r4   r4   r5   default_ae_paramsk   s   ri   c                      s$   e Zd Z fddZdd Z  ZS )SiglipNaViTWrapperc                   s*   t    t|dr|j| _d S || _d S )Nvision_model)super__init__hasattrrk   )selfrk   	__class__r4   r5   rm   {   s   


zSiglipNaViTWrapper.__init__c                 C  s   | j jj}|j|jjd d}t|||j}| j j	|}|| }|
d}	|jd }
tjdd|
|
ft|jj|j|jd}| }tt|d D ]}|| }||d  }d|d||||f< qM| j j|	|d}|jdS )Nr   r   )devicedtyper+   .)inputs_embedsattention_mask)rk   
embeddingspatch_embeddingweightviewshapeFlinearbiasposition_embedding	unsqueezetorchfullfinfort   minrs   tolistrangelenencoderlast_hidden_statesqueeze)ro   packed_pixel_valuespacked_flattened_position_ids
cu_seqlens
max_seqlenpatch_embedwrL   poshidden_statesseq_lenmaskcu_seqlens_lististartendoutputsr4   r4   r5   forward   s   


(zSiglipNaViTWrapper.forward)r0   r1   r2   rm   r   __classcell__r4   r4   rp   r5   rj   z   s    rj   c                      sR   e Zd ZdZddd  fdd	Zed!ddZe d"ddZ	d#ddZ
  ZS )$BagelPipelinezBagel generation pipeline (MoT) packaged for vllm-omni diffusion engine.

    This pipeline is self-contained and uses the ported Bagel core files.
     )prefixrJ   r   r   r.   c                  s  t    || _t | _|j}tj|}|r|}nt	||j
dg}tj|d}t|dd}t|}W d    n1 s?w   Y  |dpJi }	tt|	ddt|	dd	d
}
tj|d}t|}d|_d|_|jpsd|_tj|ddd| _|dpi }tt|ddt|ddd}tj|d}t|}|jdkrd|_d|_t|| _ t!j|dd| _"| j rt#| j | _ | j j$j%j&|_&| j j$j%j'|_'t(| j\| _| _)}t*| j}t+dd | j), D }t+tt-|d|t|t|d |_.t/|| _0t1 }t2|| _3t4| j0| j t5||
|t|ddt6|d d!t7|d"dt|d#d$t|d%d&t8|d'd(d)	d*| _9t:j;|jd |j
d+dd,g| _<| =| j d S )-N*zconfig.jsonzutf-8)encoding
vae_configrR   rQ   rT   rS   )rR   rT   zllm_config.jsonTFQwen2MoTDecoderLayer)local_files_onlytrust_remote_code
vit_configrX   rW   rZ   rY   )rX   rZ   zvit_config.json      )r   c                 s  s    | ]}t |V  qd S rK   )r   ).0rE   r4   r4   r5   	<genexpr>   s    z)BagelPipeline.__init__.<locals>.<genexpr>
vocab_sizer   vit_max_num_patch_per_sideF   connector_actgelu_pytorch_tanhinterpolate_poslatent_patch_sizer_   max_latent_size    r#   r(   )	
llm_configr   r   r   r   r   r   r   r#   )language_model	vit_modelconfigr   )model_or_path	subfolderrevisionr   fall_back_to_pt)>rl   rm   rJ   r   rs   modelospathexistsr   r   joinopenjsonloadgetrP   r   r   from_json_fileqk_normtie_word_embeddingsoverride_transformer_cls_namelayer_moduler	   from_pretrainedrB   rV   r   num_hidden_layersvision_use_headr   r   r
   image_processorrj   rk   r   rZ   rX   rI   rH   r   maxvaluesgetattrr   r   r   ri   r   vaer   r   r.   boolr"   bagelr   ComponentSourceweights_sourcesto)ro   rJ   r   r   r   
model_pathcfg_pathf	bagel_cfgvae_cfg_dictvae_cfgllm_cfg_pathr   vit_cfg_dictvit_cfgvit_config_pathvit_conf_tok_lenrequired_max_id	ae_paramsrp   r4   r5   rm      s   









zBagelPipeline.__init__r   r   r   r   latenttorch.Tensorimage_shapetuple[int, int]r[   Image.Imagec                 C  s   |\}}|| j  || j  }}| j}| j}	|d|||||	}td|}|d|	|| || }t| j}
|	|
}|
|}|d d ddd dddd }t|	tj  S )Nr   znhwpqc->nchpwqg      ?r   r_      )latent_downsampler   latent_channelreshaper   einsumnext
parametersrt   r   decodeclamppermuter   	fromarrayuint8cpunumpy)r   r   r   r   HWhr   pc	vae_dtypeimager4   r4   r5   _decode_image_from_latent  s   

&z'BagelPipeline._decode_image_from_latentreqr   r   c           2        s	  t |jdkrtdd |jd }t|jd tr|n	|jd dp%d}tjj	jj
 }|jjd u rA|jjd u rA| }}n|jjd urMt|jjn|}|jjd ur[t|jjn|}||kse||krtd| d| d	| d| d
jj	 djj
 d||f}t|jdi pi }|dd}	|dd}
tt|jjpdd|	|
d}dgdgtjjjjd}t|}t|}|jj}|d urtd ||d< |jd jd }|g|d< |g|d< td t|j|jddd}nt|trd n	|dpi d}|rt|ts|g}|rdd |D }|r)jr)jr)fd d!}jj
tjj	 fd"d#  fd$d|D }|d j \}}||f}td%| d|  d&d' }jj!|d |d ||j"d(\}}}|# D ]\}}t$%|r|&j'||< qvt$j(j'j)j'j)d)kj*j+d* jj,j|d fi ||d< W d    n	1 sw   Y  ||d< ||d< jj-|d |d ||j"d(\}}}|# D ]\}}t$%|r|&j'||< qt$j(j'j)j'j)d)kj*j+d* jj.|d fi ||d< W d    n	1 sw   Y  ||d< ||d< t|}jj/|d |d |gj0j"d+\}}} t|d, 1 2 }!tj3j4}"|!|"kr^td-|! d.|" d/|# D ]\}}t$%|rt|&j'||< qbt$j(j'j)j'j)d)kj*j+d* jj5|d fi ||d< W d    n	1 sw   Y  ||d< | |d< |d0d}#jj/|d |d |#gj0j"d+\}$}%}&|$# D ]\}}t$%|r|&j'|$|< qt$j(j'j)j'j)d)kj*j+d* jj5|d fi |$|d< W d    n	1 s
w   Y  |%|d< |&|d< jj/|d |d |gj0j"d+\}'}(})|'# D ]\}}t$%|rB|&j'|'|< q0t$j(j'j)j'j)d)kj*j+d* jj5|d fi |'|d< W d    n	1 snw   Y  |(|d< |)|d< |jj6d urt$7|jj6 j'j)d1krt$j87|jj6 jj9|d |d |gj"d2}t|d, 1 2 }*tj3j4}"|*|"krtd3|* d.|" d4t|d5 : 2 }+|+dk rtd6|+ d7t|d8 1 2 },tjj	jj	 d }-|,|-krtd9|, d:|- d;| d<jj	 d=	|# D ]\}}t$%|r#|&j'||< qjj;|d |d |gd>}.jj;|d |d |gd>}/|.# D ]\}}t$%|rW|&j'|.|< qE|/# D ]\}}t$%|ro|&j'|/|< q]t$j(j'j)j'j)d)kj*j+d*D jj<dF|d |d |d |j|j|j=|j>|j?|j@|jAd?
||.d@ |.dA |.dB |.dC |/d@ |/dA |/dB |/dC dD}0W d    n	1 sw   Y  Bjj|0d |}1tC|1dES )GNr   z@This model only supports a single prompt, not a batched request.z$Taking only the first image for now.r   promptr   zRequested resolution rL   z  exceeds Bagel checkpoint limit z (max_latent_size=z, latent_downsample=z).
extra_argsr%   r$   r'   r&   r   r!   )r    r#   r%   r'   )kv_lensropespast_key_valuesz Using injected KV Cache (direct)r  r  r  z,CFG is disabled when using injected KV Cacher(   multi_modal_datar   c                 S  s$   g | ]}t |trt|n|qS r4   )r<   r.   r   r   )r   r   r4   r4   r5   
<listcomp>\  s   $ z)BagelPipeline.forward.<locals>.<listcomp>c                   s    j | ddjd S )Npt)imagesreturn_tensorsr   )r   pixel_values)img)ro   r4   r5   vit_transformsb  s   z-BagelPipeline.forward.<locals>.vit_transformsc                   s   | j dkr
| d} | j\}}t t|| d}td }t||t|| }ttt||   }ttt||   }t| }t| }||ksX||kra| ||ftj	} | S )NRGBr(   r\   )
modeconvertsizer   r   r   roundresizer   BICUBIC)r  r   r   scalemin_img_sizenew_wnew_h)max_img_sizestrider4   r5   _resize_to_strideh  s   





z0BagelPipeline.forward.<locals>._resize_to_stridec                   s   g | ]} |qS r4   r4   )r   r  )r  r4   r5   r  z  s    zimg2img: resized image to c                 S  s>   | j dkr
| d} tt|  d d }|dddS )Nr  g     _@r(   r_   r   r   )r  r  r   
from_numpynparrayr"   r   )r  arrr4   r4   r5   vae_transforms  s   

z-BagelPipeline.forward.<locals>.vae_transforms)curr_kvlens	curr_roper	  
transformsrH   r   )device_typeenabledrt   )r!  r"  promptsrB   rH   packed_text_idsz-Tokenizer/model vocab mismatch: max token id z >= embed_tokens size z. This usually means you're not using the tokenizer shipped with the Bagel checkpoint, or llm_config.vocab_size is smaller than the tokenizer vocab.negative_promptcuda)r!  r"  image_sizesrH   z:Tokenizer/model vocab mismatch (image path): max token id zP. This indicates the tokenizer token IDs do not match the checkpoint embeddings.packed_position_idsz!Invalid packed_position_ids: min=z (must be >= 0)packed_vae_position_idszEInvalid packed_vae_position_ids (latent position embedding OOB): max=z > allowed_max=z. Requested image_shape=z, max_latent_size=.)r!  r"  r*  )
r  cfg_text_past_key_valuescfg_img_past_key_valuesr    r#   r%   r'   r*   r,   r/   cfg_packed_position_idscfg_packed_query_indexescfg_key_values_lenscfg_packed_key_value_indexes)cfg_text_packed_position_idscfg_text_packed_query_indexescfg_text_key_values_lens!cfg_text_packed_key_value_indexescfg_img_packed_position_idscfg_img_packed_query_indexescfg_img_key_values_lens cfg_img_packed_key_value_indexes)outputr4   )Dr   r&  loggerwarningr<   r.   r   r   r   r   r   sampling_paramsheightwidth
ValueErrorr   r   num_inference_stepsr   r   r   r   r   r  info	key_cacher{   r    r#   r>   r   r   r  prepare_vae_imagesrH   r;   r   	is_tensorr   rs   autocasttyperJ   rt   forward_cache_update_vaeprepare_vit_imagesforward_cache_update_vitprepare_promptsrB   r   itemr   r   forward_cache_update_textseedmanual_seedr)  prepare_vae_latentr   prepare_vae_latent_cfggenerate_imager%   r'   r*   r,   r/   r   r   )2ro   r   first_promptr  max_hwr@  rA  r   r  r%   r'   
gen_paramsgen_contextcfg_text_contextcfg_img_contextinjected_kvr   image_inputr  	resized_w	resized_hr   gen_input_vaenewlens_vaenew_rope_vaerD   rE   gen_input_imgnewlens_imgnew_rope_imggeneration_inputnewlensnew_ropemax_tidemb_n
neg_prompt	neg_inputneg_newlensneg_ropecfg_img_generation_inputcfg_img_newlenscfg_img_new_ropemax_tid_imgmin_pidmax_lat_pidmax_lat_pid_allowedgeneration_input_cfg_textgeneration_input_cfg_imglatentsr  r4   )r  r  ro   r  r5   r     sJ  
(





 	











zBagelPipeline.forwardweights"Iterable[tuple[str, torch.Tensor]]set[str]c           
        s     }t| dd | D dd  D g d}t }tD ]}|D ]\}}||v r<|||| q+q'| | dd	d
dfdd  fdd}t	}	|	
| S )Nc                 S  s   i | ]
\}}|t |jqS r4   )r)   r{   )r   rD   rE   r4   r4   r5   
<dictcomp>Z  s    z.BagelPipeline.load_weights.<locals>.<dictcomp>c                 S  s   h | ]\}}t |d r|qS )weight_loader)rn   )r   namer   r4   r4   r5   	<setcomp>\  s    z-BagelPipeline.load_weights.<locals>.<setcomp>))	.qkv_projz.q_proj)r  z.k_proj)r  z.v_proj).qkv_proj_moe_genz.q_proj_moe_gen)r  z.k_proj_moe_gen)r  z.v_proj_moe_genr}  r.   r[   c                 S  sb   dD ]}|  |r| t|d  } q|  dr!d| tdd   } |  ds+|  dr/d|  } | S )N)zmodule.model.z
vae_model.zvae.zencoder.zdecoder.
startswithr   )r}  pfxr4   r4   r5   _normalize_namer  s   

z3BagelPipeline.load_weights.<locals>._normalize_nameIterable[str]c                 3  s     | }|V  dD ]}| |rd| V   nq
dD ]}| |r)d| V   nq| dr6d| V  d	S | drBd| V  d	S | drTd|tdd	  V  d	S d	S )
aL  Yield candidate parameter names in this pipeline for a checkpoint key.

            The upstream Bagel repo typically stores Bagel-core layers (time_embedder,
            latent_pos_embed, vae2llm, llm2vae, etc.) at the top-level of the model,
            while this vllm-omni integration nests them under `self.bagel`.
            )ztime_embedder.zlatent_pos_embed.zvae2llm.zllm2vae.zbagel.)z
connector.zvit_pos_embed.z
vit_model.zvision_model.zbagel.vit_model.zmodel.vision_model.r  Nr  )r}  nr  )r  r4   r5   _iter_candidate_names  s(   






z9BagelPipeline.load_weights.<locals>._iter_candidate_namesc                  3  s   d} d}d}D ]\}}| d7 } d } |D ]}|v rt |j|ks+|v r/|} n|dr|jdkr|j\}}tt|}	|	|	 t|kr|tjjkrjj	j
}
|
j||f|
_t|	j_tjdrxtjjdt|	 tjj	drt|	jj	_||f|< |} n/|dr|jdkr|d ur| }| tt|kr||}|} n|d7 }q|d ur|d7 }||fV  q	td	|| | d S )
Nr   r   z bagel.latent_pos_embed.pos_embedr_   r   r   max_num_patch_per_sidez!embeddings.patch_embedding.weightzJBagelPipeline weight filter kept %d/%d tensors (shape mismatches seen: %d))r)   r{   r   endswithndimr   r   r   rZ   latent_pos_embed	pos_embeddata	new_emptyr   rn   setattrr   r  numelr   prodtensorrz   r=  	info_once)totalkeptshape_mismatchr}  r  pickedcandnposhdimsideparamtarget_shape)r  allowedro   shapestp_aware_paramsrx  r4   r5   _filtered_weights  sZ   
 


z5BagelPipeline.load_weights.<locals>._filtered_weights)r}  r.   r[   r.   )r}  r.   r[   r  )
state_dictsetkeysr;   named_parametersr>   addreplaceupdater   load_weights)
ro   rx  state_stacked_expansionsstacked_source_namesr}  target_suffixsource_suffixr  loaderr4   )r  r  r  ro   r  r  rx  r5   r  W  s&   


6zBagelPipeline.load_weights)rJ   r   r   r.   )
r   r   r   r   r   r   r   r   r[   r   )r   r   r[   r   )rx  ry  r[   rz  )r0   r1   r2   __doc__rm   staticmethodr   r   inference_moder   r  r   r4   r4   rp   r5   r      s    g  Ar   )rJ   r   )r[   r   )?r  
__future__r   r   r   collections.abcr   copyr   dataclassesr   mathr   r   r  r   torch.nn.functionalr   
functionalr|   PILr   transformersr	   r
   r   r   vllm.loggerr    vllm.model_executor.models.utilsr   %vllm.transformers_utils.configs.bagelr   vllm_omni.diffusion.datar   r   %vllm_omni.diffusion.distributed.utilsr   1vllm_omni.diffusion.model_loader.diffusers_loaderr   vllm_omni.diffusion.requestr   2vllm_omni.model_executor.model_loader.weight_utilsr   autoencoderr   r   bagel_transformerr   r   r   r   r0   r=  r   rI   rN   rP   rV   ri   Modulerj   r   r4   r4   r4   r5   <module>   sH   

&
