o
    ۷i;                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ eeZ	ddedejejB fddZdd Zdd Zdd ZdddZedkrae  dS dS )zServer launcher for cache-dit.

Adapted from SGLang's server launcher:
https://github.com/sgl-project/sglang/blob/main/python/sglang/launch_server.py
    N   )normalize_quantize_type)current_platformCpuPlatform   )ModelManager)
create_app)align_cache_config)init_loggerTparsereturnc                 C   s  t  }|jdddd |jdddd |jdddd |jdddd |jdtd d	d
 |jdtd dd
 |jddddd |jdddtd d |jdtd d |jdtd d |jdddtdd |jdddtdd |jdd d!td"d |jd#d$d%td&d |jd'd(tdd |jd)d*td+d |jd,d-td.d |jd/ddd |jd0d1tdd |jd2ddd |jd3d4td g d5d6d7 |jd8td d |jd9td d |jd:d;ddd |jd<d=td g d>d? |jd@td dAd
 |jdBdCtd g dDd? |jdEdddFd |jdGdHdddId |jdJtd g dKd? |jdLddd |jdMtd dNd
 |jdOtd dPd
 |jdQtd dRd
 |jdStd dTd
 |jdUdddVd |jdWdXdddYd |jdZd[ddd\d |jd]d^ddd_d |jd`ddd |jdatd d |jdbtd d |jdctdddedfgg dgdh |jdiddjd |jdkddjd | r| n|}| rt|j|_|jd urdj|_	|j	r|jd u rdl|_|j
d ur|jsdj|_|j
dmkrdn|_
|j
dokrdp|_
|j
dqkrdr|_
|j
dskrdt|_
|S )uNz--cache
store_trueF)actiondefaultz	--compilez--compile-repeated-blocksz--max-autotunez--lora-pathzPath to LoRA weights directorytyper   helpz--lora-namez.LoRA weight filename (e.g., model.safetensors)z--disable-fuse-loraz0Disable LoRA fusion (keep LoRA weights separate)r   r   r   z--num-inference-stepsz--stepsnum_inference_steps)destr   r   z--warmup)r   r   z--repeatz--Fn-compute-blocksz--FnFn_compute_blocksr   z--Bn-compute-blocksz--BnBn_compute_blocksr   z--residual-diff-thresholdz--rdtresidual_diff_thresholdgQ?z--max-warmup-stepsz--wsz--w   z--warmup-intervalz--wiz--max-cached-stepsz--mcz--max-continuous-cached-stepsz--mcc   z--taylorseerz--taylorseer-orderz-orderz--steps-maskz--mask-policyz--scm)	Nslowsmediummfastfultrauz)Pre-defined steps computation mask policyr   r   choicesr   z--heightz--widthz
--quantizez-qz--quantize-typez--quant-type)Nfloat8float8_weight_only	float8_woint8int8_weight_onlyint8_woint4int4_weight_onlyint4_wobitsandbytes_4bitbnb_4bit)r   r   r%   z--pipeline-quant-config-pathzOPath to custom Python module that provides get_pipeline_quant_config() functionz--parallel-typez
--parallel)Ntpulyssesringz--parallel-vaez%Enable VAE parallelism if applicable.z--parallel-text-encoderz--parallel-textz.Enable text encoder parallelism if applicable.z--attn)Nflash_flash_3native_native_cudnn_sdpa_cudnnsagez--perfz--promptzOverride default promptz--negative-promptz Override default negative promptz--model-pathzOverride model pathz--image-pathzOverride image pathz--track-memoryz&Track and report peak GPU memory usagez--ulysses-anythingz--uaaz9Enable Ulysses Anything Attention for context parallelismz--ulysses-float8z--ufp8z;Enable Ulysses Attention/UAA Float8 for context parallelismz--ulysses-asyncz--uaqkvzNEnabled experimental Async QKV Projection with Ulysses for context parallelismz	--profilez--profile-namez--profile-dirz--profile-activities+CPUGPU)r;   r<   MEM)r   nargsr   r%   z--profile-with-stackTz--profile-record-shapesr'   r   r   r   r   r!   r    r#   r"   )argparseArgumentParseradd_argumentstrintfloat
parse_argsr   quantize_typequantizemask_policy
steps_mask)r   parserargs_or_parser rL   K/home/ubuntu/vllm_env/lib/python3.10/site-packages/cache_dit/serve/serve.pyget_args   sP  rN   c                  C   s   t dd} | jdtddd | jdtdd	d | jd
tddd | jdtd dd | jdtdg ddd | jddddd | jdtd dd |  }tt|dd |_|jd ur\d|_|jrg|jd u rgd|_|j	so| 
d |S )NF)r   z--hostz0.0.0.0zServer hostr   z--porti@  zServer portz	--workersr   zNumber of worker processesz--devicez)Device (cuda/cpu), auto-detect by defaultz--dtypebfloat16)float32float16rO   zModel dtyper$   z--enable-cpu-offloadr   z%Enable CPU offload (saves GPU memory)r   z--device-mapz$Device map strategy (e.g., balanced)rF   Tr'   z--model-path is required)rN   rA   rB   rC   rE   r   getattrrF   rG   
model_patherror)rJ   argsrL   rL   rM   rE      sj   


rE   c                  C   s`   dd l m}  t }tj}|  r#|  }t||t	  }||fS dt|r,|fS dfS )Nr   cpu)
torch.distributeddistributedr   is_accelerator_availabledevice_typeis_initializedget_ranktorchdevicedevice_count)dist	availablerZ   rankr^   rL   rL   rM   get_rank_device+  s   rc   c           	      C   s   dd l m} tj}tj}| jr| d| ntj}t }tj}| j	d ur9|j
|d t \}}t| ||fS dt|rB|fS dfS )Nr   ,)backendrV   )rW   rX   r   full_dist_backendr   ulysses_anythingdist_backendrY   rZ   parallel_typeinit_process_grouprc   
set_devicer]   r^   )	rU   r`   platform_full_backendcpu_full_backendre   ra   rZ   rb   r^   rL   rL   rM   maybe_init_distributed7  s"   


rn   c                 C   sl  | du rt  } t| \}}| jdur%ddlm} td| d|   tt	| j
}| j}d}|rK| j| j| j| j| j| j| jd}t| j| |d}i }| jdv rt| drb| jdurb| j|d	< nd
|d	< t| drr| jrrd|d< t| dr~| jr~d|d< t| dr| jrd|d< n| jdkr	 | j|d< | j|d< td td;i d| jd| jptjd|d|d|d| j d| j!d| j"d| jd|d | jd!| j#d"| j$d#| j%d$| j&d%| j'd&| j( }td' |)  td( | jd)v rnddlm} |*  td*| d+ |dkrTdd,l+m,}	 |	||| }
t-|
}td-| j d.| j. d/| j/  td0| j. d/| j/ d1 t0j1|| j.| j/d2d3d4 dS dd5l+m2} td6| d7| j d8 ||| dS |dkrt-|}td9| j. d/| j/  td0| j. d/| j/ d1 t0j1|| j.| j/| j3d3d4 dS t4d*| d: ddl5}	 |6d2 q)<zLaunch the serving server.Nr   zInitialized distributed: rank=z, world_size=)r   r   r   max_warmup_stepswarmup_intervalmax_cached_stepsmax_continuous_cached_steps)rS   rU   base_cache_config)r2   r3   attnattention_backendr6   rg   Texperimental_ulysses_anythingulysses_float8experimental_ulysses_float8ulysses_asyncexperimental_ulysses_asyncr1   parallel_text_encoderparallel_vaezInitializing model manager...rS   r^   torch_dtypeenable_cachecache_configenable_cpu_offload
device_mapenable_compileri   parallel_argsattn_backendrG   rF   pipeline_quant_config_path	lora_path	lora_name	fuse_lorazLoading model...zModel loaded successfully!)r1   r2   r3   zRank z&: All ranks ready, starting service...)TPCoordinatorz%Starting distributed server (rank 0, z) at http://:zAPI docs at http://z/docsr   info)hostportworkers	log_level)run_tp_workerz"Starting distributed worker (rank z, )zStarting server at http://z$: Unexpected rank in single GPU moderL   )7rE   rn   ri   rW   rX   loggerr   get_world_sizerR   r]   dtypecacher   r   r   ro   rp   rq   rr   r	   rS   hasattrrt   rg   rw   ry   r{   r|   r   r^   r   rZ   r   r   compilerG   rF   r   r   r   disable_fuse_lora
load_modelbarriercache_dit.serve.tp_workerr   r   r   r   uvicornrunr   r   warningtimesleep)rU   rb   r^   r`   r}   r~   r   r   model_managerr   coordinatorappr   r   rL   rL   rM   launch_serverN  s   







	




	

	
r   __main__)T)N)__doc__r?   r]   r   quantize.utilsr   	platformsr   r   r   r   
api_serverr   cache_alignmentr	   cache_dit.loggerr
   __name__r   boolr@   	NamespacerN   rE   rc   rn   r   rL   rL   rL   rM   <module>   s4    

 UA
 
