o
    iL                     @  s~  d Z ddlmZ ddlZddlZddlZddlZddlZej	ej	ej
eZejedZejedZejedZeeefD ]ZeejvrRejde qDejddZe d	e d	e Zerqe d	e ejd< neejd< ed
ZejeddZdZdZeedfd9ddZd:ddZ													d;d<d0d1Zd2d3 Zd4d5 Zd6d7 Z e!d8kre   dS dS )=a  
Local entry point for Veena3 TTS - runs the same FastAPI app without Modal.

Replicates what Modal's @modal.enter + @modal.asgi_app does:
1. Sets up PYTHONPATH for vendored deps (sparktts, AP-BWE)
2. Downloads model weights from HuggingFace if not present
3. Initializes TTS runtime (vLLM engine, BiCodec decoder, pipelines)
4. Serves the FastAPI app via uvicorn

Usage:
    # Basic (auto-downloads model, uses defaults)
    python -m veena3modal.local_server

    # Custom model path (skip download)
    python -m veena3modal.local_server --model-path /path/to/spark_tts_4speaker

    # Custom GPU memory + port
    python -m veena3modal.local_server --gpu-memory 0.5 --port 8080

    # With super-resolution (48kHz output)
    python -m veena3modal.local_server --enable-sr --sr-path /path/to/ap_bwe/16kto48k

    # CPU-only (no GPU, for testing only - very slow)
    python -m veena3modal.local_server --device cpu

Environment Variables (override defaults):
    MODEL_PATH              - Path to Spark TTS model directory
    HF_TOKEN                - HuggingFace token for private models
    AP_BWE_CHECKPOINT_DIR   - Path to super-resolution checkpoints
    AUTH_BYPASS_MODE        - "true" to disable auth (default for local)
    GPU_MEMORY_UTILIZATION  - vLLM GPU memory fraction (default: 0.25)
    )annotationsNexternalsparkttszAP-BWE
PYTHONPATH :zveena3.localmodelsspark_tts_4speakerzBayAreaBoys/spark_tts_4speakerzSparkAudio/Spark-TTS-0.5Bhf_repostr	local_dirhf_token
str | Nonereturnc                   s  g d}t  fdd|D rtd    S zddlm} W n ty4   td td Y nw t	j
 d	d
 td|  d z||  |g dd}td|  |W S  tyt } ztd|  W Y d}~nd}~ww | tkrtdt d z|t |d}td|  td |W S  ty } ztd|  W Y d}~nd}~ww tdt d  dt d  d	 td dS )aL  
    Download Spark TTS model from HuggingFace if not already present.

    Strategy:
    1. Try private repo (BayAreaBoys/spark_tts_4speaker) with HF token
    2. Fall back to public base model (SparkAudio/Spark-TTS-0.5B)
    3. Provide instructions for Modal volume download if both fail

    Returns the resolved local path.
    config.jsonzconfig.yamlzLLM/config.jsonc                 3  &    | ]}t jt j |V  qd S Nospathexistsjoin).0mr    3/home/ubuntu/veenaModal/veena3modal/local_server.py	<genexpr>^   s   $ z!download_model.<locals>.<genexpr>zModel already present at r   )snapshot_downloadzyhuggingface_hub not installed. Install with: pip install huggingface-hub
Or download model manually and pass --model-path   T)exist_okzAttempting download from z...)z*.binztraining_args.binzoptimizer.pt)repo_idr   tokenignore_patternszModel downloaded to zPrivate repo download failed: NzTrying public base model: )r"   r   r#   zBase model downloaded to zpUsing public base SparkTTS model (no custom speakers). For full speaker support, download your fine-tuned model.z#Public model download also failed: z
Model download failed. Options:
  1. Download from Modal volume:
     modal volume get veena3-models spark_tts_4speaker models/spark_tts_4speaker

  2. Download from HuggingFace (if you have access):
     huggingface-cli download z --local-dir z>

  3. Use base SparkTTS model:
     huggingface-cli download za

  4. Specify a custom path:
     python -m veena3modal.local_server --model-path /path/to/model)anyloggerinfohuggingface_hubr   ImportErrorerrorsysexitr   makedirs	ExceptionwarningPUBLIC_FALLBACK_REPOPRIVATE_HF_REPO)r
   r   r   marker_filesr   downloaded_pathee2r   r   r   download_modelM   sp   r6   
model_pathtuple[str, str]c                 C  s~   t j| d}t j| d}t j|r"t jt j|dr"|}n| }| }td td|  td|  ||fS )a  
    Resolve LLM and BiCodec paths from a Spark TTS model directory.

    Spark TTS model structure (from SparkAudio/Spark-TTS-0.5B):
      model_path/
        LLM/          <- Language model (vLLM loads this)
        BiCodec/      <- Audio tokenizer/decoder
        config.yaml   <- Top-level config

    Fine-tuned models may have LLM files at root (flat structure).
    This function auto-detects both layouts.

    Returns:
        (llm_path, bicodec_path) resolved paths
    LLMBiCodecr   zResolved model paths:z  LLM (vLLM):    z  BiCodec:       )r   r   r   isdirr   r&   r'   )r7   
llm_subdirbicodec_subdirllm_pathbicodec_pathr   r   r   resolve_model_paths   s   "
r@   cuda333333?Fr    devicegpu_memory_utilizationfloat	enable_srboolsr_checkpoint_dirnum_enginesintmax_num_batched_tokens
int | Nonemax_num_seqsenable_chunked_prefillbool | Noneenable_prefix_cachingdisable_log_statsenforce_eagerprecompute_speaker_globalsc                 C  s&  ddl m} t| \}}t }td td td|   td|  td|  td|  td	|d
d td|  |durXtd|  |durdtd|  |	durttd|	rodnd  |
durtd|
rdnd  |durtd|rdnd  |durtd|rdnd  td|rdnd  td|rdnd  td ||||r|nd||||||||	|
|||d}t | }td|dd td|j  td|jrdnd  td |jr|jj	rd!nd  |S )"z
    Initialize TTS runtime for local serving.

    This replaces Modal's @modal.enter lifecycle hook.
    Loads: vLLM engine, BiCodec decoder, prompt builder, pipelines.
    r   )initialize_runtime<============================================================z'Initializing Veena3 TTS Runtime (local)z  Model:       z  LLM path:    z  BiCodec:     z  Device:      z  GPU Memory:  z.0%z (total)z  Engines:     Nz  Max batched tokens: z  Max seqs:    z  Chunked prefill: enableddisabledz  Prefix caching: z  Engine stats logs: z  Enforce eager: z  Precompute globals: z  SR:          )r7   r?   rH   rC   r   rD   rF   rI   rK   rM   rN   rP   rQ   rR   rS   zRuntime initialized in z.1fsz  Model version: z  Streaming: 	availableunavailablez  Super-resolution: loaded)
 veena3modal.services.tts_runtimerT   r@   timer&   r'   model_versionstreaming_pipeline
sr_service	is_loaded)r7   rC   rD   rF   rH   r   rI   rK   rM   rN   rP   rQ   rR   rS   rT   r>   r?   startruntimeelapsedr   r   r   initialize_local_runtime   sb   



&re   c                  C  s   ddl m}  |  S )z}
    Create the FastAPI app for local serving.

    Same app factory as Modal deployment, but called from local context.
    r   
create_app)veena3modal.api.fastapi_apprg   rf   r   r   r   create_local_app  s   ri   c                  C  sR  t jdt jdd} | d}|jdttjdt	dt	 dd	 |jd
tt
dt
 dd	 |jdttjddd	 |jdddd | d}|jdtdddgdd |jdtttjdddd	 |jdtttjddd d	 |jd!td"d#d	 |jd$td"d%d	 |jd&dd"d'd( |jd)dd*d+d( |jd,dd*d-d( |jd.dd*d/d( |jd0dd*d1d( |jd2dd*d3d( | d4}|jd5dd*d6d( |jd7ttjd8d9d	 | d:}|jd;td<d=d	 |jd>tttjd?d@dAd	 |jdBtdCdDd	 |jdEdd*dFd( | dG}|jdHdd*dId( | dJ}|jdKtdLg dMdNd |  S )Oz3Parse CLI arguments for local server configuration.z8Veena3 TTS Local Server - GPU-accelerated text-to-speecha  
Examples:
  python -m veena3modal.local_server                          # Auto-download model, serve on :8000
  python -m veena3modal.local_server --port 8080              # Custom port
  python -m veena3modal.local_server --model-path ./my_model  # Use local model
  python -m veena3modal.local_server --gpu-memory 0.5         # Limit GPU memory
  python -m veena3modal.local_server --workers 2              # Multiple workers (careful: each loads model)
        )descriptionformatter_classepilogModelz--model-path
MODEL_PATHz"Path to Spark TTS model (default: ))typedefaulthelpz	--hf-repoz,HuggingFace repo to download from (default: z
--hf-tokenHF_TOKENz2HuggingFace token (default: from HF_TOKEN env var)z--skip-download
store_truez-Don't auto-download model (fail if not found))actionrr   GPUz--devicerA   cpuz$Device for inference (default: cuda))rp   rq   choicesrr   z--gpu-memoryGPU_MEMORY_UTILIZATIONz0.25z4vLLM GPU memory utilization fraction (default: 0.25)z--num-enginesNUM_ENGINES1zKNumber of vLLM engine instances (default: 1, use 2-3 for multi-engine mode)z--max-num-batched-tokensNzLOverride vLLM max_num_batched_tokens scheduler limit (default: model config)z--max-num-seqsz?Override vLLM max_num_seqs scheduler limit (default: vLLM auto)z--enable-chunked-prefillz!Force-enable vLLM chunked prefill)ru   rq   rr   z--disable-chunked-prefillFz"Force-disable vLLM chunked prefillz--disable-prefix-cachingzDisable vLLM prefix cachingz--disable-engine-stats-logsz#Disable vLLM periodic stats loggingz--enforce-eagerz(Enable eager mode (disables CUDA graphs)z--precompute-globalszWPrecompute speaker global tokens at startup (streaming-only optimization, experimental)zSuper-Resolutionz--enable-srz/Enable AP-BWE super-resolution (16kHz -> 48kHz)z	--sr-pathAP_BWE_CHECKPOINT_DIRz#Path to AP-BWE checkpoint directoryServerz--hostz0.0.0.0zBind address (default: 0.0.0.0)z--portPORT8000z Port to serve on (default: 8000)z	--workersr    z=Number of uvicorn workers (default: 1, each loads full model)z--reloadzDEnable auto-reload for development (incompatible with --workers > 1)Authz--authz;Enable API key authentication (default: disabled for local)Loggingz--log-levelINFO)DEBUGr   WARNINGERRORzLog level (default: INFO))argparseArgumentParserRawDescriptionHelpFormatteradd_argument_groupadd_argumentr   r   environgetDEFAULT_LOCAL_MODEL_DIRr1   rE   rJ   
parse_args)parsermodel_group	gpu_groupsr_groupserver_group
auth_group	log_groupr   r   r   r     s.  









r   c                    s  t  } tjtt| jddd | jsdtjd< t	d ntj
dd | j t fdd	d
D }|sN| jrEtd  d td t| j | jd | j}| jrVd}t | j| j| j| j| j| j| j| j|| jrndnd| jrtdnd| j rzdnd| j!d ddl"}t	d t	d t	d t	d| j# d| j$  t	d| j# d| j$ d t	d| j# d| j$ d t	d| j# d| j$ d t	d| j# d| j$ d  t	d |j%d!d| j#| j$| j&| j'| j( d"d# dS )$z
    Main entry point for local TTS server.

    Lifecycle:
    1. Parse args + configure logging
    2. Download model if needed
    3. Initialize TTS runtime (vLLM engine, decoders, pipelines)
    4. Create FastAPI app
    5. Run uvicorn
    z6%(asctime)s | %(levelname)-8s | %(name)s | %(message)sz%Y-%m-%d %H:%M:%S)levelformatdatefmttrueAUTH_BYPASS_MODEzAAuth bypass enabled (local mode). Use --auth to require API keys.falsec                 3  r   r   r   )r   markerr7   r   r   r     s
    
zmain.<locals>.<genexpr>r   zModel not found at z and --skip-download is setr    )r
   r   r   FNT)r7   rC   rD   rF   rH   r   rI   rK   rM   rN   rP   rQ   rR   rS   r   r   rU   z Veena3 TTS Local Server startingz  URL: http://r   z  Health: http://z/v1/tts/healthz  Generate: POST http://z/v1/tts/generatez  Metrics: http://z/v1/tts/metricsz  Docs: http://z/docsz)veena3modal.local_server:create_local_app   )factoryhostportworkersreload	log_leveltimeout_keep_alive))r   loggingbasicConfiggetattrr   authr   r   r&   r'   
setdefaultr7   r%   skip_downloadr*   r+   r,   r6   r
   r   rN   disable_chunked_prefillre   rC   
gpu_memoryrF   sr_pathrI   rK   rM   disable_prefix_cachingdisable_engine_stats_logsrR   precompute_globalsuvicornr   r   runr   r   lower)argsmodel_existsrN   r   r   r   r   main  s|   







r   __main__)r
   r   r   r   r   r   r   r   )r7   r   r   r8   )rA   rB   FNNr    NNNNNNF)r7   r   rC   r   rD   rE   rF   rG   rH   r   r   r   rI   rJ   rK   rL   rM   rL   rN   rO   rP   rO   rQ   rO   rR   rO   rS   rG   )"__doc__
__future__r   r   r   r   r+   r]   r   dirnameabspath__file__	REPO_ROOTr   EXTERNAL_DIRSPARKTTS_PATHAP_BWE_PATHinsertr   r   existing_pythonpath	new_paths	getLoggerr&   r   r1   r0   r6   r@   re   ri   r   r   __name__r   r   r   r   <module>   sd    !



O&O
 0]
