o
    %i                     @   s  U d Z ddlZddlZddlZddlZddlmZmZmZ ddl	m
Z
mZmZmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ e
e,Z-da.d	d
 Z/dZ0da1ee2 e3d< da4ee5 e3d< e6 Z7da8de9fddZ:de9fddZ;dedee2 fddZ<dee2ef de9dee2e2f fddZ=de2fddZ>de2ddfddZ?dd  Z@dS )!a  
FastAPI ASGI app factory for Modal deployment.

Phase 1: expose `/v1/tts/generate` and `/v1/tts/health` while reusing existing
Veena3 inference/streaming code from the Django project.

Phase 2+: extract shared code into `veena3modal/shared/` and remove Django coupling.
    N)OptionalAnyDict)
get_loggerset_request_contextclear_request_contextlog_request_receivedlog_first_audio_emittedlog_request_completedlog_request_failed)	record_request_receivedrecord_request_completedrecord_request_failedrecord_ttfb
record_rtfrecord_audio_durationrecord_chunks_sentget_metrics_textget_content_type)get_api_validatorextract_api_keyhash_api_key)get_rate_limiter)	ErrorCodecreate_error_responseget_error_statusis_gpu_faulthandle_gpu_faultFeatureFlagsc                  C   s   t du rddlm}  |  a t S )z#Lazy load sentence store singleton.Nr   get_sentence_store)_sentence_store#veena3modal.services.sentence_storer    r    r#   veena3modal/api/fastapi_app.py_get_sentence_storeB   s   r%   z0.1.0_MODEL_VERSION_startup_timereturnc                   C   s4   t  td7 atW  d   S 1 sw   Y  dS )z5Increment in-flight counter and return current value.   N)_inflight_lock_inflight_requestsr#   r#   r#   r$   _inflight_enterT   s   $r,   c                   C   s:   t  tdtd atW  d   S 1 sw   Y  dS )z5Decrement in-flight counter and return current value.r   r)   N)r*   maxr+   r#   r#   r#   r$   _inflight_exit\   s   $r.   valuec                 C   sb   | du rdS t | tr| rdS dS t | trt| S t | tr-|  r)tt| S | dS t| S )z5Convert metric values to compact header-safe strings.Ntruefalse.3f)
isinstanceboolintstrfloat
is_integer)r/   r#   r#   r$   _serialize_header_valued   s   


r9   metricsrequest_inflightc                 C   s  i }t | ts	i } i }|dkr||d< i dddddddd	d
dddddddddddddddddddddd d!d"d#i d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdCdEdFdGdHdIdJdKdLdMdNdOdPdQdRdAdS}|dkrt| } || d< | D ]\}}|| vrq| |}t|}|dTu rq|||< |||< qdU}	|	D ]}
|
| v r| |
 ||
< qtj|dVdWdX|dY< |S )Zz
    Build detailed timing headers consumed by local benchmark scripts.

    Includes a compact JSON payload (`X-Perf-Details`) and selected explicit headers
    for easy parsing in shell scripts/tools.
    r   r;   zx-generation-msgeneration_mszx-request-inflightzx-api-preprocess-msapi_preprocess_mszx-api-generate-await-msapi_generate_await_mszx-api-postprocess-msapi_postprocess_mszx-api-total-msapi_total_mszx-api-overhead-vs-pipeline-msapi_overhead_vs_pipeline_mszx-api-overhead-vs-generation-msapi_overhead_vs_generation_mszx-timeline-first-batch-mstimeline_llm_first_batch_mszx-timeline-llm-done-mstimeline_llm_done_mszx-timeline-bicodec-done-mstimeline_bicodec_done_mszx-timeline-total-mstimeline_total_mszx-llm-token-totalllm_token_totalzx-llm-batch-countllm_batch_countzx-llm-batch-wall-msllm_batch_wall_ms_totalzx-llm-batch-gpu-msllm_batch_gpu_ms_totalzx-llm-batch-wall-ms-minllm_batch_wall_ms_minzx-llm-batch-wall-ms-maxllm_batch_wall_ms_maxzx-llm-batch-wall-ms-p50llm_batch_wall_ms_p50zx-llm-batch-gpu-ms-minllm_batch_gpu_ms_minzx-llm-batch-gpu-ms-maxllm_batch_gpu_ms_maxzx-llm-batch-gpu-ms-p50llm_batch_gpu_ms_p50zx-llm-decode-wall-msllm_decode_wall_ms_totalzx-llm-decode-gpu-msllm_decode_gpu_ms_totalzx-llm-decode-wall-ms-minllm_decode_wall_ms_minzx-llm-decode-wall-ms-maxllm_decode_wall_ms_maxzx-llm-decode-wall-ms-p50llm_decode_wall_ms_p50zx-llm-decode-gpu-ms-minllm_decode_gpu_ms_minzx-llm-decode-gpu-ms-maxllm_decode_gpu_ms_maxzx-llm-decode-gpu-ms-p50llm_decode_gpu_ms_p50zx-llm-decode-callsllm_decode_callszx-llm-decode-cpu-msbicodec_decode_cpu_mszx-llm-parse-msllm_parse_mszx-llm-parse-avg-mstokens_per_batch_mintokens_per_batch_maxtokens_per_batch_p50llm_time_per_token_msllm_time_per_batch_wall_msllm_time_per_batch_gpu_msllm_time_in_queue_msllm_scheduler_msllm_first_token_msllm_request_lifecycle_mstext_chunkedchunks_processedbicodec_decode_wall_msbicodec_decode_gpu_ms)zx-llm-tokens-per-batch-minzx-llm-tokens-per-batch-maxzx-llm-tokens-per-batch-p50zx-llm-time-per-token-mszx-llm-time-per-batch-wall-mszx-llm-time-per-batch-gpu-mszx-llm-time-in-queue-mszx-llm-scheduler-mszx-llm-first-token-mszx-llm-lifecycle-mszx-llm-text-chunkedzx-llm-chunks-processedzx-bicodec-decode-wall-mszx-bicodec-decode-gpu-mszx-bicodec-decode-cpu-msN)+prompt_build_mssampling_params_msr=   r>   r?   r@   rA   rB   timeline_markerstimeline_prompt_ready_mstimeline_sampling_ready_msrC   rD   timeline_parse_done_mstimeline_validation_done_msrE   timeline_wav_done_mstimeline_request_done_mstimeline_to_first_batch_ms#timeline_first_batch_to_llm_done_mstimeline_post_llm_ms!timeline_parse_to_bicodec_done_ms#timeline_bicodec_to_request_done_msrF   llm_prefill_wall_msllm_prefill_gpu_msllm_generation_wall_msrb   rc   llm_model_forward_msllm_model_execute_msrd   re   llm_queued_to_scheduled_msllm_scheduled_to_first_token_msllm_first_to_last_token_msllm_queued_to_last_token_msllm_prompt_token_totalsemantic_token_totalglobal_token_totalwav_pack_mspipeline_total_mstoken_validation_ms),:T)
separatorsensure_asciizx-perf-details)r3   dictitemsgetr9   jsondumps)r:   r;   perf_headersdetail_payload
header_map
header_key
metric_keyr/   
serializedextra_detail_keyskeyr#   r#   r$   _build_perf_headerss   s   
	
 !"#4

-
r   c                   C   s   t pdS )zEReturn the current model version, or 'not_loaded' if not initialized.
not_loadedr&   r#   r#   r#   r$   get_model_version   s   r   versionc                 C   s   | a dS )z$Set the model version after loading.Nr   )r   r#   r#   r$   set_model_version  s   r   c               
      s  ddl m} m m} ddlmm t a| dt	dd}|
dfdd	}|
d
fdd}|dd|f fdd}dtdtdtdtffddzddlm} || td W |S  ty } ztd|  W Y d}~|S d}~ww )z
    Factory function to create FastAPI app.
    
    Import inside factory so local tooling can import this module without FastAPI installed.
    r   )FastAPIHTTPExceptionRequest)JSONResponseResponsezVeena3 TTS (Modal)zIHigh-quality multilingual Text-to-Speech API with true streaming support.)titler   descriptionz/v1/tts/healthc                     s   ddl m}  |  }|  }|r|jnt }d}zddl}|j }W n	 t	y-   Y nw t
r6t t
 nd}|r?|r?d}n	|sC|rFd}nd} |||t|d|td|td	d
S )at  
        Health check endpoint.
        
        Returns:
            - status: "healthy" | "degraded" | "unhealthy"
            - model_loaded: whether TTS model is initialized
            - model_version: version string of loaded model
            - uptime_seconds: time since app startup
            - gpu_available: whether GPU is detected (best-effort check)
        r   tts_runtimeFNhealthydegraded   )statusmodel_loadedmodel_versionuptime_secondsgpu_availableapp_version)X-Model-VersionzX-App-Version)contentheaders)veena3modal.servicesr   is_initializedget_runtimer   r   torchcudais_availableImportErrorr'   timeround_APP_VERSION)r   r   runtimer   r   r   uptimer   r   r#   r$   
tts_health  s<   	zcreate_app.<locals>.tts_healthz/v1/tts/metricsc                      s    t  t dS )zw
        Prometheus metrics endpoint.
        
        Returns metrics in Prometheus text format for scraping.
        )r   
media_type)r   get_metrics_content_typer#   )r   r#   r$   tts_metricsR  s
   zcreate_app.<locals>.tts_metricsz/v1/tts/generaterequestc           4         sd	  ddl m}m} ddlm} tt }t }t	 }t
 rt| j}t|}|s;dttjd|dd|idS t }	t|}
|	|
}|jsg|jd	krQdnd
}|tt|j|jp^d|dd|idS t
 rt }||
\}}}|s||||}||d< dttjdt|d  d|d|dS z	|  I dH }W n ty } z	dd| dd}~ww z	|dji |W n% ty } zt|}ddd||did|idW  Y d}~S d}~ww |  sdddd|did|idS ! }d}j"r"zddl#m$   fdd}W n t%y!   t&'d Y nw j(|d}t)|d t*|t+||j,j-d t.|j,j-d  j,rR||||I dH S j-}t/ }zTzt0d!rej1nd"}t	 }j2r|j3||j4j5j6j7j8j9j:|d#
I dH \}}n|j;||j4j5j6j7j8j9|d$	I dH \}}t	 }|du ri }|| d% |d&< || d% |d'< t<t|=d(dpd||d(< |du rd)dd*d+|did|idW W t>  S t | }|=d,d} | dkr||  nd}!|? }"|"r	|"j@ntA }#|=d-j:}$||#d.t|$d/tt+|| d0t|=d1d|!d2t|=d3d4B t|=d5d4B d6}j9durNtj9|d7< d8}%||jCjDkrzdd9lEmF}&mG}'mH}(mI}) |=d-d:}*t+|d;krw|d;d n|}+||jJjDkr|&|*d<},|,K|+|*}d=}%n<||jLjDkr|'|*d<},|,K|+|*}d>}%n'||jMjDkr|( },|,K|+|*}d?}%n||jNjDkr|)|*d<},|,K|+|*}d@}%tt+||dA< t&OdB| dCt+| dD W n5 t%y } zt&'dE|  W Y d}~nd}~w ty } zt&PdF|  W Y d}~nd}~ww t	 | d% }-tQ|=d&dGp(dG}.tQ|=d'dGp3dG}/|-|dH< t<dG|-|. |/ |dI< |=dJ}0tR|0ttQfr\t<dG|-tQ|0 |dK< |=dL}1tR|1ttQfrtt<dG|-tQ|1 |dM< |StT||dN tU }2|2jV|||d4j-j4j5j6j7j8j9|=d3d4|=d1| dO t | }| dkr||  nd}!|!d2|dP< t|d% }3tW|dQ|3| |!ddR tXdQ||d4dS tY|=d1dd% |d4dT tZ|!|dU t[| |dV t&jOdWi dX|dY|dL|=dLdZ|=dZd[|=d[d\|=d\d]|=d]d^|=d^d_|=d_d`|=d`dJ|=dJda|=dadb|=dbd&|=d&d'|=d'dI|=dIdH|=dH|=dK|=d(dcdd t\  ||%|deW W t>  S  ty } z7t&]df|  t^|d)dgt|dh t_d)dg|di t\  d)ddgt||did|idW  Y d}~W t>  S d}~ww t>  w )kz
        Generate speech from text.
        
        Supports streaming (M4) and non-streaming (M3) modes.
        Currently implements non-streaming WAV generation only.
        r   )TTSGenerateRequestAudioFormatr   i  zQAPI key required. Use 'Authorization: Bearer <key>' or 'X-API-Key: <key>' header.codemessage
request_idX-Request-IDstatus_coder   r   INVALID_API_KEYi  zAuthentication failed  z!Rate limit exceeded. Retry after r)   z	 seconds.Ni  zInvalid JSON: )r   detailerrorVALIDATION_ERROR  MODEL_NOT_LOADEDz2TTS model not initialized. Please wait for warmup.)normalize_textc                    s    | j dS )N)verbose)normalize_verbose)tr   reqr#   r$   <lambda>  s    z2create_app.<locals>.tts_generate.<locals>.<lambda>z9TextNormalizer not available, skipping text normalization)normalizer_funcr   )r   text_lengthspeakerstreamformat)r   r   r   output16khz)
textr   temperaturetop_ktop_p
max_tokensrepetition_penaltyseedsample_rateoutput_sample_rate)	r   r   r   r   r   r   r   r   r     r=   r>   r;     GENERATION_FAILEDz0Audio generation failed. Check logs for details.audio_duration_secondsr   wavr1   z.2fttfb_msr2   rf   F
sr_applied)r   r   X-FormatX-Sample-RateX-StreamX-Audio-ByteszX-Audio-Secondsz	X-TTFB-msX-RTFzX-Text-ChunkedzX-SR-AppliedX-Seed	audio/wav)OpusEncoder
MP3EncoderMuLawEncoderFLACEncoderi>  ,   )r   z
audio/opusz
audio/mpegzaudio/x-wavz
audio/flacr   zEncoded audio to z: z byteszAudio encoder not available: zFormat encoding failed:         r@   r?   rF   rA   r<   rB   )r;   )r   r   r   r   r   r   r   r   r   r   r   rf   r   r   r      r   r   total_duration_msr   rtfchunks_sentr   duration_secondsr   r   ttfb_secondsr   r   r  r   r  r   request_perf_summaryr   r   rG   rH   rI   rJ   rC   rD   rE   rh   ri   )rA   r;   )extra)r   r   r   zTTS generation error: INTERNAL_ERRORr   r   
error_codeerror_messager   r  r   r#   )`veena3modal.api.schemasr   r   r   r   r6   uuiduuid4r   perf_counterr   is_auth_enabledr   r   r   r   r   r   r   r   validateis_validr  r  is_rate_limiting_enabledr   checkget_headersRATE_LIMIT_EXCEEDEDr5   r   	Exceptionr   get_resolved_speaker	normalize&veena3modal.processing.text_normalizerr   r   loggerwarningget_normalized_textr   r   lenr   r   metrics_request_receivedr,   hasattrr   chunkinggenerate_speech_chunkedr   r   r   r   r   r   r   generate_speechr-   r   r.   r   r   r   lowerWAVr/   veena3modal.audio.encoderr   r   r   r   OPUSencodeMP3MULAWFLACinfor   r7   r3   updater   r%   store_fire_and_forgetr
   metrics_request_completedr   r   r   r   	exceptionr   metrics_request_failed)4r   r   r   r   r   
start_timerequest_start_perfheaders_dictapi_key	validatorkey_hashauth_resultr   rate_limiterallowed	remainingreset_afterr   bodye	error_msgr   r   r   target_formatinflight_current	output_srgenerate_call_startaudio_bytesr:   generate_call_end
total_timeaudio_durationr  r   r   r   r   r   r   r   r   r   pcm_dataencoderr@   r=   r>   rF   r<   sentence_storer  r   r   r   _handle_streaming_requestr   r$   tts_generate^  s  




 N 



 









	








#z create_app.<locals>.tts_generater   r   r   r9  c                    s  ddl m} ddlm} ddlm} ddlm  }|du s&|j	du r5ddd	d
dididS j
|jjkrPddddj
 ddididS td z	 I dH  W n^ jy }	 zQtt|	ddpod}
tdt|
d d }tt|	di pi }tddt|	d tddd t  dddt|	ddd |d!t|d"dW  Y d}	~	S d}	~	ww  fd#d$}|r|jnt }|d%tjd&tj tttt jtt  j! d'}j"durtj"|d(< || d)||j# d*S )+z
        Handle streaming TTS request.
        
        Returns a StreamingResponse that yields WAV header + PCM chunks.
        True streaming: first bytes sent ASAP before full audio is generated.
        r   )StreamingResponse)BackgroundTask)r   r   Nr   r   STREAMING_UNAVAILABLEz#Streaming pipeline not initialized.r   r   r   i  FORMAT_NOT_IMPLEMENTEDzStreaming format 'z&' not yet implemented. Use format=wav.r   retry_after_msr   r)   g     8@g     @@snapshotr   STREAMING_OVERLOADEDr  r  z,Streaming capacity exhausted. Retry shortly.reason
overloaded)r   r   r   r\  )r   	admission)r   zRetry-Afterc                    s  i } d}d}zzj jjjjjjj dd2 zM3 dH W \}}|} |d7 }|sjd}|dd}t	|t
|d t|d	 dd
 t }|jdjjjjjjj|dd|d |V  q 6 t  }| dd}|dkr|| nd}	tdt|d	 ||	|d td|dd t|	d t|d t|d t  W n5 ty }
 z)td|
  tddt|
d tddd t  W Y d}
~
W   dS d}
~
ww W   dS   w )z)Yields audio chunks as they're generated.Fr   )r   r   r   r   r   r   r   r   enable_chunkingadmission_leaserelease_admission_leaseNr)   Tr   )r   r   chunk_size_bytesr   r  rf   )r   r   r   r   r   r   r   r   r   r   r   rf   r   r   r  r  r  r
  r  )chunksr   zStreaming error: r   STREAMING_ERRORr  r  )generate_speech_streamingr   r   r   r   r   r   r(  r   r	   r%  r   r%   r5  r   r   r
   r5   r6  r   r   r   r   r  r"  r7  r   r6   r8  release_streaming_slot)final_metricsfirst_chunk_sentchunks_countaudio_chunkr:   r   rQ  rM  rN  r  rE  r`  r   r   r   r9  r   r   r#   r$   audio_stream_generator4  s   
2

zMcreate_app.<locals>._handle_streaming_request.<locals>.audio_stream_generatorr   r0   )r   r   r   r   r   zX-Chunking-EnabledzX-Admission-Wait-mszX-Admission-Queuedr   r   )r   r   
background)$fastapi.responsesrU  starlette.backgroundrV  r  r   r   r   r   streaming_pipeliner   r,  r/   r   acquire_streaming_slotStreamingAdmissionErrorr7   getattrr-   r5   r   r   r6   r8  r   r   r   r   r(  r+  r   wait_msr4   queuedr   rf  )r   r   r   r   r9  rU  rV  r   r   excrY  retry_after_soverload_snapshotrl  r   r   r   rk  r$   rS    s   


"f
z-create_app.<locals>._handle_streaming_request)add_websocket_routesz'WebSocket support enabled at /v1/tts/wsz!WebSocket support not available: N)fastapir   r   r   rn  r   r   r   r'   r   r   postr6   r7   !veena3modal.api.websocket_handlerry  r"  r3  r   r#  )r   r   appr   r   rT  ry  rE  r#   rR  r$   
create_app
  s:   5    Pr~  )A__doc__r   r  r   	threadingtypingr   r   r   veena3modal.shared.loggingr   r   r   r   r	   r
   r   veena3modal.shared.metricsr   r&  r   r6  r   r8  r   r   r   r   r   r   r   veena3modal.api.authr   r   r   veena3modal.api.rate_limiterr   veena3modal.api.error_handlersr   r   r   r   r   r   __name__r"  r!   r%   r   r&   r6   __annotations__r'   r7   Lockr*   r+   r5   r,   r.   r9   r   r   r   r~  r#   r#   r#   r$   <module>   s6    $, 		& 