import asyncio
import aiohttp
import time
import json
import numpy as np
import re

VLLM_URL = "http://localhost:9093/v1/completions"
MODEL = "Scicom-intl/Multilingual-Expressive-TTS-0.6B"
SPEAKER = "multilingual-tts_audio_Grace"

TEXTS = [
    "Hello, my name is Grace and I am here to help you with anything you need today.",
    "The weather is beautiful outside, perfect for a walk in the park with friends.",
    "India is a diverse country with many languages, cultures, and traditions.",
    "Machine learning has transformed the way we build software applications.",
    "Please feel free to reach out if you have any questions about our services.",
    "The sunrise over the mountains was absolutely breathtaking this morning.",
    "We need to finish this project before the deadline next Friday afternoon.",
    "Technology continues to evolve at an unprecedented pace every single year.",
    "नमस्ते, मेरा नाम ग्रेस है और मैं आपकी मदद करने के लिए यहाँ हूँ।",
    "भारत एक विविधताओं से भरा देश है जहाँ अनेक भाषाएँ बोली जाती हैं।",
    "Welcome to our platform where you can learn new skills and grow professionally.",
    "The restaurant serves delicious food from various cuisines around the world.",
    "I would like to schedule a meeting for tomorrow morning at ten o'clock sharp.",
    "The library has an extensive collection of books on science and technology.",
    "Please make sure to submit your report before the end of the business day.",
    "வணக்கம், என் பெயர் கிரேஸ், நான் உங்களுக்கு உதவ இங்கே இருக்கிறேன்.",
    "Education is the most powerful weapon which you can use to change the world.",
    "The concert last night was amazing with incredible performances by all artists.",
    "We are committed to providing the best customer service experience possible.",
    "The train departs from platform number three at exactly half past seven.",
    "నమస్కారం, నా పేరు గ్రేస్ మరియు మీకు సహాయం చేయడానికి నేను ఇక్కడ ఉన్నాను.",
    "Good morning everyone, let us begin today's presentation on quarterly results.",
    "The garden is full of colorful flowers blooming in the warm spring sunshine.",
    "Please remember to bring your identification documents to the appointment.",
    "The ocean waves crashed gently against the sandy shore as the sun began to set.",
    "নমস্কার, আমার নাম গ্রেস এবং আমি আপনাকে সাহায্য করতে এখানে আছি।",
    "Innovation drives progress and helps us solve complex problems more efficiently.",
    "The museum exhibition features artwork from renowned artists across centuries.",
    "Could you please send me the updated version of the document by this evening.",
    "The children played happily in the playground while their parents watched nearby.",
    "नमस्कार, माझं नाव ग्रेस आहे आणि मी तुम्हाला मदत करण्यासाठी इथे आहे.",
    "The flight has been delayed by two hours due to unexpected weather conditions.",
    "Our team has been working hard to deliver this product on time and within budget.",
    "The city skyline looked stunning against the backdrop of the golden sunset.",
    "Thank you for your patience while we process your request as quickly as possible.",
    "The new software update includes several important security patches and features.",
    "السلام علیکم، میرا نام گریس ہے اور میں آپ کی مدد کے لیے یہاں ہوں۔",
    "Fresh fruits and vegetables are essential for maintaining a healthy balanced diet.",
    "The conference room is booked for the strategy meeting at two thirty this afternoon.",
    "നമസ്കാരം, എന്റെ പേര് ഗ്രേസ് ആണ്, നിങ്ങളെ സഹായിക്കാൻ ഞാൻ ഇവിടെയുണ്ട്.",
    "Digital transformation is reshaping industries and creating new opportunities.",
    "The hiking trail offers spectacular views of the valley and surrounding peaks.",
    "Please ensure all safety protocols are followed during the laboratory experiment.",
    "The annual company retreat will be held at the lakeside resort next month.",
    "Artificial intelligence is revolutionizing healthcare diagnosis and treatment.",
    "The local farmers market opens every Saturday morning with fresh organic produce.",
    "We appreciate your continued support and look forward to serving you again soon.",
    "The documentary explores the fascinating history of ancient civilizations worldwide.",
    "Please review the attached proposal and share your feedback at your convenience.",
    "The symphony orchestra performed a magnificent rendition of the classic composition.",
]


async def single_request(session, text, request_id):
    prompt = f"<|im_start|>{SPEAKER}: {text}<|speech_start|>"
    payload = {
        "model": MODEL,
        "prompt": prompt,
        "max_tokens": 1024,
        "temperature": 0.7,
        "repetition_penalty": 1.15,
        "stream": True,
    }

    t_start = time.perf_counter()
    ttfb = None
    full_text = ""

    async with session.post(VLLM_URL, json=payload) as resp:
        async for line in resp.content:
            line = line.decode().strip()
            if line.startswith("data: ") and line != "data: [DONE]":
                if ttfb is None:
                    ttfb = (time.perf_counter() - t_start) * 1000
                try:
                    chunk = json.loads(line[6:])
                    full_text += chunk["choices"][0].get("text", "")
                except:
                    pass

    t_end = time.perf_counter()
    total_ms = (t_end - t_start) * 1000
    audio_tokens = re.findall(r'<\|s_(\d+)\|>', full_text)
    n_tokens = len(audio_tokens)
    audio_dur = n_tokens / 50.0
    rtf = (total_ms / 1000) / audio_dur if audio_dur > 0 else 999

    return {
        "ttfb_ms": ttfb or total_ms,
        "total_ms": total_ms,
        "audio_dur": audio_dur,
        "n_tokens": n_tokens,
        "rtf": rtf,
    }


async def run_benchmark(concurrency, n_requests):
    timeout = aiohttp.ClientTimeout(total=300)
    async with aiohttp.ClientSession(timeout=timeout) as session:
        # warmup
        prompt = f"<|im_start|>{SPEAKER}: Hello<|speech_start|>"
        payload = {"model": MODEL, "prompt": prompt, "max_tokens": 50, "temperature": 0.7, "stream": False}
        async with session.post(VLLM_URL, json=payload) as resp:
            await resp.json()

        texts = [TEXTS[i % len(TEXTS)] for i in range(n_requests)]
        semaphore = asyncio.Semaphore(concurrency)

        async def bounded(text, rid):
            async with semaphore:
                return await single_request(session, text, rid)

        t_wall_start = time.perf_counter()
        tasks = [bounded(t, i) for i, t in enumerate(texts)]
        results = await asyncio.gather(*tasks)
        t_wall = time.perf_counter() - t_wall_start

        ttfbs = [r["ttfb_ms"] for r in results]
        rtfs = [r["rtf"] for r in results]
        totals = [r["total_ms"] for r in results]
        audio_durs = [r["audio_dur"] for r in results]
        total_audio = sum(audio_durs)
        throughput = total_audio / t_wall

        print(f"  Concurrency: {concurrency:4d}  |  Requests: {n_requests:4d}")
        print(f"  TTFB:   min={np.min(ttfbs):6.0f}ms  avg={np.mean(ttfbs):6.0f}ms  p50={np.percentile(ttfbs,50):6.0f}ms  p90={np.percentile(ttfbs,90):6.0f}ms  p95={np.percentile(ttfbs,95):6.0f}ms")
        print(f"  Total:  avg={np.mean(totals):6.0f}ms  p50={np.percentile(totals,50):6.0f}ms  p90={np.percentile(totals,90):6.0f}ms")
        print(f"  RTF:    avg={np.mean(rtfs):.3f}  p50={np.percentile(rtfs,50):.3f}  p90={np.percentile(rtfs,90):.3f}")
        print(f"  Audio:  avg={np.mean(audio_durs):.2f}s")
        print(f"  Throughput: {throughput:.1f}x realtime  |  Wall: {t_wall:.1f}s")
        print()


async def main():
    print("=" * 85)
    print("vLLM HIGH CONCURRENCY BENCHMARK — A100 80GB — 0.6B model")
    print("=" * 85)

    for conc in [50, 100, 150, 200, 300, 500]:
        n = conc
        print(f"\n--- Concurrency={conc} ---")
        try:
            await run_benchmark(conc, n)
        except Exception as e:
            print(f"  FAILED: {e}\n")

asyncio.run(main())
