#!/usr/bin/env python3
"""
List all delivery2/ conversations from GCS and save to local JSONL.
Output: /root/joshtalks_delivery2_filelist/{lang}.jsonl
Each line: {"session_id": "123", "files": [{"key": "delivery2/hindi/123/left_audio.wav", "size": 0}, ...]}

Usage: python3 delivery2_list.py
"""

import asyncio, aiohttp, re, json, os, time, random
from datetime import datetime

GCS_BUCKET = "joshtalks-data-collection"
LANGUAGES = ["hindi", "english", "bengali", "gujarati", "telugu"]
OUTPUT_DIR = "/root/joshtalks_delivery2_filelist"
FILES_PER_CONV = [
    "left_audio.wav", "left_metadata.json", "left_transcription.json",
    "right_audio.wav", "right_metadata.json", "right_transcription.json",
]

OXY_USER = "user-humming_Ows6w-country-US"
OXY_PASS = "mOIb_8PL7ieGppJW"
OXY_PORTS = list(range(8001, 8022))


def get_proxy():
    return f"http://{OXY_USER}:{OXY_PASS}@dc.oxylabs.io:{random.choice(OXY_PORTS)}"


def log(msg):
    print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)


async def list_prefixes(session, prefix):
    all_prefixes = []
    marker = ""
    while True:
        url = f"https://storage.googleapis.com/{GCS_BUCKET}/?prefix={prefix}&delimiter=/&max-keys=5000"
        if marker:
            url += f"&marker={marker}"
        for attempt in range(5):
            try:
                async with session.get(url, proxy=get_proxy(), ssl=False,
                                       timeout=aiohttp.ClientTimeout(total=30)) as resp:
                    text = await resp.text()
                break
            except:
                await asyncio.sleep(1)
        else:
            break
        subs = [s for s in re.findall(r"<Prefix>(.*?)</Prefix>", text) if s != prefix]
        all_prefixes.extend(subs)
        if "<IsTruncated>true</IsTruncated>" not in text:
            break
        marker = subs[-1] if subs else ""
    return all_prefixes


async def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    conn = aiohttp.TCPConnector(limit=50, ssl=False)
    async with aiohttp.ClientSession(connector=conn) as session:
        for lang in LANGUAGES:
            out_path = os.path.join(OUTPUT_DIR, f"{lang}.jsonl")
            log(f"=== {lang} ===")
            log(f"Listing conversations...")

            prefix = f"delivery2/{lang}/"
            folders = await list_prefixes(session, prefix)
            log(f"Found {len(folders)} conversations")

            with open(out_path, "w") as f:
                for folder in sorted(folders):
                    conv_id = folder.rstrip("/").split("/")[-1]
                    files = []
                    for fname in FILES_PER_CONV:
                        files.append({"key": f"{folder}{fname}", "size": 0})
                    f.write(json.dumps({"session_id": conv_id, "files": files}) + "\n")

            log(f"  {lang}: {len(folders)} conversations -> {out_path}")

    log(f"\n=== ALL DONE ===")


if __name__ == "__main__":
    asyncio.run(main())
