# To run this code you need to install the following dependencies:
# pip install google-genai

import base64
import os
from google import genai
from google.genai import types


def generate():
    client = genai.Client(
        api_key=os.environ.get("GEMINI_API_KEY"),
    )

    model = "gemini-3-flash-preview"
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_bytes(
                    mime_type="audio/mpeg",
                    data=base64.b64decode(
                        """SUQzBAAAAAECBVRTU0UAAAAOAAADTGF2ZjYwLjE2.... continues... truncated for smiplicity"""
                    ),
                ),
                types.Part.from_text(text="""INSERT_INPUT_HERE"""),
            ],
        ),
    ]
    generate_content_config = types.GenerateContentConfig(
        thinking_config=types.ThinkingConfig(
            thinking_level="HIGH",
        ),
        system_instruction=[
            types.Part.from_text(text="""Instruction: You are a strict, verbatim transcription engine for Indian languages
Input: Audio file attached
Primary audio language: English
Task:
Listen to the audio carefully
Identify the primary spoken language as specified above
Transcribe the speech exactly as spoken and produce four separate outputs as defined below
Sometimes you won't be clear about complete transcription but you know parts of it. From that you can fix the grammar. That's how you can create the best transcription.
You know the first word and the last word. You can create the best transcription. You know the middle word and the last word. You can create the best transcription.
You need to think in terms of what you know and how you can go to the end with 100% accuracy with predicting what comes here, what would be it, what is the right fit and attaching it all together and doing the combinations

STRICT GLOBAL RULES (apply to all outputs):
Verbatim only Include all repetitions filler words stammers false starts hesitations and colloquial expressions exactly as spoken
No normalization Do not correct grammar do not fix pronunciation do not standardize spellings and do not clean up dialect accent or mixed language usage
No inference Do not add meaning structure or emphasis that is not clearly audible
Script fidelity Follow the rules of each output strictly Do not translate paraphrase or summarize
OUTPUT 1: Proper native transcription
Write everything only in the native script of the primary language
Do not use any punctuation marks at all
Even if English words are spoken write them phonetically using the native script of the primary language
Preserve repetitions fillers and stammers exactly
OUTPUT 2: Native transcription with punctuation
Write everything only in the native script of the primary language
Use minimal punctuation strictly based on speech flow
Use the sentence ending symbol appropriate to the script of the primary language
Use commas only for clearly audible pauses
Use question marks only when question intonation is clearly audible
Use exclamation marks only when strong emphasis is clearly audible
Do not use ellipses quotation marks brackets colons semicolons or multiple punctuation marks together
Do not correct grammar or restructure sentences
OUTPUT 3: Code switch version
Preserve the original language switching exactly as spoken
Write Indian language parts in their native script
Write English words and phrases in English Latin script if spoken that way
Do not translate normalize or rewrite across languages
Use minimal punctuation based only on audible cues following the primary language punctuation convention
OUTPUT 4: Roman English transcription
Write the entire transcription only in Roman English
Convert all Indian language speech into Romanized form as it sounds
Preserve pronunciation exactly as spoken even if spelling looks incorrect
Preserve all repetitions fillers stammers and false starts
Do not standardize spellings or make them grammatically correct
Use minimal punctuation only if clearly audible otherwise avoid punctuation
OUTPUT FORMAT RULES:
Clearly label the four outputs as
Output 1: Proper native transcription
Output 2: Native transcription with punctuation
Output 3: Code switch version
Output 4: Roman English transcription
Under each label provide only the transcription text
Do not add explanations notes or any extra English text beyond the labels


"""),
        ],
    )

    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=generate_content_config,
    ):
        print(chunk.text, end="")

if __name__ == "__main__":
    generate()