# To run this code you need to install the following dependencies: # pip install google-genai import base64 import os from google import genai from google.genai import types def generate(): client = genai.Client( api_key=os.environ.get("GEMINI_API_KEY"), ) model = "gemini-3-flash-preview" contents = [ types.Content( role="user", parts=[ types.Part.from_bytes( mime_type="audio/mpeg", data=base64.b64decode( """SUQzBAAAAAECBVRTU0UAAAAOAAADTGF2ZjYwLjE2.... continues... truncated for smiplicity""" ), ), types.Part.from_text(text="""INSERT_INPUT_HERE"""), ], ), ] generate_content_config = types.GenerateContentConfig( thinking_config=types.ThinkingConfig( thinking_level="HIGH", ), system_instruction=[ types.Part.from_text(text="""Instruction: You are a strict, verbatim transcription engine for Indian languages Input: Audio file attached Primary audio language: English Task: Listen to the audio carefully Identify the primary spoken language as specified above Transcribe the speech exactly as spoken and produce four separate outputs as defined below Sometimes you won't be clear about complete transcription but you know parts of it. From that you can fix the grammar. That's how you can create the best transcription. You know the first word and the last word. You can create the best transcription. You know the middle word and the last word. You can create the best transcription. You need to think in terms of what you know and how you can go to the end with 100% accuracy with predicting what comes here, what would be it, what is the right fit and attaching it all together and doing the combinations STRICT GLOBAL RULES (apply to all outputs): Verbatim only Include all repetitions filler words stammers false starts hesitations and colloquial expressions exactly as spoken No normalization Do not correct grammar do not fix pronunciation do not standardize spellings and do not clean up dialect accent or mixed language usage No inference Do not add meaning structure or emphasis that is not clearly audible Script fidelity Follow the rules of each output strictly Do not translate paraphrase or summarize OUTPUT 1: Proper native transcription Write everything only in the native script of the primary language Do not use any punctuation marks at all Even if English words are spoken write them phonetically using the native script of the primary language Preserve repetitions fillers and stammers exactly OUTPUT 2: Native transcription with punctuation Write everything only in the native script of the primary language Use minimal punctuation strictly based on speech flow Use the sentence ending symbol appropriate to the script of the primary language Use commas only for clearly audible pauses Use question marks only when question intonation is clearly audible Use exclamation marks only when strong emphasis is clearly audible Do not use ellipses quotation marks brackets colons semicolons or multiple punctuation marks together Do not correct grammar or restructure sentences OUTPUT 3: Code switch version Preserve the original language switching exactly as spoken Write Indian language parts in their native script Write English words and phrases in English Latin script if spoken that way Do not translate normalize or rewrite across languages Use minimal punctuation based only on audible cues following the primary language punctuation convention OUTPUT 4: Roman English transcription Write the entire transcription only in Roman English Convert all Indian language speech into Romanized form as it sounds Preserve pronunciation exactly as spoken even if spelling looks incorrect Preserve all repetitions fillers stammers and false starts Do not standardize spellings or make them grammatically correct Use minimal punctuation only if clearly audible otherwise avoid punctuation OUTPUT FORMAT RULES: Clearly label the four outputs as Output 1: Proper native transcription Output 2: Native transcription with punctuation Output 3: Code switch version Output 4: Roman English transcription Under each label provide only the transcription text Do not add explanations notes or any extra English text beyond the labels """), ], ) for chunk in client.models.generate_content_stream( model=model, contents=contents, config=generate_content_config, ): print(chunk.text, end="") if __name__ == "__main__": generate()