oidlabs-com · Vaishnav2804 · May 30, 2025 · May 31, 2025 · May 31, 2025 · May 31, 2025
diff --git a/examples/example_notebook_colab.ipynb b/examples/example_notebook_colab.ipynb
@@ -114,9 +114,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\r",
-      "0% [Working]\r",
-      "            \r",
+      "\r\n",
+      "0% [Working]\r\n",
+      "            \r\n",
       "Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]\n",
       "Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]\n",
       "Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n",
@@ -1968,6 +1968,64 @@
    "source": [
     "display(Markdown(result_md))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Audio to Markdown (Support with Gemini)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<>:3: SyntaxWarning: invalid escape sequence '\\h'\n",
+      "<>:3: SyntaxWarning: invalid escape sequence '\\h'\n",
+      "C:\\Users\\vaish\\AppData\\Local\\Temp\\ipykernel_16776\\2861124903.py:3: SyntaxWarning: invalid escape sequence '\\h'\n",
+      "  document_path =\"inputs\\harvard.wav\"\n",
+      "e:\\Lexoid\\Lexoid\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "\u001b[32m2025-11-17 16:28:41.463\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mlexoid.core.utils\u001b[0m:\u001b[36mis_supported_file_type\u001b[0m:\u001b[36m92\u001b[0m - \u001b[34m\u001b[1mFile type: audio/wav\u001b[0m\n",
+      "\u001b[32m2025-11-17 16:28:41.463\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mlexoid.core.utils\u001b[0m:\u001b[36mrouter\u001b[0m:\u001b[36m559\u001b[0m - \u001b[34m\u001b[1mUsing LLM_PARSE because the type of file is audio.\u001b[0m\n",
+      "\u001b[32m2025-11-17 16:28:41.464\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mlexoid.api\u001b[0m:\u001b[36mwrapper\u001b[0m:\u001b[36m70\u001b[0m - \u001b[34m\u001b[1mAuto-detected parser type: ParserType.LLM_PARSE\u001b[0m\n",
+      "\u001b[32m2025-11-17 16:28:41.464\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mlexoid.api\u001b[0m:\u001b[36mparse_chunk\u001b[0m:\u001b[36m135\u001b[0m - \u001b[34m\u001b[1mUsing LLM parser\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The Stale Smell of Old Beer Lingers\n",
+      "\n",
+      "- The stale smell of old beer lingers.\n",
+      "- It takes heat to bring out the odor.\n",
+      "- A cold dip restores health and zest.\n",
+      "- A salt pickle tastes fine with ham.\n",
+      "- Tacos al pastor are my favorite.\n",
+      "- A zestful food is the hot cross bun.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from lexoid.api import parse\n",
+    "\n",
+    "document_path =\"inputs\\harvard.wav\"\n",
+    "parsed_md = parse(document_path, \"AUTO\",api=\"gemini\")[\"raw\"]\n",
+    "print(parsed_md)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/harvard.wav b/harvard.wav
diff --git a/lexoid/core/parse_type/llm_parser.py b/lexoid/core/parse_type/llm_parser.py
@@ -11,6 +11,7 @@
 import requests
 import torch
 from anthropic import Anthropic
+from google import genai
-from google import genai
+import google.generativeai as genai
-from google import genai
+import google.generativeai as genai
 from huggingface_hub import InferenceClient
 from loguru import logger
 from mistralai import Mistral
@@ -21,10 +22,11 @@
 from transformers import AutoModelForVision2Seq, AutoProcessor
 
 from lexoid.core.conversion_utils import (
-    convert_image_to_pdf,
     convert_doc_to_base64_images,
+    convert_image_to_pdf,
 )
 from lexoid.core.prompt_templates import (
+    AUDIO_TO_MARKDOWN_PROMPT,
     INSTRUCTIONS_ADD_PG_BREAK,
     LLAMA_PARSER_PROMPT,
     OPENAI_USER_PROMPT,
@@ -92,7 +94,7 @@ def wrapper(*args, **kwargs):
 @retry_on_error
 def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
     mime_type = get_file_type(path)
-    if not ("image" in mime_type or "pdf" in mime_type):
+    if not ("image" in mime_type or "pdf" in mime_type or "audio" in mime_type):
         raise ValueError(
             f"Unsupported file type: {mime_type}. Only PDF and image files are supported for LLM_PARSE."
         )
@@ -107,6 +109,11 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
 
     api_provider = get_api_provider_for_model(model)
 
+    if mime_type.startswith("audio") and api_provider != "gemini":
+        raise ValueError(
+            f"Audio files are only supported with the Gemini API provider. The model '{model}' is not compatible."
+        )
+
     if api_provider == "gemini":
         return parse_with_gemini(path, **kwargs)
     elif api_provider == "local":
@@ -398,6 +405,10 @@ def process_match(match):
 def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
     # Check if the file is an image and convert to PDF if necessary
     mime_type, _ = mimetypes.guess_type(path)
+
+    if mime_type and mime_type.startswith("audio"):
+        return parse_audio_with_gemini(path, **kwargs)
+
     if mime_type and mime_type.startswith("image"):
         pdf_content = convert_image_to_pdf(path)
         mime_type = "application/pdf"
@@ -767,3 +778,37 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
             "total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
         },
     }
+
+
+def parse_audio_with_gemini(path: str, **kwargs) -> Dict:
+    client = genai.Client()
+    audio_file = client.files.upload(file=path)
+    system_prompt = kwargs.get("system_prompt", None)
+    if system_prompt == "" or system_prompt is None:
+        system_prompt = AUDIO_TO_MARKDOWN_PROMPT + "Audo file name is: {path}\n"
-        system_prompt = AUDIO_TO_MARKDOWN_PROMPT + "Audo file name is: {path}\n"
+        system_prompt = AUDIO_TO_MARKDOWN_PROMPT + f"Audio file name is: {path}\n"
-        system_prompt = AUDIO_TO_MARKDOWN_PROMPT + "Audo file name is: {path}\n"
+        system_prompt = AUDIO_TO_MARKDOWN_PROMPT + f"Audio file name is: {path}\n"
+
+    response = client.models.generate_content(
+        model=kwargs["model"], contents=[system_prompt, audio_file]
+    )
+
+    return {
+        "raw": response.text,
+        "segments": [
+            {
+                "metadata": {"page": 0},
+                "content": response.text,
+            }
+        ],
+        "title": kwargs.get("title", ""),
+        "url": kwargs.get("url", ""),
+        "parent_title": kwargs.get("parent_title", ""),
+        "recursive_docs": [],
+        "token_usage": {
+            "input": response.usage_metadata.prompt_token_count,
+            "output": response.usage_metadata.candidates_token_count,
+            "total": (
+                response.usage_metadata.prompt_token_count
+                + response.usage_metadata.candidates_token_count
+            ),
+        },
+    }
diff --git a/lexoid/core/prompt_templates.py b/lexoid/core/prompt_templates.py
@@ -175,3 +175,10 @@
 LATEX_USER_PROMPT = """You are an AI agent specialized in parsing PDF documents and converting them into clean, valid LaTeX format. 
 Your goal is to produce LaTeX code that accurately represents the document's structure, content, and layout while ensuring everything fits within standard page margins.
 """
+
+AUDIO_TO_MARKDOWN_PROMPT = """You are an expert transcription and formatting assistant. 
+Convert the provided audio into a clean, well-structured Markdown document, preserving the logical flow, sections, and any lists or numbered points mentioned in the speech. 
+Remove background noise and ignore any irrelevant sounds, side conversations, or filler words like “um” and “uh” that do not add meaning. 
+Where appropriate, use Markdown headings, bullet points, numbered lists, and bold/italic text to improve clarity and readability. 
+If the speaker mentions code, equations, or examples, format them using proper Markdown code blocks or inline code. 
+Determine whether the speaker explicitly states a clear title in the audio; if a title is stated, use it as the main top-level Markdown heading; otherwise, use the audio file name (without its extension) as the main top-level Markdown heading."""
diff --git a/lexoid/core/utils.py b/lexoid/core/utils.py
@@ -97,6 +97,7 @@ def is_supported_file_type(path: str) -> bool:
         or "presentation" in file_type
         or file_type.startswith("image/")
         or file_type.startswith("text")
+        or file_type.startswith("audio")
     ):
         return True
     return False
@@ -554,6 +555,10 @@ def router(path: str, priority: str = "speed", autoselect_llm: bool = False) ->
     ):
         return "STATIC_PARSE", None
 
+    if file_type.startswith("audio"):
+        logger.debug("Using LLM_PARSE because the type of file is audio.")
+        return "LLM_PARSE", None
+
     if priority == "accuracy":
         # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
         # Otherwise, use LLM_PARSE