Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 61 additions & 3 deletions examples/example_notebook_colab.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"0% [Working]\r",
" \r",
"\r\n",
"0% [Working]\r\n",
" \r\n",
"Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]\n",
"Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease [1,581 B]\n",
"Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n",
Expand Down Expand Up @@ -1968,6 +1968,64 @@
"source": [
"display(Markdown(result_md))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Audio to Markdown (Support with Gemini)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<>:3: SyntaxWarning: invalid escape sequence '\\h'\n",
"<>:3: SyntaxWarning: invalid escape sequence '\\h'\n",
"C:\\Users\\vaish\\AppData\\Local\\Temp\\ipykernel_16776\\2861124903.py:3: SyntaxWarning: invalid escape sequence '\\h'\n",
" document_path =\"inputs\\harvard.wav\"\n",
"e:\\Lexoid\\Lexoid\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"\u001b[32m2025-11-17 16:28:41.463\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mlexoid.core.utils\u001b[0m:\u001b[36mis_supported_file_type\u001b[0m:\u001b[36m92\u001b[0m - \u001b[34m\u001b[1mFile type: audio/wav\u001b[0m\n",
"\u001b[32m2025-11-17 16:28:41.463\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mlexoid.core.utils\u001b[0m:\u001b[36mrouter\u001b[0m:\u001b[36m559\u001b[0m - \u001b[34m\u001b[1mUsing LLM_PARSE because the type of file is audio.\u001b[0m\n",
"\u001b[32m2025-11-17 16:28:41.464\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mlexoid.api\u001b[0m:\u001b[36mwrapper\u001b[0m:\u001b[36m70\u001b[0m - \u001b[34m\u001b[1mAuto-detected parser type: ParserType.LLM_PARSE\u001b[0m\n",
"\u001b[32m2025-11-17 16:28:41.464\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mlexoid.api\u001b[0m:\u001b[36mparse_chunk\u001b[0m:\u001b[36m135\u001b[0m - \u001b[34m\u001b[1mUsing LLM parser\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The Stale Smell of Old Beer Lingers\n",
"\n",
"- The stale smell of old beer lingers.\n",
"- It takes heat to bring out the odor.\n",
"- A cold dip restores health and zest.\n",
"- A salt pickle tastes fine with ham.\n",
"- Tacos al pastor are my favorite.\n",
"- A zestful food is the hot cross bun.\n"
]
}
],
"source": [
"from lexoid.api import parse\n",
"\n",
"document_path =\"inputs\\harvard.wav\"\n",
Copy link

Copilot AI Nov 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The path string uses a single backslash which Python interprets as an escape sequence, causing a SyntaxWarning (visible in the output at lines 1988-1991). Use either a raw string (r"inputs\harvard.wav") or forward slashes ("inputs/harvard.wav") to avoid this warning.

Copilot uses AI. Check for mistakes.
"parsed_md = parse(document_path, \"AUTO\",api=\"gemini\")[\"raw\"]\n",
Copy link

Copilot AI Nov 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing space after comma in the function call. Should be: parse(document_path, "AUTO", api="gemini")

Copilot uses AI. Check for mistakes.
"print(parsed_md)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Binary file added harvard.wav
Binary file not shown.
51 changes: 48 additions & 3 deletions lexoid/core/parse_type/llm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import requests
import torch
from anthropic import Anthropic
from google import genai
Copy link

Copilot AI Nov 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code imports from google import genai and uses genai.Client(), which appears to be from the google-genai package. However, the pyproject.toml file specifies google-generativeai (a different package) as the dependency. According to the PR description TODO, google-genai needs to be added via poetry. Either add the correct package to pyproject.toml, or update the code to use the existing google-generativeai package API.

Suggested change
from google import genai
import google.generativeai as genai

Copilot uses AI. Check for mistakes.
from huggingface_hub import InferenceClient
from loguru import logger
from mistralai import Mistral
Expand All @@ -21,10 +22,11 @@
from transformers import AutoModelForVision2Seq, AutoProcessor

from lexoid.core.conversion_utils import (
convert_image_to_pdf,
convert_doc_to_base64_images,
convert_image_to_pdf,
)
from lexoid.core.prompt_templates import (
AUDIO_TO_MARKDOWN_PROMPT,
INSTRUCTIONS_ADD_PG_BREAK,
LLAMA_PARSER_PROMPT,
OPENAI_USER_PROMPT,
Expand Down Expand Up @@ -92,9 +94,9 @@ def wrapper(*args, **kwargs):
@retry_on_error
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
mime_type = get_file_type(path)
if not ("image" in mime_type or "pdf" in mime_type):
if not ("image" in mime_type or "pdf" in mime_type or "audio" in mime_type):
raise ValueError(
f"Unsupported file type: {mime_type}. Only PDF and image files are supported for LLM_PARSE."
f"Unsupported file type: {mime_type}. Only PDF, image, and audio files are supported for LLM_PARSE."
)
if "api_provider" in kwargs:
if kwargs["api_provider"] == "local":
Expand All @@ -107,6 +109,11 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:

api_provider = get_api_provider_for_model(model)

if mime_type.startswith("audio") and api_provider != "gemini":
raise ValueError(
f"Audio files are only supported with the Gemini API provider. The model '{model}' is not compatible."
)

if api_provider == "gemini":
return parse_with_gemini(path, **kwargs)
elif api_provider == "local":
Expand Down Expand Up @@ -398,6 +405,10 @@ def process_match(match):
def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
# Check if the file is an image and convert to PDF if necessary
mime_type, _ = mimetypes.guess_type(path)

if mime_type and mime_type.startswith("audio"):
return parse_audio_with_gemini(path, **kwargs)

if mime_type and mime_type.startswith("image"):
pdf_content = convert_image_to_pdf(path)
mime_type = "application/pdf"
Expand Down Expand Up @@ -767,3 +778,37 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
"total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
},
}


def parse_audio_with_gemini(path: str, **kwargs) -> Dict:
client = genai.Client()
audio_file = client.files.upload(file=path)
system_prompt = kwargs.get("system_prompt", None)
if system_prompt == "" or system_prompt is None:
system_prompt = AUDIO_TO_MARKDOWN_PROMPT + "Audio file name is: {path}\n"

response = client.models.generate_content(
model=kwargs["model"], contents=[system_prompt, audio_file]
)

return {
"raw": response.text,
"segments": [
{
"metadata": {"page": 0},
"content": response.text,
}
],
"title": kwargs.get("title", ""),
"url": kwargs.get("url", ""),
"parent_title": kwargs.get("parent_title", ""),
"recursive_docs": [],
"token_usage": {
"input": response.usage_metadata.prompt_token_count,
"output": response.usage_metadata.candidates_token_count,
"total": (
response.usage_metadata.prompt_token_count
+ response.usage_metadata.candidates_token_count
),
},
}
7 changes: 7 additions & 0 deletions lexoid/core/prompt_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,10 @@
LATEX_USER_PROMPT = """You are an AI agent specialized in parsing PDF documents and converting them into clean, valid LaTeX format.
Your goal is to produce LaTeX code that accurately represents the document's structure, content, and layout while ensuring everything fits within standard page margins.
"""

AUDIO_TO_MARKDOWN_PROMPT = """You are an expert transcription and formatting assistant.
Convert the provided audio into a clean, well-structured Markdown document, preserving the logical flow, sections, and any lists or numbered points mentioned in the speech.
Remove background noise and ignore any irrelevant sounds, side conversations, or filler words like “um” and “uh” that do not add meaning.
Where appropriate, use Markdown headings, bullet points, numbered lists, and bold/italic text to improve clarity and readability.
If the speaker mentions code, equations, or examples, format them using proper Markdown code blocks or inline code.
Determine whether the speaker explicitly states a clear title in the audio; if a title is stated, use it as the main top-level Markdown heading; otherwise, use the audio file name (without its extension) as the main top-level Markdown heading."""
5 changes: 5 additions & 0 deletions lexoid/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def is_supported_file_type(path: str) -> bool:
or "presentation" in file_type
or file_type.startswith("image/")
or file_type.startswith("text")
or file_type.startswith("audio")
):
return True
return False
Expand Down Expand Up @@ -554,6 +555,10 @@ def router(path: str, priority: str = "speed", autoselect_llm: bool = False) ->
):
return "STATIC_PARSE", None

if file_type.startswith("audio"):
logger.debug("Using LLM_PARSE because the type of file is audio.")
return "LLM_PARSE", None

if priority == "accuracy":
# If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
# Otherwise, use LLM_PARSE
Expand Down