diff --git a/examples/patient_intake_extraction_dspy/.env.example b/examples/patient_intake_extraction_dspy/.env.example new file mode 100644 index 000000000..5ce3a5a3b --- /dev/null +++ b/examples/patient_intake_extraction_dspy/.env.example @@ -0,0 +1,4 @@ +# Postgres database address for cocoindex +COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex + +GEMINI_API_KEY= diff --git a/examples/patient_intake_extraction_dspy/.gitignore b/examples/patient_intake_extraction_dspy/.gitignore new file mode 100644 index 000000000..4c49bd78f --- /dev/null +++ b/examples/patient_intake_extraction_dspy/.gitignore @@ -0,0 +1 @@ +.env diff --git a/examples/patient_intake_extraction_dspy/README.md b/examples/patient_intake_extraction_dspy/README.md new file mode 100644 index 000000000..149706250 --- /dev/null +++ b/examples/patient_intake_extraction_dspy/README.md @@ -0,0 +1,72 @@ +# Extract structured data from patient intake forms with DSPy + +[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) +We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. + +This example shows how to use [DSPy](https://github.com/stanfordnlp/dspy) with Gemini 2.5 Flash (vision model) to extract structured data from patient intake PDFs. DSPy provides a programming model for building AI systems using language models as building blocks. + +- **Pydantic Models** (`main.py`) - Defines the data structure using Pydantic for type safety +- **DSPy Module** (`main.py`) - Defines the extraction signature and module using DSPy's ChainOfThought with vision support +- **CocoIndex Flow** (`main.py`) - Wraps DSPy in a custom function, provides the flow to process files incrementally + +## Key Features + +- **Native PDF Support**: Converts PDFs to images and processes directly with vision models +- **DSPy Vision Integration**: Uses DSPy's `Image` type with `ChainOfThought` for visual document understanding +- **Structured Outputs**: Pydantic models ensure type-safe, validated extraction +- **No Text Extraction Required**: Directly processes PDF images without intermediate markdown conversion +- **Incremental Processing**: CocoIndex handles batching and caching automatically +- **PostgreSQL Storage**: Results stored in a structured database table + +## Prerequisites + +1. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. + +2. Install dependencies + + ```sh + pip install -U cocoindex dspy-ai pydantic pymupdf + ``` + +3. Create a `.env` file. You can copy it from `.env.example` first: + + ```sh + cp .env.example .env + ``` + + Then edit the file to fill in your `GEMINI_API_KEY`. + +## Run + +Update index: + +```sh +cocoindex update main +``` + +## How It Works + +The example demonstrates DSPy vision integration with CocoIndex: + +1. **Pydantic Models**: Define the structured schema (Patient, Contact, Address, etc.) +2. **DSPy Signature**: Declares input (`list[dspy.Image]`) and output (Patient model) fields +3. **DSPy Module**: Uses `ChainOfThought` with vision capabilities to reason about extraction from images +4. **Single-Step Extraction**: + - The extractor receives PDF bytes directly + - Internally converts PDF pages to DSPy Image objects using PyMuPDF + - Processes images with vision model + - Returns Pydantic model directly +5. **CocoIndex Flow**: + - Loads PDFs from local directory as binary + - Applies single transform: PDF bytes → Patient data + - Stores results in PostgreSQL + +## CocoInsight + +I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. It just connects to your local CocoIndex server, with zero pipeline data retention. Run following command to start CocoInsight: + +```sh +cocoindex server -ci main +``` + +Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). diff --git a/examples/patient_intake_extraction_dspy/data/README.md b/examples/patient_intake_extraction_dspy/data/README.md new file mode 100644 index 000000000..43f941f6a --- /dev/null +++ b/examples/patient_intake_extraction_dspy/data/README.md @@ -0,0 +1,4 @@ +## Note: +Example files here are purely artificial and not real, for testing purposes only. +Please do not use these examples for any other purpose. + diff --git a/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_Form_David_Artificial.pdf b/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_Form_David_Artificial.pdf new file mode 100644 index 000000000..5fd43832d Binary files /dev/null and b/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_Form_David_Artificial.pdf differ diff --git a/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_Form_Emily_Artificial.pdf b/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_Form_Emily_Artificial.pdf new file mode 100644 index 000000000..09cff13c8 Binary files /dev/null and b/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_Form_Emily_Artificial.pdf differ diff --git a/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_Form_Joe_Artificial.pdf b/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_Form_Joe_Artificial.pdf new file mode 100644 index 000000000..cc15c5f08 Binary files /dev/null and b/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_Form_Joe_Artificial.pdf differ diff --git a/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_From_Jane_Artificial.pdf b/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_From_Jane_Artificial.pdf new file mode 100644 index 000000000..02e26bf05 Binary files /dev/null and b/examples/patient_intake_extraction_dspy/data/patient_forms/Patient_Intake_From_Jane_Artificial.pdf differ diff --git a/examples/patient_intake_extraction_dspy/main.py b/examples/patient_intake_extraction_dspy/main.py new file mode 100644 index 000000000..17f6e68d8 --- /dev/null +++ b/examples/patient_intake_extraction_dspy/main.py @@ -0,0 +1,173 @@ +import datetime + +import dspy +from pydantic import BaseModel, Field +import fitz # PyMuPDF + +import cocoindex + + +# Pydantic models for DSPy structured outputs +class Contact(BaseModel): + name: str + phone: str + relationship: str + + +class Address(BaseModel): + street: str + city: str + state: str + zip_code: str + + +class Pharmacy(BaseModel): + name: str + phone: str + address: Address + + +class Insurance(BaseModel): + provider: str + policy_number: str + group_number: str | None = None + policyholder_name: str + relationship_to_patient: str + + +class Condition(BaseModel): + name: str + diagnosed: bool + + +class Medication(BaseModel): + name: str + dosage: str + + +class Allergy(BaseModel): + name: str + + +class Surgery(BaseModel): + name: str + date: str + + +class Patient(BaseModel): + name: str + dob: datetime.date + gender: str + address: Address + phone: str + email: str + preferred_contact_method: str + emergency_contact: Contact + insurance: Insurance | None = None + reason_for_visit: str + symptoms_duration: str + past_conditions: list[Condition] = Field(default_factory=list) + current_medications: list[Medication] = Field(default_factory=list) + allergies: list[Allergy] = Field(default_factory=list) + surgeries: list[Surgery] = Field(default_factory=list) + occupation: str | None = None + pharmacy: Pharmacy | None = None + consent_given: bool + consent_date: str | None = None + + +# DSPy Signature for patient information extraction from images +class PatientExtractionSignature(dspy.Signature): + """Extract structured patient information from a medical intake form image.""" + + form_images: list[dspy.Image] = dspy.InputField( + desc="Images of the patient intake form pages" + ) + patient: Patient = dspy.OutputField( + desc="Extracted patient information with all available fields filled" + ) + + +class PatientExtractor(dspy.Module): + """DSPy module for extracting patient information from intake form images.""" + + def __init__(self) -> None: + super().__init__() + self.extract = dspy.ChainOfThought(PatientExtractionSignature) + + def forward(self, form_images: list[dspy.Image]) -> Patient: + """Extract patient information from form images and return as a Pydantic model.""" + result = self.extract(form_images=form_images) + return result.patient # type: ignore + + +@cocoindex.op.function(cache=True, behavior_version=1) +def extract_patient(pdf_content: bytes) -> Patient: + """Extract patient information from PDF content.""" + + # Convert PDF pages to DSPy Image objects + pdf_doc = fitz.open(stream=pdf_content, filetype="pdf") + + form_images = [] + for page in pdf_doc: + # Render page to pixmap (image) at 2x resolution for better quality + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) + # Convert to PNG bytes + img_bytes = pix.tobytes("png") + # Create DSPy Image from bytes + form_images.append(dspy.Image(img_bytes)) + + pdf_doc.close() + + # Extract patient information using DSPy with vision + extractor = PatientExtractor() + patient = extractor(form_images=form_images) + + return patient # type: ignore + + +@cocoindex.flow_def(name="PatientIntakeExtractionDSPy") +def patient_intake_extraction_dspy_flow( + flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope +) -> None: + """ + Define a flow that extracts patient information from intake forms using DSPy. + + This flow: + 1. Reads patient intake PDFs as binary + 2. Uses DSPy with vision models to extract structured patient information + (PDF to image conversion happens automatically inside the extractor) + 3. Stores the results in a Postgres database + """ + data_scope["documents"] = flow_builder.add_source( + cocoindex.sources.LocalFile(path="data/patient_forms", binary=True) + ) + + patients_index = data_scope.add_collector() + + with data_scope["documents"].row() as doc: + # Extract patient information directly from PDF using DSPy with vision + # (PDF->Image conversion happens inside the extractor) + doc["patient_info"] = doc["content"].transform(extract_patient) + + # Collect the extracted patient information + patients_index.collect( + filename=doc["filename"], + patient_info=doc["patient_info"], + ) + + # Export to Postgres + patients_index.export( + "patients", + cocoindex.storages.Postgres(table_name="patients_info_dspy"), + primary_key_fields=["filename"], + ) + + +@cocoindex.settings +def cocoindex_settings() -> cocoindex.Settings: + # Configure the model used in DSPy + lm = dspy.LM("gemini/gemini-2.5-flash") + dspy.configure(lm=lm) + + return cocoindex.Settings.from_env() diff --git a/examples/patient_intake_extraction_dspy/pyproject.toml b/examples/patient_intake_extraction_dspy/pyproject.toml new file mode 100644 index 000000000..6d189d83f --- /dev/null +++ b/examples/patient_intake_extraction_dspy/pyproject.toml @@ -0,0 +1,14 @@ +[project] +name = "patient-intake-extraction-dspy" +version = "0.1.0" +description = "Extract structured information from patient intake forms using DSPy." +requires-python = ">=3.10" +dependencies = [ + "cocoindex>=0.3.9", + "dspy-ai>=3.0.4", + "pydantic>=2.0.0", + "pymupdf>=1.24.0", +] + +[tool.setuptools] +packages = []