|
| 1 | +import datetime |
| 2 | + |
| 3 | +import dspy |
| 4 | +from pydantic import BaseModel, Field |
| 5 | +import fitz # PyMuPDF |
| 6 | + |
| 7 | +import cocoindex |
| 8 | + |
| 9 | + |
| 10 | +# Pydantic models for DSPy structured outputs |
| 11 | +class Contact(BaseModel): |
| 12 | + name: str |
| 13 | + phone: str |
| 14 | + relationship: str |
| 15 | + |
| 16 | + |
| 17 | +class Address(BaseModel): |
| 18 | + street: str |
| 19 | + city: str |
| 20 | + state: str |
| 21 | + zip_code: str |
| 22 | + |
| 23 | + |
| 24 | +class Pharmacy(BaseModel): |
| 25 | + name: str |
| 26 | + phone: str |
| 27 | + address: Address |
| 28 | + |
| 29 | + |
| 30 | +class Insurance(BaseModel): |
| 31 | + provider: str |
| 32 | + policy_number: str |
| 33 | + group_number: str | None = None |
| 34 | + policyholder_name: str |
| 35 | + relationship_to_patient: str |
| 36 | + |
| 37 | + |
| 38 | +class Condition(BaseModel): |
| 39 | + name: str |
| 40 | + diagnosed: bool |
| 41 | + |
| 42 | + |
| 43 | +class Medication(BaseModel): |
| 44 | + name: str |
| 45 | + dosage: str |
| 46 | + |
| 47 | + |
| 48 | +class Allergy(BaseModel): |
| 49 | + name: str |
| 50 | + |
| 51 | + |
| 52 | +class Surgery(BaseModel): |
| 53 | + name: str |
| 54 | + date: str |
| 55 | + |
| 56 | + |
| 57 | +class Patient(BaseModel): |
| 58 | + name: str |
| 59 | + dob: datetime.date |
| 60 | + gender: str |
| 61 | + address: Address |
| 62 | + phone: str |
| 63 | + email: str |
| 64 | + preferred_contact_method: str |
| 65 | + emergency_contact: Contact |
| 66 | + insurance: Insurance | None = None |
| 67 | + reason_for_visit: str |
| 68 | + symptoms_duration: str |
| 69 | + past_conditions: list[Condition] = Field(default_factory=list) |
| 70 | + current_medications: list[Medication] = Field(default_factory=list) |
| 71 | + allergies: list[Allergy] = Field(default_factory=list) |
| 72 | + surgeries: list[Surgery] = Field(default_factory=list) |
| 73 | + occupation: str | None = None |
| 74 | + pharmacy: Pharmacy | None = None |
| 75 | + consent_given: bool |
| 76 | + consent_date: str | None = None |
| 77 | + |
| 78 | + |
| 79 | +# DSPy Signature for patient information extraction from images |
| 80 | +class PatientExtractionSignature(dspy.Signature): |
| 81 | + """Extract structured patient information from a medical intake form image.""" |
| 82 | + |
| 83 | + form_images: list[dspy.Image] = dspy.InputField( |
| 84 | + desc="Images of the patient intake form pages" |
| 85 | + ) |
| 86 | + patient: Patient = dspy.OutputField( |
| 87 | + desc="Extracted patient information with all available fields filled" |
| 88 | + ) |
| 89 | + |
| 90 | + |
| 91 | +class PatientExtractor(dspy.Module): |
| 92 | + """DSPy module for extracting patient information from intake form images.""" |
| 93 | + |
| 94 | + def __init__(self) -> None: |
| 95 | + super().__init__() |
| 96 | + self.extract = dspy.ChainOfThought(PatientExtractionSignature) |
| 97 | + |
| 98 | + def forward(self, form_images: list[dspy.Image]) -> Patient: |
| 99 | + """Extract patient information from form images and return as a Pydantic model.""" |
| 100 | + result = self.extract(form_images=form_images) |
| 101 | + return result.patient # type: ignore |
| 102 | + |
| 103 | + |
| 104 | +@cocoindex.op.function(cache=True, behavior_version=1) |
| 105 | +def extract_patient(pdf_content: bytes) -> Patient: |
| 106 | + """Extract patient information from PDF content.""" |
| 107 | + |
| 108 | + # Convert PDF pages to DSPy Image objects |
| 109 | + pdf_doc = fitz.open(stream=pdf_content, filetype="pdf") |
| 110 | + |
| 111 | + form_images = [] |
| 112 | + for page in pdf_doc: |
| 113 | + # Render page to pixmap (image) at 2x resolution for better quality |
| 114 | + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) |
| 115 | + # Convert to PNG bytes |
| 116 | + img_bytes = pix.tobytes("png") |
| 117 | + # Create DSPy Image from bytes |
| 118 | + form_images.append(dspy.Image(img_bytes)) |
| 119 | + |
| 120 | + pdf_doc.close() |
| 121 | + |
| 122 | + # Extract patient information using DSPy with vision |
| 123 | + extractor = PatientExtractor() |
| 124 | + patient = extractor(form_images=form_images) |
| 125 | + |
| 126 | + return patient # type: ignore |
| 127 | + |
| 128 | + |
| 129 | +@cocoindex.flow_def(name="PatientIntakeExtractionDSPy") |
| 130 | +def patient_intake_extraction_dspy_flow( |
| 131 | + flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope |
| 132 | +) -> None: |
| 133 | + """ |
| 134 | + Define a flow that extracts patient information from intake forms using DSPy. |
| 135 | +
|
| 136 | + This flow: |
| 137 | + 1. Reads patient intake PDFs as binary |
| 138 | + 2. Uses DSPy with vision models to extract structured patient information |
| 139 | + (PDF to image conversion happens automatically inside the extractor) |
| 140 | + 3. Stores the results in a Postgres database |
| 141 | + """ |
| 142 | + data_scope["documents"] = flow_builder.add_source( |
| 143 | + cocoindex.sources.LocalFile(path="data/patient_forms", binary=True) |
| 144 | + ) |
| 145 | + |
| 146 | + patients_index = data_scope.add_collector() |
| 147 | + |
| 148 | + with data_scope["documents"].row() as doc: |
| 149 | + # Extract patient information directly from PDF using DSPy with vision |
| 150 | + # (PDF->Image conversion happens inside the extractor) |
| 151 | + doc["patient_info"] = doc["content"].transform(extract_patient) |
| 152 | + |
| 153 | + # Collect the extracted patient information |
| 154 | + patients_index.collect( |
| 155 | + filename=doc["filename"], |
| 156 | + patient_info=doc["patient_info"], |
| 157 | + ) |
| 158 | + |
| 159 | + # Export to Postgres |
| 160 | + patients_index.export( |
| 161 | + "patients", |
| 162 | + cocoindex.storages.Postgres(table_name="patients_info_dspy"), |
| 163 | + primary_key_fields=["filename"], |
| 164 | + ) |
| 165 | + |
| 166 | + |
| 167 | +@cocoindex.settings |
| 168 | +def cocoindex_settings() -> cocoindex.Settings: |
| 169 | + # Configure the model used in DSPy |
| 170 | + lm = dspy.LM("gemini/gemini-2.5-flash") |
| 171 | + dspy.configure(lm=lm) |
| 172 | + |
| 173 | + return cocoindex.Settings.from_env() |
0 commit comments