diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..ef32c5b Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md index 42862e3..b717bec 100644 --- a/README.md +++ b/README.md @@ -55,3 +55,34 @@ __Contributors:__ - Vincent Harkins (@vharkins1) - Marc Vergés (@marcvergees) - Jan Sans + + +## Local Development Setup (Beginner Friendly) + +1. Clone your fork and enter project folder: + + - git clone + cd FireForm (Terminal) + +2. Create virtual environment: + + - python3 -m venv venv + source venv/bin/activate + +3. Install dependencies: + +4. Initialize database tables: + +5. Run backend server: + +6. Open Swagger UI in browser: (http://127.0.0.1:8000/docs) + +### Common Errors + +- `sqlite3.OperationalError: no such table` +→ Run database initialization step. + +- `Could not connect to Ollama` +→ Ensure Ollama server is running locally. + + diff --git a/api/db/models.py b/api/db/models.py index f76c93b..d237f82 100644 --- a/api/db/models.py +++ b/api/db/models.py @@ -15,4 +15,5 @@ class FormSubmission(SQLModel, table=True): template_id: int input_text: str output_pdf_path: str - created_at: datetime = Field(default_factory=datetime.utcnow) \ No newline at end of file + requires_review: bool = False + created_at: datetime = Field(default_factory=datetime.utcnow) diff --git a/api/routes/forms.py b/api/routes/forms.py index cee5356..7743fec 100644 --- a/api/routes/forms.py +++ b/api/routes/forms.py @@ -1,27 +1,23 @@ -from fastapi import APIRouter, Depends -from sqlmodel import Session -from api.deps import get_db -from api.schemas.forms import FormFill, FormFillResponse -from api.db.repositories import create_form, get_template -from api.db.models import FormSubmission -from api.errors.base import AppError -from src.controller import Controller +from pydantic import BaseModel, Field, field_validator -router = APIRouter(prefix="/forms", tags=["forms"]) +class FormFill(BaseModel): + template_id: int + input_text: str = Field(..., min_length=1, max_length=50000) -@router.post("/fill", response_model=FormFillResponse) -def fill_form(form: FormFill, db: Session = Depends(get_db)): - fetched_template = get_template(db, form.template_id) - if not fetched_template: - raise AppError("Template not found", status_code=404) + @field_validator("input_text") + @classmethod + def validate_input_text(cls, v): + stripped = v.strip() + if not stripped: + raise ValueError("Input text cannot be empty or only whitespace") + return stripped - controller = Controller() - path = controller.fill_form( - user_input=form.input_text, - fields=fetched_template.fields, - pdf_form_path=fetched_template.pdf_path, - ) - submission = FormSubmission(**form.model_dump(), output_pdf_path=path) - return create_form(db, submission) +class FormFillResponse(BaseModel): + id: int + template_id: int + input_text: str + output_pdf_path: str + + model_config = {"from_attributes": True} \ No newline at end of file diff --git a/src/controller.py b/src/controller.py index d31ec9c..c761780 100644 --- a/src/controller.py +++ b/src/controller.py @@ -5,7 +5,14 @@ def __init__(self): self.file_manipulator = FileManipulator() def fill_form(self, user_input: str, fields: list, pdf_form_path: str): - return self.file_manipulator.fill_form(user_input, fields, pdf_form_path) + path, review_flag = self.file_manipulator.fill_form( + user_input=user_input, + fields=fields, + pdf_form_path=pdf_form_path + ) + return path, review_flag + + def create_template(self, pdf_path: str): return self.file_manipulator.create_template(pdf_path) \ No newline at end of file diff --git a/src/file_manipulator.py b/src/file_manipulator.py index e499c89..a8a723b 100644 --- a/src/file_manipulator.py +++ b/src/file_manipulator.py @@ -1,7 +1,10 @@ import os +import logging from src.filler import Filler from src.llm import LLM +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) class FileManipulator: def __init__(self): @@ -28,24 +31,43 @@ def fill_form(self, user_input: str, fields: list, pdf_form_path: str): It receives the raw data, runs the PDF filling logic, and returns the path to the newly created file. """ - print("[1] Received request from frontend.") + logger.info("[1] Received request from frontend.") print(f"[2] PDF template path: {pdf_form_path}") if not os.path.exists(pdf_form_path): print(f"Error: PDF template not found at {pdf_form_path}") return None # Or raise an exception - print("[3] Starting extraction and PDF filling process...") + logger.info("[3] Starting extraction...") try: self.llm._target_fields = fields self.llm._transcript_text = user_input - output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm) + + success = self.llm.extract_structured_safe() + + if not success: + print("Structured extraction failed → fallback to old extraction") + self.llm.main_loop() + + output_name = self.filler.fill_form( + pdf_form=pdf_form_path, + llm=self.llm + ) + + from src.utils.validation import requires_review + + extracted_data = self.llm.get_data() + + review_flag = requires_review( + extracted_data, + fields.keys() + ) print("\n----------------------------------") print("✅ Process Complete.") print(f"Output saved to: {output_name}") - return output_name + return output_name, review_flag except Exception as e: print(f"An error occurred during PDF generation: {e}") diff --git a/src/llm.py b/src/llm.py index 3621187..46e3848 100644 --- a/src/llm.py +++ b/src/llm.py @@ -1,109 +1,167 @@ import json +import logging import os +import time import requests -from requests.exceptions import Timeout, RequestException + +logger = logging.getLogger("fireform.llm") + +# Configuration constants +LLM_REQUEST_TIMEOUT_SECONDS = 120 +LLM_MAX_RETRIES = 3 +LLM_RETRY_BASE_DELAY_SECONDS = 2 class LLM: def __init__(self, transcript_text=None, target_fields=None, json=None): if json is None: json = {} - self._transcript_text = transcript_text # str - self._target_fields = target_fields # List, contains the template field. - self._json = json # dictionary + self._transcript_text = transcript_text + self._target_fields = target_fields + self._json = json def type_check_all(self): if type(self._transcript_text) is not str: raise TypeError( - f"ERROR in LLM() attributes ->\ - Transcript must be text. Input:\n\ttranscript_text: {self._transcript_text}" + f"ERROR in LLM() attributes -> " + f"Transcript must be text. Input:\n\ttranscript_text: {self._transcript_text}" ) elif type(self._target_fields) is not list: raise TypeError( - f"ERROR in LLM() attributes ->\ - Target fields must be a list. Input:\n\ttarget_fields: {self._target_fields}" + f"ERROR in LLM() attributes -> " + f"Target fields must be a list. Input:\n\ttarget_fields: {self._target_fields}" ) def build_prompt(self, current_field): """ - This method is in charge of the prompt engineering. It creates a specific prompt for each target field. - @params: current_field -> represents the current element of the json that is being prompted. + Creates a specific prompt for each target field. """ - prompt = f""" + prompt = f""" SYSTEM PROMPT: - You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings. - You will receive the transcription, and the name of the JSON field whose value you have to identify in the context. Return - only a single string containing the identified value for the JSON field. + You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings. + You will receive the transcription, and the name of the JSON field whose value you have to identify in the context. Return + only a single string containing the identified value for the JSON field. If the field name is plural, and you identify more than one possible value in the text, return both separated by a ";". If you don't identify the value in the provided text, return "-1". --- DATA: Target JSON field to find in text: {current_field} - + TEXT: {self._transcript_text} """ - return prompt - def main_loop(self): - timeout = 30 - max_retries = 3 + def _call_ollama(self, prompt, field_name): + """ + Send a prompt to Ollama with timeout and retry logic. + """ + ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/") + ollama_url = f"{ollama_host}/api/generate" - # self.type_check_all() - total_fields = len(self._target_fields) - for i, field in enumerate(self._target_fields.keys(), 1): - prompt = self.build_prompt(field) - # print(prompt) - # ollama_url = "http://localhost:11434/api/generate" - ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/") - ollama_url = f"{ollama_host}/api/generate" - - payload = { - "model": "mistral", - "prompt": prompt, - "stream": False, # don't really know why --> look into this later. - } - - json_data = None + payload = { + "model": "mistral", + "prompt": prompt, + "stream": False, + } + + last_exception = None + + for attempt in range(1, LLM_MAX_RETRIES + 1): try: - for attempt in range(max_retries): - try: - response = requests.post(ollama_url, json=payload, timeout=timeout) - response.raise_for_status() - json_data = response.json() - break - except Timeout: - print(f"Ollama request timed out (attempt {attempt+1})") - except RequestException as e: - print(f"Ollama request failed: {e}") - except requests.exceptions.ConnectionError: - raise ConnectionError( - f"Could not connect to Ollama at {ollama_url}. " - "Please ensure Ollama is running and accessible." + logger.info( + "LLM request for field '%s' (attempt %d/%d)", + field_name, + attempt, + LLM_MAX_RETRIES, + ) + + response = requests.post( + ollama_url, + json=payload, + timeout=LLM_REQUEST_TIMEOUT_SECONDS, ) - except requests.exceptions.HTTPError as e: - raise RuntimeError(f"Ollama returned an error: {e}") - - if json_data is None: - raise RuntimeError("Failed to get response from Ollama after retries.") - else: - # parse response - parsed_response = json_data["response"] - # print(parsed_response) - self.add_response_to_json(field, parsed_response) - print(f"[{i}/{total_fields}] Extracted data for field '{field}' successfully.") - - print("----------------------------------") - print("\t[LOG] Resulting JSON created from the input text:") - print(json.dumps(self._json, indent=2)) - print("--------- extracted data ---------") + response.raise_for_status() + + json_data = response.json() + result = json_data["response"] + + logger.info( + "LLM response for field '%s': %s", + field_name, + result[:100] if len(result) > 100 else result, + ) + + return result + + except requests.exceptions.Timeout as exc: + last_exception = exc + logger.warning( + "LLM request timed out for field '%s' (attempt %d/%d)", + field_name, + attempt, + LLM_MAX_RETRIES, + ) + + except requests.exceptions.ConnectionError as exc: + last_exception = exc + logger.warning( + "Cannot connect to Ollama for field '%s' (attempt %d/%d)", + field_name, + attempt, + LLM_MAX_RETRIES, + ) + + except requests.exceptions.HTTPError as exc: + last_exception = exc + if response.status_code >= 500: + logger.warning( + "Ollama server error %d for field '%s' (attempt %d/%d)", + response.status_code, + field_name, + attempt, + LLM_MAX_RETRIES, + ) + else: + # Client errors (4xx) should not be retried + raise RuntimeError( + f"Ollama returned client error {response.status_code} " + f"for field '{field_name}': {exc}" + ) from exc + + # Exponential backoff before retry + if attempt < LLM_MAX_RETRIES: + delay = LLM_RETRY_BASE_DELAY_SECONDS * (2 ** (attempt - 1)) + logger.info("Retrying in %d seconds...", delay) + time.sleep(delay) + + # All retries exhausted + raise RuntimeError( + f"LLM extraction failed for field '{field_name}' after " + f"{LLM_MAX_RETRIES} attempts: {last_exception}" + ) + + def main_loop(self): + """ + Iterate over all target fields, extract values from the LLM, + and build the result JSON. + """ + logger.info( + "Starting LLM extraction for %d fields", + len(self._target_fields) if self._target_fields else 0, + ) + + for field in self._target_fields.keys(): + prompt = self.build_prompt(field) + parsed_response = self._call_ollama(prompt, field_name=field) + self.add_response_to_json(field, parsed_response) + + logger.info("LLM extraction complete. Result:\n%s", json.dumps(self._json, indent=2)) return self def add_response_to_json(self, field, value): """ - this method adds the following value under the specified field, - or under a new field if the field doesn't exist, to the json dict + Adds the extracted value under the specified field in the JSON dict. """ value = value.strip().replace('"', "") parsed_value = None @@ -123,27 +181,26 @@ def add_response_to_json(self, field, value): def handle_plural_values(self, plural_value): """ - This method handles plural values. - Takes in strings of the form 'value1; value2; value3; ...; valueN' - returns a list with the respective values -> [value1, value2, value3, ..., valueN] + Handles plural values separated by semicolons. + 'value1; value2; value3' → ['value1', 'value2', 'value3'] """ if ";" not in plural_value: raise ValueError( f"Value is not plural, doesn't have ; separator, Value: {plural_value}" ) - print( - f"\t[LOG]: Formating plural values for JSON, [For input {plural_value}]..." - ) + logger.debug("Formatting plural values for input: %s", plural_value) values = plural_value.split(";") - # Remove trailing leading whitespace for i in range(len(values)): - values[i] = values[i].lstrip() + current = i + 1 + if current < len(values): + clean_value = values[current].lstrip() + values[current] = clean_value - print(f"\t[LOG]: Resulting formatted list of values: {values}") + logger.debug("Resulting formatted list: %s", values) return values def get_data(self): - return self._json + return self._json \ No newline at end of file diff --git a/src/schemas/incident-schema.py b/src/schemas/incident-schema.py new file mode 100644 index 0000000..054054a --- /dev/null +++ b/src/schemas/incident-schema.py @@ -0,0 +1,6 @@ +INCIDENT_SCHEMA = { + "location": "", + "time": "", + "severity": "", + "description": "" +} \ No newline at end of file diff --git a/src/utils/validation.py b/src/utils/validation.py new file mode 100644 index 0000000..5ecc5c1 --- /dev/null +++ b/src/utils/validation.py @@ -0,0 +1,11 @@ +def requires_review(data: dict, required_fields: list): + for field in required_fields: + value = data.get(field) + + if value is None: + return True + + if isinstance(value, str) and value.strip() in ["", "-1"]: + return True + + return False \ No newline at end of file diff --git a/test.pdf b/test.pdf new file mode 100644 index 0000000..e69de29