harmonydata
diff --git a/‎pyproject.toml‎
Lines changed: 16 additions & 28 deletions b/‎pyproject.toml‎
Lines changed: 16 additions & 28 deletions
diff --git a/‎requirements.txt‎
Lines changed: 3 additions & 13 deletions b/‎requirements.txt‎
Lines changed: 3 additions & 13 deletions
diff --git a/‎src/harmony/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/harmony/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/harmony/parsing/pdf_parser.py‎
Lines changed: 69 additions & 38 deletions b/‎src/harmony/parsing/pdf_parser.py‎
Lines changed: 69 additions & 38 deletions
diff --git a/‎src/harmony/util/instrument_helper.py‎
Lines changed: 3 additions & 3 deletions b/‎src/harmony/util/instrument_helper.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/test_convert_pdf.py‎
Lines changed: 22 additions & 0 deletions b/‎tests/test_convert_pdf.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎tests/test_create_instrument_from_list.py‎
Lines changed: 12 additions & 3 deletions b/‎tests/test_create_instrument_from_list.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎tests/test_crosswalk.py‎
Lines changed: 9 additions & 5 deletions b/‎tests/test_crosswalk.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎tests/test_deterministic_clustering.py‎
Lines changed: 7 additions & 5 deletions b/‎tests/test_deterministic_clustering.py‎
Lines changed: 7 additions & 5 deletions
@@ -42,35 +42,23 @@ classifiers=[
 # core dependencies of harmony
 # this set should be kept minimal!
 dependencies = [
-    "pydantic>=2.11.3; python_version <= '3.13.3'",
-    "pandas>=2.2.3; python_version <= '3.13.3'",
-    "tika>=3.1.0; python_version <= '3.13.3'",
-    "lxml>=5.4.0; python_version <= '3.13.3'",
-    "langdetect>=1.0.9; python_version <= '3.13.3'",
-    "XlsxWriter>=3.2.3; python_version <= '3.13.3'",
-    "openpyxl>=3.1.5; python_version <= '3.13.3'",
-    "wget>=3.2; python_version <= '3.13.3'",
-    "sentence-transformers>=4.1.0; python_version <= '3.13.3'",
-    "numpy>=2.0.2; python_version <= '3.13.3'",
-    "sklearn-crfsuite>=0.5.0; python_version <= '3.13.3'",
-    "scikit-learn>=1.6.1; python_version <= '3.13.3'",
-    "scipy>=1.13.1; python_version <= '3.13.3'",
-    "huggingface-hub>=0.30.2; python_version <= '3.13.3'",
-    "pydantic==2.8.2; python_version <= '3.13'",
-    "pandas==2.2.2; python_version <= '3.13'",
-    "tika==2.6.0; python_version <= '3.13'",
-    "lxml==5.3.0; python_version <= '3.13'",
-    "langdetect==1.0.9; python_version <= '3.13'",
-    "XlsxWriter==3.0.9; python_version <= '3.13'",
-    "openpyxl==3.1.2; python_version <= '3.13'",
-    "wget==3.2; python_version <= '3.13'",
-    "sentence-transformers==3.4.1; python_version <= '3.13'",
+    "pydantic>=2.11.3; python_version <= '3.13'",
+    "pandas>=2.2.3; python_version <= '3.13'",
+    "tika>=3.1.0; python_version <= '3.13'",
+    "lxml>=5.4.0; python_version <= '3.13'",
+    "langdetect>=1.0.9; python_version <= '3.13'",
+    "XlsxWriter>=3.2.3; python_version <= '3.13'",
+    "openpyxl>=3.1.5; python_version <= '3.13'",
+    "wget>=3.2; python_version <= '3.13'",
+    "sentence-transformers>=4.1.0; python_version <= '3.13'",
     "numpy==1.26.4; python_version <= '3.13'",
-    "sklearn-crfsuite==0.5.0; python_version <= '3.13'",
-    "scikit-learn; python_version <= '3.13'",
-    "scipy==1.14.1; python_version <= '3.13'",
-    "huggingface-hub==0.29.3; python_version <= '3.13'",
-    "fpdf==1.7.2; python_version <= '3.13'",
+    "sklearn-crfsuite>=0.5.0; python_version <= '3.13'",
+    "scikit-learn>=1.6.1; python_version <= '3.13'",
+    "scipy>=1.13.1; python_version <= '3.13'",
+    "huggingface-hub>=0.30.2; python_version <= '3.13'",
+    "torch==2.2.2; python_version <= '3.13'",
+    "transformers==4.50.3; python_version <= '3.13'",
+    "fpdf2~=2.8.2; python_version <= '3.13'",
 ]
 
 [project.optional-dependencies]
 
@@ -7,23 +7,13 @@ XlsxWriter>=3.2.3
 openpyxl>=3.1.5
 wget>=3.2
 sentence-transformers>=4.1.0
-numpy>=2.0.2
+numpy==1.26.4
 sklearn-crfsuite>=0.5.0
 scikit-learn>=1.6.1
 scipy>=1.13.1
 huggingface-hub>=0.30.2
-pydantic==2.8.2
-pandas==2.2.2
-tika==2.6.0
-lxml==5.3.0
-langdetect==1.0.9
-XlsxWriter==3.0.9
-openpyxl==3.1.2
-wget==3.2
-sentence-transformers==3.4.1
-numpy==1.26.4
 sklearn-crfsuite==0.5.0
-scikit-learn==1.5.0
 scipy==1.14.1
-huggingface-hub==0.29.3
+torch==2.2.2
+transformers==4.50.3
 fpdf2~=2.8.2
@@ -39,6 +39,7 @@
     from .parsing.text_parser import convert_text_to_instruments
     from .parsing.excel_parser import convert_excel_to_instruments
     from .parsing.pdf_parser import convert_pdf_to_instruments
+    from .parsing.pdf_parser import group_token_spans_by_class
     from .parsing.wrapper_all_parsers import convert_files_to_instruments
     from .parsing import *
     from .util.file_helper import load_instruments_from_local_file
 
@@ -25,57 +25,84 @@
 
 '''
 
-import pathlib
-import pickle as pkl
-import re
+import os
+
+import torch
+from transformers import AutoModelForTokenClassification, AutoTokenizer
 
 import harmony
-from harmony.parsing.util.feature_extraction import convert_text_to_features
 from harmony.parsing.util.tika_wrapper import parse_pdf_to_plain_text
 from harmony.schemas.requests.text import RawFile, Instrument
 
-model_containing_folder = pathlib.Path(__file__).parent.resolve()
+# Disable tokenizer parallelism
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+def group_token_spans_by_class(tokens, classes,
+                               tokenizer=AutoTokenizer.from_pretrained("harmonydata/debertaV2_pdfparser")) -> dict:
+    """
+    Given a list of tokens, and a list of predicted classes
+    for each token, create a dictionary to hold each
+    span of tokens.
+    Example:
+        > group_token_spans_by_classes(['▁how', '▁are', '▁you', '?', '▁1'],
+                                        ['question', 'question', 'question', 'question', 'answer'],
+                                        bert_tokenizer)
+        > {"question":["How are you?"], "answer":["1"]}
+        Notice that some tokens begin with ▁ (ASCII 9601) instead of _ (ASCII 95)
+    :param tokens: List of tokens
+    :type tokens: List[str]
+    :param classes: List of predicted classes
+    :type classes: List[str]
+    :param tokenizer: Tokenizer (defaulted to harmonydata/debertaV2_pdfparser)
+    :return: Dictionary of each span relative to its class
+    """
+    grouped_spans = {"answer": [], "question": [], "other": []}
+    span = []
+    prev_cls = None
+
+    for token, cls in zip(tokens, classes):
+        if cls != prev_cls and span:
+            grouped_spans[prev_cls].append(tokenizer.convert_tokens_to_string(span))
+            span = []
+        span.append(token)
+        prev_cls = cls
+    # Add final token and class to respective key in dictionary
+    if span:
+        grouped_spans[prev_cls].append(tokenizer.convert_tokens_to_string(span))
+
+    return grouped_spans
 
-with open(f"{model_containing_folder}/20240719_pdf_question_extraction_sklearn_crf_model.pkl", "rb") as f:
-    crf_text_model = pkl.load(f)
 
-# Predict method is taken from the training repo. Use the training repo as the master copy of the predict method.
-# All training code is in https://github.com/harmonydata/pdf-questionnaire-extraction
 def predict(test_text):
-    token_texts, token_start_char_indices, token_end_char_indices, token_properties = convert_text_to_features(
-        test_text)
-
-    X = []
-    X.append(token_properties)
-
-    y_pred = crf_text_model.predict(X)
+    # Load fine-tuned huggingface model and tokenizer
+    model = AutoModelForTokenClassification.from_pretrained("harmonydata/debertaV2_pdfparser")
+    tokenizer = AutoTokenizer.from_pretrained("harmonydata/debertaV2_pdfparser")
 
-    questions_from_text = []
+    # Tokenize input text
+    tokenized_texts = tokenizer(test_text, return_tensors="pt")
 
-    tokens_already_used = set()
+    # Inference with tokenized input text
+    with torch.no_grad():
+        logits = model(**tokenized_texts).logits
 
-    last_token_category = "O"
+    # Retrieve predicted class for each token
+    predictions = torch.argmax(logits, dim=2)
+    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
 
-    for idx in range(len(X[0])):
+    # Get input IDs (tensor) and convert to list
+    input_ids = tokenized_texts["input_ids"][0].tolist()
+    # Convert input IDs to tokens
+    decoded_tokenized_texts = tokenizer.convert_ids_to_tokens(input_ids)
 
-        if y_pred[0][idx] != "O" and idx not in tokens_already_used:
-            if last_token_category == "O" or y_pred[0][idx] == "B":
-                start_idx = token_start_char_indices[idx]
-                end_idx = len(test_text)
-                for j in range(idx + 1, len(X[0])):
-                    if y_pred[0][j] == "O" or y_pred[0][j] == "B":
-                        end_idx = token_end_char_indices[j - 1]
-                        break
-                    tokens_already_used.add(j)
+    # Remove leading [CLS] and trailing [SEP] tokens from decoded
+    # tokens, and the list of predictions
+    predicted_token_class = predicted_token_class[1:-1]
+    decoded_tokenized_texts = decoded_tokenized_texts[1:-1]
 
-                question_text = test_text[start_idx:end_idx]
-                question_text = re.sub(r'\s+', ' ', question_text)
-                question_text = question_text.strip()
-                questions_from_text.append(question_text)
+    grouped_tokens = group_token_spans_by_class(decoded_tokenized_texts, predicted_token_class, tokenizer)
 
-        last_token_category = y_pred[0][idx]
-
-    return questions_from_text
+    return grouped_tokens
 
 
 def convert_pdf_to_instruments(file: RawFile) -> Instrument:
@@ -87,8 +114,12 @@ def convert_pdf_to_instruments(file: RawFile) -> Instrument:
     if not file.text_content:
         file.text_content = parse_pdf_to_plain_text(file.content)  # call Tika to convert the PDF to plain text
 
-    questions_from_text = predict(file.text_content)
+    # Run prediction script to return questions and answers from file text content
+    questions_answers_from_text = predict(file.text_content)
+
+    questions_from_text = questions_answers_from_text["question"]
+    answers_from_text = questions_answers_from_text["answer"]
 
-    instrument = harmony.create_instrument_from_list(questions_from_text, instrument_name=file.file_name,
+    instrument = harmony.create_instrument_from_list(questions_from_text, answers_from_text, instrument_name=file.file_name,
                                                      file_name=file.file_name)
     return [instrument]
@@ -32,21 +32,21 @@
 from harmony.schemas.requests.text import Instrument, Question
 
 
-def create_instrument_from_list(question_texts: list, question_numbers: list = None,
+def create_instrument_from_list(question_texts: list, answer_texts: list, question_numbers: list = None,
+                                answer_numbers: list = None,
                                 instrument_name: str = "My instrument",
                                 file_name="My file") -> Instrument:
     """
     Read a list of strings and create an Instrument object.
     :return: Single Instrument.
     """
-
     questions = []
     for ctr, question_text in enumerate(question_texts):
         if question_numbers is not None:
             question_no = question_numbers[ctr]
         else:
             question_no = str(ctr + 1)
-        questions.append(Question(question_text=question_text, question_no=question_no))
+        questions.append(Question(question_text=question_text, question_no=question_no, options=answer_texts))
 
     return Instrument(questions=questions, instrument_name=instrument_name, instrument_id=uuid.uuid4().hex,
                       file_name=file_name, file_id=uuid.uuid4().hex)
 
@@ -33,6 +33,7 @@
 from harmony import convert_pdf_to_instruments
 from harmony.schemas.requests.text import RawFile
 from harmony import download_models
+from harmony import group_token_spans_by_class
 
 pdf_gad_7_2_questions = RawFile.model_validate({
     "file_id": "d39f31718513413fbfc620c6b6135d0c",
@@ -53,5 +54,26 @@ def test_two_questions(self):
         self.assertEqual(2, len(convert_pdf_to_instruments(pdf_gad_7_2_questions)[0].questions))
 
 
+class TestTokenGroupingByClass(unittest.TestCase):
+    def test_multiple_questions_answers_others(self):
+        input_classes = ["question", "question", "question", "question",
+                         "answer",
+                         "other", "other", "other", "other", "other", "other",
+                         "question", "question", "question", "question",
+                         "answer",
+                         "other", "other", "other", "other", "other", "other"]
+        input_tokens = ['▁How', '▁are', '▁you', '?',
+                        "▁5",
+                        ".", "▁lore", "m", "▁ipsum", "▁dolor", ".",
+                        "▁How", "▁are", "▁you", "?",
+                        "▁8",
+                        ".", "▁lore", "m", "▁ipsum", "▁dolor", "."]
+        expected_output = {"question": ["How are you?", "How are you?"],
+                           "answer": ["5", "8"],
+                           "other": [". lorem ipsum dolor.", ". lorem ipsum dolor."]}
+        output = group_token_spans_by_class(input_tokens, input_classes)
+        self.assertDictEqual(expected_output, output)
+
+
 if __name__ == '__main__':
     unittest.main()
@@ -36,16 +36,25 @@
 class TestCreateInstrument(unittest.TestCase):
 
     def test_single_instrument_simple(self):
-        instrument = create_instrument_from_list(["question A", "question B"])
+        instrument = create_instrument_from_list(["question A", "question B"], [])
         self.assertEqual(2, len(instrument.questions))
 
     def test_single_instrument_simple_2(self):
-        instrument = create_instrument_from_list(["question A", "question B", "question C"], instrument_name="potato")
+        instrument = create_instrument_from_list(["question A", "question B", "question C"], [],
+                                                 instrument_name="potato")
         self.assertEqual(3, len(instrument.questions))
         self.assertEqual("potato", instrument.instrument_name)
 
+    def test_single_instrument_with_answers(self):
+        instrument = create_instrument_from_list(["question A", "question B", "question C"], ["Never", "Rarely", "Less than 2 times a week", "Everyday"],
+                                                 instrument_name="potato")
+        self.assertEqual(3, len(instrument.questions))
+        self.assertEqual(4, len(instrument.questions[0].options))
+        self.assertEqual(4, len(instrument.questions[1].options))
+        self.assertEqual(4, len(instrument.questions[2].options))
+        self.assertEqual("potato", instrument.instrument_name)
     def test_single_instrument_send_to_web(self):
-        instrument = create_instrument_from_list(["question A", "question B"])
+        instrument = create_instrument_from_list(["question A", "question B"], [])
         web_url = import_instrument_into_harmony_web(instrument)
         self.assertIn("harmonydata.ac.uk", web_url)
 
 
@@ -41,7 +41,8 @@
 class TestGenerateCrosswalkTable(unittest.TestCase):
     def setUp(self):
         # Sample data
-        self.instruments_dummy = [create_instrument_from_list(["potato", "tomato", "radish"], instrument_name="veg")]
+        self.instruments_dummy = [
+            create_instrument_from_list(["potato", "tomato", "radish"], [], instrument_name="veg")]
 
         self.similarity = np.array([
             [1.0, 0.7, 0.9],
@@ -51,6 +52,7 @@ def setUp(self):
 
         self.instruments = [create_instrument_from_list(
             ["Feeling nervous, anxious, or on edge", "Not being able to stop or control worrying"],
+            [],
             instrument_name="GAD-7")]
 
         self.threshold = 0.6
@@ -101,9 +103,10 @@ def test_generate_crosswalk_table_real(self):
 
     def test_crosswalk_two_instruments_allow_many_to_one_matches(self):
 
-        instrument_1 = create_instrument_from_list(["I felt fearful."])
+        instrument_1 = create_instrument_from_list(["I felt fearful."], [])
         instrument_2 = create_instrument_from_list(
-            ["Feeling afraid, as if something awful might happen", "Feeling nervous, anxious, or on edge"])
+            ["Feeling afraid, as if something awful might happen", "Feeling nervous, anxious, or on edge"],
+            [])
         instruments = [instrument_1, instrument_2]
 
         match_response = match_instruments(instruments)
@@ -114,9 +117,10 @@ def test_crosswalk_two_instruments_allow_many_to_one_matches(self):
 
     def test_crosswalk_two_instruments_enforce_one_to_one_matches(self):
 
-        instrument_1 = create_instrument_from_list(["I felt fearful."])
+        instrument_1 = create_instrument_from_list(["I felt fearful."], [])
         instrument_2 = create_instrument_from_list(
-            ["Feeling afraid, as if something awful might happen", "Feeling nervous, anxious, or on edge"])
+            ["Feeling afraid, as if something awful might happen", "Feeling nervous, anxious, or on edge"],
+            [])
         instruments = [instrument_1, instrument_2]
 
         match_response = match_instruments(instruments)
 
@@ -30,11 +30,9 @@
 
 sys.path.append("../src")
 
-from harmony import match_instruments, create_instrument_from_list, find_clusters_deterministic
-from harmony.schemas.requests.text import Instrument, Question
+from harmony import create_instrument_from_list, find_clusters_deterministic
 import numpy as np
 
-
 if __name__ == '__main__':
     unittest.main()
 
@@ -43,17 +41,21 @@ class TestDeterministicClustering(unittest.TestCase):
 
     def test_two_questions_one_cluster(self):
         questions = create_instrument_from_list(
-            ["Feeling nervous, anxious, or on edge", "Not being able to stop or control worrying"]).questions
+            ["Feeling nervous, anxious, or on edge", "Not being able to stop or control worrying"],
+            []).questions
         item_to_item_similarity_matrix = np.eye(2) / 2 + np.ones((2, 2)) / 2
         clusters = find_clusters_deterministic(questions, item_to_item_similarity_matrix)
         self.assertEqual(1, len(clusters))
 
     def test_three_questions_one_cluster(self):
         questions = create_instrument_from_list(
-            ["Feeling nervous, anxious, or on edge", "Not being able to stop or control worrying", "Worrying too much about different things"]).questions
+            ["Feeling nervous, anxious, or on edge", "Not being able to stop or control worrying",
+             "Worrying too much about different things"],
+            []).questions
         item_to_item_similarity_matrix = np.eye(3) / 2 + np.ones((3, 3)) / 2
         clusters = find_clusters_deterministic(questions, item_to_item_similarity_matrix)
         self.assertEqual(1, len(clusters))
 
+
 if __name__ == '__main__':
     unittest.main()