diff --git a/finetuning/adapters/pii-redactor-3b/0000100_adapters.safetensors b/finetuning/adapters/pii-redactor-3b/0000100_adapters.safetensors new file mode 100644 index 0000000..c5da0de Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/0000100_adapters.safetensors differ diff --git a/finetuning/adapters/pii-redactor-3b/0000200_adapters.safetensors b/finetuning/adapters/pii-redactor-3b/0000200_adapters.safetensors new file mode 100644 index 0000000..ad2c21c Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/0000200_adapters.safetensors differ diff --git a/finetuning/adapters/pii-redactor-3b/0000300_adapters.safetensors b/finetuning/adapters/pii-redactor-3b/0000300_adapters.safetensors new file mode 100644 index 0000000..f707b96 Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/0000300_adapters.safetensors differ diff --git a/finetuning/adapters/pii-redactor-3b/0000400_adapters.safetensors b/finetuning/adapters/pii-redactor-3b/0000400_adapters.safetensors new file mode 100644 index 0000000..2c7ff74 Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/0000400_adapters.safetensors differ diff --git a/finetuning/adapters/pii-redactor-3b/0000500_adapters.safetensors b/finetuning/adapters/pii-redactor-3b/0000500_adapters.safetensors new file mode 100644 index 0000000..fdd396b Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/0000500_adapters.safetensors differ diff --git a/finetuning/adapters/pii-redactor-3b/0000600_adapters.safetensors b/finetuning/adapters/pii-redactor-3b/0000600_adapters.safetensors new file mode 100644 index 0000000..5c58917 Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/0000600_adapters.safetensors differ diff --git a/finetuning/adapters/pii-redactor-3b/0000700_adapters.safetensors b/finetuning/adapters/pii-redactor-3b/0000700_adapters.safetensors new file mode 100644 index 0000000..80ab48a Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/0000700_adapters.safetensors differ diff --git a/finetuning/adapters/pii-redactor-3b/0000800_adapters.safetensors b/finetuning/adapters/pii-redactor-3b/0000800_adapters.safetensors new file mode 100644 index 0000000..8a3cbd1 Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/0000800_adapters.safetensors differ diff --git a/finetuning/adapters/pii-redactor-3b/0000900_adapters.safetensors b/finetuning/adapters/pii-redactor-3b/0000900_adapters.safetensors new file mode 100644 index 0000000..89b06af Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/0000900_adapters.safetensors differ diff --git a/finetuning/adapters/pii-redactor-3b/0001000_adapters.safetensors b/finetuning/adapters/pii-redactor-3b/0001000_adapters.safetensors new file mode 100644 index 0000000..f980d55 Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/0001000_adapters.safetensors differ diff --git a/finetuning/adapters/pii-redactor-3b/adapter_config.json b/finetuning/adapters/pii-redactor-3b/adapter_config.json new file mode 100644 index 0000000..c47e2c1 --- /dev/null +++ b/finetuning/adapters/pii-redactor-3b/adapter_config.json @@ -0,0 +1,40 @@ +{ + "adapter_path": "adapters/pii-redactor-3b", + "batch_size": 2, + "config": null, + "data": "data", + "fine_tune_type": "lora", + "grad_accumulation_steps": 1, + "grad_checkpoint": false, + "iters": 1000, + "learning_rate": 1e-05, + "lora_parameters": { + "rank": 8, + "dropout": 0.0, + "scale": 20.0 + }, + "lr_schedule": null, + "mask_prompt": false, + "max_seq_length": 2048, + "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", + "num_layers": 8, + "optimizer": "adam", + "optimizer_config": { + "adam": {}, + "adamw": {}, + "muon": {}, + "sgd": {}, + "adafactor": {} + }, + "project_name": null, + "report_to": null, + "resume_adapter_file": null, + "save_every": 100, + "seed": 0, + "steps_per_eval": 100, + "steps_per_report": 10, + "test": false, + "test_batches": 500, + "train": true, + "val_batches": 25 +} \ No newline at end of file diff --git a/finetuning/adapters/pii-redactor-3b/adapters.safetensors b/finetuning/adapters/pii-redactor-3b/adapters.safetensors new file mode 100644 index 0000000..f980d55 Binary files /dev/null and b/finetuning/adapters/pii-redactor-3b/adapters.safetensors differ diff --git a/finetuning/redact.py b/finetuning/redact.py index 4b923a8..a682090 100755 --- a/finetuning/redact.py +++ b/finetuning/redact.py @@ -1,22 +1,144 @@ +#!/usr/bin/env python3 """Simple PII redactor""" import sys from mlx_lm import load, generate print("Loading model...", file=sys.stderr) model, tokenizer = load( - "mlx-community/Llama-3.2-1B-Instruct-4bit", - adapter_path="adapters/pii-redactor" + "mlx-community/Llama-3.2-3B-Instruct-4bit", + adapter_path="adapters/pii-redactor-3b" ) +# Get input if len(sys.argv) > 1: text = " ".join(sys.argv[1:]) else: text = sys.stdin.read().strip() -system_prompt = """You are a privacy compliance officer. Redact PII and output JSON: +system_prompt = """You are a **privacy compliance officer** responsible for redacting **personally identifiable information (PII)** from **Redmine support tickets** before they are shared on public platforms like Discord. + + Your primary goal is to **remove PII while preserving ticket clarity and structure** so that it remains **useful for troubleshooting and public discussion.** + ALWAYS REMEMBER: never summarize a ticket. redact per instructions but never summarize or shorten excessively + + --- + + ## **Strict Redaction Rules:** + Apply these redaction rules carefully: + + ### **1. Names** + Replace all personal names with placeholders + Note: If there's only a first name, leave as is. If both first and last names are present, redact the last name only. + using **original firstname [LastNameN]**, where **N** is a unique number per name within the ticket. + + - Example: + - **Original:** *Chris Caputo reported an issue with network latency.* + - **Redacted:** *Chris [LastName1] reported an issue with network latency.* + + - Example: + - **Original:** *John reported an issue with network latency.* + - **Redacted:** *John reported an issue with network latency.* + + - Example (Multiple people): + - **Original:** *Esther Jang contacted Alice about the issue.* + - **Redacted:** *Esther [LastName1] contacted Alice about the issue.* + + - Example (Multiple people): + - **Original:** *Esther Jang contacted Alice Beatrice about the issue.* + - **Redacted:** *Esther [LastName1] contacted Alice [LastName2] about the issue.* + + ### **2. Emails** + Replace all email addresses with **[EmailN]**, ensuring correct numbering within the ticket. + + - Example: + - **Original:** *Please contact john.doe@example.com for assistance.* + - **Redacted:** *Please contact [Email1] for assistance.* + + - **Original:** *infrared@cs.washington.edu requested access to the document.* + - **Redacted:** *[Email2] requested access to the document.* + + ### **3. Phone Numbers** + Replace all phone numbers with **[PhoneN]**. + + - Example: + - **Original:** *Call us at +1 (555) 123-4567 for support.* + - **Redacted:** *Call us at [Phone1] for support.* + + ### **4. Physical Addresses** + Replace all physical addresses (including partial ones) with **[AddressN]**. + + - Example: + - **Original:** *The router is located at 1234 Elm St, Seattle, WA 98101.* + - **Redacted:** *The router is located at [Address1].* + + ### **5. IP & MAC Addresses** + Replace all IP addresses (both IPv4 and IPv6) and MAC addresses with **[IPN]** and **[MACN]**, respectively. + + - Example: + - **Original:** *Router IP: 192.168.1.100, IPv6: 2001:db8::ff00:42:8329.* + - **Redacted:** *Router IP: [IP1], IPv6: [IP2].* + + - **Original:** *Device MAC: AA:BB:CC:DD:EE:FF.* + - **Redacted:** *Device MAC: [MAC1].* + + ### **6. Public Keys & Login Credentials** + Replace cryptographic keys (such as SSH, PGP, and API keys) with **[PublicKeyN]**. + + - Example: + - **Original:** *PGP Key: 1280 AFA4 DD14 589B.* + - **Redacted:** *PGP Key: [PublicKey1].* + + Replace **any login credentials** (usernames and passwords) with **[UsernameN]** and **[PasswordN]**. + + - Example: + - **Original:** *Username: admin, Password: pass123!* + - **Redacted:** *Username: [Username1], Password: [Password1].* + + ### **7. Links & URLs** + - Replace **personal document-sharing links** with **[Document Link]**. + - Keep **institutional URLs** **(such as Seattle Community Network pages and PeeringDB links)** intact. + + - Example: + - **Original:** *Google Doc: https://docs.google.com/document/d/xyz123.* + - **Redacted:** *Google Doc: [Document Link].* + + - **Original:** *Seattle IX route servers: https://www.seattleix.net/route-servers.* + - **Redacted:** *Seattle IX route servers: https://www.seattleix.net/route-servers.* (Kept because it is public knowledge) + + ### **8. Message Context Preservation** + - **DO NOT** modify technical details, ticket metadata, or non-PII network-related content. + - **DO NOT** over-redact information that is **public knowledge or operationally relevant**. + + --- + + ## **Handling Multi-User Interactions** + When a ticket has **multiple users**, assign **unique placeholders per person** to maintain readability: + + - **Original:** +``` + Chris Caputo: Please fix this issue. + Esther Jang: I have escalated this to IT. + Chris Caputo: Thank you! +``` + - **Redacted:** +``` + [FirstName1] [LastName1]: Please fix this issue. + [FirstName2] [LastName2]: I have escalated this to IT. + [FirstName1] [LastName1]: Thank you! +``` + + --- + + ALWAYS REMEMBER: never summarize a ticket. redact per instructions but never summarize or shorten excessively + +You must output in JSON format with: { - "redacted_text": "text with [LastNameN], [EmailN], [PhoneN], [AddressN], [IPN] placeholders", - "properties_redacted": {"lastname1": "original", "email1": "original", ...} + "redacted_text": "the fully redacted text", + "properties_redacted": { + "lastname1": "original last name", + "email1": "original email", + "ip1": "original IP", + ... + } }""" messages = [ @@ -24,8 +146,14 @@ {"role": "user", "content": f"Redact all PII from this text and output in JSON format:\n\n{text}"} ] +# Generate prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +result = generate(model, tokenizer, prompt=prompt, max_tokens=1024) -result = generate(model, tokenizer, prompt=prompt, max_tokens=512) - -print(result) +# Extract JSON only +if '{' in result: + json_start = result.find('{') + json_end = result.rfind('}') + 1 + print(result[json_start:json_end]) +else: + print(result)