Skip to content
Merged

Dev #74

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<a href="https://twitter.com/brootware"><img src="https://img.shields.io/twitter/follow/brootware?style=social" alt="Twitter Follow"></a>
<img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/pyredactkit"> <img alt="PyPI" src="https://img.shields.io/pypi/v/pyredactkit">
<a href="https://sonarcloud.io/summary/new_code?id=brootware_PyRedactKit"><img src="https://sonarcloud.io/api/project_badges/measure?project=brootware_PyRedactKit&metric=alert_status" alt="reliability rating"></a>
<img alt="GitHub Workflow Status" src="https://img.shields.io/github/workflow/status/brootware/pyredactkit/CI?label=CI&branch=dev">
<img alt="GitHub Workflow Status" src="https://img.shields.io/github/workflow/status/brootware/pyredactkit/CI?label=CI&branch=main">
</p>

## Features
Expand Down
92 changes: 14 additions & 78 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 1 addition & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "pyredactkit"
version = "0.4.0"
version = "1.0.0"
description = "Python cli tool to redact sensitive data"
authors = ["brootware <brootware@outlook.com>"]
license = "GPL-3.0-or-later"
Expand All @@ -25,8 +25,6 @@ classifiers = [

[tool.poetry.dependencies]
python = "^3.7"
nltk = "^3.7"
numpy = "<1.22.0"

[tool.poetry.dev-dependencies]
pytest = "^7.1.2"
Expand All @@ -41,8 +39,6 @@ py = "^1.11.0"
pyparsing = "^3.0.8"
tomli = "^2.0.1"
tqdm = "^4.64.0"
nltk = "^3.7"
numpy = "<1.22.0"
rich = "^12.4.0"
mypy = "^0.961"
flake8 = "^4.0.1"
Expand Down
25 changes: 0 additions & 25 deletions pyredactkit/identifiers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

""" Data identifier class implementation """
import nltk


class Identifier:
Expand Down Expand Up @@ -45,27 +44,3 @@ class Identifier:
def __init__(self) -> None:
return None

def names(self, data: str) -> list:
""" Identify names and return them from the supplied data
Args:
data (str): data in alpha-numeric format

Returns:
name_list (array): array of names identified from the supplied data
"""
name = ""
name_list = []
words = nltk.word_tokenize(data)
part_of_speech_tagsets = nltk.pos_tag(words)
named_ent = nltk.ne_chunk(part_of_speech_tagsets, binary=False)

for subtree in named_ent.subtrees():
if subtree.label() == 'PERSON':
l = []
for leaf in subtree.leaves():
l.append(leaf[0])
name = ' '.join(l)
if name not in name_list:
name_list.append(name)

return name_list
53 changes: 53 additions & 0 deletions tests/test_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import pytest
from pyredactkit import runner as Runner


@pytest.fixture
def mocker_text_file(mocker):
content = "Message to write on file to be written"
mocked_open = mocker.mock_open(read_data=content)
builtin_open = "builtins.open"
mocker.patch(builtin_open, mocked_open)


def test_is_it_file(mocker_text_file, tmp_path):
assert Runner.is_it_file('This is a test string') is False, "is_it_file function should return False for this string"


def test_recursive_file_search(mocker_text_file, tmp_path):
assert Runner.recursive_file_search('This is a test string', 'txt', True) == set(), "recursive_file_search function should return an empty set"


def test_api_identify_sensitive_data(mocker_text_file, tmp_path):
test_string = """this is my IP: 102.23.5.1
My router is : 10.10.10.1
71.159.188.33
81.141.167.45
165.65.59.139
64.248.67.225
https://tech.gov.sg
My email is harold@mail.com
this is my IP: 102.23.5.1
My router is: 10.10.10.1
71.159.188.33
81.141.167.45
165.65.59.139
64.248.67.225
Base64 data
QVBJX1RPS0VO
UzNjcjN0UGFzc3dvcmQ=
U3VwM3JTM2NyZXRQQHNzd29yZA==
Singapore NRIC
G0022121F
F2121200F
G1021022E
S1022221L
G1222221C
S0000212Q
F2120212E
S0021001P
"""
test_data = ['102.23.5.1', '10.10.10.1', '71.159.188.33', '81.141.167.45', '165.65.59.139', '64.248.67.225', 'https://tech.gov.sg', 'harold@mail.com', 'mail.com', '102.23.5.1', '10.10.10.1', '71.159.188.33', '81.141.167.45', '165.65.59.139', '64.248.67.225', 'G0022121F', 'F2121200F', 'G1021022E', 'S1022221L', 'G1222221C', 'S0000212Q', 'F2120212E', 'S0021001P']

assert Runner.api_identify_sensitive_data(test_string) == test_data, "api_identify_sensitive_data function should return a list of sensitive data"
assert Runner.api_identify_sensitive_data('This is a test string') == [], "api_identify_sensitive_data function should return an empty list"
18 changes: 9 additions & 9 deletions tools/install_nltk_popular.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/usr/bin/env python
import nltk
import ssl
# import nltk
# import ssl

try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
# try:
# _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
# pass
# else:
# ssl._create_default_https_context = _create_unverified_https_context

nltk.download('popular')
# nltk.download('popular')