From fe2581e3d05513175fff0090f24434f6a7613ee0 Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Tue, 13 Mar 2018 16:35:53 -0700 Subject: [PATCH 1/5] deid samples --- dlp/deid.py | 548 +++++++++++++++++++++++++++++++++++++++++++++++ dlp/deid_test.py | 168 +++++++++++++++ 2 files changed, 716 insertions(+) create mode 100644 dlp/deid.py create mode 100644 dlp/deid_test.py diff --git a/dlp/deid.py b/dlp/deid.py new file mode 100644 index 00000000000..2e17716d455 --- /dev/null +++ b/dlp/deid.py @@ -0,0 +1,548 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevention API for deidentifying +sensitive data.""" + +from __future__ import print_function + +import argparse + +import os + + +# [START deidentify_with_mask] +def deidentify_with_mask(parent, string, masking_character=None, + number_to_mask=0): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by masking it with a character. + Args: + item: The string to deidentify (will be treated as text). + masking_character: The character to mask matching sensitive data with. + number_to_mask: The maximum number of sensitive characters to mask in + a match. If omitted the request or set to 0, the API will mask any + mathcing characters. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client + google.cloud.dlp.DlpServiceClient.SERVICE_ADDRESS = 'autopush-dlp.sandbox.googleapis.com' # DO NOT SUBMIT + dlp = google.cloud.dlp.DlpServiceClient() + + # Add parent + parent = dlp.project_path(parent) + + # Construct deidentify configuration dictionary + deidentify_config = { + 'info_type_transformations': { + 'transformations': [ + { + 'primitive_transformation': { + 'character_mask_config': { + 'masking_character': masking_character, + 'number_to_mask': number_to_mask if + number_to_mask >= 0 else 0 + } + } + } + ] + } + } + + # Construct item + item = {'value': string} + + # Call the API + response = dlp.deidentify_content(parent, + deidentify_config=deidentify_config, + item=item) + + # Print out the results. + print(response.item.value) +# [END deidentify_with_mask] + + +# [START deidentify_with_fpe] +def deidentify_with_fpe(parent, string, alphabet=None, + surrogate_type=None, key_name=None, wrapped_key=None): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string using Format Preserving Encryption (FPE). + Args: + item: The string to deidentify (will be treated as text). + Example: string = 'My SSN is 372819127' + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to use. Only + necessary if you want to reverse the deidentification process. Can + be essentially any arbitrary string, as long as it doesn't appear + in your dataset otherwise. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + google.cloud.dlp.DlpServiceClient.SERVICE_ADDRESS = 'autopush-dlp.sandbox.googleapis.com' # DO NOT SUBMIT + dlp = google.cloud.dlp.DlpServiceClient() + + # Add parent + parent = dlp.project_path(parent) + + # Wrapped key can not be base64 encoded + import base64 + wrapped_key = base64.b64decode(wrapped_key) + + # Construct FPE configuration dictionary + crypto_replace_ffx_fpe_config = { + 'crypto_key': { + 'kms_wrapped': { + 'wrapped_key': wrapped_key, + 'crypto_key_name': key_name + } + }, + 'common_alphabet': alphabet + } + + # Add surrogate type + if surrogate_type: + crypto_replace_ffx_fpe_config['surrogate_info_type'] = { + 'name': surrogate_type + } + + # Construct deidentify configuration dictionary + deidentify_config = { + 'info_type_transformations': { + 'transformations': [ + { + 'primitive_transformation': { + 'crypto_replace_ffx_fpe_config': + crypto_replace_ffx_fpe_config + } + } + ] + } + } + + # Convert string to item + item = {'value': string} + + # Call the API + response = dlp.deidentify_content(parent, + deidentify_config=deidentify_config, + item=item) + # Print results + print(response.item.value) +# [END deidentify_with_fpe] + + +# [START reidentify_with_fpe] +def reidentify_with_fpe(parent, string, alphabet=None, + surrogate_type=None, key_name=None, wrapped_key=None): + """Uses the Data Loss Prevention API to reidentify sensitive data in a + string that was encrypted by Format Preserving Encryption (FPE). + Args: + item: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to used + during the encryption process. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + keyName = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + google.cloud.dlp.DlpServiceClient.SERVICE_ADDRESS = 'autopush-dlp.sandbox.googleapis.com' + dlp = google.cloud.dlp.DlpServiceClient() + + # Add parent + parent = dlp.project_path(parent) + + # Wrapped key can not be base64 encoded + import base64 + wrapped_key = base64.b64decode(wrapped_key) + + # Construct Deidentify Config + reidentify_config = { + 'info_type_transformations': { + 'transformations': [ + { + 'primitive_transformation': { + 'crypto_replace_ffx_fpe_config': { + 'crypto_key': { + 'kms_wrapped': { + 'wrapped_key': wrapped_key, + 'crypto_key_name': key_name + } + }, + 'common_alphabet': alphabet, + 'surrogate_info_type': { + 'name': surrogate_type + } + } + } + } + ] + } + } + + inspect_config = { + 'custom_info_types': [ + { + 'info_type': { + 'name': surrogate_type + }, + 'surrogate_type': { + } + } + ] + } + + # Convert string to item + item = {'value': string} + + # Call the API + response = dlp.reidentify_content(parent, + inspect_config=inspect_config, + reidentify_config=reidentify_config, + item=item) + # Print results + print(response.item.value) +# [END reidentify_with_fpe] + + +# [START deidentify_with_date_shift] +def deidentify_with_date_shift(parent, input_csv_file=None, + output_csv_file=None, date_fields=None, + lower_bound_days=None, upper_bound_days=None, + context_field_id=None, wrapped_key=None, + key_name=None): + """Uses the Data Loss Prevention API to deidentify dates in a CSV file by + pseudorandomly shifting them. + Args: + input_csv_file: The path to the CSV file to deidentify. The first row + of the file must specify column names, and all other rows must + contain valid values. + output_csv_file: The path to save the date-shifted CSV file. + date_fields: The list of (date) fields in the CSV file to date shift. + Example: ['birth_date', 'register_date'] + lower_bound_days: The maximum number of days to shift a date backward + upper_bound_days: The maximum number of days to shift a date forward + context_field_id: (Optional) The column to determine date shift amount + based on. If this is not specified, a random shift amount will be + used for every row. If this is specified, then 'wrappedKey' and + 'keyName' must also be set. Example: + contextFieldId = [{ 'name': 'user_id' }] + key_name: (Optional) The name of the Cloud KMS key used to encrypt + ('wrap') the AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use. + This key should be encrypted using the Cloud KMS key specified by + key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + google.cloud.dlp.DlpServiceClient.SERVICE_ADDRESS = 'autopush-dlp.sandbox.googleapis.com' # DO NOT SUBMIT + dlp = google.cloud.dlp.DlpServiceClient() + + # Add parent + parent = dlp.project_path(parent) + + # Convert date field list to Protobuf type + def map_fields(field): + return {'name': field} + + if date_fields: + date_fields = map(map_fields, date_fields) + else: + date_fields = [] + + # Read and parse the CSV file + import csv + from datetime import datetime + file = [] + with open(input_csv_file, 'rb') as csvfile: + reader = csv.reader(csvfile) + for row in reader: + file.append(row) + + # Helper function for converting CSV rows to Protobuf types + def map_headers(header): + return {'name': header} + + def map_data(value): + try: + date = datetime.strptime(value, '%m/%d/%Y') + return { + 'date_value': { + 'year': date.year, + 'month': date.month, + 'day': date.day + } + } + except ValueError: + return {'string_value': value} + + def map_rows(row): + return {'values': map(map_data, row)} + + csv_headers = map(map_headers, file[0]) + csv_rows = map(map_rows, file[1:]) + + # Construct the table dict + table_item = { + 'table': { + 'headers': csv_headers, + 'rows': csv_rows + } + } + + # Construct date shift config + date_shift_config = { + 'lower_bound_days': lower_bound_days, + 'upper_bound_days': upper_bound_days + } + + # If using Cloud KMS key + if context_field_id and key_name and wrapped_key: + import base64 + date_shift_config['context'] = {'name': context_field_id} + date_shift_config['crypto_key'] = { + 'kms_wrapped': { + 'wrapped_key': base64.b64decode(wrapped_key), + 'crypto_key_name': key_name + } + } + elif context_field_id or key_name or wrapped_key: + raise StandardError("""You must set either ALL or NONE of + [context_field_id, key_name, wrapped_key]!""") + + # Construct Deidentify Config + deidentify_config = { + 'record_transformations': { + 'field_transformations': [ + { + 'fields': date_fields, + 'primitive_transformation': { + 'date_shift_config': date_shift_config + } + } + ] + } + } + + # Write to CSV helper methods + def write_header(header): + return header.name + + def write_data(data): + return data.string_value or '%s/%s/%s' % (data.date_value.month, + data.date_value.day, + data.date_value.year) + + # Call the API + response = dlp.deidentify_content(parent, + deidentify_config=deidentify_config, + item=table_item) + # Write results to CSV file + with open(output_csv_file, 'wb') as csvfile: + write_file = csv.writer(csvfile, delimiter=',') + write_file.writerow(map(write_header, response.item.table.headers)) + for row in response.item.table.rows: + write_file.writerow(map(write_data, row.values)) + # Print status + print('Successfully saved date-shift output to {}'.format( + output_csv_file)) +# [END deidentify_with_date_shift] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest='content', help='Select how to submit content to the API.') + + mask_parser = subparsers.add_parser( + 'deid_mask', + help='Deidentify sensitive data in a string by masking it with a ' + 'character.') + mask_parser.add_argument('item', help='The string to deidentify.') + mask_parser.add_argument( + '-n', '--number_to_mask', type=int, + help='The maximum number of sensitive characters to mask in a match. ' + 'If omitted the request or set to 0, the API will mask any mathcing ' + 'characters.') + mask_parser.add_argument( + '-m', '--masking_character', + help='The character to mask matching sensitive data with.') + mask_parser.add_argument( + '-p', '--project', default=os.environ['GCLOUD_PROJECT']) + + fpe_parser = subparsers.add_parser( + 'deid_fpe', + help='Deidentify sensitive data in a string using Format Preserving ' + 'Encryption (FPE).') + fpe_parser.add_argument( + 'item', + help='The string to deidentify. ' + 'Example: string = \'My SSN is 372819127\'') + fpe_parser.add_argument( + 'key_name', + help='The name of the Cloud KMS key used to encrypt (\'wrap\') the ' + 'AES-256 key. Example: ' + 'key_name = \'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/' + 'keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME\'') + fpe_parser.add_argument( + 'wrapped_key', + help='The encrypted (\'wrapped\') AES-256 key to use. This key should ' + 'be encrypted using the Cloud KMS key specified by key_name.') + fpe_parser.add_argument( + '-a', '--alphabet', default='ALPHA_NUMERIC', + help='The set of characters to replace sensitive ones with. Commonly ' + 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' + '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"') + fpe_parser.add_argument( + '-s', '--surrogate_type', + help='The name of the surrogate custom info type to use. Only ' + 'necessary if you want to reverse the deidentification process. Can ' + 'be essentially any arbitrary string, as long as it doesn\'t appear ' + 'in your dataset otherwise.') + fpe_parser.add_argument( + '-p', '--project', default=os.environ['GCLOUD_PROJECT']) + + reid_parser = subparsers.add_parser( + 'reid_fpe', + help='Reidentify sensitive data in a string using Format Preserving ' + 'Encryption (FPE).') + reid_parser.add_argument( + 'item', + help='The string to deidentify. ' + 'Example: string = \'My SSN is 372819127\'') + reid_parser.add_argument( + 'surrogate_type', + help='The name of the surrogate custom info type to use. Only ' + 'necessary if you want to reverse the deidentification process. Can ' + 'be essentially any arbitrary string, as long as it doesn\'t appear ' + 'in your dataset otherwise.') + reid_parser.add_argument( + 'key_name', + help='The name of the Cloud KMS key used to encrypt (\'wrap\') the ' + 'AES-256 key. Example: ' + 'key_name = \'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/' + 'keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME\'') + reid_parser.add_argument( + 'wrapped_key', + help='The encrypted (\'wrapped\') AES-256 key to use. This key should ' + 'be encrypted using the Cloud KMS key specified by key_name.') + reid_parser.add_argument( + '-a', '--alphabet', default='ALPHA_NUMERIC', + help='The set of characters to replace sensitive ones with. Commonly ' + 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' + '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"') + reid_parser.add_argument( + '-p', '--project', default=os.environ['GCLOUD_PROJECT']) + + date_shift_parser = subparsers.add_parser( + 'deid_date_shift', + help='Deidentify dates in a CSV file by pseudorandomly shifting them.') + date_shift_parser.add_argument( + 'input_csv_file', + help='The path to the CSV file to deidentify. The first row of the ' + 'file must specify column names, and all other rows must contain ' + 'valid values.') + date_shift_parser.add_argument( + 'output_csv_file', + help='The path to save the date-shifted CSV file.') + date_shift_parser.add_argument( + 'lower_bound_days', type=int, + help='The maximum number of days to shift a date backward') + date_shift_parser.add_argument( + 'upper_bound_days', type=int, + help='The maximum number of days to shift a date forward') + date_shift_parser.add_argument( + 'date_fields', nargs='+', + help='The list of date fields in the CSV file to date shift. Example: ' + '[\'birth_date\', \'register_date\']') + date_shift_parser.add_argument( + '--context_field_id', + help='(Optional) The column to determine date shift amount based on. ' + 'If this is not specified, a random shift amount will be used for ' + 'every row. If this is specified, then \'wrappedKey\' and \'keyName\' ' + 'must also be set.') + date_shift_parser.add_argument( + '--key_name', + help='(Optional) The name of the Cloud KMS key used to encrypt ' + '(\'wrap\') the AES-256 key. Example: ' + 'key_name = \'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/' + 'keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME\'') + date_shift_parser.add_argument( + '--wrapped_key', + help='(Optional) The encrypted (\'wrapped\') AES-256 key to use. This ' + 'key should be encrypted using the Cloud KMS key specified by' + 'key_name.') + date_shift_parser.add_argument( + '-p', '--project', default=os.environ['GCLOUD_PROJECT']) + + args = parser.parse_args() + + if args.content == 'deid_mask': + deidentify_with_mask(args.project, args.item, + masking_character=args.masking_character, + number_to_mask=args.number_to_mask) + elif args.content == 'deid_fpe': + deidentify_with_fpe(args.project, args.item, alphabet=args.alphabet, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + surrogate_type=args.surrogate_type) + elif args.content == 'reid_fpe': + reidentify_with_fpe(args.project, args.item, + surrogate_type=args.surrogate_type, + wrapped_key=args.wrapped_key, + key_name=args.key_name, alphabet=args.alphabet) + elif args.content == 'deid_date_shift': + deidentify_with_date_shift(args.project, + input_csv_file=args.input_csv_file, + output_csv_file=args.output_csv_file, + lower_bound_days=args.lower_bound_days, + upper_bound_days=args.upper_bound_days, + date_fields=args.date_fields, + context_field_id=args.context_field_id, + wrapped_key=args.wrapped_key, + key_name=args.key_name) diff --git a/dlp/deid_test.py b/dlp/deid_test.py new file mode 100644 index 00000000000..0f5accdeb1e --- /dev/null +++ b/dlp/deid_test.py @@ -0,0 +1,168 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +import deid + +harmful_string = 'My SSN is 372819127' +parent = os.environ['GCLOUD_PROJECT'] +wrapped_key = os.environ['DLP_DEID_WRAPPED_KEY'] +key_name = os.environ['DLP_DEID_KEY_NAME'] +surrogate_type = 'SSN_TOKEN' +csv_file = 'resources/dates.csv' +output_csv_file = 'resources/temp.results.csv' +date_shifted_amount = 30 +date_fields = ['birth_date', 'register_date'] +csv_context_field = 'name' + + +# [START Deidentify with masking] +def test_deidentify_with_mask(capsys): + harmful_string = 'My SSN is 372819127' + + deid.deidentify_with_mask(parent, harmful_string) + + out, _ = capsys.readouterr() + assert 'My SSN is *********' in out + + +def test_deidentify_with_mask_ignore_insensitive_data(capsys): + harmless_string = 'My favorite color is blue' + + deid.deidentify_with_mask(parent, harmless_string) + + out, _ = capsys.readouterr() + assert harmless_string in out + + +def test_deidentify_with_mask_masking_character_specified(capsys): + harmful_string = 'My SSN is 372819127' + + deid.deidentify_with_mask(parent, harmful_string, masking_character='#') + + out, _ = capsys.readouterr() + assert 'My SSN is #########' in out + + +def test_deidentify_with_mask_masking_number_specified(capsys): + harmful_string = 'My SSN is 372819127' + + deid.deidentify_with_mask(parent, harmful_string, number_to_mask=7) + + out, _ = capsys.readouterr() + assert 'My SSN is *******27' in out + + +def test_deidentify_with_mask_handles_masking_number_error(capsys): + harmful_string = 'My SSN is 372819127' + + deid.deidentify_with_mask(parent, harmful_string, number_to_mask=-3) + + out, _ = capsys.readouterr() + assert 'My SSN is *********' in out +# [END deidentify_with_mask] + + +# [START Deidentify with FPE] +def test_deidentify_with_fpe(capsys): + harmful_string = 'My SSN is 372819127' + + deid.deidentify_with_fpe(parent, harmful_string, alphabet='NUMERIC', + wrapped_key=wrapped_key, key_name=key_name) + + out, _ = capsys.readouterr() + assert 'My SSN is' in out + assert '372819127' not in out + + +def test_deidentify_with_fpe_uses_surrogate_info_types(capsys): + harmful_string = 'My SSN is 372819127' + + deid.deidentify_with_fpe(parent, harmful_string, alphabet='NUMERIC', + wrapped_key=wrapped_key, key_name=key_name, + surrogate_type=surrogate_type) + + out, _ = capsys.readouterr() + assert 'My SSN is SSN_TOKEN' in out + assert '372819127' not in out + + +def test_deidentify_with_fpe_ignores_insensitive_data(capsys): + harmless_string = 'My favorite color is blue' + + deid.deidentify_with_fpe(parent, harmless_string, alphabet='NUMERIC', + wrapped_key=wrapped_key, key_name=key_name) + + out, _ = capsys.readouterr() + assert harmless_string in out +# [END Deidentify with FPE] + + +# [START Deidentify with date shift] +def test_deidentify_with_date_shift(capsys): + deid.deidentify_with_date_shift(parent, input_csv_file=csv_file, + output_csv_file=output_csv_file, + lower_bound_days=date_shifted_amount, + upper_bound_days=date_shifted_amount, + date_fields=date_fields) + + out, _ = capsys.readouterr() + + assert 'Successful' in out + # read in csv?? + + +def test_deidentify_with_date_shift_using_context_field(capsys): + deid.deidentify_with_date_shift(parent, input_csv_file=csv_file, + output_csv_file=output_csv_file, + lower_bound_days=date_shifted_amount, + upper_bound_days=date_shifted_amount, + date_fields=date_fields, + context_field_id=csv_context_field, + wrapped_key=wrapped_key, + key_name=key_name) + + out, _ = capsys.readouterr() + + assert 'Successful' in out + + +def test_deidentify_with_date_shift_requires_all_fields(): + with pytest.raises(StandardError): + deid.deidentify_with_date_shift(parent, input_csv_file=csv_file, + output_csv_file=output_csv_file, + lower_bound_days=date_shifted_amount, + upper_bound_days=date_shifted_amount, + date_fields=date_fields, + context_field_id=csv_context_field, + key_name=key_name) +# [END Deidentify with date shift] + + +# [START Reidentify with FPE] +def test_reidentify_with_fpe(capsys): + labeled_fpe_string = 'My SSN is SSN_TOKEN(9):731997681' + + deid.reidentify_with_fpe(parent, labeled_fpe_string, + surrogate_type=surrogate_type, + wrapped_key=wrapped_key, key_name=key_name, + alphabet='NUMERIC') + + out, _ = capsys.readouterr() + + assert harmful_string in out +# [END Reidentify with FPE] From 28c51fcd5c7b5f91b8a20552ac0329b5e35e594f Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Tue, 13 Mar 2018 16:36:39 -0700 Subject: [PATCH 2/5] added csv file --- dlp/resources/dates.csv | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 dlp/resources/dates.csv diff --git a/dlp/resources/dates.csv b/dlp/resources/dates.csv new file mode 100644 index 00000000000..056fccb328e --- /dev/null +++ b/dlp/resources/dates.csv @@ -0,0 +1,5 @@ +name,birth_date,register_date,credit_card +Ann,01/01/1970,07/21/1996,4532908762519852 +James,03/06/1988,04/09/2001,4301261899725540 +Dan,08/14/1945,11/15/2011,4620761856015295 +Laura,11/03/1992,01/04/2017,4564981067258901 \ No newline at end of file From 1af703f51b83c76efd871ba906e101fb2bb22e4e Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Wed, 14 Mar 2018 12:21:28 -0700 Subject: [PATCH 3/5] pull request comment changes --- dlp/deid.py | 55 +++++++-------- dlp/deid_test.py | 173 ++++++++++++++++++++++++----------------------- 2 files changed, 115 insertions(+), 113 deletions(-) diff --git a/dlp/deid.py b/dlp/deid.py index 2e17716d455..844efe60b74 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -12,13 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Sample app that uses the Data Loss Prevention API for deidentifying -sensitive data.""" +"""Usages of the Data Loss Prevention API for deidentifying sensitive data.""" from __future__ import print_function import argparse - import os @@ -31,8 +29,8 @@ def deidentify_with_mask(parent, string, masking_character=None, item: The string to deidentify (will be treated as text). masking_character: The character to mask matching sensitive data with. number_to_mask: The maximum number of sensitive characters to mask in - a match. If omitted the request or set to 0, the API will mask any - mathcing characters. + a match. If omitted or set to zero, the API will default to no + maximum. Returns: None; the response from the API is printed to the terminal. """ @@ -41,10 +39,9 @@ def deidentify_with_mask(parent, string, masking_character=None, import google.cloud.dlp # Instantiate a client - google.cloud.dlp.DlpServiceClient.SERVICE_ADDRESS = 'autopush-dlp.sandbox.googleapis.com' # DO NOT SUBMIT dlp = google.cloud.dlp.DlpServiceClient() - # Add parent + # Convert the project id into a full resource id. parent = dlp.project_path(parent) # Construct deidentify configuration dictionary @@ -68,9 +65,8 @@ def deidentify_with_mask(parent, string, masking_character=None, item = {'value': string} # Call the API - response = dlp.deidentify_content(parent, - deidentify_config=deidentify_config, - item=item) + response = dlp.deidentify_content( + parent, deidentify_config=deidentify_config, item=item) # Print out the results. print(response.item.value) @@ -105,10 +101,9 @@ def deidentify_with_fpe(parent, string, alphabet=None, import google.cloud.dlp # Instantiate a client - google.cloud.dlp.DlpServiceClient.SERVICE_ADDRESS = 'autopush-dlp.sandbox.googleapis.com' # DO NOT SUBMIT dlp = google.cloud.dlp.DlpServiceClient() - # Add parent + # Convert the project id into a full resource id. parent = dlp.project_path(parent) # Wrapped key can not be base64 encoded @@ -150,9 +145,9 @@ def deidentify_with_fpe(parent, string, alphabet=None, item = {'value': string} # Call the API - response = dlp.deidentify_content(parent, - deidentify_config=deidentify_config, - item=item) + response = dlp.deidentify_content( + parent, deidentify_config=deidentify_config, item=item) + # Print results print(response.item.value) # [END deidentify_with_fpe] @@ -183,10 +178,9 @@ def reidentify_with_fpe(parent, string, alphabet=None, import google.cloud.dlp # Instantiate a client - google.cloud.dlp.DlpServiceClient.SERVICE_ADDRESS = 'autopush-dlp.sandbox.googleapis.com' dlp = google.cloud.dlp.DlpServiceClient() - # Add parent + # Convert the project id into a full resource id. parent = dlp.project_path(parent) # Wrapped key can not be base64 encoded @@ -233,10 +227,12 @@ def reidentify_with_fpe(parent, string, alphabet=None, item = {'value': string} # Call the API - response = dlp.reidentify_content(parent, - inspect_config=inspect_config, - reidentify_config=reidentify_config, - item=item) + response = dlp.reidentify_content( + parent, + inspect_config=inspect_config, + reidentify_config=reidentify_config, + item=item) + # Print results print(response.item.value) # [END reidentify_with_fpe] @@ -278,10 +274,9 @@ def deidentify_with_date_shift(parent, input_csv_file=None, import google.cloud.dlp # Instantiate a client - google.cloud.dlp.DlpServiceClient.SERVICE_ADDRESS = 'autopush-dlp.sandbox.googleapis.com' # DO NOT SUBMIT dlp = google.cloud.dlp.DlpServiceClient() - # Add parent + # Convert the project id into a full resource id. parent = dlp.project_path(parent) # Convert date field list to Protobuf type @@ -296,11 +291,11 @@ def map_fields(field): # Read and parse the CSV file import csv from datetime import datetime - file = [] + f = [] with open(input_csv_file, 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: - file.append(row) + f.append(row) # Helper function for converting CSV rows to Protobuf types def map_headers(header): @@ -322,8 +317,8 @@ def map_data(value): def map_rows(row): return {'values': map(map_data, row)} - csv_headers = map(map_headers, file[0]) - csv_rows = map(map_rows, file[1:]) + csv_headers = map(map_headers, f[0]) + csv_rows = map(map_rows, f[1:]) # Construct the table dict table_item = { @@ -377,9 +372,9 @@ def write_data(data): data.date_value.year) # Call the API - response = dlp.deidentify_content(parent, - deidentify_config=deidentify_config, - item=table_item) + response = dlp.deidentify_content( + parent, deidentify_config=deidentify_config, item=table_item) + # Write results to CSV file with open(output_csv_file, 'wb') as csvfile: write_file = csv.writer(csvfile, delimiter=',') diff --git a/dlp/deid_test.py b/dlp/deid_test.py index 0f5accdeb1e..f7456e055e3 100644 --- a/dlp/deid_test.py +++ b/dlp/deid_test.py @@ -13,76 +13,70 @@ # limitations under the License. import os +import shutil +import tempfile import pytest import deid -harmful_string = 'My SSN is 372819127' -parent = os.environ['GCLOUD_PROJECT'] -wrapped_key = os.environ['DLP_DEID_WRAPPED_KEY'] -key_name = os.environ['DLP_DEID_KEY_NAME'] -surrogate_type = 'SSN_TOKEN' -csv_file = 'resources/dates.csv' -output_csv_file = 'resources/temp.results.csv' -date_shifted_amount = 30 -date_fields = ['birth_date', 'register_date'] -csv_context_field = 'name' +HARMFUL_STRING = 'My SSN is 372819127' +HARMLESS_STRING = 'My favorite color is blue' +GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT') +WRAPPED_KEY = os.environ['DLP_DEID_WRAPPED_KEY'] +KEY_NAME = os.environ['DLP_DEID_KEY_NAME'] +SURROGATE_TYPE = 'SSN_TOKEN' +CSV_FILE = os.path.join(os.path.dirname(__file__), 'resources/dates.csv') +DATE_SHIFTED_AMOUNT = 30 +DATE_FIELDS = ['birth_date', 'register_date'] +CSV_CONTEXT_FIELD = 'name' -# [START Deidentify with masking] -def test_deidentify_with_mask(capsys): - harmful_string = 'My SSN is 372819127' +@pytest.fixture(scope='module') +def tempdir(): + tempdir = tempfile.mkdtemp() + yield tempdir + shutil.rmtree(tempdir) + - deid.deidentify_with_mask(parent, harmful_string) +def test_deidentify_with_mask(capsys): + deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING) out, _ = capsys.readouterr() assert 'My SSN is *********' in out def test_deidentify_with_mask_ignore_insensitive_data(capsys): - harmless_string = 'My favorite color is blue' - - deid.deidentify_with_mask(parent, harmless_string) + deid.deidentify_with_mask(GCLOUD_PROJECT, HARMLESS_STRING) out, _ = capsys.readouterr() - assert harmless_string in out + assert HARMLESS_STRING in out def test_deidentify_with_mask_masking_character_specified(capsys): - harmful_string = 'My SSN is 372819127' - - deid.deidentify_with_mask(parent, harmful_string, masking_character='#') + deid.deidentify_with_mask( + GCLOUD_PROJECT, + HARMFUL_STRING, + masking_character='#') out, _ = capsys.readouterr() assert 'My SSN is #########' in out def test_deidentify_with_mask_masking_number_specified(capsys): - harmful_string = 'My SSN is 372819127' - - deid.deidentify_with_mask(parent, harmful_string, number_to_mask=7) + deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING, number_to_mask=7) out, _ = capsys.readouterr() assert 'My SSN is *******27' in out -def test_deidentify_with_mask_handles_masking_number_error(capsys): - harmful_string = 'My SSN is 372819127' - - deid.deidentify_with_mask(parent, harmful_string, number_to_mask=-3) - - out, _ = capsys.readouterr() - assert 'My SSN is *********' in out -# [END deidentify_with_mask] - - -# [START Deidentify with FPE] def test_deidentify_with_fpe(capsys): - harmful_string = 'My SSN is 372819127' - - deid.deidentify_with_fpe(parent, harmful_string, alphabet='NUMERIC', - wrapped_key=wrapped_key, key_name=key_name) + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMFUL_STRING, + alphabet='NUMERIC', + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME) out, _ = capsys.readouterr() assert 'My SSN is' in out @@ -90,11 +84,13 @@ def test_deidentify_with_fpe(capsys): def test_deidentify_with_fpe_uses_surrogate_info_types(capsys): - harmful_string = 'My SSN is 372819127' - - deid.deidentify_with_fpe(parent, harmful_string, alphabet='NUMERIC', - wrapped_key=wrapped_key, key_name=key_name, - surrogate_type=surrogate_type) + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMFUL_STRING, + alphabet='NUMERIC', + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + surrogate_type=SURROGATE_TYPE) out, _ = capsys.readouterr() assert 'My SSN is SSN_TOKEN' in out @@ -102,67 +98,78 @@ def test_deidentify_with_fpe_uses_surrogate_info_types(capsys): def test_deidentify_with_fpe_ignores_insensitive_data(capsys): - harmless_string = 'My favorite color is blue' - - deid.deidentify_with_fpe(parent, harmless_string, alphabet='NUMERIC', - wrapped_key=wrapped_key, key_name=key_name) + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMLESS_STRING, + alphabet='NUMERIC', + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME) out, _ = capsys.readouterr() - assert harmless_string in out -# [END Deidentify with FPE] + assert HARMLESS_STRING in out + +def test_deidentify_with_date_shift(tempdir, capsys): + output_filepath = os.path.join(tempdir, 'dates-shifted.csv') -# [START Deidentify with date shift] -def test_deidentify_with_date_shift(capsys): - deid.deidentify_with_date_shift(parent, input_csv_file=csv_file, - output_csv_file=output_csv_file, - lower_bound_days=date_shifted_amount, - upper_bound_days=date_shifted_amount, - date_fields=date_fields) + deid.deidentify_with_date_shift( + GCLOUD_PROJECT, + input_csv_file=CSV_FILE, + output_csv_file=output_filepath, + lower_bound_days=DATE_SHIFTED_AMOUNT, + upper_bound_days=DATE_SHIFTED_AMOUNT, + date_fields=DATE_FIELDS) out, _ = capsys.readouterr() assert 'Successful' in out - # read in csv?? -def test_deidentify_with_date_shift_using_context_field(capsys): - deid.deidentify_with_date_shift(parent, input_csv_file=csv_file, - output_csv_file=output_csv_file, - lower_bound_days=date_shifted_amount, - upper_bound_days=date_shifted_amount, - date_fields=date_fields, - context_field_id=csv_context_field, - wrapped_key=wrapped_key, - key_name=key_name) +def test_deidentify_with_date_shift_using_context_field(tempdir, capsys): + output_filepath = os.path.join(tempdir, 'dates-shifted.csv') + + deid.deidentify_with_date_shift( + GCLOUD_PROJECT, + input_csv_file=CSV_FILE, + output_csv_file=output_filepath, + lower_bound_days=DATE_SHIFTED_AMOUNT, + upper_bound_days=DATE_SHIFTED_AMOUNT, + date_fields=DATE_FIELDS, + context_field_id=CSV_CONTEXT_FIELD, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME) out, _ = capsys.readouterr() assert 'Successful' in out -def test_deidentify_with_date_shift_requires_all_fields(): +def test_deidentify_with_date_shift_requires_all_fields(tempdir): + output_filepath = os.path.join(tempdir, 'dates-shifted.csv') + with pytest.raises(StandardError): - deid.deidentify_with_date_shift(parent, input_csv_file=csv_file, - output_csv_file=output_csv_file, - lower_bound_days=date_shifted_amount, - upper_bound_days=date_shifted_amount, - date_fields=date_fields, - context_field_id=csv_context_field, - key_name=key_name) -# [END Deidentify with date shift] + deid.deidentify_with_date_shift( + GCLOUD_PROJECT, + input_csv_file=CSV_FILE, + output_csv_file=output_filepath, + lower_bound_days=DATE_SHIFTED_AMOUNT, + upper_bound_days=DATE_SHIFTED_AMOUNT, + date_fields=DATE_FIELDS, + context_field_id=CSV_CONTEXT_FIELD, + key_name=KEY_NAME) -# [START Reidentify with FPE] def test_reidentify_with_fpe(capsys): labeled_fpe_string = 'My SSN is SSN_TOKEN(9):731997681' - deid.reidentify_with_fpe(parent, labeled_fpe_string, - surrogate_type=surrogate_type, - wrapped_key=wrapped_key, key_name=key_name, - alphabet='NUMERIC') + deid.reidentify_with_fpe( + GCLOUD_PROJECT, + labeled_fpe_string, + surrogate_type=SURROGATE_TYPE, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + alphabet='NUMERIC') out, _ = capsys.readouterr() - assert harmful_string in out -# [END Reidentify with FPE] + assert HARMFUL_STRING in out From 3ed824b814a1911d3f13ee4a5f87595c72de860a Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Wed, 14 Mar 2018 12:49:16 -0700 Subject: [PATCH 4/5] Updated project id as first positional argument --- dlp/deid.py | 70 +++++++++++++++++++++++++----------------------- dlp/deid_test.py | 4 +-- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/dlp/deid.py b/dlp/deid.py index 844efe60b74..af6e5825b93 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -12,16 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Usages of the Data Loss Prevention API for deidentifying sensitive data.""" +"""Uses of the Data Loss Prevention API for deidentifying sensitive data.""" from __future__ import print_function import argparse -import os -# [START deidentify_with_mask] -def deidentify_with_mask(parent, string, masking_character=None, +def deidentify_with_mask(project, string, masking_character=None, number_to_mask=0): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. @@ -42,7 +40,7 @@ def deidentify_with_mask(parent, string, masking_character=None, dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. - parent = dlp.project_path(parent) + parent = dlp.project_path(project) # Construct deidentify configuration dictionary deidentify_config = { @@ -52,8 +50,7 @@ def deidentify_with_mask(parent, string, masking_character=None, 'primitive_transformation': { 'character_mask_config': { 'masking_character': masking_character, - 'number_to_mask': number_to_mask if - number_to_mask >= 0 else 0 + 'number_to_mask': number_to_mask } } } @@ -70,17 +67,14 @@ def deidentify_with_mask(parent, string, masking_character=None, # Print out the results. print(response.item.value) -# [END deidentify_with_mask] -# [START deidentify_with_fpe] -def deidentify_with_fpe(parent, string, alphabet=None, +def deidentify_with_fpe(project, string, alphabet=None, surrogate_type=None, key_name=None, wrapped_key=None): """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Format Preserving Encryption (FPE). Args: item: The string to deidentify (will be treated as text). - Example: string = 'My SSN is 372819127' alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet @@ -104,9 +98,10 @@ def deidentify_with_fpe(parent, string, alphabet=None, dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. - parent = dlp.project_path(parent) + parent = dlp.project_path(project) - # Wrapped key can not be base64 encoded + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. import base64 wrapped_key = base64.b64decode(wrapped_key) @@ -150,11 +145,9 @@ def deidentify_with_fpe(parent, string, alphabet=None, # Print results print(response.item.value) -# [END deidentify_with_fpe] -# [START reidentify_with_fpe] -def reidentify_with_fpe(parent, string, alphabet=None, +def reidentify_with_fpe(project, string, alphabet=None, surrogate_type=None, key_name=None, wrapped_key=None): """Uses the Data Loss Prevention API to reidentify sensitive data in a string that was encrypted by Format Preserving Encryption (FPE). @@ -181,9 +174,10 @@ def reidentify_with_fpe(parent, string, alphabet=None, dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. - parent = dlp.project_path(parent) + parent = dlp.project_path(project) - # Wrapped key can not be base64 encoded + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. import base64 wrapped_key = base64.b64decode(wrapped_key) @@ -235,11 +229,9 @@ def reidentify_with_fpe(parent, string, alphabet=None, # Print results print(response.item.value) -# [END reidentify_with_fpe] -# [START deidentify_with_date_shift] -def deidentify_with_date_shift(parent, input_csv_file=None, +def deidentify_with_date_shift(project, input_csv_file=None, output_csv_file=None, date_fields=None, lower_bound_days=None, upper_bound_days=None, context_field_id=None, wrapped_key=None, @@ -277,7 +269,7 @@ def deidentify_with_date_shift(parent, input_csv_file=None, dlp = google.cloud.dlp.DlpServiceClient() # Convert the project id into a full resource id. - parent = dlp.project_path(parent) + parent = dlp.project_path(project) # Convert date field list to Protobuf type def map_fields(field): @@ -317,6 +309,8 @@ def map_data(value): def map_rows(row): return {'values': map(map_data, row)} + # Using the helper functions, convert CSV rows to protobuf-compatible + # dictionaries. csv_headers = map(map_headers, f[0]) csv_rows = map(map_rows, f[1:]) @@ -334,7 +328,9 @@ def map_rows(row): 'upper_bound_days': upper_bound_days } - # If using Cloud KMS key + # If using a Cloud KMS key, add it to the date_shift_config. + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. if context_field_id and key_name and wrapped_key: import base64 date_shift_config['context'] = {'name': context_field_id} @@ -345,7 +341,7 @@ def map_rows(row): } } elif context_field_id or key_name or wrapped_key: - raise StandardError("""You must set either ALL or NONE of + raise ValueError("""You must set either ALL or NONE of [context_field_id, key_name, wrapped_key]!""") # Construct Deidentify Config @@ -384,34 +380,40 @@ def write_data(data): # Print status print('Successfully saved date-shift output to {}'.format( output_csv_file)) -# [END deidentify_with_date_shift] if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) subparsers = parser.add_subparsers( dest='content', help='Select how to submit content to the API.') + subparsers.required = True mask_parser = subparsers.add_parser( 'deid_mask', help='Deidentify sensitive data in a string by masking it with a ' 'character.') + mask_parser.add_argument( + 'project', + help='The Google Cloud project id to use as a parent resource.') mask_parser.add_argument('item', help='The string to deidentify.') mask_parser.add_argument( - '-n', '--number_to_mask', type=int, + '-n', '--number_to_mask', + type=int, + default=0, help='The maximum number of sensitive characters to mask in a match. ' 'If omitted the request or set to 0, the API will mask any mathcing ' 'characters.') mask_parser.add_argument( '-m', '--masking_character', help='The character to mask matching sensitive data with.') - mask_parser.add_argument( - '-p', '--project', default=os.environ['GCLOUD_PROJECT']) fpe_parser = subparsers.add_parser( 'deid_fpe', help='Deidentify sensitive data in a string using Format Preserving ' 'Encryption (FPE).') + fpe_parser.add_argument( + 'project', + help='The Google Cloud project id to use as a parent resource.') fpe_parser.add_argument( 'item', help='The string to deidentify. ' @@ -438,13 +440,14 @@ def write_data(data): 'necessary if you want to reverse the deidentification process. Can ' 'be essentially any arbitrary string, as long as it doesn\'t appear ' 'in your dataset otherwise.') - fpe_parser.add_argument( - '-p', '--project', default=os.environ['GCLOUD_PROJECT']) reid_parser = subparsers.add_parser( 'reid_fpe', help='Reidentify sensitive data in a string using Format Preserving ' 'Encryption (FPE).') + reid_parser.add_argument( + 'project', + help='The Google Cloud project id to use as a parent resource.') reid_parser.add_argument( 'item', help='The string to deidentify. ' @@ -471,12 +474,13 @@ def write_data(data): 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"') - reid_parser.add_argument( - '-p', '--project', default=os.environ['GCLOUD_PROJECT']) date_shift_parser = subparsers.add_parser( 'deid_date_shift', help='Deidentify dates in a CSV file by pseudorandomly shifting them.') + date_shift_parser.add_argument( + 'project', + help='The Google Cloud project id to use as a parent resource.') date_shift_parser.add_argument( 'input_csv_file', help='The path to the CSV file to deidentify. The first row of the ' @@ -512,8 +516,6 @@ def write_data(data): help='(Optional) The encrypted (\'wrapped\') AES-256 key to use. This ' 'key should be encrypted using the Cloud KMS key specified by' 'key_name.') - date_shift_parser.add_argument( - '-p', '--project', default=os.environ['GCLOUD_PROJECT']) args = parser.parse_args() diff --git a/dlp/deid_test.py b/dlp/deid_test.py index f7456e055e3..8d8fdc6a02c 100644 --- a/dlp/deid_test.py +++ b/dlp/deid_test.py @@ -23,8 +23,8 @@ HARMFUL_STRING = 'My SSN is 372819127' HARMLESS_STRING = 'My favorite color is blue' GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT') -WRAPPED_KEY = os.environ['DLP_DEID_WRAPPED_KEY'] -KEY_NAME = os.environ['DLP_DEID_KEY_NAME'] +WRAPPED_KEY = os.getenv('DLP_DEID_WRAPPED_KEY') +KEY_NAME = os.getenv('DLP_DEID_KEY_NAME') SURROGATE_TYPE = 'SSN_TOKEN' CSV_FILE = os.path.join(os.path.dirname(__file__), 'resources/dates.csv') DATE_SHIFTED_AMOUNT = 30 From ebcb63ef142321c6588051e9195a565b86eee6cd Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Wed, 14 Mar 2018 17:19:42 -0700 Subject: [PATCH 5/5] added project to argument list --- dlp/deid.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dlp/deid.py b/dlp/deid.py index af6e5825b93..631e9d02c58 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -24,6 +24,7 @@ def deidentify_with_mask(project, string, masking_character=None, """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. Args: + project: The Google Cloud project id to use as a parent resource. item: The string to deidentify (will be treated as text). masking_character: The character to mask matching sensitive data with. number_to_mask: The maximum number of sensitive characters to mask in @@ -74,6 +75,7 @@ def deidentify_with_fpe(project, string, alphabet=None, """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Format Preserving Encryption (FPE). Args: + project: The Google Cloud project id to use as a parent resource. item: The string to deidentify (will be treated as text). alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ @@ -152,6 +154,7 @@ def reidentify_with_fpe(project, string, alphabet=None, """Uses the Data Loss Prevention API to reidentify sensitive data in a string that was encrypted by Format Preserving Encryption (FPE). Args: + project: The Google Cloud project id to use as a parent resource. item: The string to deidentify (will be treated as text). alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ @@ -239,6 +242,7 @@ def deidentify_with_date_shift(project, input_csv_file=None, """Uses the Data Loss Prevention API to deidentify dates in a CSV file by pseudorandomly shifting them. Args: + project: The Google Cloud project id to use as a parent resource. input_csv_file: The path to the CSV file to deidentify. The first row of the file must specify column names, and all other rows must contain valid values.