diff --git a/dlp/deid.py b/dlp/deid.py index 55882faaa97..98b41488267 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -20,7 +20,7 @@ # [START dlp_deidentify_masking] -def deidentify_with_mask(project, string, masking_character=None, +def deidentify_with_mask(project, string, info_types, masking_character=None, number_to_mask=0): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. @@ -44,6 +44,11 @@ def deidentify_with_mask(project, string, masking_character=None, # Convert the project id into a full resource id. parent = dlp.project_path(project) + # Construct inspect configuration dictionary + inspect_config = { + 'info_types': [{'name': info_type} for info_type in info_types] + } + # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { @@ -65,7 +70,8 @@ def deidentify_with_mask(project, string, masking_character=None, # Call the API response = dlp.deidentify_content( - parent, deidentify_config=deidentify_config, item=item) + parent, inspect_config=inspect_config, + deidentify_config=deidentify_config, item=item) # Print out the results. print(response.item.value) @@ -73,7 +79,7 @@ def deidentify_with_mask(project, string, masking_character=None, # [START dlp_deidentify_fpe] -def deidentify_with_fpe(project, string, alphabet=None, +def deidentify_with_fpe(project, string, info_types, alphabet=None, surrogate_type=None, key_name=None, wrapped_key=None): """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Format Preserving Encryption (FPE). @@ -127,6 +133,11 @@ def deidentify_with_fpe(project, string, alphabet=None, 'name': surrogate_type } + # Construct inspect configuration dictionary + inspect_config = { + 'info_types': [{'name': info_type} for info_type in info_types] + } + # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { @@ -146,7 +157,8 @@ def deidentify_with_fpe(project, string, alphabet=None, # Call the API response = dlp.deidentify_content( - parent, deidentify_config=deidentify_config, item=item) + parent, inspect_config=inspect_config, + deidentify_config=deidentify_config, item=item) # Print results print(response.item.value) @@ -404,6 +416,13 @@ def write_data(data): 'deid_mask', help='Deidentify sensitive data in a string by masking it with a ' 'character.') + mask_parser.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + 'If unspecified, the three above examples will be used.', + default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) mask_parser.add_argument( 'project', help='The Google Cloud project id to use as a parent resource.') @@ -423,6 +442,13 @@ def write_data(data): 'deid_fpe', help='Deidentify sensitive data in a string using Format Preserving ' 'Encryption (FPE).') + fpe_parser.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + 'If unspecified, the three above examples will be used.', + default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) fpe_parser.add_argument( 'project', help='The Google Cloud project id to use as a parent resource.') @@ -532,11 +558,12 @@ def write_data(data): args = parser.parse_args() if args.content == 'deid_mask': - deidentify_with_mask(args.project, args.item, + deidentify_with_mask(args.project, args.item, args.info_types, masking_character=args.masking_character, number_to_mask=args.number_to_mask) elif args.content == 'deid_fpe': - deidentify_with_fpe(args.project, args.item, alphabet=args.alphabet, + deidentify_with_fpe(args.project, args.item, args.info_types, + alphabet=args.alphabet, wrapped_key=args.wrapped_key, key_name=args.key_name, surrogate_type=args.surrogate_type) diff --git a/dlp/deid_test.py b/dlp/deid_test.py index 70e8290c067..f6bce36e79a 100644 --- a/dlp/deid_test.py +++ b/dlp/deid_test.py @@ -43,7 +43,8 @@ def tempdir(): def test_deidentify_with_mask(capsys): - deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING) + deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING, + ['US_SOCIAL_SECURITY_NUMBER']) out, _ = capsys.readouterr() assert 'My SSN is *********' in out @@ -60,6 +61,7 @@ def test_deidentify_with_mask_masking_character_specified(capsys): deid.deidentify_with_mask( GCLOUD_PROJECT, HARMFUL_STRING, + ['US_SOCIAL_SECURITY_NUMBER'], masking_character='#') out, _ = capsys.readouterr() @@ -67,7 +69,9 @@ def test_deidentify_with_mask_masking_character_specified(capsys): def test_deidentify_with_mask_masking_number_specified(capsys): - deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING, number_to_mask=7) + deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING, + ['US_SOCIAL_SECURITY_NUMBER'], + number_to_mask=7) out, _ = capsys.readouterr() assert 'My SSN is *******27' in out @@ -77,6 +81,7 @@ def test_deidentify_with_fpe(capsys): deid.deidentify_with_fpe( GCLOUD_PROJECT, HARMFUL_STRING, + ['US_SOCIAL_SECURITY_NUMBER'], alphabet='NUMERIC', wrapped_key=WRAPPED_KEY, key_name=KEY_NAME) @@ -90,6 +95,7 @@ def test_deidentify_with_fpe_uses_surrogate_info_types(capsys): deid.deidentify_with_fpe( GCLOUD_PROJECT, HARMFUL_STRING, + ['US_SOCIAL_SECURITY_NUMBER'], alphabet='NUMERIC', wrapped_key=WRAPPED_KEY, key_name=KEY_NAME, diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index 3b2d5d4a60b..aedc002d465 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -23,6 +23,7 @@ # [START dlp_inspect_string] def inspect_string(project, content_string, info_types, + custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, include_quote=True): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: @@ -50,10 +51,29 @@ def inspect_string(project, content_string, info_types, # dictionaries (protos are also accepted). info_types = [{'name': info_type} for info_type in info_types] + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dict.split(',')} + } + } for i, custom_dict in enumerate(custom_dictionaries)] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regex} + } for i, custom_regex in enumerate(custom_regexes)] + custom_info_types = dictionaries + regexes + # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, + 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': {'max_findings_per_request': max_findings}, @@ -85,6 +105,7 @@ def inspect_string(project, content_string, info_types, # [START dlp_inspect_file] def inspect_file(project, filename, info_types, min_likelihood=None, + custom_dictionaries=None, custom_regexes=None, max_findings=None, include_quote=True, mime_type=None): """Uses the Data Loss Prevention API to analyze a file for protected data. Args: @@ -118,10 +139,29 @@ def inspect_file(project, filename, info_types, min_likelihood=None, info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dict.split(',')} + } + } for i, custom_dict in enumerate(custom_dictionaries)] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regex} + } for i, custom_regex in enumerate(custom_regexes)] + custom_info_types = dictionaries + regexes + # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, + 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } @@ -168,8 +208,9 @@ def inspect_file(project, filename, info_types, min_likelihood=None, # [START dlp_inspect_gcs] def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, - info_types, min_likelihood=None, max_findings=None, - timeout=300): + info_types, custom_dictionaries=None, + custom_regexes=None, min_likelihood=None, + max_findings=None, timeout=300): """Uses the Data Loss Prevention API to analyze a file on GCS. Args: project: The Google Cloud project id to use as a parent resource. @@ -211,10 +252,29 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dict.split(',')} + } + } for i, custom_dict in enumerate(custom_dictionaries)] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regex} + } for i, custom_regex in enumerate(custom_regexes)] + custom_info_types = dictionaries + regexes + # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, + 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } @@ -293,8 +353,10 @@ def callback(message): # [START dlp_inspect_datastore] def inspect_datastore(project, datastore_project, kind, - topic_id, subscription_id, info_types, namespace_id=None, - min_likelihood=None, max_findings=None, timeout=300): + topic_id, subscription_id, info_types, + custom_dictionaries=None, custom_regexes=None, + namespace_id=None, min_likelihood=None, + max_findings=None, timeout=300): """Uses the Data Loss Prevention API to analyze Datastore data. Args: project: The Google Cloud project id to use as a parent resource. @@ -336,10 +398,29 @@ def inspect_datastore(project, datastore_project, kind, info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dict.split(',')} + } + } for i, custom_dict in enumerate(custom_dictionaries)] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regex} + } for i, custom_regex in enumerate(custom_regexes)] + custom_info_types = dictionaries + regexes + # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, + 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } @@ -424,6 +505,7 @@ def callback(message): # [START dlp_inspect_bigquery] def inspect_bigquery(project, bigquery_project, dataset_id, table_id, topic_id, subscription_id, info_types, + custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, timeout=300): """Uses the Data Loss Prevention API to analyze BigQuery data. Args: @@ -467,10 +549,29 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dict.split(',')} + } + } for i, custom_dict in enumerate(custom_dictionaries)] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regex} + } for i, custom_regex in enumerate(custom_regexes)] + custom_info_types = dictionaries + regexes + # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, + 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } @@ -571,6 +672,17 @@ def callback(message): 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_string.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', + default=None) + parser_string.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.', + default=None) parser_string.add_argument( '--min_likelihood', choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', @@ -600,6 +712,17 @@ def callback(message): 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_file.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', + default=None) + parser_file.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.', + default=None) parser_file.add_argument( '--min_likelihood', choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', @@ -648,6 +771,17 @@ def callback(message): 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_gcs.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', + default=None) + parser_gcs.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.', + default=None) parser_gcs.add_argument( '--min_likelihood', choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', @@ -692,6 +826,17 @@ def callback(message): 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_datastore.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', + default=None) + parser_datastore.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.', + default=None) parser_datastore.add_argument( '--namespace_id', help='The Datastore namespace to use, if applicable.') @@ -742,6 +887,17 @@ def callback(message): 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_bigquery.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', + default=None) + parser_bigquery.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.', + default=None) parser_bigquery.add_argument( '--min_likelihood', choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', @@ -762,12 +918,16 @@ def callback(message): if args.content == 'string': inspect_string( args.project, args.item, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, include_quote=args.include_quote) elif args.content == 'file': inspect_file( args.project, args.filename, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, include_quote=args.include_quote, @@ -777,6 +937,8 @@ def callback(message): args.project, args.bucket, args.filename, args.topic_id, args.subscription_id, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, timeout=args.timeout) @@ -785,6 +947,8 @@ def callback(message): args.project, args.datastore_project, args.kind, args.topic_id, args.subscription_id, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, namespace_id=args.namespace_id, min_likelihood=args.min_likelihood, max_findings=args.max_findings, @@ -794,6 +958,8 @@ def callback(message): args.project, args.bigquery_project, args.dataset_id, args.table_id, args.topic_id, args.subscription_id, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, timeout=args.timeout) diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py index 3fd7874478d..db1a0074142 100644 --- a/dlp/inspect_content_test.py +++ b/dlp/inspect_content_test.py @@ -170,6 +170,24 @@ def test_inspect_string(capsys): assert 'Info type: EMAIL_ADDRESS' in out +def test_inspect_string_with_custom_info_types(capsys): + test_string = 'My name is Gary Smith and my email is gary@example.com' + dictionaries = ['Gary Smith'] + regexes = ['\\w+@\\w+.com'] + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + include_quote=True) + + out, _ = capsys.readouterr() + assert 'Info type: CUSTOM_DICTIONARY_0' in out + assert 'Info type: CUSTOM_REGEX_0' in out + + def test_inspect_string_no_results(capsys): test_string = 'Nothing to see here' @@ -196,6 +214,24 @@ def test_inspect_file(capsys): assert 'Info type: EMAIL_ADDRESS' in out +def test_inspect_file_with_custom_info_types(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.txt') + dictionaries = ['gary@somedomain.com'] + regexes = ['\\(\\d{3}\\) \\d{3}-\\d{4}'] + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + include_quote=True) + + out, _ = capsys.readouterr() + assert 'Info type: CUSTOM_DICTIONARY_0' in out + assert 'Info type: CUSTOM_REGEX_0' in out + + def test_inspect_file_no_results(capsys): test_filepath = os.path.join(RESOURCE_DIRECTORY, 'harmless.txt') @@ -236,6 +272,27 @@ def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): assert 'Info type: EMAIL_ADDRESS' in out +@flaky +def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, + subscription_id, capsys): + dictionaries = ['gary@somedomain.com'] + regexes = ['\\(\\d{3}\\) \\d{3}-\\d{4}'] + + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + 'test.txt', + topic_id, + subscription_id, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes) + + out, _ = capsys.readouterr() + assert 'Info type: CUSTOM_DICTIONARY_0' in out + assert 'Info type: CUSTOM_REGEX_0' in out + + @flaky def test_inspect_gcs_file_no_results( bucket, topic_id, subscription_id, capsys):