diff --git a/dlp/snippets/deid.py b/dlp/snippets/deid.py index 7fe220b5859..da1cd727ce7 100644 --- a/dlp/snippets/deid.py +++ b/dlp/snippets/deid.py @@ -2588,6 +2588,7 @@ def deidentify_table_with_multiple_crypto_hash( "deid_fields_2", help="List of column names in table to de-identify using transient_key_name_2.", ) + args = parser.parse_args() if args.content == "deid_mask": diff --git a/dlp/snippets/deid_table.py b/dlp/snippets/deid_table.py new file mode 100644 index 00000000000..d9cdc7b4649 --- /dev/null +++ b/dlp/snippets/deid_table.py @@ -0,0 +1,319 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Uses of the Data Loss Prevention API for de-identifying sensitive data +contained in table.""" + +from __future__ import annotations + +import argparse +import base64 + +# [START dlp_deidentify_table_fpe] +from typing import List # noqa: F811, E402, I100 + +import google.cloud.dlp # noqa: F811, E402 + + +def deidentify_table_with_fpe( + project: str, + table_header: List[str], + table_rows: List[List[str]], + deid_field_names: List[str], + key_name: str = None, + wrapped_key: bytes = None, + alphabet: str = None, +) -> None: + """Uses the Data Loss Prevention API to de-identify sensitive data in a + table while maintaining format. + + Args: + project: The Google Cloud project id to use as a parent resource. + table_header: List of strings representing table field names. + table_rows: List of rows representing table data. + deid_field_names: A list of fields in table to de-identify. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The decrypted ('wrapped', in bytes) AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2/projects.deidentifyTemplates#ffxcommonnativealphabet + """ + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct the `table`. For more details on the table schema, please see + # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table + headers = [{"name": val} for val in table_header] + rows = [] + for row in table_rows: + rows.append({"values": [{"string_value": cell_val} for cell_val in row]}) + + table = {"headers": headers, "rows": rows} + + # Construct the `item` for table. + item = {"table": table} + + # Specify fields to be de-identified. + deid_field_names = [{"name": _i} for _i in deid_field_names] + + # Construct FPE configuration dictionary + crypto_replace_ffx_fpe_config = { + "crypto_key": { + "kms_wrapped": {"wrapped_key": wrapped_key, "crypto_key_name": key_name}, + }, + "common_alphabet": alphabet, + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "record_transformations": { + "field_transformations": [ + { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config + }, + "fields": deid_field_names, + } + ] + } + } + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.deidentify_content( + request={ + "parent": parent, + "deidentify_config": deidentify_config, + "item": item + }) + + # Print out results. + print(f"Table after de-identification: {response.item.table}") + + +# [END dlp_deidentify_table_fpe] + + +# [START dlp_reidentify_table_fpe] +from typing import List # noqa: F811, E402, I100 + +import google.cloud.dlp # noqa: F811, E402 + + +def reidentify_table_with_fpe( + project: str, + table_header: List[str], + table_rows: List[List[str]], + reid_field_names: List[str], + key_name: str = None, + wrapped_key: bytes = None, + alphabet: str = None, +) -> None: + """Uses the Data Loss Prevention API to re-identify sensitive data in a + table that was encrypted by Format Preserving Encryption (FPE). + + Args: + project: The Google Cloud project id to use as a parent resource. + table_header: List of strings representing table field names. + table_rows: List of rows representing table data. + reid_field_names: A list of fields in table to re-identify. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The decrypted ('wrapped', in bytes) AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2/projects.deidentifyTemplates#ffxcommonnativealphabet + """ + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct the `table`. For more details on the table schema, please see + # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table + headers = [{"name": val} for val in table_header] + rows = [] + for row in table_rows: + rows.append({"values": [{"string_value": cell_val} for cell_val in row]}) + table = {"headers": headers, "rows": rows} + + # Convert table to `item` + item = {"table": table} + + # Specify fields to be re-identified/decrypted. + reid_field_names = [{"name": _i} for _i in reid_field_names] + + # Construct FPE configuration dictionary + crypto_replace_ffx_fpe_config = { + "crypto_key": { + "kms_wrapped": {"wrapped_key": wrapped_key, "crypto_key_name": key_name} + }, + "common_alphabet": alphabet, + } + + # Construct reidentify configuration dictionary + reidentify_config = { + "record_transformations": { + "field_transformations": [ + { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config, + }, + "fields": reid_field_names, + } + ] + } + } + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.reidentify_content( + request={ + "parent": parent, + "reidentify_config": reidentify_config, + "item": item, + }) + + # Print out results. + print("Table after re-identification: {}".format(response.item.table)) + + +# [END dlp_reidentify_table_fpe] + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + table_fpe_parser = subparsers.add_parser( + "deid_table_fpe", + help="Deidentify sensitive data in a string using Format Preserving " + "Encryption (FPE).", + ) + table_fpe_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + table_fpe_parser.add_argument( + "table_header", + help="List of strings representing table field names.", + ) + table_fpe_parser.add_argument( + "table_rows", + help="List of rows representing table data", + ) + table_fpe_parser.add_argument( + "deid_field_names", + help="A list of fields in table to de-identify.", + ) + table_fpe_parser.add_argument( + "key_name", + help="The name of the Cloud KMS key used to encrypt ('wrap') the " + "AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + table_fpe_parser.add_argument( + "wrapped_key", + help="The encrypted ('wrapped') AES-256 key to use. This key should " + "be encrypted using the Cloud KMS key specified by key_name.", + ) + table_fpe_parser.add_argument( + "-a", + "--alphabet", + default="ALPHA_NUMERIC", + help="The set of characters to replace sensitive ones with. Commonly " + 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' + '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', + ) + + reid_table_fpe_parser = subparsers.add_parser( + "reid_table_fpe", + help="Re-identify sensitive data in a table using Format Preserving " + "Encryption (FPE).", + ) + reid_table_fpe_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + reid_table_fpe_parser.add_argument( + "table_header", + help="List of strings representing table field names.", + ) + reid_table_fpe_parser.add_argument( + "table_rows", + help="List of rows representing table data", + ) + reid_table_fpe_parser.add_argument( + "reid_field_names", + help="A list of fields in table to re-identify.", + ) + reid_table_fpe_parser.add_argument( + "key_name", + help="The name of the Cloud KMS key used to encrypt ('wrap') the " + "AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + reid_table_fpe_parser.add_argument( + "wrapped_key", + help="The encrypted ('wrapped') AES-256 key to use. This key should " + "be encrypted using the Cloud KMS key specified by key_name.", + ) + reid_table_fpe_parser.add_argument( + "-a", + "--alphabet", + default="ALPHA_NUMERIC", + help="The set of characters to replace sensitive ones with. Commonly " + 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' + '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', + ) + + args = parser.parse_args() + + if args.content == "deid_table_fpe": + deidentify_table_with_fpe( + args.project, + args.table_header, + args.table_rows, + args.deid_field_names, + wrapped_key=base64.b64decode(args.wrapped_key), + key_name=args.key_name, + alphabet=args.alphabet, + ) + elif args.content == "reid_table_fpe": + reidentify_table_with_fpe( + args.project, + args.table_header, + args.table_rows, + args.reid_field_names, + wrapped_key=base64.b64decode(args.wrapped_key), + key_name=args.key_name, + alphabet=args.alphabet, + ) diff --git a/dlp/snippets/deid_table_test.py b/dlp/snippets/deid_table_test.py new file mode 100644 index 00000000000..3b708eaece6 --- /dev/null +++ b/dlp/snippets/deid_table_test.py @@ -0,0 +1,82 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import base64 +import os + +import pytest + +import deid_table + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +UNWRAPPED_KEY = "YWJjZGVmZ2hpamtsbW5vcA==" +WRAPPED_KEY = ( + "CiQAz0hX4+go8fJwn80Fr8pVImwx+tmZdqU7JL+7TN/S5JxBU9gSSQDhFHpFVy" + "uzJps0YH9ls480mU+JLG7jI/0lL04i6XJRWqmI6gUSZRUtECYcLH5gXK4SXHlL" + "rotx7Chxz/4z7SIpXFOBY61z0/U=" +) +KEY_NAME = ( + f"projects/{GCLOUD_PROJECT}/locations/global/keyRings/" + "dlp-test/cryptoKeys/dlp-test" +) + + +def test_deidentify_and_reidentify_table_with_fpe(capsys: pytest.CaptureFixture) -> None: + table_data = { + "header": ["employee_id", "date", "compensation"], + "rows": [ + ["11111", "2015", "$10"], + ["22222", "2016", "$20"], + ["33333", "2016", "$15"], + ] + } + + deid_table.deidentify_table_with_fpe( + GCLOUD_PROJECT, + table_data["header"], + table_data["rows"], + ["employee_id"], + alphabet='NUMERIC', + wrapped_key=base64.b64decode(WRAPPED_KEY), + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + assert "11111" not in out + assert "22222" not in out + + response = out.split(":")[1:] + + deid_col_id = response.index(' "employee_id"\n}\nheaders {\n name') + total_columns = len(table_data['header']) + total_rows = len(table_data['rows'][0]) + + deid_emp_ids = [response[i].split("\n")[0][2:-1] for i in + range(deid_col_id + total_columns, len(response), total_columns)] + + for i in range(total_rows): + table_data['rows'][i][deid_col_id - 1] = deid_emp_ids[i] + + deid_table.reidentify_table_with_fpe( + GCLOUD_PROJECT, + table_data["header"], + table_data["rows"], + ["employee_id"], + alphabet='NUMERIC', + wrapped_key=base64.b64decode(WRAPPED_KEY), + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + assert "11111" in out + assert "22222" in out diff --git a/dlp/snippets/deid_test.py b/dlp/snippets/deid_test.py index fa2541cbf73..436a4450a41 100644 --- a/dlp/snippets/deid_test.py +++ b/dlp/snippets/deid_test.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import os import shutil import tempfile