diff --git a/samples/snippets/README.rst b/samples/snippets/README.rst new file mode 100644 index 00000000..a996c69d --- /dev/null +++ b/samples/snippets/README.rst @@ -0,0 +1,2 @@ +The DLP samples have moved to a new repository (https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/dlp/snippets) in PR: https://github.com/GoogleCloudPlatform/python-docs-samples/pull/9091 +Moving forward, all DLP samples will be added/ updated in the python-docs-samples repository. \ No newline at end of file diff --git a/samples/snippets/README.rst.in b/samples/snippets/README.rst.in deleted file mode 100644 index 708e870f..00000000 --- a/samples/snippets/README.rst.in +++ /dev/null @@ -1,52 +0,0 @@ -# This file is used to generate README.rst - -product: - name: Google Data Loss Prevention - short_name: Data Loss Prevention - url: https://cloud.google.com/dlp/docs/ - description: > - `Google Data Loss Prevention`_ provides programmatic access to a powerful - detection engine for personally identifiable information and other - privacy-sensitive data in unstructured data streams. - -setup: -- auth -- install_deps - -required_api_url: https://console.cloud.google.com/apis/library/dlp.googleapis.com - -required_roles: -- DLP Administrator -- DLP API Service Agent - -samples: -- name: Quickstart - file: quickstart.py -- name: Inspect Content - file: inspect_content.py - show_help: true -- name: Redact Content - file: redact.py - show_help: true -- name: Metadata - file: metadata.py - show_help: true -- name: Jobs - file: jobs.py - show_help: true -- name: Templates - file: templates.py - show_help: true -- name: Triggers - file: triggers.py - show_help: true -- name: Risk Analysis - file: risk.py - show_help: true -- name: DeID - file: deid.py - show_help: true - -cloud_client_library: true - -folder: dlp diff --git a/samples/snippets/custom_infotype.py b/samples/snippets/custom_infotype.py deleted file mode 100644 index 9ecc993f..00000000 --- a/samples/snippets/custom_infotype.py +++ /dev/null @@ -1,873 +0,0 @@ -# Copyright 2020 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Custom infoType snippets. - -This file contains sample code that uses the Data Loss Prevention API to create -custom infoType detectors to refine scan results. -""" - - -# [START dlp_inspect_string_with_exclusion_dict] -def inspect_string_with_exclusion_dict( - project, content_string, exclusion_list=["example@example.com"] -): - """Inspects the provided text, avoiding matches specified in the exclusion list - - Uses the Data Loss Prevention API to omit matches on EMAIL_ADDRESS if they are - in the specified exclusion list. - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - exclusion_list: The list of strings to ignore matches on - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct a list of infoTypes for DLP to locate in `content_string`. See - # https://cloud.google.com/dlp/docs/concepts-infotypes for more information - # about supported infoTypes. - info_types_to_locate = [{"name": "EMAIL_ADDRESS"}] - - # Construct a rule set that will only match on EMAIL_ADDRESS - # if the match text is not in the exclusion list. - rule_set = [ - { - "info_types": info_types_to_locate, - "rules": [ - { - "exclusion_rule": { - "dictionary": {"word_list": {"words": exclusion_list}}, - "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH, - } - } - ], - } - ] - - # Construct the configuration dictionary - inspect_config = { - "info_types": info_types_to_locate, - "rule_set": rule_set, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END dlp_inspect_string_with_exclusion_dict] - - -# [START dlp_inspect_string_with_exclusion_regex] -def inspect_string_with_exclusion_regex( - project, content_string, exclusion_regex=".+@example.com" -): - """Inspects the provided text, avoiding matches specified in the exclusion regex - - Uses the Data Loss Prevention API to omit matches on EMAIL_ADDRESS if they match - the specified exclusion regex. - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - exclusion_regex: The regular expression to exclude matches on - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct a list of infoTypes for DLP to locate in `content_string`. See - # https://cloud.google.com/dlp/docs/concepts-infotypes for more information - # about supported infoTypes. - info_types_to_locate = [{"name": "EMAIL_ADDRESS"}] - - # Construct a rule set that will only match on EMAIL_ADDRESS - # if the specified regex doesn't also match. - rule_set = [ - { - "info_types": info_types_to_locate, - "rules": [ - { - "exclusion_rule": { - "regex": {"pattern": exclusion_regex}, - "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH, - } - } - ], - } - ] - - # Construct the configuration dictionary - inspect_config = { - "info_types": info_types_to_locate, - "rule_set": rule_set, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END dlp_inspect_string_with_exclusion_regex] - - -# [START dlp_inspect_string_with_exclusion_dict_substring] -def inspect_string_with_exclusion_dict_substring( - project, content_string, exclusion_list=["TEST"] -): - """Inspects the provided text, avoiding matches that contain excluded tokens - - Uses the Data Loss Prevention API to omit matches if they include tokens - in the specified exclusion list. - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - exclusion_list: The list of strings to ignore partial matches on - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct a list of infoTypes for DLP to locate in `content_string`. See - # https://cloud.google.com/dlp/docs/concepts-infotypes for more information - # about supported infoTypes. - info_types_to_locate = [{"name": "EMAIL_ADDRESS"}, {"name": "DOMAIN_NAME"}] - - # Construct a rule set that will only match if the match text does not - # contains tokens from the exclusion list. - rule_set = [ - { - "info_types": info_types_to_locate, - "rules": [ - { - "exclusion_rule": { - "dictionary": {"word_list": {"words": exclusion_list}}, - "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, - } - } - ], - } - ] - - # Construct the configuration dictionary - inspect_config = { - "info_types": info_types_to_locate, - "rule_set": rule_set, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END dlp_inspect_string_with_exclusion_dict_substring] - - -# [START dlp_inspect_string_custom_excluding_substring] -def inspect_string_custom_excluding_substring( - project, content_string, exclusion_list=["jimmy"] -): - """Inspects the provided text with a custom detector, avoiding matches on specific tokens - - Uses the Data Loss Prevention API to omit matches on a custom detector - if they include tokens in the specified exclusion list. - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - exclusion_list: The list of strings to ignore matches on - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct a custom regex detector for names - custom_info_types = [ - { - "info_type": {"name": "CUSTOM_NAME_DETECTOR"}, - "regex": {"pattern": "[A-Z][a-z]{1,15}, [A-Z][a-z]{1,15}"}, - } - ] - - # Construct a rule set that will only match if the match text does not - # contains tokens from the exclusion list. - rule_set = [ - { - "info_types": [{"name": "CUSTOM_NAME_DETECTOR"}], - "rules": [ - { - "exclusion_rule": { - "dictionary": {"word_list": {"words": exclusion_list}}, - "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, - } - } - ], - } - ] - - # Construct the configuration dictionary - inspect_config = { - "custom_info_types": custom_info_types, - "rule_set": rule_set, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END dlp_inspect_string_custom_excluding_substring] - - -# [START dlp_inspect_string_custom_omit_overlap] -def inspect_string_custom_omit_overlap(project, content_string): - """Matches PERSON_NAME and a custom detector, - but if they overlap only matches the custom detector - - Uses the Data Loss Prevention API to omit matches on a built-in detector - if they overlap with matches from a custom detector - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct a custom regex detector for names - custom_info_types = [ - { - "info_type": {"name": "VIP_DETECTOR"}, - "regex": {"pattern": "Larry Page|Sergey Brin"}, - "exclusion_type": google.cloud.dlp_v2.CustomInfoType.ExclusionType.EXCLUSION_TYPE_EXCLUDE, - } - ] - - # Construct a rule set that will exclude PERSON_NAME matches - # that overlap with VIP_DETECTOR matches - rule_set = [ - { - "info_types": [{"name": "PERSON_NAME"}], - "rules": [ - { - "exclusion_rule": { - "exclude_info_types": { - "info_types": [{"name": "VIP_DETECTOR"}] - }, - "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH, - } - } - ], - } - ] - - # Construct the configuration dictionary - inspect_config = { - "info_types": [{"name": "PERSON_NAME"}], - "custom_info_types": custom_info_types, - "rule_set": rule_set, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END dlp_inspect_string_custom_omit_overlap] - - -# [START dlp_omit_name_if_also_email] -def omit_name_if_also_email( - project, - content_string, -): - """Matches PERSON_NAME and EMAIL_ADDRESS, but not both. - - Uses the Data Loss Prevention API omit matches on PERSON_NAME if the - EMAIL_ADDRESS detector also matches. - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct a list of infoTypes for DLP to locate in `content_string`. See - # https://cloud.google.com/dlp/docs/concepts-infotypes for more information - # about supported infoTypes. - info_types_to_locate = [{"name": "PERSON_NAME"}, {"name": "EMAIL_ADDRESS"}] - - # Construct the configuration dictionary that will only match on PERSON_NAME - # if the EMAIL_ADDRESS doesn't also match. This configuration helps reduce - # the total number of findings when there is a large overlap between different - # infoTypes. - inspect_config = { - "info_types": info_types_to_locate, - "rule_set": [ - { - "info_types": [{"name": "PERSON_NAME"}], - "rules": [ - { - "exclusion_rule": { - "exclude_info_types": { - "info_types": [{"name": "EMAIL_ADDRESS"}] - }, - "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, - } - } - ], - } - ], - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END dlp_omit_name_if_also_email] - - -# [START dlp_inspect_string_without_overlap] -def inspect_string_without_overlap(project, content_string): - """Matches EMAIL_ADDRESS and DOMAIN_NAME, but DOMAIN_NAME is omitted - if it overlaps with EMAIL_ADDRESS - - Uses the Data Loss Prevention API to omit matches of one infotype - that overlap with another. - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct a list of infoTypes for DLP to locate in `content_string`. See - # https://cloud.google.com/dlp/docs/concepts-infotypes for more information - # about supported infoTypes. - info_types_to_locate = [{"name": "DOMAIN_NAME"}, {"name": "EMAIL_ADDRESS"}] - - # Define a custom info type to exclude email addresses - custom_info_types = [ - { - "info_type": {"name": "EMAIL_ADDRESS"}, - "exclusion_type": google.cloud.dlp_v2.CustomInfoType.ExclusionType.EXCLUSION_TYPE_EXCLUDE, - } - ] - - # Construct a rule set that will exclude DOMAIN_NAME matches - # that overlap with EMAIL_ADDRESS matches - rule_set = [ - { - "info_types": [{"name": "DOMAIN_NAME"}], - "rules": [ - { - "exclusion_rule": { - "exclude_info_types": { - "info_types": [{"name": "EMAIL_ADDRESS"}] - }, - "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, - } - } - ], - } - ] - - # Construct the configuration dictionary - inspect_config = { - "info_types": info_types_to_locate, - "custom_info_types": custom_info_types, - "rule_set": rule_set, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END dlp_inspect_string_without_overlap] - - -# [START inspect_with_person_name_w_custom_hotword] -def inspect_with_person_name_w_custom_hotword( - project, content_string, custom_hotword="patient" -): - """Uses the Data Loss Prevention API increase likelihood for matches on - PERSON_NAME if the user specified custom hotword is present. Only - includes findings with the increased likelihood by setting a minimum - likelihood threshold of VERY_LIKELY. - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - custom_hotword: The custom hotword used for likelihood boosting. - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct a rule set with caller provided hotword, with a likelihood - # boost to VERY_LIKELY when the hotword are present within the 50 character- - # window preceding the PII finding. - hotword_rule = { - "hotword_regex": {"pattern": custom_hotword}, - "likelihood_adjustment": { - "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY - }, - "proximity": {"window_before": 50}, - } - - rule_set = [ - { - "info_types": [{"name": "PERSON_NAME"}], - "rules": [{"hotword_rule": hotword_rule}], - } - ] - - # Construct the configuration dictionary with the custom regex info type. - inspect_config = { - "rule_set": rule_set, - "min_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END inspect_with_person_name_w_custom_hotword] - - -# [START dlp_inspect_string_multiple_rules] -def inspect_string_multiple_rules(project, content_string): - """Uses the Data Loss Prevention API to modify likelihood for matches on - PERSON_NAME combining multiple hotword and exclusion rules. - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct hotword rules - patient_rule = { - "hotword_regex": {"pattern": "patient"}, - "proximity": {"window_before": 10}, - "likelihood_adjustment": { - "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY - }, - } - doctor_rule = { - "hotword_regex": {"pattern": "doctor"}, - "proximity": {"window_before": 10}, - "likelihood_adjustment": { - "fixed_likelihood": google.cloud.dlp_v2.Likelihood.UNLIKELY - }, - } - - # Construct exclusion rules - quasimodo_rule = { - "dictionary": {"word_list": {"words": ["quasimodo"]}}, - "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, - } - redacted_rule = { - "regex": {"pattern": "REDACTED"}, - "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, - } - - # Construct the rule set, combining the above rules - rule_set = [ - { - "info_types": [{"name": "PERSON_NAME"}], - "rules": [ - {"hotword_rule": patient_rule}, - {"hotword_rule": doctor_rule}, - {"exclusion_rule": quasimodo_rule}, - {"exclusion_rule": redacted_rule}, - ], - } - ] - - # Construct the configuration dictionary - inspect_config = { - "info_types": [{"name": "PERSON_NAME"}], - "rule_set": rule_set, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END dlp_inspect_string_multiple_rules] - - -# [START dlp_inspect_with_medical_record_number_custom_regex_detector] -def inspect_with_medical_record_number_custom_regex_detector( - project, - content_string, -): - """Uses the Data Loss Prevention API to analyze string with medical record - number custom regex detector - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct a custom regex detector info type called "C_MRN", - # with ###-#-##### pattern, where each # represents a digit from 1 to 9. - # The detector has a detection likelihood of POSSIBLE. - custom_info_types = [ - { - "info_type": {"name": "C_MRN"}, - "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"}, - "likelihood": google.cloud.dlp_v2.Likelihood.POSSIBLE, - } - ] - - # Construct the configuration dictionary with the custom regex info type. - inspect_config = { - "custom_info_types": custom_info_types, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END dlp_inspect_with_medical_record_number_custom_regex_detector] - - -# [START dlp_inspect_with_medical_record_number_w_custom_hotwords] -def inspect_with_medical_record_number_w_custom_hotwords( - project, - content_string, -): - """Uses the Data Loss Prevention API to analyze string with medical record - number custom regex detector, with custom hotwords rules to boost finding - certainty under some circumstances. - - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct a custom regex detector info type called "C_MRN", - # with ###-#-##### pattern, where each # represents a digit from 1 to 9. - # The detector has a detection likelihood of POSSIBLE. - custom_info_types = [ - { - "info_type": {"name": "C_MRN"}, - "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"}, - "likelihood": google.cloud.dlp_v2.Likelihood.POSSIBLE, - } - ] - - # Construct a rule set with hotwords "mrn" and "medical", with a likelohood - # boost to VERY_LIKELY when hotwords are present within the 10 character- - # window preceding the PII finding. - hotword_rule = { - "hotword_regex": {"pattern": "(?i)(mrn|medical)(?-i)"}, - "likelihood_adjustment": { - "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY - }, - "proximity": {"window_before": 10}, - } - - rule_set = [ - {"info_types": [{"name": "C_MRN"}], "rules": [{"hotword_rule": hotword_rule}]} - ] - - # Construct the configuration dictionary with the custom regex info type. - inspect_config = { - "custom_info_types": custom_info_types, - "rule_set": rule_set, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print(f"Quote: {finding.quote}") - print(f"Info type: {finding.info_type.name}") - print(f"Likelihood: {finding.likelihood}") - else: - print("No findings.") - - -# [END dlp_inspect_with_medical_record_number_w_custom_hotwords] diff --git a/samples/snippets/custom_infotype_test.py b/samples/snippets/custom_infotype_test.py deleted file mode 100644 index b6d12245..00000000 --- a/samples/snippets/custom_infotype_test.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright 2020 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import custom_infotype - -GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") - - -def test_inspect_string_with_exclusion_dict(capsys): - custom_infotype.inspect_string_with_exclusion_dict( - GCLOUD_PROJECT, "gary@example.com, example@example.com", ["example@example.com"] - ) - - out, _ = capsys.readouterr() - assert "example@example.com" not in out - assert "gary@example.com" in out - - -def test_inspect_string_with_exclusion_regex(capsys): - custom_infotype.inspect_string_with_exclusion_regex( - GCLOUD_PROJECT, "alice@example.com, ironman@avengers.net", ".+@example.com" - ) - - out, _ = capsys.readouterr() - assert "alice" not in out - assert "ironman" in out - - -def test_inspect_string_with_exclusion_dict_substring(capsys): - custom_infotype.inspect_string_with_exclusion_dict_substring( - GCLOUD_PROJECT, "bob@example.com TEST@example.com TEST.com", ["TEST"] - ) - - out, _ = capsys.readouterr() - assert "TEST@example.com" not in out - assert "TEST.com" not in out - assert "bob@example.com" in out - - -def test_inspect_string_custom_excluding_substring(capsys): - custom_infotype.inspect_string_custom_excluding_substring( - GCLOUD_PROJECT, "Danger, Jimmy | Wayne, Bruce", ["Jimmy"] - ) - - out, _ = capsys.readouterr() - assert "Wayne, Bruce" in out - assert "Danger, Jimmy" not in out - - -def test_inspect_string_custom_omit_overlap(capsys): - custom_infotype.inspect_string_custom_omit_overlap( - GCLOUD_PROJECT, "Larry Page and John Doe" - ) - - out, _ = capsys.readouterr() - assert "Larry Page" not in out - assert "John Doe" in out - - -def test_omit_name_if_also_email(capsys): - custom_infotype.omit_name_if_also_email(GCLOUD_PROJECT, "alice@example.com") - - # Ensure we found only EMAIL_ADDRESS, and not PERSON_NAME. - out, _ = capsys.readouterr() - assert "Info type: EMAIL_ADDRESS" in out - assert "Info type: PERSON_NAME" not in out - - -def test_inspect_string_without_overlap(capsys): - custom_infotype.inspect_string_without_overlap( - GCLOUD_PROJECT, "example.com is a domain, james@example.org is an email." - ) - - out, _ = capsys.readouterr() - assert "example.com" in out - assert "example.org" not in out - - -def test_inspect_with_person_name_w_custom_hotword(capsys): - custom_infotype.inspect_with_person_name_w_custom_hotword( - GCLOUD_PROJECT, "patient's name is John Doe.", "patient" - ) - - out, _ = capsys.readouterr() - assert "Info type: PERSON_NAME" in out - assert "Likelihood: 5" in out - - -def test_inspect_string_multiple_rules_patient(capsys): - custom_infotype.inspect_string_multiple_rules( - GCLOUD_PROJECT, "patient name: Jane Doe" - ) - - out, _ = capsys.readouterr() - assert "Likelihood: 4" in out - - -def test_inspect_string_multiple_rules_doctor(capsys): - custom_infotype.inspect_string_multiple_rules(GCLOUD_PROJECT, "doctor: Jane Doe") - - out, _ = capsys.readouterr() - assert "No findings" in out - - -def test_inspect_string_multiple_rules_quasimodo(capsys): - custom_infotype.inspect_string_multiple_rules( - GCLOUD_PROJECT, "patient name: quasimodo" - ) - - out, _ = capsys.readouterr() - assert "No findings" in out - - -def test_inspect_string_multiple_rules_redacted(capsys): - custom_infotype.inspect_string_multiple_rules( - GCLOUD_PROJECT, "name of patient: REDACTED" - ) - - out, _ = capsys.readouterr() - assert "No findings" in out - - -def test_inspect_with_medical_record_number_custom_regex_detector(capsys): - custom_infotype.inspect_with_medical_record_number_custom_regex_detector( - GCLOUD_PROJECT, "Patients MRN 444-5-22222" - ) - - out, _ = capsys.readouterr() - assert "Info type: C_MRN" in out - - -def test_inspect_with_medical_record_number_w_custom_hotwords_no_hotwords(capsys): - custom_infotype.inspect_with_medical_record_number_w_custom_hotwords( - GCLOUD_PROJECT, "just a number 444-5-22222" - ) - - out, _ = capsys.readouterr() - assert "Info type: C_MRN" in out - assert "Likelihood: 3" in out - - -def test_inspect_with_medical_record_number_w_custom_hotwords_has_hotwords(capsys): - custom_infotype.inspect_with_medical_record_number_w_custom_hotwords( - GCLOUD_PROJECT, "Patients MRN 444-5-22222" - ) - - out, _ = capsys.readouterr() - assert "Info type: C_MRN" in out - assert "Likelihood: 5" in out diff --git a/samples/snippets/deid.py b/samples/snippets/deid.py deleted file mode 100644 index 0f745e0a..00000000 --- a/samples/snippets/deid.py +++ /dev/null @@ -1,1228 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Uses of the Data Loss Prevention API for deidentifying sensitive data.""" - -from __future__ import print_function - -import argparse - - -# [START dlp_deidentify_masking] -def deidentify_with_mask( - project, input_str, info_types, masking_character=None, number_to_mask=0 -): - """Uses the Data Loss Prevention API to deidentify sensitive data in a - string by masking it with a character. - Args: - project: The Google Cloud project id to use as a parent resource. - input_str: The string to deidentify (will be treated as text). - masking_character: The character to mask matching sensitive data with. - number_to_mask: The maximum number of sensitive characters to mask in - a match. If omitted or set to zero, the API will default to no - maximum. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Construct inspect configuration dictionary - inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} - - # Construct deidentify configuration dictionary - deidentify_config = { - "info_type_transformations": { - "transformations": [ - { - "primitive_transformation": { - "character_mask_config": { - "masking_character": masking_character, - "number_to_mask": number_to_mask, - } - } - } - ] - } - } - - # Construct item - item = {"value": input_str} - - # Call the API - response = dlp.deidentify_content( - request={ - "parent": parent, - "deidentify_config": deidentify_config, - "inspect_config": inspect_config, - "item": item, - } - ) - - # Print out the results. - print(response.item.value) - - -# [END dlp_deidentify_masking] - -# [START dlp_deidentify_redact] -def deidentify_with_redact( - project, - input_str, - info_types, -): - """Uses the Data Loss Prevention API to deidentify sensitive data in a - string by redacting matched input values. - Args: - project: The Google Cloud project id to use as a parent resource. - input_str: The string to deidentify (will be treated as text). - info_types: A list of strings representing info types to look for. - Returns: - None; the response from the API is printed to the terminal. - """ - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Construct inspect configuration dictionary - inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} - - # Construct deidentify configuration dictionary - deidentify_config = { - "info_type_transformations": { - "transformations": [{"primitive_transformation": {"redact_config": {}}}] - } - } - - # Construct item - item = {"value": input_str} - - # Call the API - response = dlp.deidentify_content( - request={ - "parent": parent, - "deidentify_config": deidentify_config, - "inspect_config": inspect_config, - "item": item, - } - ) - - # Print out the results. - print(response.item.value) - - -# [END dlp_deidentify_redact] - -# [START dlp_deidentify_replace] -def deidentify_with_replace( - project, - input_str, - info_types, - replacement_str="REPLACEMENT_STR", -): - """Uses the Data Loss Prevention API to deidentify sensitive data in a - string by replacing matched input values with a value you specify. - Args: - project: The Google Cloud project id to use as a parent resource. - input_str: The string to deidentify (will be treated as text). - info_types: A list of strings representing info types to look for. - replacement_str: The string to replace all values that match given - info types. - Returns: - None; the response from the API is printed to the terminal. - """ - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Construct inspect configuration dictionary - inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} - - # Construct deidentify configuration dictionary - deidentify_config = { - "info_type_transformations": { - "transformations": [ - { - "primitive_transformation": { - "replace_config": { - "new_value": {"string_value": replacement_str} - } - } - } - ] - } - } - - # Construct item - item = {"value": input_str} - - # Call the API - response = dlp.deidentify_content( - request={ - "parent": parent, - "deidentify_config": deidentify_config, - "inspect_config": inspect_config, - "item": item, - } - ) - - # Print out the results. - print(response.item.value) - - -# [END dlp_deidentify_replace] - -# [START dlp_deidentify_fpe] - - -def deidentify_with_fpe( - project, - input_str, - info_types, - alphabet=None, - surrogate_type=None, - key_name=None, - wrapped_key=None, -): - """Uses the Data Loss Prevention API to deidentify sensitive data in a - string using Format Preserving Encryption (FPE). - Args: - project: The Google Cloud project id to use as a parent resource. - input_str: The string to deidentify (will be treated as text). - alphabet: The set of characters to replace sensitive ones with. For - more information, see https://cloud.google.com/dlp/docs/reference/ - rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet - surrogate_type: The name of the surrogate custom info type to use. Only - necessary if you want to reverse the deidentification process. Can - be essentially any arbitrary string, as long as it doesn't appear - in your dataset otherwise. - key_name: The name of the Cloud KMS key used to encrypt ('wrap') the - AES-256 key. Example: - key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ - keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' - wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key - should be encrypted using the Cloud KMS key specified by key_name. - Returns: - None; the response from the API is printed to the terminal. - """ - # Import the client library - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # The wrapped key is base64-encoded, but the library expects a binary - # string, so decode it here. - import base64 - - wrapped_key = base64.b64decode(wrapped_key) - - # Construct FPE configuration dictionary - crypto_replace_ffx_fpe_config = { - "crypto_key": { - "kms_wrapped": {"wrapped_key": wrapped_key, "crypto_key_name": key_name} - }, - "common_alphabet": alphabet, - } - - # Add surrogate type - if surrogate_type: - crypto_replace_ffx_fpe_config["surrogate_info_type"] = {"name": surrogate_type} - - # Construct inspect configuration dictionary - inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} - - # Construct deidentify configuration dictionary - deidentify_config = { - "info_type_transformations": { - "transformations": [ - { - "primitive_transformation": { - "crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config - } - } - ] - } - } - - # Convert string to item - item = {"value": input_str} - - # Call the API - response = dlp.deidentify_content( - request={ - "parent": parent, - "deidentify_config": deidentify_config, - "inspect_config": inspect_config, - "item": item, - } - ) - - # Print results - print(response.item.value) - - -# [END dlp_deidentify_fpe] - -# [START dlp_deidentify_deterministic] -def deidentify_with_deterministic( - project, - input_str, - info_types, - surrogate_type=None, - key_name=None, - wrapped_key=None, -): - """Deidentifies sensitive data in a string using deterministic encryption. - Args: - project: The Google Cloud project id to use as a parent resource. - input_str: The string to deidentify (will be treated as text). - surrogate_type: The name of the surrogate custom info type to use. Only - necessary if you want to reverse the deidentification process. Can - be essentially any arbitrary string, as long as it doesn't appear - in your dataset otherwise. - key_name: The name of the Cloud KMS key used to encrypt ('wrap') the - AES-256 key. Example: - key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ - keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' - wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key - should be encrypted using the Cloud KMS key specified by key_name. - Returns: - None; the response from the API is printed to the terminal. - """ - import base64 - - # Import the client library - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # The wrapped key is base64-encoded, but the library expects a binary - # string, so decode it here. - wrapped_key = base64.b64decode(wrapped_key) - - # Construct Deterministic encryption configuration dictionary - crypto_replace_deterministic_config = { - "crypto_key": { - "kms_wrapped": {"wrapped_key": wrapped_key, "crypto_key_name": key_name} - }, - } - - # Add surrogate type - if surrogate_type: - crypto_replace_deterministic_config["surrogate_info_type"] = { - "name": surrogate_type - } - - # Construct inspect configuration dictionary - inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} - - # Construct deidentify configuration dictionary - deidentify_config = { - "info_type_transformations": { - "transformations": [ - { - "primitive_transformation": { - "crypto_deterministic_config": crypto_replace_deterministic_config - } - } - ] - } - } - - # Convert string to item - item = {"value": input_str} - - # Call the API - response = dlp.deidentify_content( - request={ - "parent": parent, - "deidentify_config": deidentify_config, - "inspect_config": inspect_config, - "item": item, - } - ) - - # Print results - print(response.item.value) - - -# [END dlp_deidentify_deterministic] - - -# [START dlp_reidentify_fpe] -def reidentify_with_fpe( - project, - input_str, - alphabet=None, - surrogate_type=None, - key_name=None, - wrapped_key=None, -): - """Uses the Data Loss Prevention API to reidentify sensitive data in a - string that was encrypted by Format Preserving Encryption (FPE). - Args: - project: The Google Cloud project id to use as a parent resource. - input_str: The string to deidentify (will be treated as text). - alphabet: The set of characters to replace sensitive ones with. For - more information, see https://cloud.google.com/dlp/docs/reference/ - rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet - surrogate_type: The name of the surrogate custom info type to used - during the encryption process. - key_name: The name of the Cloud KMS key used to encrypt ('wrap') the - AES-256 key. Example: - keyName = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ - keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' - wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key - should be encrypted using the Cloud KMS key specified by key_name. - Returns: - None; the response from the API is printed to the terminal. - """ - # Import the client library - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # The wrapped key is base64-encoded, but the library expects a binary - # string, so decode it here. - import base64 - - wrapped_key = base64.b64decode(wrapped_key) - - # Construct Deidentify Config - reidentify_config = { - "info_type_transformations": { - "transformations": [ - { - "primitive_transformation": { - "crypto_replace_ffx_fpe_config": { - "crypto_key": { - "kms_wrapped": { - "wrapped_key": wrapped_key, - "crypto_key_name": key_name, - } - }, - "common_alphabet": alphabet, - "surrogate_info_type": {"name": surrogate_type}, - } - } - } - ] - } - } - - inspect_config = { - "custom_info_types": [ - {"info_type": {"name": surrogate_type}, "surrogate_type": {}} - ] - } - - # Convert string to item - item = {"value": input_str} - - # Call the API - response = dlp.reidentify_content( - request={ - "parent": parent, - "reidentify_config": reidentify_config, - "inspect_config": inspect_config, - "item": item, - } - ) - - # Print results - print(response.item.value) - - -# [END dlp_reidentify_fpe] - - -# [START dlp_reidentify_deterministic] -def reidentify_with_deterministic( - project, - input_str, - surrogate_type=None, - key_name=None, - wrapped_key=None, -): - """Re-identifies content that was previously de-identified through deterministic encryption. - Args: - project: The Google Cloud project ID to use as a parent resource. - input_str: The string to be re-identified. Provide the entire token. Example: - EMAIL_ADDRESS_TOKEN(52):AVAx2eIEnIQP5jbNEr2j9wLOAd5m4kpSBR/0jjjGdAOmryzZbE/q - surrogate_type: The name of the surrogate custom infoType used - during the encryption process. - key_name: The name of the Cloud KMS key used to encrypt ("wrap") the - AES-256 key. Example: - keyName = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ - keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' - wrapped_key: The encrypted ("wrapped") AES-256 key previously used to encrypt the content. - This key must have been encrypted using the Cloud KMS key specified by key_name. - Returns: - None; the response from the API is printed to the terminal. - """ - import base64 - - # Import the client library - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # The wrapped key is base64-encoded, but the library expects a binary - # string, so decode it here. - wrapped_key = base64.b64decode(wrapped_key) - - # Construct reidentify Configuration - reidentify_config = { - "info_type_transformations": { - "transformations": [ - { - "primitive_transformation": { - "crypto_deterministic_config": { - "crypto_key": { - "kms_wrapped": { - "wrapped_key": wrapped_key, - "crypto_key_name": key_name, - } - }, - "surrogate_info_type": {"name": surrogate_type}, - } - } - } - ] - } - } - - inspect_config = { - "custom_info_types": [ - {"info_type": {"name": surrogate_type}, "surrogate_type": {}} - ] - } - - # Convert string to item - item = {"value": input_str} - - # Call the API - response = dlp.reidentify_content( - request={ - "parent": parent, - "reidentify_config": reidentify_config, - "inspect_config": inspect_config, - "item": item, - } - ) - - # Print results - print(response.item.value) - - -# [END dlp_reidentify_deterministic] - - -# [START dlp_deidentify_free_text_with_fpe_using_surrogate] -def deidentify_free_text_with_fpe_using_surrogate( - project, - input_str, - alphabet="NUMERIC", - info_type="PHONE_NUMBER", - surrogate_type="PHONE_TOKEN", - unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==", -): - """Uses the Data Loss Prevention API to deidentify sensitive data in a - string using Format Preserving Encryption (FPE). - The encryption is performed with an unwrapped key. - Args: - project: The Google Cloud project id to use as a parent resource. - input_str: The string to deidentify (will be treated as text). - alphabet: The set of characters to replace sensitive ones with. For - more information, see https://cloud.google.com/dlp/docs/reference/ - rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet - info_type: The name of the info type to de-identify - surrogate_type: The name of the surrogate custom info type to use. Can - be essentially any arbitrary string, as long as it doesn't appear - in your dataset otherwise. - unwrapped_key: The base64-encoded AES-256 key to use. - Returns: - None; the response from the API is printed to the terminal. - """ - # Import the client library - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # The unwrapped key is base64-encoded, but the library expects a binary - # string, so decode it here. - import base64 - - unwrapped_key = base64.b64decode(unwrapped_key) - - # Construct de-identify config - transformation = { - "info_types": [{"name": info_type}], - "primitive_transformation": { - "crypto_replace_ffx_fpe_config": { - "crypto_key": {"unwrapped": {"key": unwrapped_key}}, - "common_alphabet": alphabet, - "surrogate_info_type": {"name": surrogate_type}, - } - }, - } - - deidentify_config = { - "info_type_transformations": {"transformations": [transformation]} - } - - # Construct the inspect config, trying to finding all PII with likelihood - # higher than UNLIKELY - inspect_config = { - "info_types": [{"name": info_type}], - "min_likelihood": google.cloud.dlp_v2.Likelihood.UNLIKELY, - } - - # Convert string to item - item = {"value": input_str} - - # Call the API - response = dlp.deidentify_content( - request={ - "parent": parent, - "deidentify_config": deidentify_config, - "inspect_config": inspect_config, - "item": item, - } - ) - - # Print results - print(response.item.value) - - -# [END dlp_deidentify_free_text_with_fpe_using_surrogate] - - -# [START dlp_reidentify_free_text_with_fpe_using_surrogate] -def reidentify_free_text_with_fpe_using_surrogate( - project, - input_str, - alphabet="NUMERIC", - surrogate_type="PHONE_TOKEN", - unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==", -): - """Uses the Data Loss Prevention API to reidentify sensitive data in a - string that was encrypted by Format Preserving Encryption (FPE) with - surrogate type. The encryption is performed with an unwrapped key. - Args: - project: The Google Cloud project id to use as a parent resource. - input_str: The string to deidentify (will be treated as text). - alphabet: The set of characters to replace sensitive ones with. For - more information, see https://cloud.google.com/dlp/docs/reference/ - rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet - surrogate_type: The name of the surrogate custom info type to used - during the encryption process. - unwrapped_key: The base64-encoded AES-256 key to use. - Returns: - None; the response from the API is printed to the terminal. - """ - # Import the client library - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # The unwrapped key is base64-encoded, but the library expects a binary - # string, so decode it here. - import base64 - - unwrapped_key = base64.b64decode(unwrapped_key) - - # Construct Deidentify Config - transformation = { - "primitive_transformation": { - "crypto_replace_ffx_fpe_config": { - "crypto_key": {"unwrapped": {"key": unwrapped_key}}, - "common_alphabet": alphabet, - "surrogate_info_type": {"name": surrogate_type}, - } - } - } - - reidentify_config = { - "info_type_transformations": {"transformations": [transformation]} - } - - inspect_config = { - "custom_info_types": [ - {"info_type": {"name": surrogate_type}, "surrogate_type": {}} - ] - } - - # Convert string to item - item = {"value": input_str} - - # Call the API - response = dlp.reidentify_content( - request={ - "parent": parent, - "reidentify_config": reidentify_config, - "inspect_config": inspect_config, - "item": item, - } - ) - - # Print results - print(response.item.value) - - -# [END dlp_reidentify_free_text_with_fpe_using_surrogate] - - -# [START dlp_deidentify_date_shift] -def deidentify_with_date_shift( - project, - input_csv_file=None, - output_csv_file=None, - date_fields=None, - lower_bound_days=None, - upper_bound_days=None, - context_field_id=None, - wrapped_key=None, - key_name=None, -): - """Uses the Data Loss Prevention API to deidentify dates in a CSV file by - pseudorandomly shifting them. - Args: - project: The Google Cloud project id to use as a parent resource. - input_csv_file: The path to the CSV file to deidentify. The first row - of the file must specify column names, and all other rows must - contain valid values. - output_csv_file: The path to save the date-shifted CSV file. - date_fields: The list of (date) fields in the CSV file to date shift. - Example: ['birth_date', 'register_date'] - lower_bound_days: The maximum number of days to shift a date backward - upper_bound_days: The maximum number of days to shift a date forward - context_field_id: (Optional) The column to determine date shift amount - based on. If this is not specified, a random shift amount will be - used for every row. If this is specified, then 'wrappedKey' and - 'keyName' must also be set. Example: - contextFieldId = [{ 'name': 'user_id' }] - key_name: (Optional) The name of the Cloud KMS key used to encrypt - ('wrap') the AES-256 key. Example: - key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ - keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' - wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use. - This key should be encrypted using the Cloud KMS key specified by - key_name. - Returns: - None; the response from the API is printed to the terminal. - """ - # Import the client library - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Convert date field list to Protobuf type - def map_fields(field): - return {"name": field} - - if date_fields: - date_fields = map(map_fields, date_fields) - else: - date_fields = [] - - # Read and parse the CSV file - import csv - from datetime import datetime - - f = [] - with open(input_csv_file, "r") as csvfile: - reader = csv.reader(csvfile) - for row in reader: - f.append(row) - - # Helper function for converting CSV rows to Protobuf types - def map_headers(header): - return {"name": header} - - def map_data(value): - try: - date = datetime.strptime(value, "%m/%d/%Y") - return { - "date_value": {"year": date.year, "month": date.month, "day": date.day} - } - except ValueError: - return {"string_value": value} - - def map_rows(row): - return {"values": map(map_data, row)} - - # Using the helper functions, convert CSV rows to protobuf-compatible - # dictionaries. - csv_headers = map(map_headers, f[0]) - csv_rows = map(map_rows, f[1:]) - - # Construct the table dict - table_item = {"table": {"headers": csv_headers, "rows": csv_rows}} - - # Construct date shift config - date_shift_config = { - "lower_bound_days": lower_bound_days, - "upper_bound_days": upper_bound_days, - } - - # If using a Cloud KMS key, add it to the date_shift_config. - # The wrapped key is base64-encoded, but the library expects a binary - # string, so decode it here. - if context_field_id and key_name and wrapped_key: - import base64 - - date_shift_config["context"] = {"name": context_field_id} - date_shift_config["crypto_key"] = { - "kms_wrapped": { - "wrapped_key": base64.b64decode(wrapped_key), - "crypto_key_name": key_name, - } - } - elif context_field_id or key_name or wrapped_key: - raise ValueError( - """You must set either ALL or NONE of - [context_field_id, key_name, wrapped_key]!""" - ) - - # Construct Deidentify Config - deidentify_config = { - "record_transformations": { - "field_transformations": [ - { - "fields": date_fields, - "primitive_transformation": { - "date_shift_config": date_shift_config - }, - } - ] - } - } - - # Write to CSV helper methods - def write_header(header): - return header.name - - def write_data(data): - return data.string_value or "%s/%s/%s" % ( - data.date_value.month, - data.date_value.day, - data.date_value.year, - ) - - # Call the API - response = dlp.deidentify_content( - request={ - "parent": parent, - "deidentify_config": deidentify_config, - "item": table_item, - } - ) - - # Write results to CSV file - with open(output_csv_file, "w") as csvfile: - write_file = csv.writer(csvfile, delimiter=",") - write_file.writerow(map(write_header, response.item.table.headers)) - for row in response.item.table.rows: - write_file.writerow(map(write_data, row.values)) - # Print status - print("Successfully saved date-shift output to {}".format(output_csv_file)) - - -# [END dlp_deidentify_date_shift] - - -# [START dlp_deidentify_replace_infotype] -def deidentify_with_replace_infotype(project, item, info_types): - """Uses the Data Loss Prevention API to deidentify sensitive data in a - string by replacing it with the info type. - Args: - project: The Google Cloud project id to use as a parent resource. - item: The string to deidentify (will be treated as text). - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library - import google.cloud.dlp - - # Instantiate a client - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Construct inspect configuration dictionary - inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} - - # Construct deidentify configuration dictionary - deidentify_config = { - "info_type_transformations": { - "transformations": [ - {"primitive_transformation": {"replace_with_info_type_config": {}}} - ] - } - } - - # Call the API - response = dlp.deidentify_content( - request={ - "parent": parent, - "deidentify_config": deidentify_config, - "inspect_config": inspect_config, - "item": {"value": item}, - } - ) - - # Print out the results. - print(response.item.value) - - -# [END dlp_deidentify_replace_infotype] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - subparsers = parser.add_subparsers( - dest="content", help="Select how to submit content to the API." - ) - subparsers.required = True - - mask_parser = subparsers.add_parser( - "deid_mask", - help="Deidentify sensitive data in a string by masking it with a " "character.", - ) - mask_parser.add_argument( - "--info_types", - nargs="+", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - mask_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - mask_parser.add_argument("item", help="The string to deidentify.") - mask_parser.add_argument( - "-n", - "--number_to_mask", - type=int, - default=0, - help="The maximum number of sensitive characters to mask in a match. " - "If omitted the request or set to 0, the API will mask any mathcing " - "characters.", - ) - mask_parser.add_argument( - "-m", - "--masking_character", - help="The character to mask matching sensitive data with.", - ) - - replace_parser = subparsers.add_parser( - "deid_replace", - help="Deidentify sensitive data in a string by replacing it with " - "another string.", - ) - replace_parser.add_argument( - "--info_types", - nargs="+", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - replace_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - replace_parser.add_argument("item", help="The string to deidentify.") - replace_parser.add_argument( - "replacement_str", help="The string to " "replace all matched values with." - ) - - fpe_parser = subparsers.add_parser( - "deid_fpe", - help="Deidentify sensitive data in a string using Format Preserving " - "Encryption (FPE).", - ) - fpe_parser.add_argument( - "--info_types", - action="append", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - fpe_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - fpe_parser.add_argument( - "item", - help="The string to deidentify. " "Example: string = 'My SSN is 372819127'", - ) - fpe_parser.add_argument( - "key_name", - help="The name of the Cloud KMS key used to encrypt ('wrap') the " - "AES-256 key. Example: " - "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" - "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", - ) - fpe_parser.add_argument( - "wrapped_key", - help="The encrypted ('wrapped') AES-256 key to use. This key should " - "be encrypted using the Cloud KMS key specified by key_name.", - ) - fpe_parser.add_argument( - "-a", - "--alphabet", - default="ALPHA_NUMERIC", - help="The set of characters to replace sensitive ones with. Commonly " - 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' - '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' - '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', - ) - fpe_parser.add_argument( - "-s", - "--surrogate_type", - help="The name of the surrogate custom info type to use. Only " - "necessary if you want to reverse the deidentification process. Can " - "be essentially any arbitrary string, as long as it doesn't appear " - "in your dataset otherwise.", - ) - - reid_parser = subparsers.add_parser( - "reid_fpe", - help="Reidentify sensitive data in a string using Format Preserving " - "Encryption (FPE).", - ) - reid_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - reid_parser.add_argument( - "item", - help="The string to deidentify. " "Example: string = 'My SSN is 372819127'", - ) - reid_parser.add_argument( - "surrogate_type", - help="The name of the surrogate custom info type to use. Only " - "necessary if you want to reverse the deidentification process. Can " - "be essentially any arbitrary string, as long as it doesn't appear " - "in your dataset otherwise.", - ) - reid_parser.add_argument( - "key_name", - help="The name of the Cloud KMS key used to encrypt ('wrap') the " - "AES-256 key. Example: " - "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" - "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", - ) - reid_parser.add_argument( - "wrapped_key", - help="The encrypted ('wrapped') AES-256 key to use. This key should " - "be encrypted using the Cloud KMS key specified by key_name.", - ) - reid_parser.add_argument( - "-a", - "--alphabet", - default="ALPHA_NUMERIC", - help="The set of characters to replace sensitive ones with. Commonly " - 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' - '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' - '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', - ) - - date_shift_parser = subparsers.add_parser( - "deid_date_shift", - help="Deidentify dates in a CSV file by pseudorandomly shifting them.", - ) - date_shift_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - date_shift_parser.add_argument( - "input_csv_file", - help="The path to the CSV file to deidentify. The first row of the " - "file must specify column names, and all other rows must contain " - "valid values.", - ) - date_shift_parser.add_argument( - "output_csv_file", help="The path to save the date-shifted CSV file." - ) - date_shift_parser.add_argument( - "lower_bound_days", - type=int, - help="The maximum number of days to shift a date backward", - ) - date_shift_parser.add_argument( - "upper_bound_days", - type=int, - help="The maximum number of days to shift a date forward", - ) - date_shift_parser.add_argument( - "date_fields", - nargs="+", - help="The list of date fields in the CSV file to date shift. Example: " - "['birth_date', 'register_date']", - ) - date_shift_parser.add_argument( - "--context_field_id", - help="(Optional) The column to determine date shift amount based on. " - "If this is not specified, a random shift amount will be used for " - "every row. If this is specified, then 'wrappedKey' and 'keyName' " - "must also be set.", - ) - date_shift_parser.add_argument( - "--key_name", - help="(Optional) The name of the Cloud KMS key used to encrypt " - "('wrap') the AES-256 key. Example: " - "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" - "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", - ) - date_shift_parser.add_argument( - "--wrapped_key", - help="(Optional) The encrypted ('wrapped') AES-256 key to use. This " - "key should be encrypted using the Cloud KMS key specified by" - "key_name.", - ) - - replace_with_infotype_parser = subparsers.add_parser( - "replace_with_infotype", - help="Deidentify sensitive data in a string by replacing it with the " - "info type of the data.", - ) - replace_with_infotype_parser.add_argument( - "--info_types", - action="append", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - replace_with_infotype_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - replace_with_infotype_parser.add_argument( - "item", - help="The string to deidentify." - "Example: 'My credit card is 4242 4242 4242 4242'", - ) - - args = parser.parse_args() - - if args.content == "deid_mask": - deidentify_with_mask( - args.project, - args.item, - args.info_types, - masking_character=args.masking_character, - number_to_mask=args.number_to_mask, - ) - elif args.content == "deid_replace": - deidentify_with_replace( - args.project, - args.item, - args.info_types, - replacement_str=args.replacement_str, - ) - elif args.content == "deid_fpe": - deidentify_with_fpe( - args.project, - args.item, - args.info_types, - alphabet=args.alphabet, - wrapped_key=args.wrapped_key, - key_name=args.key_name, - surrogate_type=args.surrogate_type, - ) - elif args.content == "reid_fpe": - reidentify_with_fpe( - args.project, - args.item, - surrogate_type=args.surrogate_type, - wrapped_key=args.wrapped_key, - key_name=args.key_name, - alphabet=args.alphabet, - ) - elif args.content == "deid_date_shift": - deidentify_with_date_shift( - args.project, - input_csv_file=args.input_csv_file, - output_csv_file=args.output_csv_file, - lower_bound_days=args.lower_bound_days, - upper_bound_days=args.upper_bound_days, - date_fields=args.date_fields, - context_field_id=args.context_field_id, - wrapped_key=args.wrapped_key, - key_name=args.key_name, - ) - elif args.content == "replace_with_infotype": - deidentify_with_replace_infotype( - args.project, - item=args.item, - info_types=args.info_types, - ) diff --git a/samples/snippets/deid_test.py b/samples/snippets/deid_test.py deleted file mode 100644 index 84217675..00000000 --- a/samples/snippets/deid_test.py +++ /dev/null @@ -1,291 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import shutil -import tempfile - -import google.cloud.dlp_v2 -import pytest - -import deid - -HARMFUL_STRING = "My SSN is 372819127" -HARMLESS_STRING = "My favorite color is blue" -GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") -UNWRAPPED_KEY = "YWJjZGVmZ2hpamtsbW5vcA==" -WRAPPED_KEY = ( - "CiQAz0hX4+go8fJwn80Fr8pVImwx+tmZdqU7JL+7TN/S5JxBU9gSSQDhFHpFVy" - "uzJps0YH9ls480mU+JLG7jI/0lL04i6XJRWqmI6gUSZRUtECYcLH5gXK4SXHlL" - "rotx7Chxz/4z7SIpXFOBY61z0/U=" -) -KEY_NAME = ( - "projects/python-docs-samples-tests/locations/global/keyRings/" - "dlp-test/cryptoKeys/dlp-test" -) -SURROGATE_TYPE = "SSN_TOKEN" -CSV_FILE = os.path.join(os.path.dirname(__file__), "resources/dates.csv") -DATE_SHIFTED_AMOUNT = 30 -DATE_FIELDS = ["birth_date", "register_date"] -CSV_CONTEXT_FIELD = "name" - - -@pytest.fixture(scope="module") -def tempdir(): - tempdir = tempfile.mkdtemp() - yield tempdir - shutil.rmtree(tempdir) - - -def test_deidentify_with_mask(capsys): - deid.deidentify_with_mask( - GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"] - ) - - out, _ = capsys.readouterr() - assert "My SSN is *********" in out - - -def test_deidentify_with_mask_ignore_insensitive_data(capsys): - deid.deidentify_with_mask( - GCLOUD_PROJECT, HARMLESS_STRING, ["US_SOCIAL_SECURITY_NUMBER"] - ) - - out, _ = capsys.readouterr() - assert HARMLESS_STRING in out - - -def test_deidentify_with_mask_masking_character_specified(capsys): - deid.deidentify_with_mask( - GCLOUD_PROJECT, - HARMFUL_STRING, - ["US_SOCIAL_SECURITY_NUMBER"], - masking_character="#", - ) - - out, _ = capsys.readouterr() - assert "My SSN is #########" in out - - -def test_deidentify_with_mask_masking_number_specified(capsys): - deid.deidentify_with_mask( - GCLOUD_PROJECT, - HARMFUL_STRING, - ["US_SOCIAL_SECURITY_NUMBER"], - number_to_mask=7, - ) - - out, _ = capsys.readouterr() - assert "My SSN is *******27" in out - - -def test_deidentify_with_redact(capsys): - deid.deidentify_with_redact( - GCLOUD_PROJECT, HARMFUL_STRING + "!", ["US_SOCIAL_SECURITY_NUMBER"] - ) - out, _ = capsys.readouterr() - assert "My SSN is !" in out - - -def test_deidentify_with_replace(capsys): - deid.deidentify_with_replace( - GCLOUD_PROJECT, - HARMFUL_STRING, - ["US_SOCIAL_SECURITY_NUMBER"], - replacement_str="REPLACEMENT_STR", - ) - - out, _ = capsys.readouterr() - assert "My SSN is REPLACEMENT_STR" in out - - -def test_deidentify_with_fpe(capsys): - deid.deidentify_with_fpe( - GCLOUD_PROJECT, - HARMFUL_STRING, - ["US_SOCIAL_SECURITY_NUMBER"], - alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, - wrapped_key=WRAPPED_KEY, - key_name=KEY_NAME, - ) - - out, _ = capsys.readouterr() - assert "My SSN is" in out - assert "372819127" not in out - - -def test_deidentify_with_deterministic(capsys): - deid.deidentify_with_deterministic( - GCLOUD_PROJECT, - HARMFUL_STRING, - ["US_SOCIAL_SECURITY_NUMBER"], - surrogate_type=SURROGATE_TYPE, - key_name=KEY_NAME, - wrapped_key=WRAPPED_KEY, - ) - - out, _ = capsys.readouterr() - assert "My SSN is" in out - assert "372819127" not in out - - -def test_deidentify_with_fpe_uses_surrogate_info_types(capsys): - deid.deidentify_with_fpe( - GCLOUD_PROJECT, - HARMFUL_STRING, - ["US_SOCIAL_SECURITY_NUMBER"], - alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, - wrapped_key=WRAPPED_KEY, - key_name=KEY_NAME, - surrogate_type=SURROGATE_TYPE, - ) - - out, _ = capsys.readouterr() - assert "My SSN is SSN_TOKEN" in out - assert "372819127" not in out - - -def test_deidentify_with_fpe_ignores_insensitive_data(capsys): - deid.deidentify_with_fpe( - GCLOUD_PROJECT, - HARMLESS_STRING, - ["US_SOCIAL_SECURITY_NUMBER"], - alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, - wrapped_key=WRAPPED_KEY, - key_name=KEY_NAME, - ) - - out, _ = capsys.readouterr() - assert HARMLESS_STRING in out - - -def test_deidentify_with_date_shift(tempdir, capsys): - output_filepath = os.path.join(tempdir, "dates-shifted.csv") - - deid.deidentify_with_date_shift( - GCLOUD_PROJECT, - input_csv_file=CSV_FILE, - output_csv_file=output_filepath, - lower_bound_days=DATE_SHIFTED_AMOUNT, - upper_bound_days=DATE_SHIFTED_AMOUNT, - date_fields=DATE_FIELDS, - ) - - out, _ = capsys.readouterr() - - assert "Successful" in out - - -def test_deidentify_with_date_shift_using_context_field(tempdir, capsys): - output_filepath = os.path.join(tempdir, "dates-shifted.csv") - - deid.deidentify_with_date_shift( - GCLOUD_PROJECT, - input_csv_file=CSV_FILE, - output_csv_file=output_filepath, - lower_bound_days=DATE_SHIFTED_AMOUNT, - upper_bound_days=DATE_SHIFTED_AMOUNT, - date_fields=DATE_FIELDS, - context_field_id=CSV_CONTEXT_FIELD, - wrapped_key=WRAPPED_KEY, - key_name=KEY_NAME, - ) - - out, _ = capsys.readouterr() - - assert "Successful" in out - - -def test_reidentify_with_fpe(capsys): - labeled_fpe_string = "My SSN is SSN_TOKEN(9):731997681" - - deid.reidentify_with_fpe( - GCLOUD_PROJECT, - labeled_fpe_string, - surrogate_type=SURROGATE_TYPE, - wrapped_key=WRAPPED_KEY, - key_name=KEY_NAME, - alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, - ) - - out, _ = capsys.readouterr() - - assert "731997681" not in out - - -def test_reidentify_with_deterministic(capsys): - labeled_fpe_string = "My SSN is SSN_TOKEN(36):ATeRUd3WWnAHHFtjtl1bv+CT09FZ7hyqNas=" - - deid.reidentify_with_deterministic( - GCLOUD_PROJECT, - labeled_fpe_string, - surrogate_type=SURROGATE_TYPE, - key_name=KEY_NAME, - wrapped_key=WRAPPED_KEY, - ) - - out, _ = capsys.readouterr() - - assert "SSN_TOKEN(" not in out - - -def test_deidentify_free_text_with_fpe_using_surrogate(capsys): - labeled_fpe_string = "My phone number is 4359916732" - - deid.deidentify_free_text_with_fpe_using_surrogate( - GCLOUD_PROJECT, - labeled_fpe_string, - info_type="PHONE_NUMBER", - surrogate_type="PHONE_TOKEN", - unwrapped_key=UNWRAPPED_KEY, - alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, - ) - - out, _ = capsys.readouterr() - - assert "PHONE_TOKEN" in out - assert "My phone number is" in out - assert "4359916732" not in out - - -def test_reidentify_free_text_with_fpe_using_surrogate(capsys): - labeled_fpe_string = "My phone number is PHONE_TOKEN(10):9617256398" - - deid.reidentify_free_text_with_fpe_using_surrogate( - GCLOUD_PROJECT, - labeled_fpe_string, - surrogate_type="PHONE_TOKEN", - unwrapped_key=UNWRAPPED_KEY, - alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, - ) - - out, _ = capsys.readouterr() - - assert "PHONE_TOKEN" not in out - assert "9617256398" not in out - assert "My phone number is" in out - - -def test_deidentify_with_replace_infotype(capsys): - url_to_redact = "https://cloud.google.com" - deid.deidentify_with_replace_infotype( - GCLOUD_PROJECT, - "My favorite site is " + url_to_redact, - ["URL"], - ) - - out, _ = capsys.readouterr() - - assert url_to_redact not in out - assert "My favorite site is [URL]" in out diff --git a/samples/snippets/inspect_content.py b/samples/snippets/inspect_content.py deleted file mode 100644 index 55e85507..00000000 --- a/samples/snippets/inspect_content.py +++ /dev/null @@ -1,1435 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Sample app that uses the Data Loss Prevention API to inspect a string, a -local file or a file on Google Cloud Storage.""" - -from __future__ import print_function - -import argparse -import json -import os - - -# [START dlp_inspect_string_basic] -def inspect_string_basic( - project, - content_string, - info_types=["PHONE_NUMBER"], -): - """Uses the Data Loss Prevention API to analyze strings for protected data. - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - info_types = [{"name": info_type} for info_type in info_types] - - # Construct the configuration dictionary. - inspect_config = { - "info_types": info_types, - "include_quote": True, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - print("Quote: {}".format(finding.quote)) - print("Info type: {}".format(finding.info_type.name)) - print("Likelihood: {}".format(finding.likelihood)) - else: - print("No findings.") - - -# [END dlp_inspect_string_basic] - - -# [START dlp_inspect_string] -def inspect_string( - project, - content_string, - info_types, - custom_dictionaries=None, - custom_regexes=None, - min_likelihood=None, - max_findings=None, - include_quote=True, -): - """Uses the Data Loss Prevention API to analyze strings for protected data. - Args: - project: The Google Cloud project id to use as a parent resource. - content_string: The string to inspect. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - min_likelihood: A string representing the minimum likelihood threshold - that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', - 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. - max_findings: The maximum number of findings to report; 0 = no maximum. - include_quote: Boolean for whether to display a quote of the detected - information in the results. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - info_types = [{"name": info_type} for info_type in info_types] - - # Prepare custom_info_types by parsing the dictionary word lists and - # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [ - { - "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, - "dictionary": {"word_list": {"words": custom_dict.split(",")}}, - } - for i, custom_dict in enumerate(custom_dictionaries) - ] - if custom_regexes is None: - custom_regexes = [] - regexes = [ - { - "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, - "regex": {"pattern": custom_regex}, - } - for i, custom_regex in enumerate(custom_regexes) - ] - custom_info_types = dictionaries + regexes - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - inspect_config = { - "info_types": info_types, - "custom_info_types": custom_info_types, - "min_likelihood": min_likelihood, - "include_quote": include_quote, - "limits": {"max_findings_per_request": max_findings}, - } - - # Construct the `item`. - item = {"value": content_string} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - try: - if finding.quote: - print("Quote: {}".format(finding.quote)) - except AttributeError: - pass - print("Info type: {}".format(finding.info_type.name)) - print("Likelihood: {}".format(finding.likelihood)) - else: - print("No findings.") - - -# [END dlp_inspect_string] - -# [START dlp_inspect_table] - - -def inspect_table( - project, - data, - info_types, - custom_dictionaries=None, - custom_regexes=None, - min_likelihood=None, - max_findings=None, - include_quote=True, -): - """Uses the Data Loss Prevention API to analyze strings for protected data. - Args: - project: The Google Cloud project id to use as a parent resource. - data: Json string representing table data. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - min_likelihood: A string representing the minimum likelihood threshold - that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', - 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. - max_findings: The maximum number of findings to report; 0 = no maximum. - include_quote: Boolean for whether to display a quote of the detected - information in the results. - Returns: - None; the response from the API is printed to the terminal. - Example: - data = { - "header":[ - "email", - "phone number" - ], - "rows":[ - [ - "robertfrost@xyz.com", - "4232342345" - ], - [ - "johndoe@pqr.com", - "4253458383" - ] - ] - } - - >> $ python inspect_content.py table \ - '{"header": ["email", "phone number"], - "rows": [["robertfrost@xyz.com", "4232342345"], - ["johndoe@pqr.com", "4253458383"]]}' - >> Quote: robertfrost@xyz.com - Info type: EMAIL_ADDRESS - Likelihood: 4 - Quote: johndoe@pqr.com - Info type: EMAIL_ADDRESS - Likelihood: 4 - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - info_types = [{"name": info_type} for info_type in info_types] - - # Prepare custom_info_types by parsing the dictionary word lists and - # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [ - { - "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, - "dictionary": {"word_list": {"words": custom_dict.split(",")}}, - } - for i, custom_dict in enumerate(custom_dictionaries) - ] - if custom_regexes is None: - custom_regexes = [] - regexes = [ - { - "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, - "regex": {"pattern": custom_regex}, - } - for i, custom_regex in enumerate(custom_regexes) - ] - custom_info_types = dictionaries + regexes - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - inspect_config = { - "info_types": info_types, - "custom_info_types": custom_info_types, - "min_likelihood": min_likelihood, - "include_quote": include_quote, - "limits": {"max_findings_per_request": max_findings}, - } - - # Construct the `table`. For more details on the table schema, please see - # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table - headers = [{"name": val} for val in data["header"]] - rows = [] - for row in data["rows"]: - rows.append({"values": [{"string_value": cell_val} for cell_val in row]}) - - table = {} - table["headers"] = headers - table["rows"] = rows - item = {"table": table} - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - try: - if finding.quote: - print("Quote: {}".format(finding.quote)) - except AttributeError: - pass - print("Info type: {}".format(finding.info_type.name)) - print("Likelihood: {}".format(finding.likelihood)) - else: - print("No findings.") - - -# [END dlp_inspect_table] - -# [START dlp_inspect_file] - - -def inspect_file( - project, - filename, - info_types, - min_likelihood=None, - custom_dictionaries=None, - custom_regexes=None, - max_findings=None, - include_quote=True, - mime_type=None, -): - """Uses the Data Loss Prevention API to analyze a file for protected data. - Args: - project: The Google Cloud project id to use as a parent resource. - filename: The path to the file to inspect. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - min_likelihood: A string representing the minimum likelihood threshold - that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', - 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. - max_findings: The maximum number of findings to report; 0 = no maximum. - include_quote: Boolean for whether to display a quote of the detected - information in the results. - mime_type: The MIME type of the file. If not specified, the type is - inferred via the Python standard library's mimetypes module. - Returns: - None; the response from the API is printed to the terminal. - """ - - import mimetypes - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - if not info_types: - info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] - info_types = [{"name": info_type} for info_type in info_types] - - # Prepare custom_info_types by parsing the dictionary word lists and - # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [ - { - "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, - "dictionary": {"word_list": {"words": custom_dict.split(",")}}, - } - for i, custom_dict in enumerate(custom_dictionaries) - ] - if custom_regexes is None: - custom_regexes = [] - regexes = [ - { - "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, - "regex": {"pattern": custom_regex}, - } - for i, custom_regex in enumerate(custom_regexes) - ] - custom_info_types = dictionaries + regexes - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - inspect_config = { - "info_types": info_types, - "custom_info_types": custom_info_types, - "min_likelihood": min_likelihood, - "include_quote": include_quote, - "limits": {"max_findings_per_request": max_findings}, - } - - # If mime_type is not specified, guess it from the filename. - if mime_type is None: - mime_guess = mimetypes.MimeTypes().guess_type(filename) - mime_type = mime_guess[0] - - # Select the content type index from the list of supported types. - supported_content_types = { - None: 0, # "Unspecified" - "image/jpeg": 1, - "image/bmp": 2, - "image/png": 3, - "image/svg": 4, - "text/plain": 5, - } - content_type_index = supported_content_types.get(mime_type, 0) - - # Construct the item, containing the file's byte data. - with open(filename, mode="rb") as f: - item = {"byte_item": {"type_": content_type_index, "data": f.read()}} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - try: - print("Quote: {}".format(finding.quote)) - except AttributeError: - pass - print("Info type: {}".format(finding.info_type.name)) - print("Likelihood: {}".format(finding.likelihood)) - else: - print("No findings.") - - -# [END dlp_inspect_file] - - -# [START dlp_inspect_gcs] -def inspect_gcs_file( - project, - bucket, - filename, - topic_id, - subscription_id, - info_types, - custom_dictionaries=None, - custom_regexes=None, - min_likelihood=None, - max_findings=None, - timeout=300, -): - """Uses the Data Loss Prevention API to analyze a file on GCS. - Args: - project: The Google Cloud project id to use as a parent resource. - bucket: The name of the GCS bucket containing the file, as a string. - filename: The name of the file in the bucket, including the path, as a - string; e.g. 'images/myfile.png'. - topic_id: The id of the Cloud Pub/Sub topic to which the API will - broadcast job completion. The topic must already exist. - subscription_id: The id of the Cloud Pub/Sub subscription to listen on - while waiting for job completion. The subscription must already - exist and be subscribed to the topic. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - min_likelihood: A string representing the minimum likelihood threshold - that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', - 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. - max_findings: The maximum number of findings to report; 0 = no maximum. - timeout: The number of seconds to wait for a response from the API. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - # This sample also uses threading.Event() to wait for the job to finish. - import threading - - import google.cloud.dlp - - # This sample additionally uses Cloud Pub/Sub to receive results from - # potentially long-running operations. - import google.cloud.pubsub - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - if not info_types: - info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] - info_types = [{"name": info_type} for info_type in info_types] - - # Prepare custom_info_types by parsing the dictionary word lists and - # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [ - { - "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, - "dictionary": {"word_list": {"words": custom_dict.split(",")}}, - } - for i, custom_dict in enumerate(custom_dictionaries) - ] - if custom_regexes is None: - custom_regexes = [] - regexes = [ - { - "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, - "regex": {"pattern": custom_regex}, - } - for i, custom_regex in enumerate(custom_regexes) - ] - custom_info_types = dictionaries + regexes - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - inspect_config = { - "info_types": info_types, - "custom_info_types": custom_info_types, - "min_likelihood": min_likelihood, - "limits": {"max_findings_per_request": max_findings}, - } - - # Construct a storage_config containing the file's URL. - url = "gs://{}/{}".format(bucket, filename) - storage_config = {"cloud_storage_options": {"file_set": {"url": url}}} - - # Convert the project id into full resource ids. - topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) - parent = f"projects/{project}/locations/global" - - # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": topic}}] - - # Construct the inspect_job, which defines the entire inspect content task. - inspect_job = { - "inspect_config": inspect_config, - "storage_config": storage_config, - "actions": actions, - } - - operation = dlp.create_dlp_job( - request={"parent": parent, "inspect_job": inspect_job} - ) - print("Inspection operation started: {}".format(operation.name)) - - # Create a Pub/Sub client and find the subscription. The subscription is - # expected to already be listening to the topic. - subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path(project, subscription_id) - - # Set up a callback to acknowledge a message. This closes around an event - # so that it can signal that it is done and the main thread can continue. - job_done = threading.Event() - - def callback(message): - try: - if message.attributes["DlpJobName"] == operation.name: - # This is the message we're looking for, so acknowledge it. - message.ack() - - # Now that the job is done, fetch the results and print them. - job = dlp.get_dlp_job(request={"name": operation.name}) - if job.inspect_details.result.info_type_stats: - for finding in job.inspect_details.result.info_type_stats: - print( - "Info type: {}; Count: {}".format( - finding.info_type.name, finding.count - ) - ) - else: - print("No findings.") - - # Signal to the main thread that we can exit. - job_done.set() - else: - # This is not the message we're looking for. - message.drop() - except Exception as e: - # Because this is executing in a thread, an exception won't be - # noted unless we print it manually. - print(e) - raise - - subscriber.subscribe(subscription_path, callback=callback) - finished = job_done.wait(timeout=timeout) - if not finished: - print( - "No event received before the timeout. Please verify that the " - "subscription provided is subscribed to the topic provided." - ) - - -# [END dlp_inspect_gcs] - - -# [START dlp_inspect_datastore] -def inspect_datastore( - project, - datastore_project, - kind, - topic_id, - subscription_id, - info_types, - custom_dictionaries=None, - custom_regexes=None, - namespace_id=None, - min_likelihood=None, - max_findings=None, - timeout=300, -): - """Uses the Data Loss Prevention API to analyze Datastore data. - Args: - project: The Google Cloud project id to use as a parent resource. - datastore_project: The Google Cloud project id of the target Datastore. - kind: The kind of the Datastore entity to inspect, e.g. 'Person'. - topic_id: The id of the Cloud Pub/Sub topic to which the API will - broadcast job completion. The topic must already exist. - subscription_id: The id of the Cloud Pub/Sub subscription to listen on - while waiting for job completion. The subscription must already - exist and be subscribed to the topic. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - namespace_id: The namespace of the Datastore document, if applicable. - min_likelihood: A string representing the minimum likelihood threshold - that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', - 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. - max_findings: The maximum number of findings to report; 0 = no maximum. - timeout: The number of seconds to wait for a response from the API. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - # This sample also uses threading.Event() to wait for the job to finish. - import threading - - import google.cloud.dlp - - # This sample additionally uses Cloud Pub/Sub to receive results from - # potentially long-running operations. - import google.cloud.pubsub - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - if not info_types: - info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] - info_types = [{"name": info_type} for info_type in info_types] - - # Prepare custom_info_types by parsing the dictionary word lists and - # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [ - { - "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, - "dictionary": {"word_list": {"words": custom_dict.split(",")}}, - } - for i, custom_dict in enumerate(custom_dictionaries) - ] - if custom_regexes is None: - custom_regexes = [] - regexes = [ - { - "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, - "regex": {"pattern": custom_regex}, - } - for i, custom_regex in enumerate(custom_regexes) - ] - custom_info_types = dictionaries + regexes - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - inspect_config = { - "info_types": info_types, - "custom_info_types": custom_info_types, - "min_likelihood": min_likelihood, - "limits": {"max_findings_per_request": max_findings}, - } - - # Construct a storage_config containing the target Datastore info. - storage_config = { - "datastore_options": { - "partition_id": { - "project_id": datastore_project, - "namespace_id": namespace_id, - }, - "kind": {"name": kind}, - } - } - - # Convert the project id into full resource ids. - topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) - parent = f"projects/{project}/locations/global" - - # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": topic}}] - - # Construct the inspect_job, which defines the entire inspect content task. - inspect_job = { - "inspect_config": inspect_config, - "storage_config": storage_config, - "actions": actions, - } - - operation = dlp.create_dlp_job( - request={"parent": parent, "inspect_job": inspect_job} - ) - print("Inspection operation started: {}".format(operation.name)) - - # Create a Pub/Sub client and find the subscription. The subscription is - # expected to already be listening to the topic. - subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path(project, subscription_id) - - # Set up a callback to acknowledge a message. This closes around an event - # so that it can signal that it is done and the main thread can continue. - job_done = threading.Event() - - def callback(message): - try: - if message.attributes["DlpJobName"] == operation.name: - # This is the message we're looking for, so acknowledge it. - message.ack() - - # Now that the job is done, fetch the results and print them. - job = dlp.get_dlp_job(request={"name": operation.name}) - if job.inspect_details.result.info_type_stats: - for finding in job.inspect_details.result.info_type_stats: - print( - "Info type: {}; Count: {}".format( - finding.info_type.name, finding.count - ) - ) - else: - print("No findings.") - - # Signal to the main thread that we can exit. - job_done.set() - else: - # This is not the message we're looking for. - message.drop() - except Exception as e: - # Because this is executing in a thread, an exception won't be - # noted unless we print it manually. - print(e) - raise - - # Register the callback and wait on the event. - subscriber.subscribe(subscription_path, callback=callback) - - finished = job_done.wait(timeout=timeout) - if not finished: - print( - "No event received before the timeout. Please verify that the " - "subscription provided is subscribed to the topic provided." - ) - - -# [END dlp_inspect_datastore] - - -# [START dlp_inspect_bigquery] -def inspect_bigquery( - project, - bigquery_project, - dataset_id, - table_id, - topic_id, - subscription_id, - info_types, - custom_dictionaries=None, - custom_regexes=None, - min_likelihood=None, - max_findings=None, - timeout=300, -): - """Uses the Data Loss Prevention API to analyze BigQuery data. - Args: - project: The Google Cloud project id to use as a parent resource. - bigquery_project: The Google Cloud project id of the target table. - dataset_id: The id of the target BigQuery dataset. - table_id: The id of the target BigQuery table. - topic_id: The id of the Cloud Pub/Sub topic to which the API will - broadcast job completion. The topic must already exist. - subscription_id: The id of the Cloud Pub/Sub subscription to listen on - while waiting for job completion. The subscription must already - exist and be subscribed to the topic. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - namespace_id: The namespace of the Datastore document, if applicable. - min_likelihood: A string representing the minimum likelihood threshold - that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', - 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. - max_findings: The maximum number of findings to report; 0 = no maximum. - timeout: The number of seconds to wait for a response from the API. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - # This sample also uses threading.Event() to wait for the job to finish. - import threading - - import google.cloud.dlp - - # This sample additionally uses Cloud Pub/Sub to receive results from - # potentially long-running operations. - import google.cloud.pubsub - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - if not info_types: - info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] - info_types = [{"name": info_type} for info_type in info_types] - - # Prepare custom_info_types by parsing the dictionary word lists and - # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [ - { - "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, - "dictionary": {"word_list": {"words": custom_dict.split(",")}}, - } - for i, custom_dict in enumerate(custom_dictionaries) - ] - if custom_regexes is None: - custom_regexes = [] - regexes = [ - { - "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, - "regex": {"pattern": custom_regex}, - } - for i, custom_regex in enumerate(custom_regexes) - ] - custom_info_types = dictionaries + regexes - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - inspect_config = { - "info_types": info_types, - "custom_info_types": custom_info_types, - "min_likelihood": min_likelihood, - "limits": {"max_findings_per_request": max_findings}, - } - - # Construct a storage_config containing the target Bigquery info. - storage_config = { - "big_query_options": { - "table_reference": { - "project_id": bigquery_project, - "dataset_id": dataset_id, - "table_id": table_id, - } - } - } - - # Convert the project id into full resource ids. - topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) - parent = f"projects/{project}/locations/global" - - # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": topic}}] - - # Construct the inspect_job, which defines the entire inspect content task. - inspect_job = { - "inspect_config": inspect_config, - "storage_config": storage_config, - "actions": actions, - } - - operation = dlp.create_dlp_job( - request={"parent": parent, "inspect_job": inspect_job} - ) - print("Inspection operation started: {}".format(operation.name)) - - # Create a Pub/Sub client and find the subscription. The subscription is - # expected to already be listening to the topic. - subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path(project, subscription_id) - - # Set up a callback to acknowledge a message. This closes around an event - # so that it can signal that it is done and the main thread can continue. - job_done = threading.Event() - - def callback(message): - try: - if message.attributes["DlpJobName"] == operation.name: - # This is the message we're looking for, so acknowledge it. - message.ack() - - # Now that the job is done, fetch the results and print them. - job = dlp.get_dlp_job(request={"name": operation.name}) - if job.inspect_details.result.info_type_stats: - for finding in job.inspect_details.result.info_type_stats: - print( - "Info type: {}; Count: {}".format( - finding.info_type.name, finding.count - ) - ) - else: - print("No findings.") - - # Signal to the main thread that we can exit. - job_done.set() - else: - # This is not the message we're looking for. - message.drop() - except Exception as e: - # Because this is executing in a thread, an exception won't be - # noted unless we print it manually. - print(e) - raise - - # Register the callback and wait on the event. - subscriber.subscribe(subscription_path, callback=callback) - finished = job_done.wait(timeout=timeout) - if not finished: - print( - "No event received before the timeout. Please verify that the " - "subscription provided is subscribed to the topic provided." - ) - - -# [END dlp_inspect_bigquery] - - -if __name__ == "__main__": - default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") - - parser = argparse.ArgumentParser(description=__doc__) - subparsers = parser.add_subparsers( - dest="content", help="Select how to submit content to the API." - ) - subparsers.required = True - - parser_string = subparsers.add_parser("string", help="Inspect a string.") - parser_string.add_argument("item", help="The string to inspect.") - parser_string.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - parser_string.add_argument( - "--info_types", - nargs="+", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - parser_string.add_argument( - "--custom_dictionaries", - action="append", - help="Strings representing comma-delimited lists of dictionary words" - " to search for as custom info types. Each string is a comma " - "delimited list of words representing a distinct dictionary.", - default=None, - ) - parser_string.add_argument( - "--custom_regexes", - action="append", - help="Strings representing regex patterns to search for as custom " - " info types.", - default=None, - ) - parser_string.add_argument( - "--min_likelihood", - choices=[ - "LIKELIHOOD_UNSPECIFIED", - "VERY_UNLIKELY", - "UNLIKELY", - "POSSIBLE", - "LIKELY", - "VERY_LIKELY", - ], - help="A string representing the minimum likelihood threshold that " - "constitutes a match.", - ) - parser_string.add_argument( - "--max_findings", - type=int, - help="The maximum number of findings to report; 0 = no maximum.", - ) - parser_string.add_argument( - "--include_quote", - type=bool, - help="A boolean for whether to display a quote of the detected " - "information in the results.", - default=True, - ) - - parser_table = subparsers.add_parser("table", help="Inspect a table.") - parser_table.add_argument( - "data", help="Json string representing a table.", type=json.loads - ) - parser_table.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - parser_table.add_argument( - "--info_types", - action="append", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - parser_table.add_argument( - "--custom_dictionaries", - action="append", - help="Strings representing comma-delimited lists of dictionary words" - " to search for as custom info types. Each string is a comma " - "delimited list of words representing a distinct dictionary.", - default=None, - ) - parser_table.add_argument( - "--custom_regexes", - action="append", - help="Strings representing regex patterns to search for as custom " - " info types.", - default=None, - ) - parser_table.add_argument( - "--min_likelihood", - choices=[ - "LIKELIHOOD_UNSPECIFIED", - "VERY_UNLIKELY", - "UNLIKELY", - "POSSIBLE", - "LIKELY", - "VERY_LIKELY", - ], - help="A string representing the minimum likelihood threshold that " - "constitutes a match.", - ) - parser_table.add_argument( - "--max_findings", - type=int, - help="The maximum number of findings to report; 0 = no maximum.", - ) - parser_table.add_argument( - "--include_quote", - type=bool, - help="A boolean for whether to display a quote of the detected " - "information in the results.", - default=True, - ) - - parser_file = subparsers.add_parser("file", help="Inspect a local file.") - parser_file.add_argument("filename", help="The path to the file to inspect.") - parser_file.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - parser_file.add_argument( - "--info_types", - action="append", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - parser_file.add_argument( - "--custom_dictionaries", - action="append", - help="Strings representing comma-delimited lists of dictionary words" - " to search for as custom info types. Each string is a comma " - "delimited list of words representing a distinct dictionary.", - default=None, - ) - parser_file.add_argument( - "--custom_regexes", - action="append", - help="Strings representing regex patterns to search for as custom " - " info types.", - default=None, - ) - parser_file.add_argument( - "--min_likelihood", - choices=[ - "LIKELIHOOD_UNSPECIFIED", - "VERY_UNLIKELY", - "UNLIKELY", - "POSSIBLE", - "LIKELY", - "VERY_LIKELY", - ], - help="A string representing the minimum likelihood threshold that " - "constitutes a match.", - ) - parser_file.add_argument( - "--max_findings", - type=int, - help="The maximum number of findings to report; 0 = no maximum.", - ) - parser_file.add_argument( - "--include_quote", - type=bool, - help="A boolean for whether to display a quote of the detected " - "information in the results.", - default=True, - ) - parser_file.add_argument( - "--mime_type", - help="The MIME type of the file. If not specified, the type is " - "inferred via the Python standard library's mimetypes module.", - ) - - parser_gcs = subparsers.add_parser( - "gcs", help="Inspect files on Google Cloud Storage." - ) - parser_gcs.add_argument( - "bucket", help="The name of the GCS bucket containing the file." - ) - parser_gcs.add_argument( - "filename", - help="The name of the file in the bucket, including the path, e.g. " - '"images/myfile.png". Wildcards are permitted.', - ) - parser_gcs.add_argument( - "topic_id", - help="The id of the Cloud Pub/Sub topic to use to report that the job " - 'is complete, e.g. "dlp-sample-topic".', - ) - parser_gcs.add_argument( - "subscription_id", - help="The id of the Cloud Pub/Sub subscription to monitor for job " - 'completion, e.g. "dlp-sample-subscription". The subscription must ' - "already be subscribed to the topic. See the test files or the Cloud " - "Pub/Sub sample files for examples on how to create the subscription.", - ) - parser_gcs.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - parser_gcs.add_argument( - "--info_types", - action="append", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - parser_gcs.add_argument( - "--custom_dictionaries", - action="append", - help="Strings representing comma-delimited lists of dictionary words" - " to search for as custom info types. Each string is a comma " - "delimited list of words representing a distinct dictionary.", - default=None, - ) - parser_gcs.add_argument( - "--custom_regexes", - action="append", - help="Strings representing regex patterns to search for as custom " - " info types.", - default=None, - ) - parser_gcs.add_argument( - "--min_likelihood", - choices=[ - "LIKELIHOOD_UNSPECIFIED", - "VERY_UNLIKELY", - "UNLIKELY", - "POSSIBLE", - "LIKELY", - "VERY_LIKELY", - ], - help="A string representing the minimum likelihood threshold that " - "constitutes a match.", - ) - parser_gcs.add_argument( - "--max_findings", - type=int, - help="The maximum number of findings to report; 0 = no maximum.", - ) - parser_gcs.add_argument( - "--timeout", - type=int, - help="The maximum number of seconds to wait for a response from the " - "API. The default is 300 seconds.", - default=300, - ) - - parser_datastore = subparsers.add_parser( - "datastore", help="Inspect files on Google Datastore." - ) - parser_datastore.add_argument( - "datastore_project", - help="The Google Cloud project id of the target Datastore.", - ) - parser_datastore.add_argument( - "kind", - help='The kind of the Datastore entity to inspect, e.g. "Person".', - ) - parser_datastore.add_argument( - "topic_id", - help="The id of the Cloud Pub/Sub topic to use to report that the job " - 'is complete, e.g. "dlp-sample-topic".', - ) - parser_datastore.add_argument( - "subscription_id", - help="The id of the Cloud Pub/Sub subscription to monitor for job " - 'completion, e.g. "dlp-sample-subscription". The subscription must ' - "already be subscribed to the topic. See the test files or the Cloud " - "Pub/Sub sample files for examples on how to create the subscription.", - ) - parser_datastore.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - parser_datastore.add_argument( - "--info_types", - action="append", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - parser_datastore.add_argument( - "--custom_dictionaries", - action="append", - help="Strings representing comma-delimited lists of dictionary words" - " to search for as custom info types. Each string is a comma " - "delimited list of words representing a distinct dictionary.", - default=None, - ) - parser_datastore.add_argument( - "--custom_regexes", - action="append", - help="Strings representing regex patterns to search for as custom " - " info types.", - default=None, - ) - parser_datastore.add_argument( - "--namespace_id", help="The Datastore namespace to use, if applicable." - ) - parser_datastore.add_argument( - "--min_likelihood", - choices=[ - "LIKELIHOOD_UNSPECIFIED", - "VERY_UNLIKELY", - "UNLIKELY", - "POSSIBLE", - "LIKELY", - "VERY_LIKELY", - ], - help="A string representing the minimum likelihood threshold that " - "constitutes a match.", - ) - parser_datastore.add_argument( - "--max_findings", - type=int, - help="The maximum number of findings to report; 0 = no maximum.", - ) - parser_datastore.add_argument( - "--timeout", - type=int, - help="The maximum number of seconds to wait for a response from the " - "API. The default is 300 seconds.", - default=300, - ) - - parser_bigquery = subparsers.add_parser( - "bigquery", help="Inspect files on Google BigQuery." - ) - parser_bigquery.add_argument( - "bigquery_project", - help="The Google Cloud project id of the target table.", - ) - parser_bigquery.add_argument( - "dataset_id", help="The ID of the target BigQuery dataset." - ) - parser_bigquery.add_argument( - "table_id", help="The ID of the target BigQuery table." - ) - parser_bigquery.add_argument( - "topic_id", - help="The id of the Cloud Pub/Sub topic to use to report that the job " - 'is complete, e.g. "dlp-sample-topic".', - ) - parser_bigquery.add_argument( - "subscription_id", - help="The id of the Cloud Pub/Sub subscription to monitor for job " - 'completion, e.g. "dlp-sample-subscription". The subscription must ' - "already be subscribed to the topic. See the test files or the Cloud " - "Pub/Sub sample files for examples on how to create the subscription.", - ) - parser_bigquery.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - parser_bigquery.add_argument( - "--info_types", - nargs="+", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - parser_bigquery.add_argument( - "--custom_dictionaries", - action="append", - help="Strings representing comma-delimited lists of dictionary words" - " to search for as custom info types. Each string is a comma " - "delimited list of words representing a distinct dictionary.", - default=None, - ) - parser_bigquery.add_argument( - "--custom_regexes", - action="append", - help="Strings representing regex patterns to search for as custom " - " info types.", - default=None, - ) - parser_bigquery.add_argument( - "--min_likelihood", - choices=[ - "LIKELIHOOD_UNSPECIFIED", - "VERY_UNLIKELY", - "UNLIKELY", - "POSSIBLE", - "LIKELY", - "VERY_LIKELY", - ], - help="A string representing the minimum likelihood threshold that " - "constitutes a match.", - ) - parser_bigquery.add_argument( - "--max_findings", - type=int, - help="The maximum number of findings to report; 0 = no maximum.", - ) - parser_bigquery.add_argument( - "--timeout", - type=int, - help="The maximum number of seconds to wait for a response from the " - "API. The default is 300 seconds.", - default=300, - ) - - args = parser.parse_args() - - if args.content == "string": - inspect_string( - args.project, - args.item, - args.info_types, - custom_dictionaries=args.custom_dictionaries, - custom_regexes=args.custom_regexes, - min_likelihood=args.min_likelihood, - max_findings=args.max_findings, - include_quote=args.include_quote, - ) - elif args.content == "table": - inspect_table( - args.project, - args.data, - args.info_types, - custom_dictionaries=args.custom_dictionaries, - custom_regexes=args.custom_regexes, - min_likelihood=args.min_likelihood, - max_findings=args.max_findings, - include_quote=args.include_quote, - ) - elif args.content == "file": - inspect_file( - args.project, - args.filename, - args.info_types, - custom_dictionaries=args.custom_dictionaries, - custom_regexes=args.custom_regexes, - min_likelihood=args.min_likelihood, - max_findings=args.max_findings, - include_quote=args.include_quote, - mime_type=args.mime_type, - ) - elif args.content == "gcs": - inspect_gcs_file( - args.project, - args.bucket, - args.filename, - args.topic_id, - args.subscription_id, - args.info_types, - custom_dictionaries=args.custom_dictionaries, - custom_regexes=args.custom_regexes, - min_likelihood=args.min_likelihood, - max_findings=args.max_findings, - timeout=args.timeout, - ) - elif args.content == "datastore": - inspect_datastore( - args.project, - args.datastore_project, - args.kind, - args.topic_id, - args.subscription_id, - args.info_types, - custom_dictionaries=args.custom_dictionaries, - custom_regexes=args.custom_regexes, - namespace_id=args.namespace_id, - min_likelihood=args.min_likelihood, - max_findings=args.max_findings, - timeout=args.timeout, - ) - elif args.content == "bigquery": - inspect_bigquery( - args.project, - args.bigquery_project, - args.dataset_id, - args.table_id, - args.topic_id, - args.subscription_id, - args.info_types, - custom_dictionaries=args.custom_dictionaries, - custom_regexes=args.custom_regexes, - min_likelihood=args.min_likelihood, - max_findings=args.max_findings, - timeout=args.timeout, - ) diff --git a/samples/snippets/inspect_content_test.py b/samples/snippets/inspect_content_test.py deleted file mode 100644 index 564f5b9f..00000000 --- a/samples/snippets/inspect_content_test.py +++ /dev/null @@ -1,483 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import uuid - -import backoff -import google.api_core.exceptions -from google.api_core.exceptions import ServiceUnavailable -import google.cloud.bigquery -import google.cloud.datastore -import google.cloud.dlp_v2 -import google.cloud.exceptions -import google.cloud.pubsub -import google.cloud.storage -import pytest - -import inspect_content - -UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] - -GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") -TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" + UNIQUE_STRING -RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") -RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"] -TOPIC_ID = "dlp-test" + UNIQUE_STRING -SUBSCRIPTION_ID = "dlp-test-subscription" + UNIQUE_STRING -DATASTORE_KIND = "DLP test kind" -DATASTORE_NAME = "DLP test object" + UNIQUE_STRING -BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING -BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING - -TIMEOUT = 900 # 15 minutes - - -@pytest.fixture(scope="module") -def bucket(): - # Creates a GCS bucket, uploads files required for the test, and tears down - # the entire bucket afterwards. - - client = google.cloud.storage.Client() - try: - bucket = client.get_bucket(TEST_BUCKET_NAME) - except google.cloud.exceptions.NotFound: - bucket = client.create_bucket(TEST_BUCKET_NAME) - - # Upoad the blobs and keep track of them in a list. - blobs = [] - for name in RESOURCE_FILE_NAMES: - path = os.path.join(RESOURCE_DIRECTORY, name) - blob = bucket.blob(name) - blob.upload_from_filename(path) - blobs.append(blob) - - # Yield the object to the test; lines after this execute as a teardown. - yield bucket - - # Delete the files. - for blob in blobs: - try: - blob.delete() - except google.cloud.exceptions.NotFound: - print("Issue during teardown, missing blob") - - # Attempt to delete the bucket; this will only work if it is empty. - bucket.delete() - - -@pytest.fixture(scope="module") -def topic_id(): - # Creates a pubsub topic, and tears it down. - publisher = google.cloud.pubsub.PublisherClient() - topic_path = publisher.topic_path(GCLOUD_PROJECT, TOPIC_ID) - try: - publisher.create_topic(request={"name": topic_path}) - except google.api_core.exceptions.AlreadyExists: - pass - - yield TOPIC_ID - - publisher.delete_topic(request={"topic": topic_path}) - - -@pytest.fixture(scope="module") -def subscription_id(topic_id): - # Subscribes to a topic. - subscriber = google.cloud.pubsub.SubscriberClient() - topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id) - subscription_path = subscriber.subscription_path(GCLOUD_PROJECT, SUBSCRIPTION_ID) - try: - subscriber.create_subscription( - request={"name": subscription_path, "topic": topic_path} - ) - except google.api_core.exceptions.AlreadyExists: - pass - - yield SUBSCRIPTION_ID - - subscriber.delete_subscription(request={"subscription": subscription_path}) - - -@pytest.fixture(scope="module") -def datastore_project(): - # Adds test Datastore data, yields the project ID and then tears down. - datastore_client = google.cloud.datastore.Client() - - kind = DATASTORE_KIND - name = DATASTORE_NAME - key = datastore_client.key(kind, name) - item = google.cloud.datastore.Entity(key=key) - item["payload"] = "My name is Gary Smith and my email is gary@example.com" - - datastore_client.put(item) - - yield GCLOUD_PROJECT - - @backoff.on_exception(backoff.expo, ServiceUnavailable, max_time=120) - def cleanup(): - datastore_client.delete(key) - - cleanup() - - -@pytest.fixture(scope="module") -def bigquery_project(): - # Adds test Bigquery data, yields the project ID and then tears down. - bigquery_client = google.cloud.bigquery.Client() - - dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID) - dataset = google.cloud.bigquery.Dataset(dataset_ref) - try: - dataset = bigquery_client.create_dataset(dataset) - except google.api_core.exceptions.Conflict: - dataset = bigquery_client.get_dataset(dataset) - - table_ref = dataset_ref.table(BIGQUERY_TABLE_ID) - table = google.cloud.bigquery.Table(table_ref) - - # DO NOT SUBMIT: trim this down once we find out what works - table.schema = ( - google.cloud.bigquery.SchemaField("Name", "STRING"), - google.cloud.bigquery.SchemaField("Comment", "STRING"), - ) - - try: - table = bigquery_client.create_table(table) - except google.api_core.exceptions.Conflict: - table = bigquery_client.get_table(table) - - rows_to_insert = [("Gary Smith", "My email is gary@example.com")] - - bigquery_client.insert_rows(table, rows_to_insert) - - yield GCLOUD_PROJECT - - @backoff.on_exception(backoff.expo, ServiceUnavailable, max_time=120) - def cleanup(): - bigquery_client.delete_dataset(dataset_ref, delete_contents=True) - - cleanup() - - -def test_inspect_string_basic(capsys): - test_string = "String with a phone number: 234-555-6789" - - inspect_content.inspect_string_basic(GCLOUD_PROJECT, test_string) - - out, _ = capsys.readouterr() - assert "Info type: PHONE_NUMBER" in out - assert "Quote: 234-555-6789" in out - - -def test_inspect_string(capsys): - test_string = "My name is Gary Smith and my email is gary@example.com" - - inspect_content.inspect_string( - GCLOUD_PROJECT, - test_string, - ["FIRST_NAME", "EMAIL_ADDRESS"], - include_quote=True, - ) - - out, _ = capsys.readouterr() - assert "Info type: FIRST_NAME" in out - assert "Info type: EMAIL_ADDRESS" in out - - -def test_inspect_table(capsys): - test_tabular_data = { - "header": ["email", "phone number"], - "rows": [ - ["robertfrost@xyz.com", "4232342345"], - ["johndoe@pqr.com", "4253458383"], - ], - } - - inspect_content.inspect_table( - GCLOUD_PROJECT, - test_tabular_data, - ["PHONE_NUMBER", "EMAIL_ADDRESS"], - include_quote=True, - ) - - out, _ = capsys.readouterr() - assert "Info type: PHONE_NUMBER" in out - assert "Info type: EMAIL_ADDRESS" in out - - -def test_inspect_string_with_custom_info_types(capsys): - test_string = "My name is Gary Smith and my email is gary@example.com" - dictionaries = ["Gary Smith"] - regexes = ["\\w+@\\w+.com"] - - inspect_content.inspect_string( - GCLOUD_PROJECT, - test_string, - [], - custom_dictionaries=dictionaries, - custom_regexes=regexes, - include_quote=True, - ) - - out, _ = capsys.readouterr() - assert "Info type: CUSTOM_DICTIONARY_0" in out - assert "Info type: CUSTOM_REGEX_0" in out - - -def test_inspect_string_no_results(capsys): - test_string = "Nothing to see here" - - inspect_content.inspect_string( - GCLOUD_PROJECT, - test_string, - ["FIRST_NAME", "EMAIL_ADDRESS"], - include_quote=True, - ) - - out, _ = capsys.readouterr() - assert "No findings" in out - - -def test_inspect_file(capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt") - - inspect_content.inspect_file( - GCLOUD_PROJECT, - test_filepath, - ["FIRST_NAME", "EMAIL_ADDRESS"], - include_quote=True, - ) - - out, _ = capsys.readouterr() - assert "Info type: EMAIL_ADDRESS" in out - - -def test_inspect_file_with_custom_info_types(capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt") - dictionaries = ["gary@somedomain.com"] - regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"] - - inspect_content.inspect_file( - GCLOUD_PROJECT, - test_filepath, - [], - custom_dictionaries=dictionaries, - custom_regexes=regexes, - include_quote=True, - ) - - out, _ = capsys.readouterr() - assert "Info type: CUSTOM_DICTIONARY_0" in out - assert "Info type: CUSTOM_REGEX_0" in out - - -def test_inspect_file_no_results(capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, "harmless.txt") - - inspect_content.inspect_file( - GCLOUD_PROJECT, - test_filepath, - ["FIRST_NAME", "EMAIL_ADDRESS"], - include_quote=True, - ) - - out, _ = capsys.readouterr() - assert "No findings" in out - - -def test_inspect_image_file(capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") - - inspect_content.inspect_file( - GCLOUD_PROJECT, - test_filepath, - ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], - include_quote=True, - ) - - out, _ = capsys.readouterr() - assert "Info type: PHONE_NUMBER" in out - - -def cancel_operation(out): - if "Inspection operation started" in out: - # Cancel the operation - operation_id = out.split("Inspection operation started: ")[1].split("\n")[0] - client = google.cloud.dlp_v2.DlpServiceClient() - client.cancel_dlp_job(request={"name": operation_id}) - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): - try: - inspect_content.inspect_gcs_file( - GCLOUD_PROJECT, - bucket.name, - "test.txt", - topic_id, - subscription_id, - ["EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=TIMEOUT, - ) - - out, _ = capsys.readouterr() - assert "Info type: EMAIL_ADDRESS" in out - finally: - cancel_operation(out) - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_inspect_gcs_file_with_custom_info_types( - bucket, topic_id, subscription_id, capsys -): - try: - dictionaries = ["gary@somedomain.com"] - regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"] - - inspect_content.inspect_gcs_file( - GCLOUD_PROJECT, - bucket.name, - "test.txt", - topic_id, - subscription_id, - [], - custom_dictionaries=dictionaries, - custom_regexes=regexes, - timeout=TIMEOUT, - ) - - out, _ = capsys.readouterr() - - assert "Info type: EMAIL_ADDRESS" in out - finally: - cancel_operation(out) - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_inspect_gcs_file_no_results(bucket, topic_id, subscription_id, capsys): - try: - inspect_content.inspect_gcs_file( - GCLOUD_PROJECT, - bucket.name, - "harmless.txt", - topic_id, - subscription_id, - ["EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=TIMEOUT, - ) - - out, _ = capsys.readouterr() - - assert "No findings" in out - finally: - cancel_operation(out) - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_inspect_gcs_image_file(bucket, topic_id, subscription_id, capsys): - try: - inspect_content.inspect_gcs_file( - GCLOUD_PROJECT, - bucket.name, - "test.png", - topic_id, - subscription_id, - ["EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=TIMEOUT, - ) - - out, _ = capsys.readouterr() - assert "Info type: EMAIL_ADDRESS" in out - finally: - cancel_operation(out) - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_inspect_gcs_multiple_files(bucket, topic_id, subscription_id, capsys): - try: - inspect_content.inspect_gcs_file( - GCLOUD_PROJECT, - bucket.name, - "*", - topic_id, - subscription_id, - ["EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=TIMEOUT, - ) - - out, _ = capsys.readouterr() - - assert "Info type: EMAIL_ADDRESS" in out - finally: - cancel_operation(out) - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_inspect_datastore(datastore_project, topic_id, subscription_id, capsys): - try: - inspect_content.inspect_datastore( - GCLOUD_PROJECT, - datastore_project, - DATASTORE_KIND, - topic_id, - subscription_id, - ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=TIMEOUT, - ) - - out, _ = capsys.readouterr() - assert "Info type: EMAIL_ADDRESS" in out - finally: - cancel_operation(out) - - -@pytest.mark.flaky(max_runs=2, min_passes=1) -def test_inspect_datastore_no_results( - datastore_project, topic_id, subscription_id, capsys -): - try: - inspect_content.inspect_datastore( - GCLOUD_PROJECT, - datastore_project, - DATASTORE_KIND, - topic_id, - subscription_id, - ["PHONE_NUMBER"], - timeout=TIMEOUT, - ) - - out, _ = capsys.readouterr() - assert "No findings" in out - finally: - cancel_operation(out) - - -def test_inspect_bigquery(bigquery_project, topic_id, subscription_id, capsys): - try: - inspect_content.inspect_bigquery( - GCLOUD_PROJECT, - bigquery_project, - BIGQUERY_DATASET_ID, - BIGQUERY_TABLE_ID, - topic_id, - subscription_id, - ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=1, - ) - - out, _ = capsys.readouterr() - assert "Inspection operation started" in out - finally: - cancel_operation(out) diff --git a/samples/snippets/jobs.py b/samples/snippets/jobs.py deleted file mode 100644 index 0bf77104..00000000 --- a/samples/snippets/jobs.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Sample app to list and delete DLP jobs using the Data Loss Prevent API. """ - -from __future__ import print_function - -import argparse - - -# [START dlp_list_jobs] -def list_dlp_jobs(project, filter_string=None, job_type=None): - """Uses the Data Loss Prevention API to lists DLP jobs that match the - specified filter in the request. - Args: - project: The project id to use as a parent resource. - filter: (Optional) Allows filtering. - Supported syntax: - * Filter expressions are made up of one or more restrictions. - * Restrictions can be combined by 'AND' or 'OR' logical operators. - A sequence of restrictions implicitly uses 'AND'. - * A restriction has the form of ' '. - * Supported fields/values for inspect jobs: - - `state` - PENDING|RUNNING|CANCELED|FINISHED|FAILED - - `inspected_storage` - DATASTORE|CLOUD_STORAGE|BIGQUERY - - `trigger_name` - The resource name of the trigger that - created job. - * Supported fields for risk analysis jobs: - - `state` - RUNNING|CANCELED|FINISHED|FAILED - * The operator must be '=' or '!='. - Examples: - * inspected_storage = cloud_storage AND state = done - * inspected_storage = cloud_storage OR inspected_storage = bigquery - * inspected_storage = cloud_storage AND - (state = done OR state = canceled) - type: (Optional) The type of job. Defaults to 'INSPECT'. - Choices: - DLP_JOB_TYPE_UNSPECIFIED - INSPECT_JOB: The job inspected content for sensitive data. - RISK_ANALYSIS_JOB: The job executed a Risk Analysis computation. - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Job type dictionary - job_type_to_int = { - "DLP_JOB_TYPE_UNSPECIFIED": google.cloud.dlp.DlpJobType.DLP_JOB_TYPE_UNSPECIFIED, - "INSPECT_JOB": google.cloud.dlp.DlpJobType.INSPECT_JOB, - "RISK_ANALYSIS_JOB": google.cloud.dlp.DlpJobType.RISK_ANALYSIS_JOB, - } - # If job type is specified, convert job type to number through enums. - if job_type: - job_type = job_type_to_int[job_type] - - # Call the API to get a list of jobs. - response = dlp.list_dlp_jobs( - request={"parent": parent, "filter": filter_string, "type_": job_type} - ) - - # Iterate over results. - for job in response: - print("Job: %s; status: %s" % (job.name, job.state.name)) - - -# [END dlp_list_jobs] - - -# [START dlp_delete_job] -def delete_dlp_job(project, job_name): - """Uses the Data Loss Prevention API to delete a long-running DLP job. - Args: - project: The project id to use as a parent resource. - job_name: The name of the DlpJob resource to be deleted. - - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library. - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id and job name into a full resource id. - name = f"projects/{project}/dlpJobs/{job_name}" - - # Call the API to delete job. - dlp.delete_dlp_job(request={"name": name}) - - print("Successfully deleted %s" % job_name) - - -# [END dlp_delete_job] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - subparsers = parser.add_subparsers( - dest="content", help="Select how to submit content to the API." - ) - subparsers.required = True - - list_parser = subparsers.add_parser( - "list", - help="List Data Loss Prevention API jobs corresponding to a given " "filter.", - ) - list_parser.add_argument( - "project", help="The project id to use as a parent resource." - ) - list_parser.add_argument( - "-f", - "--filter", - help="Filter expressions are made up of one or more restrictions.", - ) - list_parser.add_argument( - "-t", - "--type", - choices=["DLP_JOB_TYPE_UNSPECIFIED", "INSPECT_JOB", "RISK_ANALYSIS_JOB"], - help='The type of job. API defaults to "INSPECT"', - ) - - delete_parser = subparsers.add_parser( - "delete", help="Delete results of a Data Loss Prevention API job." - ) - delete_parser.add_argument( - "project", help="The project id to use as a parent resource." - ) - delete_parser.add_argument( - "job_name", - help="The name of the DlpJob resource to be deleted. " "Example: X-#####", - ) - - args = parser.parse_args() - - if args.content == "list": - list_dlp_jobs(args.project, filter_string=args.filter, job_type=args.type) - elif args.content == "delete": - delete_dlp_job(args.project, args.job_name) diff --git a/samples/snippets/jobs_test.py b/samples/snippets/jobs_test.py deleted file mode 100644 index 361118d4..00000000 --- a/samples/snippets/jobs_test.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import uuid - -import pytest - -import jobs - -GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") -TEST_COLUMN_NAME = "zip_code" -TEST_TABLE_PROJECT_ID = "bigquery-public-data" -TEST_DATASET_ID = "san_francisco" -TEST_TABLE_ID = "bikeshare_trips" -test_job_id = "test-job-{}".format(uuid.uuid4()) - - -@pytest.fixture(scope="module") -def test_job_name(): - import google.cloud.dlp - - dlp = google.cloud.dlp_v2.DlpServiceClient() - - parent = f"projects/{GCLOUD_PROJECT}" - - # Construct job request - risk_job = { - "privacy_metric": { - "categorical_stats_config": {"field": {"name": TEST_COLUMN_NAME}} - }, - "source_table": { - "project_id": TEST_TABLE_PROJECT_ID, - "dataset_id": TEST_DATASET_ID, - "table_id": TEST_TABLE_ID, - }, - } - - response = dlp.create_dlp_job( - request={"parent": parent, "risk_job": risk_job, "job_id": test_job_id} - ) - full_path = response.name - # API expects only job name, not full project path - job_name = full_path[full_path.rfind("/") + 1 :] - yield job_name - - # clean up job if not deleted - try: - dlp.delete_dlp_job(request={"name": full_path}) - except google.api_core.exceptions.NotFound: - print("Issue during teardown, missing job") - - -def test_list_dlp_jobs(test_job_name, capsys): - jobs.list_dlp_jobs(GCLOUD_PROJECT) - - out, _ = capsys.readouterr() - assert test_job_name not in out - - -def test_list_dlp_jobs_with_filter(test_job_name, capsys): - jobs.list_dlp_jobs( - GCLOUD_PROJECT, - filter_string="state=RUNNING OR state=DONE", - job_type="RISK_ANALYSIS_JOB", - ) - - out, _ = capsys.readouterr() - assert test_job_name in out - - -def test_list_dlp_jobs_with_job_type(test_job_name, capsys): - jobs.list_dlp_jobs(GCLOUD_PROJECT, job_type="INSPECT_JOB") - - out, _ = capsys.readouterr() - assert test_job_name not in out # job created is a risk analysis job - - -def test_delete_dlp_job(test_job_name, capsys): - jobs.delete_dlp_job(GCLOUD_PROJECT, test_job_name) diff --git a/samples/snippets/metadata.py b/samples/snippets/metadata.py deleted file mode 100644 index 0782a6a0..00000000 --- a/samples/snippets/metadata.py +++ /dev/null @@ -1,72 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Sample app that queries the Data Loss Prevention API for supported -categories and info types.""" - -from __future__ import print_function - -import argparse - - -# [START dlp_list_info_types] -def list_info_types(language_code=None, result_filter=None): - """List types of sensitive information within a category. - Args: - language_code: The BCP-47 language code to use, e.g. 'en-US'. - filter: An optional filter to only return info types supported by - certain parts of the API. Defaults to "supported_by=INSPECT". - Returns: - None; the response from the API is printed to the terminal. - """ - # Import the client library - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Make the API call. - response = dlp.list_info_types( - request={"parent": language_code, "filter": result_filter} - ) - - # Print the results to the console. - print("Info types:") - for info_type in response.info_types: - print( - "{name}: {display_name}".format( - name=info_type.name, display_name=info_type.display_name - ) - ) - - -# [END dlp_list_info_types] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--language_code", - help="The BCP-47 language code to use, e.g. 'en-US'.", - ) - parser.add_argument( - "--filter", - help="An optional filter to only return info types supported by " - 'certain parts of the API. Defaults to "supported_by=INSPECT".', - ) - - args = parser.parse_args() - - list_info_types(language_code=args.language_code, result_filter=args.filter) diff --git a/samples/snippets/metadata_test.py b/samples/snippets/metadata_test.py deleted file mode 100644 index bde63fd3..00000000 --- a/samples/snippets/metadata_test.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import metadata - - -def test_fetch_info_types(capsys): - metadata.list_info_types() - - out, _ = capsys.readouterr() - assert "EMAIL_ADDRESS" in out diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py deleted file mode 100644 index de104dbc..00000000 --- a/samples/snippets/noxfile.py +++ /dev/null @@ -1,292 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import glob -import os -from pathlib import Path -import sys -from typing import Callable, Dict, Optional - -import nox - -# WARNING - WARNING - WARNING - WARNING - WARNING -# WARNING - WARNING - WARNING - WARNING - WARNING -# DO NOT EDIT THIS FILE EVER! -# WARNING - WARNING - WARNING - WARNING - WARNING -# WARNING - WARNING - WARNING - WARNING - WARNING - -BLACK_VERSION = "black==22.3.0" -ISORT_VERSION = "isort==5.10.1" - -# Copy `noxfile_config.py` to your directory and modify it instead. - -# `TEST_CONFIG` dict is a configuration hook that allows users to -# modify the test configurations. The values here should be in sync -# with `noxfile_config.py`. Users will copy `noxfile_config.py` into -# their directory and modify it. - -TEST_CONFIG = { - # You can opt out from the test for specific Python versions. - "ignored_versions": [], - # Old samples are opted out of enforcing Python type hints - # All new samples should feature them - "enforce_type_hints": False, - # An envvar key for determining the project id to use. Change it - # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a - # build specific Cloud project. You can also use your own string - # to use your own Cloud project. - "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", - # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', - # If you need to use a specific version of pip, - # change pip_version_override to the string representation - # of the version number, for example, "20.2.4" - "pip_version_override": None, - # A dictionary you want to inject into your test. Don't put any - # secrets here. These values will override predefined values. - "envs": {}, -} - - -try: - # Ensure we can import noxfile_config in the project's directory. - sys.path.append(".") - from noxfile_config import TEST_CONFIG_OVERRIDE -except ImportError as e: - print("No user noxfile_config found: detail: {}".format(e)) - TEST_CONFIG_OVERRIDE = {} - -# Update the TEST_CONFIG with the user supplied values. -TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) - - -def get_pytest_env_vars() -> Dict[str, str]: - """Returns a dict for pytest invocation.""" - ret = {} - - # Override the GCLOUD_PROJECT and the alias. - env_key = TEST_CONFIG["gcloud_project_env"] - # This should error out if not set. - ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key] - - # Apply user supplied envs. - ret.update(TEST_CONFIG["envs"]) - return ret - - -# DO NOT EDIT - automatically generated. -# All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11"] - -# Any default versions that should be ignored. -IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] - -TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) - -INSTALL_LIBRARY_FROM_SOURCE = os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False) in ( - "True", - "true", -) - -# Error if a python version is missing -nox.options.error_on_missing_interpreters = True - -# -# Style Checks -# - - -# Linting with flake8. -# -# We ignore the following rules: -# E203: whitespace before ‘:’ -# E266: too many leading ‘#’ for block comment -# E501: line too long -# I202: Additional newline in a section of imports -# -# We also need to specify the rules which are ignored by default: -# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] -FLAKE8_COMMON_ARGS = [ - "--show-source", - "--builtin=gettext", - "--max-complexity=20", - "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", - "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", - "--max-line-length=88", -] - - -@nox.session -def lint(session: nox.sessions.Session) -> None: - if not TEST_CONFIG["enforce_type_hints"]: - session.install("flake8") - else: - session.install("flake8", "flake8-annotations") - - args = FLAKE8_COMMON_ARGS + [ - ".", - ] - session.run("flake8", *args) - - -# -# Black -# - - -@nox.session -def blacken(session: nox.sessions.Session) -> None: - """Run black. Format code to uniform standard.""" - session.install(BLACK_VERSION) - python_files = [path for path in os.listdir(".") if path.endswith(".py")] - - session.run("black", *python_files) - - -# -# format = isort + black -# - - -@nox.session -def format(session: nox.sessions.Session) -> None: - """ - Run isort to sort imports. Then run black - to format code to uniform standard. - """ - session.install(BLACK_VERSION, ISORT_VERSION) - python_files = [path for path in os.listdir(".") if path.endswith(".py")] - - # Use the --fss option to sort imports using strict alphabetical order. - # See https://pycqa.github.io/isort/docs/configuration/options.html#force-sort-within-sections - session.run("isort", "--fss", *python_files) - session.run("black", *python_files) - - -# -# Sample Tests -# - - -PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] - - -def _session_tests( - session: nox.sessions.Session, post_install: Callable = None -) -> None: - # check for presence of tests - test_list = glob.glob("**/*_test.py", recursive=True) + glob.glob( - "**/test_*.py", recursive=True - ) - test_list.extend(glob.glob("**/tests", recursive=True)) - - if len(test_list) == 0: - print("No tests found, skipping directory.") - return - - if TEST_CONFIG["pip_version_override"]: - pip_version = TEST_CONFIG["pip_version_override"] - session.install(f"pip=={pip_version}") - """Runs py.test for a particular project.""" - concurrent_args = [] - if os.path.exists("requirements.txt"): - if os.path.exists("constraints.txt"): - session.install("-r", "requirements.txt", "-c", "constraints.txt") - else: - session.install("-r", "requirements.txt") - with open("requirements.txt") as rfile: - packages = rfile.read() - - if os.path.exists("requirements-test.txt"): - if os.path.exists("constraints-test.txt"): - session.install("-r", "requirements-test.txt", "-c", "constraints-test.txt") - else: - session.install("-r", "requirements-test.txt") - with open("requirements-test.txt") as rtfile: - packages += rtfile.read() - - if INSTALL_LIBRARY_FROM_SOURCE: - session.install("-e", _get_repo_root()) - - if post_install: - post_install(session) - - if "pytest-parallel" in packages: - concurrent_args.extend(["--workers", "auto", "--tests-per-worker", "auto"]) - elif "pytest-xdist" in packages: - concurrent_args.extend(["-n", "auto"]) - - session.run( - "pytest", - *(PYTEST_COMMON_ARGS + session.posargs + concurrent_args), - # Pytest will return 5 when no tests are collected. This can happen - # on travis where slow and flaky tests are excluded. - # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html - success_codes=[0, 5], - env=get_pytest_env_vars(), - ) - - -@nox.session(python=ALL_VERSIONS) -def py(session: nox.sessions.Session) -> None: - """Runs py.test for a sample using the specified version of Python.""" - if session.python in TESTED_VERSIONS: - _session_tests(session) - else: - session.skip( - "SKIPPED: {} tests are disabled for this sample.".format(session.python) - ) - - -# -# Readmegen -# - - -def _get_repo_root() -> Optional[str]: - """Returns the root folder of the project.""" - # Get root of this repository. Assume we don't have directories nested deeper than 10 items. - p = Path(os.getcwd()) - for i in range(10): - if p is None: - break - if Path(p / ".git").exists(): - return str(p) - # .git is not available in repos cloned via Cloud Build - # setup.py is always in the library's root, so use that instead - # https://github.com/googleapis/synthtool/issues/792 - if Path(p / "setup.py").exists(): - return str(p) - p = p.parent - raise Exception("Unable to detect repository root.") - - -GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) - - -@nox.session -@nox.parametrize("path", GENERATED_READMES) -def readmegen(session: nox.sessions.Session, path: str) -> None: - """(Re-)generates the readme for a sample.""" - session.install("jinja2", "pyyaml") - dir_ = os.path.dirname(path) - - if os.path.exists(os.path.join(dir_, "requirements.txt")): - session.install("-r", os.path.join(dir_, "requirements.txt")) - - in_file = os.path.join(dir_, "README.rst.in") - session.run( - "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file - ) diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py deleted file mode 100644 index d40fac4f..00000000 --- a/samples/snippets/quickstart.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Sample app that queries the Data Loss Prevention API for supported -categories and info types.""" - -from __future__ import print_function - -import argparse -import sys - - -def quickstart(project_id): - """Demonstrates use of the Data Loss Prevention API client library.""" - - # [START dlp_quickstart] - # Import the client library - import google.cloud.dlp - - # Instantiate a client. - dlp_client = google.cloud.dlp_v2.DlpServiceClient() - - # The string to inspect - content = "Robert Frost" - - # Construct the item to inspect. - item = {"value": content} - - # The info types to search for in the content. Required. - info_types = [{"name": "FIRST_NAME"}, {"name": "LAST_NAME"}] - - # The minimum likelihood to constitute a match. Optional. - min_likelihood = google.cloud.dlp_v2.Likelihood.LIKELIHOOD_UNSPECIFIED - - # The maximum number of findings to report (0 = server maximum). Optional. - max_findings = 0 - - # Whether to include the matching string in the results. Optional. - include_quote = True - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - inspect_config = { - "info_types": info_types, - "min_likelihood": min_likelihood, - "include_quote": include_quote, - "limits": {"max_findings_per_request": max_findings}, - } - - # Convert the project id into a full resource id. - parent = f"projects/{project_id}" - - # Call the API. - response = dlp_client.inspect_content( - request={"parent": parent, "inspect_config": inspect_config, "item": item} - ) - - # Print out the results. - if response.result.findings: - for finding in response.result.findings: - try: - print("Quote: {}".format(finding.quote)) - except AttributeError: - pass - print("Info type: {}".format(finding.info_type.name)) - # Convert likelihood value to string respresentation. - likelihood = finding.likelihood.name - print("Likelihood: {}".format(likelihood)) - else: - print("No findings.") - # [END dlp_quickstart] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("project_id", help="Enter your GCP project id.", type=str) - args = parser.parse_args() - if len(sys.argv) == 1: - parser.print_usage() - sys.exit(1) - quickstart(args.project_id) diff --git a/samples/snippets/quickstart_test.py b/samples/snippets/quickstart_test.py deleted file mode 100644 index 2b113779..00000000 --- a/samples/snippets/quickstart_test.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import quickstart - -GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") - - -def test_quickstart(capsys): - quickstart.quickstart(GCLOUD_PROJECT) - - out, _ = capsys.readouterr() - assert "FIRST_NAME" in out - assert "LAST_NAME" in out diff --git a/samples/snippets/redact.py b/samples/snippets/redact.py deleted file mode 100644 index e49b85f7..00000000 --- a/samples/snippets/redact.py +++ /dev/null @@ -1,259 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Sample app that uses the Data Loss Prevent API to redact the contents of -an image file.""" - -from __future__ import print_function - -import argparse - -# [START dlp_redact_image] -import mimetypes - -# [END dlp_redact_image] -import os - -# [START dlp_redact_image] - - -def redact_image( - project, - filename, - output_filename, - info_types, - min_likelihood=None, - mime_type=None, -): - """Uses the Data Loss Prevention API to redact protected data in an image. - Args: - project: The Google Cloud project id to use as a parent resource. - filename: The path to the file to inspect. - output_filename: The path to which the redacted image will be written. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - min_likelihood: A string representing the minimum likelihood threshold - that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', - 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. - mime_type: The MIME type of the file. If not specified, the type is - inferred via the Python standard library's mimetypes module. - Returns: - None; the response from the API is printed to the terminal. - """ - # Import the client library - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - info_types = [{"name": info_type} for info_type in info_types] - - # Prepare image_redaction_configs, a list of dictionaries. Each dictionary - # contains an info_type and optionally the color used for the replacement. - # The color is omitted in this sample, so the default (black) will be used. - image_redaction_configs = [] - - if info_types is not None: - for info_type in info_types: - image_redaction_configs.append({"info_type": info_type}) - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - inspect_config = { - "min_likelihood": min_likelihood, - "info_types": info_types, - } - - # If mime_type is not specified, guess it from the filename. - if mime_type is None: - mime_guess = mimetypes.MimeTypes().guess_type(filename) - mime_type = mime_guess[0] or "application/octet-stream" - - # Select the content type index from the list of supported types. - supported_content_types = { - None: 0, # "Unspecified" - "image/jpeg": 1, - "image/bmp": 2, - "image/png": 3, - "image/svg": 4, - "text/plain": 5, - } - content_type_index = supported_content_types.get(mime_type, 0) - - # Construct the byte_item, containing the file's byte data. - with open(filename, mode="rb") as f: - byte_item = {"type_": content_type_index, "data": f.read()} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.redact_image( - request={ - "parent": parent, - "inspect_config": inspect_config, - "image_redaction_configs": image_redaction_configs, - "byte_item": byte_item, - } - ) - - # Write out the results. - with open(output_filename, mode="wb") as f: - f.write(response.redacted_image) - print( - "Wrote {byte_count} to {filename}".format( - byte_count=len(response.redacted_image), filename=output_filename - ) - ) - - -# [END dlp_redact_image] - -# [START dlp_redact_image_all_text] - - -def redact_image_all_text( - project, - filename, - output_filename, -): - """Uses the Data Loss Prevention API to redact all text in an image. - - Args: - project: The Google Cloud project id to use as a parent resource. - filename: The path to the file to inspect. - output_filename: The path to which the redacted image will be written. - - Returns: - None; the response from the API is printed to the terminal. - """ - # Import the client library - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Construct the image_redaction_configs, indicating to DLP that all text in - # the input image should be redacted. - image_redaction_configs = [{"redact_all_text": True}] - - # Construct the byte_item, containing the file's byte data. - with open(filename, mode="rb") as f: - byte_item = {"type_": google.cloud.dlp_v2.FileType.IMAGE, "data": f.read()} - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.redact_image( - request={ - "parent": parent, - "image_redaction_configs": image_redaction_configs, - "byte_item": byte_item, - } - ) - - # Write out the results. - with open(output_filename, mode="wb") as f: - f.write(response.redacted_image) - - print( - "Wrote {byte_count} to {filename}".format( - byte_count=len(response.redacted_image), filename=output_filename - ) - ) - - -# [END dlp_redact_image_all_text] - -if __name__ == "__main__": - default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") - - common_args_parser = argparse.ArgumentParser(add_help=False) - common_args_parser.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - common_args_parser.add_argument("filename", help="The path to the file to inspect.") - common_args_parser.add_argument( - "output_filename", - help="The path to which the redacted image will be written.", - ) - - parser = argparse.ArgumentParser(description=__doc__) - subparsers = parser.add_subparsers( - dest="content", help="Select which content should be redacted." - ) - subparsers.required = True - - info_types_parser = subparsers.add_parser( - "info_types", - help="Redact specific infoTypes from an image.", - parents=[common_args_parser], - ) - info_types_parser.add_argument( - "--info_types", - nargs="+", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - info_types_parser.add_argument( - "--min_likelihood", - choices=[ - "LIKELIHOOD_UNSPECIFIED", - "VERY_UNLIKELY", - "UNLIKELY", - "POSSIBLE", - "LIKELY", - "VERY_LIKELY", - ], - help="A string representing the minimum likelihood threshold that " - "constitutes a match.", - ) - info_types_parser.add_argument( - "--mime_type", - help="The MIME type of the file. If not specified, the type is " - "inferred via the Python standard library's mimetypes module.", - ) - - all_text_parser = subparsers.add_parser( - "all_text", - help="Redact all text from an image. The MIME type of the file is " - "inferred via the Python standard library's mimetypes module.", - parents=[common_args_parser], - ) - - args = parser.parse_args() - - if args.content == "info_types": - redact_image( - args.project, - args.filename, - args.output_filename, - args.info_types, - min_likelihood=args.min_likelihood, - mime_type=args.mime_type, - ) - elif args.content == "all_text": - redact_image_all_text( - args.project, - args.filename, - args.output_filename, - ) diff --git a/samples/snippets/redact_test.py b/samples/snippets/redact_test.py deleted file mode 100644 index 0cce514e..00000000 --- a/samples/snippets/redact_test.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import shutil -import tempfile - -import pytest - -import redact - -GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") -RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") - - -@pytest.fixture(scope="module") -def tempdir(): - tempdir = tempfile.mkdtemp() - yield tempdir - shutil.rmtree(tempdir) - - -def test_redact_image_file(tempdir, capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") - output_filepath = os.path.join(tempdir, "redacted.png") - - redact.redact_image( - GCLOUD_PROJECT, - test_filepath, - output_filepath, - ["FIRST_NAME", "EMAIL_ADDRESS"], - ) - - out, _ = capsys.readouterr() - assert output_filepath in out - - -def test_redact_image_all_text(tempdir, capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") - output_filepath = os.path.join(tempdir, "redacted.png") - - redact.redact_image_all_text( - GCLOUD_PROJECT, - test_filepath, - output_filepath, - ) - - out, _ = capsys.readouterr() - assert output_filepath in out diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt deleted file mode 100644 index 3275b420..00000000 --- a/samples/snippets/requirements-test.txt +++ /dev/null @@ -1,4 +0,0 @@ -backoff==2.2.1 -pytest==7.2.1 -flaky==3.7.0 -mock==5.0.1 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt deleted file mode 100644 index 7ef4f742..00000000 --- a/samples/snippets/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -google-cloud-dlp==3.12.0 -google-cloud-storage==2.7.0 -google-cloud-pubsub==2.15.0 -google-cloud-datastore==2.14.0 -google-cloud-bigquery==3.6.0 diff --git a/samples/snippets/resources/accounts.txt b/samples/snippets/resources/accounts.txt deleted file mode 100644 index 2763cd0a..00000000 --- a/samples/snippets/resources/accounts.txt +++ /dev/null @@ -1 +0,0 @@ -My credit card number is 1234 5678 9012 3456, and my CVV is 789. \ No newline at end of file diff --git a/samples/snippets/resources/dates.csv b/samples/snippets/resources/dates.csv deleted file mode 100644 index 056fccb3..00000000 --- a/samples/snippets/resources/dates.csv +++ /dev/null @@ -1,5 +0,0 @@ -name,birth_date,register_date,credit_card -Ann,01/01/1970,07/21/1996,4532908762519852 -James,03/06/1988,04/09/2001,4301261899725540 -Dan,08/14/1945,11/15/2011,4620761856015295 -Laura,11/03/1992,01/04/2017,4564981067258901 \ No newline at end of file diff --git a/samples/snippets/resources/harmless.txt b/samples/snippets/resources/harmless.txt deleted file mode 100644 index 5666de37..00000000 --- a/samples/snippets/resources/harmless.txt +++ /dev/null @@ -1 +0,0 @@ -This file is mostly harmless. diff --git a/samples/snippets/resources/test.png b/samples/snippets/resources/test.png deleted file mode 100644 index 8f32c825..00000000 Binary files a/samples/snippets/resources/test.png and /dev/null differ diff --git a/samples/snippets/resources/test.txt b/samples/snippets/resources/test.txt deleted file mode 100644 index c2ee3815..00000000 --- a/samples/snippets/resources/test.txt +++ /dev/null @@ -1 +0,0 @@ -My phone number is (223) 456-7890 and my email address is gary@somedomain.com. \ No newline at end of file diff --git a/samples/snippets/risk.py b/samples/snippets/risk.py deleted file mode 100644 index 7070d06e..00000000 --- a/samples/snippets/risk.py +++ /dev/null @@ -1,934 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Sample app that uses the Data Loss Prevent API to perform risk anaylsis.""" - -from __future__ import print_function - -import argparse - - -# [START dlp_numerical_stats] -def numerical_risk_analysis( - project, - table_project_id, - dataset_id, - table_id, - column_name, - topic_id, - subscription_id, - timeout=300, -): - """Uses the Data Loss Prevention API to compute risk metrics of a column - of numerical data in a Google BigQuery table. - Args: - project: The Google Cloud project id to use as a parent resource. - table_project_id: The Google Cloud project id where the BigQuery table - is stored. - dataset_id: The id of the dataset to inspect. - table_id: The id of the table to inspect. - column_name: The name of the column to compute risk metrics for. - topic_id: The name of the Pub/Sub topic to notify once the job - completes. - subscription_id: The name of the Pub/Sub subscription to use when - listening for job completion notifications. - timeout: The number of seconds to wait for a response from the API. - - Returns: - None; the response from the API is printed to the terminal. - """ - import concurrent.futures - - # Import the client library. - import google.cloud.dlp - - # This sample additionally uses Cloud Pub/Sub to receive results from - # potentially long-running operations. - import google.cloud.pubsub - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into full resource ids. - topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) - parent = f"projects/{project}/locations/global" - - # Location info of the BigQuery table. - source_table = { - "project_id": table_project_id, - "dataset_id": dataset_id, - "table_id": table_id, - } - - # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": topic}}] - - # Configure risk analysis job - # Give the name of the numeric column to compute risk metrics for - risk_job = { - "privacy_metric": {"numerical_stats_config": {"field": {"name": column_name}}}, - "source_table": source_table, - "actions": actions, - } - - # Call API to start risk analysis job - operation = dlp.create_dlp_job(request={"parent": parent, "risk_job": risk_job}) - - def callback(message): - if message.attributes["DlpJobName"] == operation.name: - # This is the message we're looking for, so acknowledge it. - message.ack() - - # Now that the job is done, fetch the results and print them. - job = dlp.get_dlp_job(request={"name": operation.name}) - results = job.risk_details.numerical_stats_result - print( - "Value Range: [{}, {}]".format( - results.min_value.integer_value, - results.max_value.integer_value, - ) - ) - prev_value = None - for percent, result in enumerate(results.quantile_values): - value = result.integer_value - if prev_value != value: - print("Value at {}% quantile: {}".format(percent, value)) - prev_value = value - subscription.set_result(None) - else: - # This is not the message we're looking for. - message.drop() - - # Create a Pub/Sub client and find the subscription. The subscription is - # expected to already be listening to the topic. - subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path(project, subscription_id) - subscription = subscriber.subscribe(subscription_path, callback) - - try: - subscription.result(timeout=timeout) - except concurrent.futures.TimeoutError: - print( - "No event received before the timeout. Please verify that the " - "subscription provided is subscribed to the topic provided." - ) - subscription.close() - - -# [END dlp_numerical_stats] - - -# [START dlp_categorical_stats] -def categorical_risk_analysis( - project, - table_project_id, - dataset_id, - table_id, - column_name, - topic_id, - subscription_id, - timeout=300, -): - """Uses the Data Loss Prevention API to compute risk metrics of a column - of categorical data in a Google BigQuery table. - Args: - project: The Google Cloud project id to use as a parent resource. - table_project_id: The Google Cloud project id where the BigQuery table - is stored. - dataset_id: The id of the dataset to inspect. - table_id: The id of the table to inspect. - column_name: The name of the column to compute risk metrics for. - topic_id: The name of the Pub/Sub topic to notify once the job - completes. - subscription_id: The name of the Pub/Sub subscription to use when - listening for job completion notifications. - timeout: The number of seconds to wait for a response from the API. - - Returns: - None; the response from the API is printed to the terminal. - """ - import concurrent.futures - - # Import the client library. - import google.cloud.dlp - - # This sample additionally uses Cloud Pub/Sub to receive results from - # potentially long-running operations. - import google.cloud.pubsub - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into full resource ids. - topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) - parent = f"projects/{project}/locations/global" - - # Location info of the BigQuery table. - source_table = { - "project_id": table_project_id, - "dataset_id": dataset_id, - "table_id": table_id, - } - - # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": topic}}] - - # Configure risk analysis job - # Give the name of the numeric column to compute risk metrics for - risk_job = { - "privacy_metric": { - "categorical_stats_config": {"field": {"name": column_name}} - }, - "source_table": source_table, - "actions": actions, - } - - # Call API to start risk analysis job - operation = dlp.create_dlp_job(request={"parent": parent, "risk_job": risk_job}) - - def callback(message): - if message.attributes["DlpJobName"] == operation.name: - # This is the message we're looking for, so acknowledge it. - message.ack() - - # Now that the job is done, fetch the results and print them. - job = dlp.get_dlp_job(request={"name": operation.name}) - histogram_buckets = ( - job.risk_details.categorical_stats_result.value_frequency_histogram_buckets # noqa: E501 - ) - # Print bucket stats - for i, bucket in enumerate(histogram_buckets): - print("Bucket {}:".format(i)) - print( - " Most common value occurs {} time(s)".format( - bucket.value_frequency_upper_bound - ) - ) - print( - " Least common value occurs {} time(s)".format( - bucket.value_frequency_lower_bound - ) - ) - print(" {} unique values total.".format(bucket.bucket_size)) - for value in bucket.bucket_values: - print( - " Value {} occurs {} time(s)".format( - value.value.integer_value, value.count - ) - ) - subscription.set_result(None) - else: - # This is not the message we're looking for. - message.drop() - - # Create a Pub/Sub client and find the subscription. The subscription is - # expected to already be listening to the topic. - subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path(project, subscription_id) - subscription = subscriber.subscribe(subscription_path, callback) - - try: - subscription.result(timeout=timeout) - except concurrent.futures.TimeoutError: - print( - "No event received before the timeout. Please verify that the " - "subscription provided is subscribed to the topic provided." - ) - subscription.close() - - -# [END dlp_categorical_stats] - - -# [START dlp_k_anonymity] -def k_anonymity_analysis( - project, - table_project_id, - dataset_id, - table_id, - topic_id, - subscription_id, - quasi_ids, - timeout=300, -): - """Uses the Data Loss Prevention API to compute the k-anonymity of a - column set in a Google BigQuery table. - Args: - project: The Google Cloud project id to use as a parent resource. - table_project_id: The Google Cloud project id where the BigQuery table - is stored. - dataset_id: The id of the dataset to inspect. - table_id: The id of the table to inspect. - topic_id: The name of the Pub/Sub topic to notify once the job - completes. - subscription_id: The name of the Pub/Sub subscription to use when - listening for job completion notifications. - quasi_ids: A set of columns that form a composite key. - timeout: The number of seconds to wait for a response from the API. - - Returns: - None; the response from the API is printed to the terminal. - """ - import concurrent.futures - - # Import the client library. - import google.cloud.dlp - - # This sample additionally uses Cloud Pub/Sub to receive results from - # potentially long-running operations. - import google.cloud.pubsub - - # Create helper function for unpacking values - def get_values(obj): - return int(obj.integer_value) - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) - parent = f"projects/{project}/locations/global" - - # Location info of the BigQuery table. - source_table = { - "project_id": table_project_id, - "dataset_id": dataset_id, - "table_id": table_id, - } - - # Convert quasi id list to Protobuf type - def map_fields(field): - return {"name": field} - - quasi_ids = map(map_fields, quasi_ids) - - # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": topic}}] - - # Configure risk analysis job - # Give the name of the numeric column to compute risk metrics for - risk_job = { - "privacy_metric": {"k_anonymity_config": {"quasi_ids": quasi_ids}}, - "source_table": source_table, - "actions": actions, - } - - # Call API to start risk analysis job - operation = dlp.create_dlp_job(request={"parent": parent, "risk_job": risk_job}) - - def callback(message): - if message.attributes["DlpJobName"] == operation.name: - # This is the message we're looking for, so acknowledge it. - message.ack() - - # Now that the job is done, fetch the results and print them. - job = dlp.get_dlp_job(request={"name": operation.name}) - histogram_buckets = ( - job.risk_details.k_anonymity_result.equivalence_class_histogram_buckets - ) - # Print bucket stats - for i, bucket in enumerate(histogram_buckets): - print("Bucket {}:".format(i)) - if bucket.equivalence_class_size_lower_bound: - print( - " Bucket size range: [{}, {}]".format( - bucket.equivalence_class_size_lower_bound, - bucket.equivalence_class_size_upper_bound, - ) - ) - for value_bucket in bucket.bucket_values: - print( - " Quasi-ID values: {}".format( - map(get_values, value_bucket.quasi_ids_values) - ) - ) - print( - " Class size: {}".format( - value_bucket.equivalence_class_size - ) - ) - subscription.set_result(None) - else: - # This is not the message we're looking for. - message.drop() - - # Create a Pub/Sub client and find the subscription. The subscription is - # expected to already be listening to the topic. - subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path(project, subscription_id) - subscription = subscriber.subscribe(subscription_path, callback) - - try: - subscription.result(timeout=timeout) - except concurrent.futures.TimeoutError: - print( - "No event received before the timeout. Please verify that the " - "subscription provided is subscribed to the topic provided." - ) - subscription.close() - - -# [END dlp_k_anonymity] - - -# [START dlp_l_diversity] -def l_diversity_analysis( - project, - table_project_id, - dataset_id, - table_id, - topic_id, - subscription_id, - sensitive_attribute, - quasi_ids, - timeout=300, -): - """Uses the Data Loss Prevention API to compute the l-diversity of a - column set in a Google BigQuery table. - Args: - project: The Google Cloud project id to use as a parent resource. - table_project_id: The Google Cloud project id where the BigQuery table - is stored. - dataset_id: The id of the dataset to inspect. - table_id: The id of the table to inspect. - topic_id: The name of the Pub/Sub topic to notify once the job - completes. - subscription_id: The name of the Pub/Sub subscription to use when - listening for job completion notifications. - sensitive_attribute: The column to measure l-diversity relative to. - quasi_ids: A set of columns that form a composite key. - timeout: The number of seconds to wait for a response from the API. - - Returns: - None; the response from the API is printed to the terminal. - """ - import concurrent.futures - - # Import the client library. - import google.cloud.dlp - - # This sample additionally uses Cloud Pub/Sub to receive results from - # potentially long-running operations. - import google.cloud.pubsub - - # Create helper function for unpacking values - def get_values(obj): - return int(obj.integer_value) - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) - parent = f"projects/{project}/locations/global" - - # Location info of the BigQuery table. - source_table = { - "project_id": table_project_id, - "dataset_id": dataset_id, - "table_id": table_id, - } - - # Convert quasi id list to Protobuf type - def map_fields(field): - return {"name": field} - - quasi_ids = map(map_fields, quasi_ids) - - # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": topic}}] - - # Configure risk analysis job - # Give the name of the numeric column to compute risk metrics for - risk_job = { - "privacy_metric": { - "l_diversity_config": { - "quasi_ids": quasi_ids, - "sensitive_attribute": {"name": sensitive_attribute}, - } - }, - "source_table": source_table, - "actions": actions, - } - - # Call API to start risk analysis job - operation = dlp.create_dlp_job(request={"parent": parent, "risk_job": risk_job}) - - def callback(message): - if message.attributes["DlpJobName"] == operation.name: - # This is the message we're looking for, so acknowledge it. - message.ack() - - # Now that the job is done, fetch the results and print them. - job = dlp.get_dlp_job(request={"name": operation.name}) - histogram_buckets = ( - job.risk_details.l_diversity_result.sensitive_value_frequency_histogram_buckets # noqa: E501 - ) - # Print bucket stats - for i, bucket in enumerate(histogram_buckets): - print("Bucket {}:".format(i)) - print( - " Bucket size range: [{}, {}]".format( - bucket.sensitive_value_frequency_lower_bound, - bucket.sensitive_value_frequency_upper_bound, - ) - ) - for value_bucket in bucket.bucket_values: - print( - " Quasi-ID values: {}".format( - map(get_values, value_bucket.quasi_ids_values) - ) - ) - print( - " Class size: {}".format(value_bucket.equivalence_class_size) - ) - for value in value_bucket.top_sensitive_values: - print( - ( - " Sensitive value {} occurs {} time(s)".format( - value.value, value.count - ) - ) - ) - subscription.set_result(None) - else: - # This is not the message we're looking for. - message.drop() - - # Create a Pub/Sub client and find the subscription. The subscription is - # expected to already be listening to the topic. - subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path(project, subscription_id) - subscription = subscriber.subscribe(subscription_path, callback) - - try: - subscription.result(timeout=timeout) - except concurrent.futures.TimeoutError: - print( - "No event received before the timeout. Please verify that the " - "subscription provided is subscribed to the topic provided." - ) - subscription.close() - - -# [END dlp_l_diversity] - - -# [START dlp_k_map] -def k_map_estimate_analysis( - project, - table_project_id, - dataset_id, - table_id, - topic_id, - subscription_id, - quasi_ids, - info_types, - region_code="US", - timeout=300, -): - """Uses the Data Loss Prevention API to compute the k-map risk estimation - of a column set in a Google BigQuery table. - Args: - project: The Google Cloud project id to use as a parent resource. - table_project_id: The Google Cloud project id where the BigQuery table - is stored. - dataset_id: The id of the dataset to inspect. - table_id: The id of the table to inspect. - column_name: The name of the column to compute risk metrics for. - topic_id: The name of the Pub/Sub topic to notify once the job - completes. - subscription_id: The name of the Pub/Sub subscription to use when - listening for job completion notifications. - quasi_ids: A set of columns that form a composite key and optionally - their reidentification distributions. - info_types: Type of information of the quasi_id in order to provide a - statistical model of population. - region_code: The ISO 3166-1 region code that the data is representative - of. Can be omitted if using a region-specific infoType (such as - US_ZIP_5) - timeout: The number of seconds to wait for a response from the API. - - Returns: - None; the response from the API is printed to the terminal. - """ - import concurrent.futures - - # Import the client library. - import google.cloud.dlp - - # This sample additionally uses Cloud Pub/Sub to receive results from - # potentially long-running operations. - import google.cloud.pubsub - - # Create helper function for unpacking values - def get_values(obj): - return int(obj.integer_value) - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into full resource ids. - topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) - parent = f"projects/{project}/locations/global" - - # Location info of the BigQuery table. - source_table = { - "project_id": table_project_id, - "dataset_id": dataset_id, - "table_id": table_id, - } - - # Check that numbers of quasi-ids and info types are equal - if len(quasi_ids) != len(info_types): - raise ValueError( - """Number of infoTypes and number of quasi-identifiers - must be equal!""" - ) - - # Convert quasi id list to Protobuf type - def map_fields(quasi_id, info_type): - return {"field": {"name": quasi_id}, "info_type": {"name": info_type}} - - quasi_ids = map(map_fields, quasi_ids, info_types) - - # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": topic}}] - - # Configure risk analysis job - # Give the name of the numeric column to compute risk metrics for - risk_job = { - "privacy_metric": { - "k_map_estimation_config": { - "quasi_ids": quasi_ids, - "region_code": region_code, - } - }, - "source_table": source_table, - "actions": actions, - } - - # Call API to start risk analysis job - operation = dlp.create_dlp_job(request={"parent": parent, "risk_job": risk_job}) - - def callback(message): - if message.attributes["DlpJobName"] == operation.name: - # This is the message we're looking for, so acknowledge it. - message.ack() - - # Now that the job is done, fetch the results and print them. - job = dlp.get_dlp_job(request={"name": operation.name}) - histogram_buckets = ( - job.risk_details.k_map_estimation_result.k_map_estimation_histogram - ) - # Print bucket stats - for i, bucket in enumerate(histogram_buckets): - print("Bucket {}:".format(i)) - print( - " Anonymity range: [{}, {}]".format( - bucket.min_anonymity, bucket.max_anonymity - ) - ) - print(" Size: {}".format(bucket.bucket_size)) - for value_bucket in bucket.bucket_values: - print( - " Values: {}".format( - map(get_values, value_bucket.quasi_ids_values) - ) - ) - print( - " Estimated k-map anonymity: {}".format( - value_bucket.estimated_anonymity - ) - ) - subscription.set_result(None) - else: - # This is not the message we're looking for. - message.drop() - - # Create a Pub/Sub client and find the subscription. The subscription is - # expected to already be listening to the topic. - subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path(project, subscription_id) - subscription = subscriber.subscribe(subscription_path, callback) - - try: - subscription.result(timeout=timeout) - except concurrent.futures.TimeoutError: - print( - "No event received before the timeout. Please verify that the " - "subscription provided is subscribed to the topic provided." - ) - subscription.close() - - -# [END dlp_k_map] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - subparsers = parser.add_subparsers( - dest="content", help="Select how to submit content to the API." - ) - subparsers.required = True - - numerical_parser = subparsers.add_parser("numerical", help="") - numerical_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - numerical_parser.add_argument( - "table_project_id", - help="The Google Cloud project id where the BigQuery table is stored.", - ) - numerical_parser.add_argument( - "dataset_id", help="The id of the dataset to inspect." - ) - numerical_parser.add_argument("table_id", help="The id of the table to inspect.") - numerical_parser.add_argument( - "column_name", - help="The name of the column to compute risk metrics for.", - ) - numerical_parser.add_argument( - "topic_id", - help="The name of the Pub/Sub topic to notify once the job completes.", - ) - numerical_parser.add_argument( - "subscription_id", - help="The name of the Pub/Sub subscription to use when listening for" - "job completion notifications.", - ) - numerical_parser.add_argument( - "--timeout", - type=int, - help="The number of seconds to wait for a response from the API.", - ) - - categorical_parser = subparsers.add_parser("categorical", help="") - categorical_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - categorical_parser.add_argument( - "table_project_id", - help="The Google Cloud project id where the BigQuery table is stored.", - ) - categorical_parser.add_argument( - "dataset_id", help="The id of the dataset to inspect." - ) - categorical_parser.add_argument("table_id", help="The id of the table to inspect.") - categorical_parser.add_argument( - "column_name", - help="The name of the column to compute risk metrics for.", - ) - categorical_parser.add_argument( - "topic_id", - help="The name of the Pub/Sub topic to notify once the job completes.", - ) - categorical_parser.add_argument( - "subscription_id", - help="The name of the Pub/Sub subscription to use when listening for" - "job completion notifications.", - ) - categorical_parser.add_argument( - "--timeout", - type=int, - help="The number of seconds to wait for a response from the API.", - ) - - k_anonymity_parser = subparsers.add_parser( - "k_anonymity", - help="Computes the k-anonymity of a column set in a Google BigQuery" "table.", - ) - k_anonymity_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - k_anonymity_parser.add_argument( - "table_project_id", - help="The Google Cloud project id where the BigQuery table is stored.", - ) - k_anonymity_parser.add_argument( - "dataset_id", help="The id of the dataset to inspect." - ) - k_anonymity_parser.add_argument("table_id", help="The id of the table to inspect.") - k_anonymity_parser.add_argument( - "topic_id", - help="The name of the Pub/Sub topic to notify once the job completes.", - ) - k_anonymity_parser.add_argument( - "subscription_id", - help="The name of the Pub/Sub subscription to use when listening for" - "job completion notifications.", - ) - k_anonymity_parser.add_argument( - "quasi_ids", - nargs="+", - help="A set of columns that form a composite key.", - ) - k_anonymity_parser.add_argument( - "--timeout", - type=int, - help="The number of seconds to wait for a response from the API.", - ) - - l_diversity_parser = subparsers.add_parser( - "l_diversity", - help="Computes the l-diversity of a column set in a Google BigQuery" "table.", - ) - l_diversity_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - l_diversity_parser.add_argument( - "table_project_id", - help="The Google Cloud project id where the BigQuery table is stored.", - ) - l_diversity_parser.add_argument( - "dataset_id", help="The id of the dataset to inspect." - ) - l_diversity_parser.add_argument("table_id", help="The id of the table to inspect.") - l_diversity_parser.add_argument( - "topic_id", - help="The name of the Pub/Sub topic to notify once the job completes.", - ) - l_diversity_parser.add_argument( - "subscription_id", - help="The name of the Pub/Sub subscription to use when listening for" - "job completion notifications.", - ) - l_diversity_parser.add_argument( - "sensitive_attribute", - help="The column to measure l-diversity relative to.", - ) - l_diversity_parser.add_argument( - "quasi_ids", - nargs="+", - help="A set of columns that form a composite key.", - ) - l_diversity_parser.add_argument( - "--timeout", - type=int, - help="The number of seconds to wait for a response from the API.", - ) - - k_map_parser = subparsers.add_parser( - "k_map", - help="Computes the k-map risk estimation of a column set in a Google" - "BigQuery table.", - ) - k_map_parser.add_argument( - "project", - help="The Google Cloud project id to use as a parent resource.", - ) - k_map_parser.add_argument( - "table_project_id", - help="The Google Cloud project id where the BigQuery table is stored.", - ) - k_map_parser.add_argument("dataset_id", help="The id of the dataset to inspect.") - k_map_parser.add_argument("table_id", help="The id of the table to inspect.") - k_map_parser.add_argument( - "topic_id", - help="The name of the Pub/Sub topic to notify once the job completes.", - ) - k_map_parser.add_argument( - "subscription_id", - help="The name of the Pub/Sub subscription to use when listening for" - "job completion notifications.", - ) - k_map_parser.add_argument( - "quasi_ids", - nargs="+", - help="A set of columns that form a composite key.", - ) - k_map_parser.add_argument( - "-t", - "--info-types", - nargs="+", - help="Type of information of the quasi_id in order to provide a" - "statistical model of population.", - required=True, - ) - k_map_parser.add_argument( - "-r", - "--region-code", - default="US", - help="The ISO 3166-1 region code that the data is representative of.", - ) - k_map_parser.add_argument( - "--timeout", - type=int, - help="The number of seconds to wait for a response from the API.", - ) - - args = parser.parse_args() - - if args.content == "numerical": - numerical_risk_analysis( - args.project, - args.table_project_id, - args.dataset_id, - args.table_id, - args.column_name, - args.topic_id, - args.subscription_id, - timeout=args.timeout, - ) - elif args.content == "categorical": - categorical_risk_analysis( - args.project, - args.table_project_id, - args.dataset_id, - args.table_id, - args.column_name, - args.topic_id, - args.subscription_id, - timeout=args.timeout, - ) - elif args.content == "k_anonymity": - k_anonymity_analysis( - args.project, - args.table_project_id, - args.dataset_id, - args.table_id, - args.topic_id, - args.subscription_id, - args.quasi_ids, - timeout=args.timeout, - ) - elif args.content == "l_diversity": - l_diversity_analysis( - args.project, - args.table_project_id, - args.dataset_id, - args.table_id, - args.topic_id, - args.subscription_id, - args.sensitive_attribute, - args.quasi_ids, - timeout=args.timeout, - ) - elif args.content == "k_map": - k_map_estimate_analysis( - args.project, - args.table_project_id, - args.dataset_id, - args.table_id, - args.topic_id, - args.subscription_id, - args.quasi_ids, - args.info_types, - region_code=args.region_code, - timeout=args.timeout, - ) diff --git a/samples/snippets/risk_test.py b/samples/snippets/risk_test.py deleted file mode 100644 index a8defbd3..00000000 --- a/samples/snippets/risk_test.py +++ /dev/null @@ -1,351 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import uuid - -import google.cloud.bigquery -import google.cloud.pubsub -import pytest - -import risk - -UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] -GCLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") -TABLE_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") -TOPIC_ID = "dlp-test" + UNIQUE_STRING -SUBSCRIPTION_ID = "dlp-test-subscription" + UNIQUE_STRING -UNIQUE_FIELD = "Name" -REPEATED_FIELD = "Mystery" -NUMERIC_FIELD = "Age" -STRING_BOOLEAN_FIELD = "Gender" - -BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING -BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING -BIGQUERY_HARMFUL_TABLE_ID = "harmful" + UNIQUE_STRING - - -# Create new custom topic/subscription -# We observe sometimes all the tests in this file fail. In a -# hypothesis where DLP service somehow loses the connection to the -# topic, now we use function scope for Pub/Sub fixtures. -@pytest.fixture(scope="module") -def topic_id(): - # Creates a pubsub topic, and tears it down. - publisher = google.cloud.pubsub.PublisherClient() - topic_path = publisher.topic_path(GCLOUD_PROJECT, TOPIC_ID) - try: - publisher.create_topic(request={"name": topic_path}) - except google.api_core.exceptions.AlreadyExists: - pass - - yield TOPIC_ID - - publisher.delete_topic(request={"topic": topic_path}) - - -@pytest.fixture(scope="module") -def subscription_id(topic_id): - # Subscribes to a topic. - subscriber = google.cloud.pubsub.SubscriberClient() - topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id) - subscription_path = subscriber.subscription_path(GCLOUD_PROJECT, SUBSCRIPTION_ID) - try: - subscriber.create_subscription( - request={"name": subscription_path, "topic": topic_path} - ) - except google.api_core.exceptions.AlreadyExists: - pass - - yield SUBSCRIPTION_ID - - subscriber.delete_subscription(request={"subscription": subscription_path}) - - -@pytest.fixture(scope="module") -def bigquery_project(): - # Adds test Bigquery data, yields the project ID and then tears down. - - bigquery_client = google.cloud.bigquery.Client() - - dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID) - dataset = google.cloud.bigquery.Dataset(dataset_ref) - try: - dataset = bigquery_client.create_dataset(dataset) - except google.api_core.exceptions.Conflict: - dataset = bigquery_client.get_dataset(dataset) - table_ref = dataset_ref.table(BIGQUERY_TABLE_ID) - table = google.cloud.bigquery.Table(table_ref) - - harmful_table_ref = dataset_ref.table(BIGQUERY_HARMFUL_TABLE_ID) - harmful_table = google.cloud.bigquery.Table(harmful_table_ref) - - table.schema = ( - google.cloud.bigquery.SchemaField("Name", "STRING"), - google.cloud.bigquery.SchemaField("Comment", "STRING"), - ) - - harmful_table.schema = ( - google.cloud.bigquery.SchemaField("Name", "STRING", "REQUIRED"), - google.cloud.bigquery.SchemaField("TelephoneNumber", "STRING", "REQUIRED"), - google.cloud.bigquery.SchemaField("Mystery", "STRING", "REQUIRED"), - google.cloud.bigquery.SchemaField("Age", "INTEGER", "REQUIRED"), - google.cloud.bigquery.SchemaField("Gender", "STRING"), - google.cloud.bigquery.SchemaField("RegionCode", "STRING"), - ) - - try: - table = bigquery_client.create_table(table) - except google.api_core.exceptions.Conflict: - table = bigquery_client.get_table(table) - - try: - harmful_table = bigquery_client.create_table(harmful_table) - except google.api_core.exceptions.Conflict: - harmful_table = bigquery_client.get_table(harmful_table) - - rows_to_insert = [("Gary Smith", "My email is gary@example.com")] - harmful_rows_to_insert = [ - ( - "Gandalf", - "(123) 456-7890", - "4231 5555 6781 9876", - 27, - "Male", - "US", - ), - ( - "Dumbledore", - "(313) 337-1337", - "6291 8765 1095 7629", - 27, - "Male", - "US", - ), - ("Joe", "(452) 123-1234", "3782 2288 1166 3030", 35, "Male", "US"), - ("James", "(567) 890-1234", "8291 3627 8250 1234", 19, "Male", "US"), - ( - "Marie", - "(452) 123-1234", - "8291 3627 8250 1234", - 35, - "Female", - "US", - ), - ( - "Carrie", - "(567) 890-1234", - "2253 5218 4251 4526", - 35, - "Female", - "US", - ), - ] - - bigquery_client.insert_rows(table, rows_to_insert) - bigquery_client.insert_rows(harmful_table, harmful_rows_to_insert) - yield GCLOUD_PROJECT - - bigquery_client.delete_dataset(dataset_ref, delete_contents=True) - - -@pytest.mark.flaky(max_runs=3, min_passes=1) -def test_numerical_risk_analysis(topic_id, subscription_id, bigquery_project, capsys): - risk.numerical_risk_analysis( - GCLOUD_PROJECT, - TABLE_PROJECT, - BIGQUERY_DATASET_ID, - BIGQUERY_HARMFUL_TABLE_ID, - NUMERIC_FIELD, - topic_id, - subscription_id, - ) - - out, _ = capsys.readouterr() - assert "Value Range:" in out - - -@pytest.mark.flaky(max_runs=3, min_passes=1) -def test_categorical_risk_analysis_on_string_field( - topic_id, subscription_id, bigquery_project, capsys -): - risk.categorical_risk_analysis( - GCLOUD_PROJECT, - TABLE_PROJECT, - BIGQUERY_DATASET_ID, - BIGQUERY_HARMFUL_TABLE_ID, - UNIQUE_FIELD, - topic_id, - subscription_id, - ) - - out, _ = capsys.readouterr() - assert "Most common value occurs" in out - - -@pytest.mark.flaky(max_runs=3, min_passes=1) -def test_categorical_risk_analysis_on_number_field( - topic_id, subscription_id, bigquery_project, capsys -): - risk.categorical_risk_analysis( - GCLOUD_PROJECT, - TABLE_PROJECT, - BIGQUERY_DATASET_ID, - BIGQUERY_HARMFUL_TABLE_ID, - NUMERIC_FIELD, - topic_id, - subscription_id, - ) - - out, _ = capsys.readouterr() - assert "Most common value occurs" in out - - -@pytest.mark.flaky(max_runs=3, min_passes=1) -def test_k_anonymity_analysis_single_field( - topic_id, subscription_id, bigquery_project, capsys -): - risk.k_anonymity_analysis( - GCLOUD_PROJECT, - TABLE_PROJECT, - BIGQUERY_DATASET_ID, - BIGQUERY_HARMFUL_TABLE_ID, - topic_id, - subscription_id, - [NUMERIC_FIELD], - ) - - out, _ = capsys.readouterr() - assert "Quasi-ID values:" in out - assert "Class size:" in out - - -@pytest.mark.flaky(max_runs=3, min_passes=1) -def test_k_anonymity_analysis_multiple_fields( - topic_id, subscription_id, bigquery_project, capsys -): - risk.k_anonymity_analysis( - GCLOUD_PROJECT, - TABLE_PROJECT, - BIGQUERY_DATASET_ID, - BIGQUERY_HARMFUL_TABLE_ID, - topic_id, - subscription_id, - [NUMERIC_FIELD, REPEATED_FIELD], - ) - - out, _ = capsys.readouterr() - assert "Quasi-ID values:" in out - assert "Class size:" in out - - -@pytest.mark.flaky(max_runs=3, min_passes=1) -def test_l_diversity_analysis_single_field( - topic_id, subscription_id, bigquery_project, capsys -): - risk.l_diversity_analysis( - GCLOUD_PROJECT, - TABLE_PROJECT, - BIGQUERY_DATASET_ID, - BIGQUERY_HARMFUL_TABLE_ID, - topic_id, - subscription_id, - UNIQUE_FIELD, - [NUMERIC_FIELD], - ) - - out, _ = capsys.readouterr() - assert "Quasi-ID values:" in out - assert "Class size:" in out - assert "Sensitive value" in out - - -@pytest.mark.flaky(max_runs=3, min_passes=1) -def test_l_diversity_analysis_multiple_field( - topic_id, subscription_id, bigquery_project, capsys -): - risk.l_diversity_analysis( - GCLOUD_PROJECT, - TABLE_PROJECT, - BIGQUERY_DATASET_ID, - BIGQUERY_HARMFUL_TABLE_ID, - topic_id, - subscription_id, - UNIQUE_FIELD, - [NUMERIC_FIELD, REPEATED_FIELD], - ) - - out, _ = capsys.readouterr() - assert "Quasi-ID values:" in out - assert "Class size:" in out - assert "Sensitive value" in out - - -@pytest.mark.flaky(max_runs=3, min_passes=1) -def test_k_map_estimate_analysis_single_field( - topic_id, subscription_id, bigquery_project, capsys -): - risk.k_map_estimate_analysis( - GCLOUD_PROJECT, - TABLE_PROJECT, - BIGQUERY_DATASET_ID, - BIGQUERY_HARMFUL_TABLE_ID, - topic_id, - subscription_id, - [NUMERIC_FIELD], - ["AGE"], - ) - - out, _ = capsys.readouterr() - assert "Anonymity range:" in out - assert "Size:" in out - assert "Values" in out - - -@pytest.mark.flaky(max_runs=5, min_passes=1) -def test_k_map_estimate_analysis_multiple_field( - topic_id, subscription_id, bigquery_project, capsys -): - risk.k_map_estimate_analysis( - GCLOUD_PROJECT, - TABLE_PROJECT, - BIGQUERY_DATASET_ID, - BIGQUERY_HARMFUL_TABLE_ID, - topic_id, - subscription_id, - [NUMERIC_FIELD, STRING_BOOLEAN_FIELD], - ["AGE", "GENDER"], - ) - - out, _ = capsys.readouterr() - assert "Anonymity range:" in out - assert "Size:" in out - assert "Values" in out - - -@pytest.mark.flaky(max_runs=3, min_passes=1) -def test_k_map_estimate_analysis_quasi_ids_info_types_equal( - topic_id, subscription_id, bigquery_project -): - with pytest.raises(ValueError): - risk.k_map_estimate_analysis( - GCLOUD_PROJECT, - TABLE_PROJECT, - BIGQUERY_DATASET_ID, - BIGQUERY_HARMFUL_TABLE_ID, - topic_id, - subscription_id, - [NUMERIC_FIELD, STRING_BOOLEAN_FIELD], - ["AGE"], - ) diff --git a/samples/snippets/templates.py b/samples/snippets/templates.py deleted file mode 100644 index 009dc08f..00000000 --- a/samples/snippets/templates.py +++ /dev/null @@ -1,255 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Sample app that sets up Data Loss Prevention API inspect templates.""" - -from __future__ import print_function - -import argparse -import os - - -# [START dlp_create_inspect_template] -def create_inspect_template( - project, - info_types, - template_id=None, - display_name=None, - min_likelihood=None, - max_findings=None, - include_quote=None, -): - """Creates a Data Loss Prevention API inspect template. - Args: - project: The Google Cloud project id to use as a parent resource. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - template_id: The id of the template. If omitted, an id will be randomly - generated. - display_name: The optional display name of the template. - min_likelihood: A string representing the minimum likelihood threshold - that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', - 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. - max_findings: The maximum number of findings to report; 0 = no maximum. - include_quote: Boolean for whether to display a quote of the detected - information in the results. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - info_types = [{"name": info_type} for info_type in info_types] - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - inspect_config = { - "info_types": info_types, - "min_likelihood": min_likelihood, - "include_quote": include_quote, - "limits": {"max_findings_per_request": max_findings}, - } - - inspect_template = { - "inspect_config": inspect_config, - "display_name": display_name, - } - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.create_inspect_template( - request={ - "parent": parent, - "inspect_template": inspect_template, - "template_id": template_id, - } - ) - - print("Successfully created template {}".format(response.name)) - - -# [END dlp_create_inspect_template] - - -# [START dlp_list_templates] -def list_inspect_templates(project): - """Lists all Data Loss Prevention API inspect templates. - Args: - project: The Google Cloud project id to use as a parent resource. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.list_inspect_templates(request={"parent": parent}) - - for template in response: - print("Template {}:".format(template.name)) - if template.display_name: - print(" Display Name: {}".format(template.display_name)) - print(" Created: {}".format(template.create_time)) - print(" Updated: {}".format(template.update_time)) - - config = template.inspect_config - print( - " InfoTypes: {}".format(", ".join([it.name for it in config.info_types])) - ) - print(" Minimum likelihood: {}".format(config.min_likelihood)) - print(" Include quotes: {}".format(config.include_quote)) - print( - " Max findings per request: {}".format( - config.limits.max_findings_per_request - ) - ) - - -# [END dlp_list_templates] - - -# [START dlp_delete_inspect_template] -def delete_inspect_template(project, template_id): - """Deletes a Data Loss Prevention API template. - Args: - project: The id of the Google Cloud project which owns the template. - template_id: The id of the template to delete. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Combine the template id with the parent id. - template_resource = "{}/inspectTemplates/{}".format(parent, template_id) - - # Call the API. - dlp.delete_inspect_template(request={"name": template_resource}) - - print("Template {} successfully deleted.".format(template_resource)) - - -# [END dlp_delete_inspect_template] - - -if __name__ == "__main__": - default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") - - parser = argparse.ArgumentParser(description=__doc__) - subparsers = parser.add_subparsers( - dest="action", help="Select which action to perform." - ) - subparsers.required = True - - parser_create = subparsers.add_parser("create", help="Create a template.") - parser_create.add_argument( - "--template_id", - help="The id of the template. If omitted, an id will be randomly " "generated", - ) - parser_create.add_argument( - "--display_name", help="The optional display name of the template." - ) - parser_create.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - parser_create.add_argument( - "--info_types", - nargs="+", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - parser_create.add_argument( - "--min_likelihood", - choices=[ - "LIKELIHOOD_UNSPECIFIED", - "VERY_UNLIKELY", - "UNLIKELY", - "POSSIBLE", - "LIKELY", - "VERY_LIKELY", - ], - help="A string representing the minimum likelihood threshold that " - "constitutes a match.", - ) - parser_create.add_argument( - "--max_findings", - type=int, - help="The maximum number of findings to report; 0 = no maximum.", - ) - parser_create.add_argument( - "--include_quote", - type=bool, - help="A boolean for whether to display a quote of the detected " - "information in the results.", - default=True, - ) - - parser_list = subparsers.add_parser("list", help="List all templates.") - parser_list.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - - parser_delete = subparsers.add_parser("delete", help="Delete a template.") - parser_delete.add_argument("template_id", help="The id of the template to delete.") - parser_delete.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - - args = parser.parse_args() - - if args.action == "create": - create_inspect_template( - args.project, - args.info_types, - template_id=args.template_id, - display_name=args.display_name, - min_likelihood=args.min_likelihood, - max_findings=args.max_findings, - include_quote=args.include_quote, - ) - elif args.action == "list": - list_inspect_templates(args.project) - elif args.action == "delete": - delete_inspect_template(args.project, args.template_id) diff --git a/samples/snippets/templates_test.py b/samples/snippets/templates_test.py deleted file mode 100644 index f8d22118..00000000 --- a/samples/snippets/templates_test.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import uuid - -import google.api_core.exceptions -import google.cloud.storage - -import templates - -UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] -GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") -TEST_TEMPLATE_ID = "test-template" + UNIQUE_STRING - - -def test_create_list_and_delete_template(capsys): - try: - templates.create_inspect_template( - GCLOUD_PROJECT, - ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], - template_id=TEST_TEMPLATE_ID, - ) - except google.api_core.exceptions.InvalidArgument: - # Template already exists, perhaps due to a previous interrupted test. - templates.delete_inspect_template(GCLOUD_PROJECT, TEST_TEMPLATE_ID) - - out, _ = capsys.readouterr() - assert TEST_TEMPLATE_ID in out - - # Try again and move on. - templates.create_inspect_template( - GCLOUD_PROJECT, - ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], - template_id=TEST_TEMPLATE_ID, - ) - - out, _ = capsys.readouterr() - assert TEST_TEMPLATE_ID in out - - templates.list_inspect_templates(GCLOUD_PROJECT) - - out, _ = capsys.readouterr() - assert TEST_TEMPLATE_ID in out - - templates.delete_inspect_template(GCLOUD_PROJECT, TEST_TEMPLATE_ID) - - out, _ = capsys.readouterr() - assert TEST_TEMPLATE_ID in out diff --git a/samples/snippets/triggers.py b/samples/snippets/triggers.py deleted file mode 100644 index dae75c2d..00000000 --- a/samples/snippets/triggers.py +++ /dev/null @@ -1,286 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Sample app that sets up Data Loss Prevention API automation triggers.""" - -from __future__ import print_function - -import argparse -import os - - -# [START dlp_create_trigger] -def create_trigger( - project, - bucket, - scan_period_days, - info_types, - trigger_id=None, - display_name=None, - description=None, - min_likelihood=None, - max_findings=None, - auto_populate_timespan=False, -): - """Creates a scheduled Data Loss Prevention API inspect_content trigger. - Args: - project: The Google Cloud project id to use as a parent resource. - bucket: The name of the GCS bucket to scan. This sample scans all - files in the bucket using a wildcard. - scan_period_days: How often to repeat the scan, in days. - The minimum is 1 day. - info_types: A list of strings representing info types to look for. - A full list of info type categories can be fetched from the API. - trigger_id: The id of the trigger. If omitted, an id will be randomly - generated. - display_name: The optional display name of the trigger. - description: The optional description of the trigger. - min_likelihood: A string representing the minimum likelihood threshold - that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', - 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. - max_findings: The maximum number of findings to report; 0 = no maximum. - auto_populate_timespan: Automatically populates time span config start - and end times in order to scan new content only. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Prepare info_types by converting the list of strings into a list of - # dictionaries (protos are also accepted). - info_types = [{"name": info_type} for info_type in info_types] - - # Construct the configuration dictionary. Keys which are None may - # optionally be omitted entirely. - inspect_config = { - "info_types": info_types, - "min_likelihood": min_likelihood, - "limits": {"max_findings_per_request": max_findings}, - } - - # Construct a cloud_storage_options dictionary with the bucket's URL. - url = "gs://{}/*".format(bucket) - storage_config = { - "cloud_storage_options": {"file_set": {"url": url}}, - # Time-based configuration for each storage object. - "timespan_config": { - # Auto-populate start and end times in order to scan new objects - # only. - "enable_auto_population_of_timespan_config": auto_populate_timespan - }, - } - - # Construct the job definition. - job = {"inspect_config": inspect_config, "storage_config": storage_config} - - # Construct the schedule definition: - schedule = { - "recurrence_period_duration": {"seconds": scan_period_days * 60 * 60 * 24} - } - - # Construct the trigger definition. - job_trigger = { - "inspect_job": job, - "display_name": display_name, - "description": description, - "triggers": [{"schedule": schedule}], - "status": google.cloud.dlp_v2.JobTrigger.Status.HEALTHY, - } - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.create_job_trigger( - request={"parent": parent, "job_trigger": job_trigger, "trigger_id": trigger_id} - ) - - print("Successfully created trigger {}".format(response.name)) - - -# [END dlp_create_trigger] - - -# [START dlp_list_triggers] -def list_triggers(project): - """Lists all Data Loss Prevention API triggers. - Args: - project: The Google Cloud project id to use as a parent resource. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Call the API. - response = dlp.list_job_triggers(request={"parent": parent}) - - for trigger in response: - print("Trigger {}:".format(trigger.name)) - print(" Created: {}".format(trigger.create_time)) - print(" Updated: {}".format(trigger.update_time)) - if trigger.display_name: - print(" Display Name: {}".format(trigger.display_name)) - if trigger.description: - print(" Description: {}".format(trigger.discription)) - print(" Status: {}".format(trigger.status)) - print(" Error count: {}".format(len(trigger.errors))) - - -# [END dlp_list_triggers] - - -# [START dlp_delete_trigger] -def delete_trigger(project, trigger_id): - """Deletes a Data Loss Prevention API trigger. - Args: - project: The id of the Google Cloud project which owns the trigger. - trigger_id: The id of the trigger to delete. - Returns: - None; the response from the API is printed to the terminal. - """ - - # Import the client library - import google.cloud.dlp - - # Instantiate a client. - dlp = google.cloud.dlp_v2.DlpServiceClient() - - # Convert the project id into a full resource id. - parent = f"projects/{project}" - - # Combine the trigger id with the parent id. - trigger_resource = "{}/jobTriggers/{}".format(parent, trigger_id) - - # Call the API. - dlp.delete_job_trigger(request={"name": trigger_resource}) - - print("Trigger {} successfully deleted.".format(trigger_resource)) - - -# [END dlp_delete_triggers] - - -if __name__ == "__main__": - default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") - - parser = argparse.ArgumentParser(description=__doc__) - subparsers = parser.add_subparsers( - dest="action", help="Select which action to perform." - ) - subparsers.required = True - - parser_create = subparsers.add_parser("create", help="Create a trigger.") - parser_create.add_argument( - "bucket", help="The name of the GCS bucket containing the file." - ) - parser_create.add_argument( - "scan_period_days", - type=int, - help="How often to repeat the scan, in days. The minimum is 1 day.", - ) - parser_create.add_argument( - "--trigger_id", - help="The id of the trigger. If omitted, an id will be randomly " "generated", - ) - parser_create.add_argument( - "--display_name", help="The optional display name of the trigger." - ) - parser_create.add_argument( - "--description", help="The optional description of the trigger." - ) - parser_create.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - parser_create.add_argument( - "--info_types", - nargs="+", - help="Strings representing info types to look for. A full list of " - "info categories and types is available from the API. Examples " - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - "If unspecified, the three above examples will be used.", - default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], - ) - parser_create.add_argument( - "--min_likelihood", - choices=[ - "LIKELIHOOD_UNSPECIFIED", - "VERY_UNLIKELY", - "UNLIKELY", - "POSSIBLE", - "LIKELY", - "VERY_LIKELY", - ], - help="A string representing the minimum likelihood threshold that " - "constitutes a match.", - ) - parser_create.add_argument( - "--max_findings", - type=int, - help="The maximum number of findings to report; 0 = no maximum.", - ) - parser_create.add_argument( - "--auto_populate_timespan", - type=bool, - help="Limit scan to new content only.", - ) - - parser_list = subparsers.add_parser("list", help="List all triggers.") - parser_list.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - - parser_delete = subparsers.add_parser("delete", help="Delete a trigger.") - parser_delete.add_argument("trigger_id", help="The id of the trigger to delete.") - parser_delete.add_argument( - "--project", - help="The Google Cloud project id to use as a parent resource.", - default=default_project, - ) - - args = parser.parse_args() - - if args.action == "create": - create_trigger( - args.project, - args.bucket, - args.scan_period_days, - args.info_types, - trigger_id=args.trigger_id, - display_name=args.display_name, - description=args.description, - min_likelihood=args.min_likelihood, - max_findings=args.max_findings, - auto_populate_timespan=args.auto_populate_timespan, - ) - elif args.action == "list": - list_triggers(args.project) - elif args.action == "delete": - delete_trigger(args.project, args.trigger_id) diff --git a/samples/snippets/triggers_test.py b/samples/snippets/triggers_test.py deleted file mode 100644 index ca862e6e..00000000 --- a/samples/snippets/triggers_test.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import uuid - -import google.api_core.exceptions -import google.cloud.storage -import pytest - -import triggers - -UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] -GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") -TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" + UNIQUE_STRING -RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") -RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"] -TEST_TRIGGER_ID = "test-trigger" + UNIQUE_STRING - - -@pytest.fixture(scope="module") -def bucket(): - # Creates a GCS bucket, uploads files required for the test, and tears down - # the entire bucket afterwards. - - client = google.cloud.storage.Client() - try: - bucket = client.get_bucket(TEST_BUCKET_NAME) - except google.cloud.exceptions.NotFound: - bucket = client.create_bucket(TEST_BUCKET_NAME) - - # Upoad the blobs and keep track of them in a list. - blobs = [] - for name in RESOURCE_FILE_NAMES: - path = os.path.join(RESOURCE_DIRECTORY, name) - blob = bucket.blob(name) - blob.upload_from_filename(path) - blobs.append(blob) - - # Yield the object to the test; lines after this execute as a teardown. - yield bucket - - # Delete the files. - for blob in blobs: - try: - blob.delete() - except google.cloud.exceptions.NotFound: - print("Issue during teardown, missing blob") - - # Attempt to delete the bucket; this will only work if it is empty. - bucket.delete() - - -def test_create_list_and_delete_trigger(bucket, capsys): - try: - triggers.create_trigger( - GCLOUD_PROJECT, - bucket.name, - 7, - ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], - trigger_id=TEST_TRIGGER_ID, - ) - except google.api_core.exceptions.InvalidArgument: - # Trigger already exists, perhaps due to a previous interrupted test. - triggers.delete_trigger(GCLOUD_PROJECT, TEST_TRIGGER_ID) - - out, _ = capsys.readouterr() - assert TEST_TRIGGER_ID in out - - # Try again and move on. - triggers.create_trigger( - GCLOUD_PROJECT, - bucket.name, - 7, - ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], - trigger_id=TEST_TRIGGER_ID, - auto_populate_timespan=True, - ) - - out, _ = capsys.readouterr() - assert TEST_TRIGGER_ID in out - - triggers.list_triggers(GCLOUD_PROJECT) - - out, _ = capsys.readouterr() - assert TEST_TRIGGER_ID in out - - triggers.delete_trigger(GCLOUD_PROJECT, TEST_TRIGGER_ID) - - out, _ = capsys.readouterr() - assert TEST_TRIGGER_ID in out