1919
2020import argparse
2121import os
22+ import json
2223
2324
2425# [START dlp_inspect_string]
@@ -77,7 +78,7 @@ def inspect_string(project, content_string, info_types,
7778 'min_likelihood' : min_likelihood ,
7879 'include_quote' : include_quote ,
7980 'limits' : {'max_findings_per_request' : max_findings },
80- }
81+ }
8182
8283 # Construct the `item`.
8384 item = {'value' : content_string }
@@ -102,8 +103,130 @@ def inspect_string(project, content_string, info_types,
102103 print ('No findings.' )
103104# [END dlp_inspect_string]
104105
106+ # [START dlp_inspect_table]
107+
108+
109+ def inspect_table (project , data , info_types ,
110+ custom_dictionaries = None , custom_regexes = None ,
111+ min_likelihood = None , max_findings = None , include_quote = True ):
112+ """Uses the Data Loss Prevention API to analyze strings for protected data.
113+ Args:
114+ project: The Google Cloud project id to use as a parent resource.
115+ data: Json string representing table data.
116+ info_types: A list of strings representing info types to look for.
117+ A full list of info type categories can be fetched from the API.
118+ min_likelihood: A string representing the minimum likelihood threshold
119+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
120+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
121+ max_findings: The maximum number of findings to report; 0 = no maximum.
122+ include_quote: Boolean for whether to display a quote of the detected
123+ information in the results.
124+ Returns:
125+ None; the response from the API is printed to the terminal.
126+ Example:
127+ data = {
128+ "header":[
129+ "email",
130+ "phone number"
131+ ],
132+ "rows":[
133+ [
134+ 135+ "4232342345"
136+ ],
137+ [
138+ 139+ "4253458383"
140+ ]
141+ ]
142+ }
143+
144+ >> $ python inspect_content.py table \
145+ '{"header": ["email", "phone number"],
146+ "rows": [["[email protected] ", "4232342345"], 147+ ["[email protected] ", "4253458383"]]}' 148+ 149+ Info type: EMAIL_ADDRESS
150+ Likelihood: 4
151+ 152+ Info type: EMAIL_ADDRESS
153+ Likelihood: 4
154+ """
155+
156+ # Import the client library.
157+ import google .cloud .dlp
158+
159+ # Instantiate a client.
160+ dlp = google .cloud .dlp .DlpServiceClient ()
161+
162+ # Prepare info_types by converting the list of strings into a list of
163+ # dictionaries (protos are also accepted).
164+ info_types = [{'name' : info_type } for info_type in info_types ]
165+
166+ # Prepare custom_info_types by parsing the dictionary word lists and
167+ # regex patterns.
168+ if custom_dictionaries is None :
169+ custom_dictionaries = []
170+ dictionaries = [{
171+ 'info_type' : {'name' : 'CUSTOM_DICTIONARY_{}' .format (i )},
172+ 'dictionary' : {
173+ 'word_list' : {'words' : custom_dict .split (',' )}
174+ }
175+ } for i , custom_dict in enumerate (custom_dictionaries )]
176+ if custom_regexes is None :
177+ custom_regexes = []
178+ regexes = [{
179+ 'info_type' : {'name' : 'CUSTOM_REGEX_{}' .format (i )},
180+ 'regex' : {'pattern' : custom_regex }
181+ } for i , custom_regex in enumerate (custom_regexes )]
182+ custom_info_types = dictionaries + regexes
183+
184+ # Construct the configuration dictionary. Keys which are None may
185+ # optionally be omitted entirely.
186+ inspect_config = {
187+ 'info_types' : info_types ,
188+ 'custom_info_types' : custom_info_types ,
189+ 'min_likelihood' : min_likelihood ,
190+ 'include_quote' : include_quote ,
191+ 'limits' : {'max_findings_per_request' : max_findings },
192+ }
193+
194+ # Construct the `table`. For more details on the table schema, please see
195+ # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
196+ headers = [{"name" : val } for val in data ["header" ]]
197+ rows = []
198+ for row in data ["rows" ]:
199+ rows .append ({
200+ "values" : [{"string_value" : cell_val } for cell_val in row ]
201+ })
202+
203+ table = {}
204+ table ["headers" ] = headers
205+ table ["rows" ] = rows
206+ item = {"table" : table }
207+ # Convert the project id into a full resource id.
208+ parent = dlp .project_path (project )
209+
210+ # Call the API.
211+ response = dlp .inspect_content (parent , inspect_config , item )
212+
213+ # Print out the results.
214+ if response .result .findings :
215+ for finding in response .result .findings :
216+ try :
217+ if finding .quote :
218+ print ('Quote: {}' .format (finding .quote ))
219+ except AttributeError :
220+ pass
221+ print ('Info type: {}' .format (finding .info_type .name ))
222+ print ('Likelihood: {}' .format (finding .likelihood ))
223+ else :
224+ print ('No findings.' )
225+ # [END dlp_inspect_table]
105226
106227# [START dlp_inspect_file]
228+
229+
107230def inspect_file (project , filename , info_types , min_likelihood = None ,
108231 custom_dictionaries = None , custom_regexes = None ,
109232 max_findings = None , include_quote = True , mime_type = None ):
@@ -284,8 +407,8 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
284407 storage_config = {
285408 'cloud_storage_options' : {
286409 'file_set' : {'url' : url }
287- }
288410 }
411+ }
289412
290413 # Convert the project id into a full resource id.
291414 parent = dlp .project_path (project )
@@ -309,7 +432,6 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
309432 subscriber = google .cloud .pubsub .SubscriberClient ()
310433 subscription_path = subscriber .subscription_path (
311434 project , subscription_id )
312- subscription = subscriber .subscribe (subscription_path )
313435
314436 # Set up a callback to acknowledge a message. This closes around an event
315437 # so that it can signal that it is done and the main thread can continue.
@@ -341,8 +463,7 @@ def callback(message):
341463 print (e )
342464 raise
343465
344- # Register the callback and wait on the event.
345- subscription .open (callback )
466+ subscriber .subscribe (subscription_path , callback = callback )
346467 finished = job_done .wait (timeout = timeout )
347468 if not finished :
348469 print ('No event received before the timeout. Please verify that the '
@@ -460,7 +581,6 @@ def inspect_datastore(project, datastore_project, kind,
460581 subscriber = google .cloud .pubsub .SubscriberClient ()
461582 subscription_path = subscriber .subscription_path (
462583 project , subscription_id )
463- subscription = subscriber .subscribe (subscription_path )
464584
465585 # Set up a callback to acknowledge a message. This closes around an event
466586 # so that it can signal that it is done and the main thread can continue.
@@ -493,7 +613,8 @@ def callback(message):
493613 raise
494614
495615 # Register the callback and wait on the event.
496- subscription .open (callback )
616+ subscriber .subscribe (subscription_path , callback = callback )
617+
497618 finished = job_done .wait (timeout = timeout )
498619 if not finished :
499620 print ('No event received before the timeout. Please verify that the '
@@ -609,7 +730,6 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
609730 subscriber = google .cloud .pubsub .SubscriberClient ()
610731 subscription_path = subscriber .subscription_path (
611732 project , subscription_id )
612- subscription = subscriber .subscribe (subscription_path )
613733
614734 # Set up a callback to acknowledge a message. This closes around an event
615735 # so that it can signal that it is done and the main thread can continue.
@@ -642,7 +762,7 @@ def callback(message):
642762 raise
643763
644764 # Register the callback and wait on the event.
645- subscription . open ( callback )
765+ subscriber . subscribe ( subscription_path , callback = callback )
646766 finished = job_done .wait (timeout = timeout )
647767 if not finished :
648768 print ('No event received before the timeout. Please verify that the '
@@ -698,6 +818,46 @@ def callback(message):
698818 'information in the results.' ,
699819 default = True )
700820
821+ parser_table = subparsers .add_parser ('table' , help = 'Inspect a table.' )
822+ parser_table .add_argument (
823+ 'data' , help = 'Json string representing a table.' , type = json .loads )
824+ parser_table .add_argument (
825+ '--project' ,
826+ help = 'The Google Cloud project id to use as a parent resource.' ,
827+ default = default_project )
828+ parser_table .add_argument (
829+ '--info_types' , action = 'append' ,
830+ help = 'Strings representing info types to look for. A full list of '
831+ 'info categories and types is available from the API. Examples '
832+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
833+ 'If unspecified, the three above examples will be used.' ,
834+ default = ['FIRST_NAME' , 'LAST_NAME' , 'EMAIL_ADDRESS' ])
835+ parser_table .add_argument (
836+ '--custom_dictionaries' , action = 'append' ,
837+ help = 'Strings representing comma-delimited lists of dictionary words'
838+ ' to search for as custom info types. Each string is a comma '
839+ 'delimited list of words representing a distinct dictionary.' ,
840+ default = None )
841+ parser_table .add_argument (
842+ '--custom_regexes' , action = 'append' ,
843+ help = 'Strings representing regex patterns to search for as custom '
844+ ' info types.' ,
845+ default = None )
846+ parser_table .add_argument (
847+ '--min_likelihood' ,
848+ choices = ['LIKELIHOOD_UNSPECIFIED' , 'VERY_UNLIKELY' , 'UNLIKELY' ,
849+ 'POSSIBLE' , 'LIKELY' , 'VERY_LIKELY' ],
850+ help = 'A string representing the minimum likelihood threshold that '
851+ 'constitutes a match.' )
852+ parser_table .add_argument (
853+ '--max_findings' , type = int ,
854+ help = 'The maximum number of findings to report; 0 = no maximum.' )
855+ parser_table .add_argument (
856+ '--include_quote' , type = bool ,
857+ help = 'A boolean for whether to display a quote of the detected '
858+ 'information in the results.' ,
859+ default = True )
860+
701861 parser_file = subparsers .add_parser ('file' , help = 'Inspect a local file.' )
702862 parser_file .add_argument (
703863 'filename' , help = 'The path to the file to inspect.' )
@@ -923,6 +1083,14 @@ def callback(message):
9231083 min_likelihood = args .min_likelihood ,
9241084 max_findings = args .max_findings ,
9251085 include_quote = args .include_quote )
1086+ elif args .content == 'table' :
1087+ inspect_table (
1088+ args .project , args .data , args .info_types ,
1089+ custom_dictionaries = args .custom_dictionaries ,
1090+ custom_regexes = args .custom_regexes ,
1091+ min_likelihood = args .min_likelihood ,
1092+ max_findings = args .max_findings ,
1093+ include_quote = args .include_quote )
9261094 elif args .content == 'file' :
9271095 inspect_file (
9281096 args .project , args .filename , args .info_types ,
0 commit comments