From 84156559b8a1fa06ead8169954c769ab6e428642 Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Thu, 23 Jul 2015 10:59:45 -0400 Subject: [PATCH 1/5] Add usage docs for jobs: query and load. --- docs/bigquery-usage.rst | 186 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) diff --git a/docs/bigquery-usage.rst b/docs/bigquery-usage.rst index 87b948f7eba9..86170b91efea 100644 --- a/docs/bigquery-usage.rst +++ b/docs/bigquery-usage.rst @@ -236,3 +236,189 @@ Delete a table: >>> dataset = client.dataset('dataset_name') >>> table = dataset.table(name='person_ages') >>> table.delete() # API request + +Jobs +---- + +Jobs describe actions peformed on data in BigQuery tables: + +- Load data into a table +- Run a query against data in one or more tables +- Extrat data from a table +- Copy a table + +List jobs for a project: + +.. doctest:: + + >>> from gcloud import bigquery + >>> client = bigquery.Client() + >>> jobs = client.jobs() # API request + >>> [(job.job_id, job.type, job.created, job.state) for job in jobs] + ['e3344fba-09df-4ae0-8337-fddee34b3840', 'insert', (datetime.datetime(2015, 7, 23, 9, 30, 20, 268260, tzinfo=), 'done')] + +Querying data (synchronous) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Run a query which can be expected to complete within bounded time: + +.. doctest:: + + >>> from gcloud import bigquery + >>> client = bigquery.Client() + >>> query = """\ + SELECT count(*) AS age_count FROM dataset_name.person_ages + """ + >>> results = client.query(query, timeout_ms=1000) + >>> while not results.job_complete: + ... time.sleep(10) + ... results.reload() # API request + >>> results.schema + [{'name': 'age_count', 'type': 'integer', 'mode': 'nullable'}] + >>> results.rows + [(15,)] + +.. note:: + + If the query takse longer than the timeout allowd, ``results.job_complete`` + will be False: we therefore poll until it is completed. + +Querying data (asynchronous) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Background a query, loading the results into a table: + +.. doctest:: + + >>> from gcloud import bigquery + >>> client = bigquery.Client() + >>> query = """\ + SELECT firstname + ' ' + last_name AS full_name, + FLOOR(DATEDIFF(CURRENT_DATE(), birth_date) / 365) AS age + FROM dataset_name.persons + """ + >>> dataset = client.dataset('dataset_name') + >>> table = dataset.table(name='person_ages') + >>> job = client.query_async(query, + ... destination=table, + ... write_disposition='truncate') + >>> job.job_id + 'e3344fba-09df-4ae0-8337-fddee34b3840' + >>> job.type + 'load' + >>> job.created + None + >>> job.state + None + +.. note:: + + - ``gcloud.bigquery`` generates a UUID for each job. + - The ``created`` and ``state`` fields are not set until the job + is submitted to the BigQuery back-end. + +Then, begin executing the job on the server: + +.. doctest:: + + >>> job.submit() # API call + >>> job.created + datetime.datetime(2015, 7, 23, 9, 30, 20, 268260, tzinfo=) + >>> job.state + 'running' + +Poll until the job is complete: + +.. doctest:: + + >>> import time + >>> while job.state == 'running': + ... time.sleep(10) + ... job.reload() # API call + >>> job.state + 'done' + >>> job.ended + datetime.datetime(2015, 7, 23, 9, 30, 21, 334792, tzinfo=) + +Inserting data (synchronous) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Start a job loading data asynchronously from a local CSV files. +into a new table. First, create the job locally: + +.. doctest:: + + >>> from gcloud import bigquery + >>> client = bigquery.Client() + >>> table = dataset.table(name='person_ages') + >>> with open('/path/to/person_ages.csv') as f: + ... job = table.load_from_file(f, + ... source_format='CSV', + ... skip_leading_rows=1 + ... write_disposition='truncate', + ... ) # API request + >>> job.job_id + 'e3344fba-09df-4ae0-8337-fddee34b3840' + >>> job.type + 'load' + >>> job.created + datetime.datetime(2015, 7, 23, 9, 30, 20, 268260, tzinfo=) + >>> job.state + 'done' + >>> job.ended + datetime.datetime(2015, 7, 23, 9, 30, 21, 334792, tzinfo=) + +Inserting data (asynchronous) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Start a job loading data asynchronously from a set of CSV files, located on +GCloud Storage, appending rows into an existing table. First, create the job +locally: + +.. doctest:: + + >>> from gcloud import bigquery + >>> client = bigquery.Client() + >>> table = dataset.table(name='person_ages') + >>> job = table.load_from_storage(bucket_name='bucket-name', + ... object_name='object-prefix*', + ... source_format='CSV', + ... skip_leading_rows=1 + ... write_disposition='truncate') + >>> job.job_id + 'e3344fba-09df-4ae0-8337-fddee34b3840' + >>> job.type + 'load' + >>> job.created + None + >>> job.state + None + +.. note:: + + - ``gcloud.bigquery`` generates a UUID for each job. + - The ``created`` and ``state`` fields are not set until the job + is submitted to the BigQuery back-end. + +Then, begin executing the job on the server: + +.. doctest:: + + >>> job.submit() # API call + >>> job.created + datetime.datetime(2015, 7, 23, 9, 30, 20, 268260, tzinfo=) + >>> job.state + 'running' + +Poll until the job is complete: + +.. doctest:: + + >>> import time + >>> while job.state == 'running': + ... time.sleep(10) + ... job.reload() # API call + >>> job.state + 'done' + >>> job.ended + datetime.datetime(2015, 7, 23, 9, 30, 21, 334792, tzinfo=) From 02f4a921902f7591f403f06a79e8c972c64567fd Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Thu, 23 Jul 2015 13:44:01 -0400 Subject: [PATCH 2/5] Typo fixes. Addresses feedback from @dhermes on #1014. --- docs/bigquery-usage.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/bigquery-usage.rst b/docs/bigquery-usage.rst index 86170b91efea..030fb076bc8f 100644 --- a/docs/bigquery-usage.rst +++ b/docs/bigquery-usage.rst @@ -244,7 +244,7 @@ Jobs describe actions peformed on data in BigQuery tables: - Load data into a table - Run a query against data in one or more tables -- Extrat data from a table +- Extract data from a table - Copy a table List jobs for a project: @@ -280,8 +280,8 @@ Run a query which can be expected to complete within bounded time: .. note:: - If the query takse longer than the timeout allowd, ``results.job_complete`` - will be False: we therefore poll until it is completed. + If the query takes longer than the timeout allowed, ``results.job_complete`` + will be ``False``: we therefore poll until it is completed. Querying data (asynchronous) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -343,8 +343,8 @@ Poll until the job is complete: Inserting data (synchronous) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Start a job loading data asynchronously from a local CSV files. -into a new table. First, create the job locally: +Load data synchronously from a local CSV file into a new table. First, +create the job locally: .. doctest:: @@ -383,7 +383,7 @@ locally: >>> job = table.load_from_storage(bucket_name='bucket-name', ... object_name='object-prefix*', ... source_format='CSV', - ... skip_leading_rows=1 + ... skip_leading_rows=1, ... write_disposition='truncate') >>> job.job_id 'e3344fba-09df-4ae0-8337-fddee34b3840' From 31f083c35acb5edd1dd080fa2191a5d5882d5a0c Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Thu, 23 Jul 2015 13:49:25 -0400 Subject: [PATCH 3/5] Bound polling loops. Addresses feedback from @dhermes in #1014. --- docs/bigquery-usage.rst | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/bigquery-usage.rst b/docs/bigquery-usage.rst index 030fb076bc8f..cd7d42b38338 100644 --- a/docs/bigquery-usage.rst +++ b/docs/bigquery-usage.rst @@ -270,9 +270,11 @@ Run a query which can be expected to complete within bounded time: SELECT count(*) AS age_count FROM dataset_name.person_ages """ >>> results = client.query(query, timeout_ms=1000) - >>> while not results.job_complete: - ... time.sleep(10) - ... results.reload() # API request + >>> retry_count = 100 + >>> while retry_count > 0 and not results.job_complete: + ... retry_count -= 1 + ... time.sleep(10) + ... results.reload() # API request >>> results.schema [{'name': 'age_count', 'type': 'integer', 'mode': 'nullable'}] >>> results.rows @@ -332,7 +334,9 @@ Poll until the job is complete: .. doctest:: >>> import time - >>> while job.state == 'running': + >>> retry_count = 100 + >>> while retry_count > 0 and job.state == 'running': + ... retry_count -= 1 ... time.sleep(10) ... job.reload() # API call >>> job.state @@ -415,7 +419,9 @@ Poll until the job is complete: .. doctest:: >>> import time - >>> while job.state == 'running': + >>> retry_count = 100 + >>> while retry_count > 0 and job.state == 'running': + ... retry_count -= 1 ... time.sleep(10) ... job.reload() # API call >>> job.state From 9f07efd43bab2c707b7bfeddc7acd43f76a3492e Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Thu, 23 Jul 2015 13:54:47 -0400 Subject: [PATCH 4/5] Avoid 'f' as variable name. pass mode to 'open'. Addresses feedback from @dhermes in #1014. --- docs/bigquery-usage.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/bigquery-usage.rst b/docs/bigquery-usage.rst index cd7d42b38338..1edf28e86d5f 100644 --- a/docs/bigquery-usage.rst +++ b/docs/bigquery-usage.rst @@ -355,12 +355,13 @@ create the job locally: >>> from gcloud import bigquery >>> client = bigquery.Client() >>> table = dataset.table(name='person_ages') - >>> with open('/path/to/person_ages.csv') as f: - ... job = table.load_from_file(f, - ... source_format='CSV', - ... skip_leading_rows=1 - ... write_disposition='truncate', - ... ) # API request + >>> with open('/path/to/person_ages.csv', 'rb') as file_obj: + ... job = table.load_from_file( + ... file_obj, + ... source_format='CSV', + ... skip_leading_rows=1 + ... write_disposition='truncate', + ... ) # API request >>> job.job_id 'e3344fba-09df-4ae0-8337-fddee34b3840' >>> job.type From 48eef9253d3c57f8e1cdb88dcbcb9c5a2a5ae5c4 Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Fri, 24 Jul 2015 10:49:20 -0400 Subject: [PATCH 5/5] Give argument a more descriptive name. Addresses feedback from @dhermes on #1014. --- docs/bigquery-usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/bigquery-usage.rst b/docs/bigquery-usage.rst index 1edf28e86d5f..9bf03710f8ea 100644 --- a/docs/bigquery-usage.rst +++ b/docs/bigquery-usage.rst @@ -386,7 +386,7 @@ locally: >>> client = bigquery.Client() >>> table = dataset.table(name='person_ages') >>> job = table.load_from_storage(bucket_name='bucket-name', - ... object_name='object-prefix*', + ... object_name_glob='object-prefix*', ... source_format='CSV', ... skip_leading_rows=1, ... write_disposition='truncate')