diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 1cd36a41..5d1bb98b 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,13 @@ Changelog ========= + +0.3.1 / [TBD] +------------------ + +- Fix an issue where Unicode couldn't be uploaded in Python 2 (:issue:`93`) + + 0.3.0 / 2018-01-03 ------------------ diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 77efe100..67d5ea51 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -559,7 +559,7 @@ def run_query(self, query, **kwargs): def load_data(self, dataframe, dataset_id, table_id, chunksize): from google.cloud.bigquery import LoadJobConfig - from six import StringIO + from six import BytesIO destination_table = self.client.dataset(dataset_id).table(table_id) job_config = LoadJobConfig() @@ -581,7 +581,11 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize): self._print("\rLoad is {0}% Complete".format( ((total_rows - remaining_rows) * 100) / total_rows)) - body = StringIO('{}\n'.format('\n'.join(rows))) + body = '{}\n'.format('\n'.join(rows)) + if isinstance(body, bytes): + body = body.decode('utf-8') + body = body.encode('utf-8') + body = BytesIO(body) try: self.client.load_table_from_file( diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 75274d97..27f991d7 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + import pytest import re @@ -7,6 +9,7 @@ import os from random import randint import logging +import sys import numpy as np @@ -1154,6 +1157,61 @@ def test_google_upload_errors_should_raise_exception(self): gbq.to_gbq(bad_df, self.destination_table + test_id, _get_project_id(), private_key=_get_private_key_path()) + def test_upload_chinese_unicode_data(self): + test_id = "2" + test_size = 6 + df = DataFrame(np.random.randn(6, 4), index=range(6), + columns=list('ABCD')) + df['s'] = u'信用卡' + + gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), + chunksize=10000) + + result_df = gbq.read_gbq("SELECT * FROM {0}".format( + self.destination_table + test_id), + project_id=_get_project_id()) + + assert len(result_df) == test_size + + pytest.skipif( + sys.version_info.major < 3, + reason='Unicode comparison in Py2 not working') + + result = result_df['s'].sort_values() + expected = df['s'].sort_values() + + tm.assert_numpy_array_equal(expected.values, result.values) + + def test_upload_other_unicode_data(self): + test_id = "3" + test_size = 3 + df = DataFrame({ + 's': ['Skywalker™', 'lego', 'hülle'], + 'i': [200, 300, 400], + 'd': [ + '2017-12-13 17:40:39', '2017-12-13 17:40:39', + '2017-12-13 17:40:39' + ] + }) + + gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), + chunksize=10000) + + result_df = gbq.read_gbq("SELECT * FROM {0}".format( + self.destination_table + test_id), + project_id=_get_project_id()) + + assert len(result_df) == test_size + + pytest.skipif( + sys.version_info.major < 3, + reason='Unicode comparison in Py2 not working') + + result = result_df['s'].sort_values() + expected = df['s'].sort_values() + + tm.assert_numpy_array_equal(expected.values, result.values) + def test_generate_schema(self): df = tm.makeMixedDataFrame() schema = gbq._generate_bq_schema(df) @@ -1467,6 +1525,59 @@ def test_upload_data(self): assert result['num_rows'][0] == test_size + def test_upload_chinese_unicode_data(self): + test_id = "2" + test_size = 6 + df = DataFrame(np.random.randn(6, 4), index=range(6), + columns=list('ABCD')) + df['s'] = u'信用卡' + + gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), + chunksize=10000) + + result_df = gbq.read_gbq("SELECT * FROM {0}".format( + self.destination_table + test_id), + project_id=_get_project_id()) + + assert len(result_df) == test_size + + if sys.version_info.major < 3: + pytest.skip(msg='Unicode comparison in Py2 not working') + + result = result_df['s'].sort_values() + expected = df['s'].sort_values() + + tm.assert_numpy_array_equal(expected.values, result.values) + + def test_upload_other_unicode_data(self): + test_id = "3" + test_size = 3 + df = DataFrame({ + 's': ['Skywalker™', 'lego', 'hülle'], + 'i': [200, 300, 400], + 'd': [ + '2017-12-13 17:40:39', '2017-12-13 17:40:39', + '2017-12-13 17:40:39' + ] + }) + + gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), + chunksize=10000) + + result_df = gbq.read_gbq("SELECT * FROM {0}".format( + self.destination_table + test_id), + project_id=_get_project_id()) + + assert len(result_df) == test_size + + if sys.version_info.major < 3: + pytest.skip(msg='Unicode comparison in Py2 not working') + + result = result_df['s'].sort_values() + expected = df['s'].sort_values() + + tm.assert_numpy_array_equal(expected.values, result.values) + class TestToGBQIntegrationWithServiceAccountKeyContents(object): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015