Skip to content

Commit 3b112bf

Browse files
max-sixtytswast
authored andcommitted
Encode before uploading (#108)
* encode before uploading * set py file coding for py2 * lint * move test to travis test class * try forcing utf-8 encoding * add test * correct expected sizes * test data matches * test unicode locally * Py2/Py3 compat * typo * what's new
1 parent 61bc28f commit 3b112bf

File tree

3 files changed

+124
-2
lines changed

3 files changed

+124
-2
lines changed

docs/source/changelog.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
Changelog
22
=========
33

4+
5+
0.3.1 / [TBD]
6+
------------------
7+
8+
- Fix an issue where Unicode couldn't be uploaded in Python 2 (:issue:`93`)
9+
10+
411
0.3.0 / 2018-01-03
512
------------------
613

pandas_gbq/gbq.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,7 @@ def run_query(self, query, **kwargs):
559559

560560
def load_data(self, dataframe, dataset_id, table_id, chunksize):
561561
from google.cloud.bigquery import LoadJobConfig
562-
from six import StringIO
562+
from six import BytesIO
563563

564564
destination_table = self.client.dataset(dataset_id).table(table_id)
565565
job_config = LoadJobConfig()
@@ -581,7 +581,11 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize):
581581
self._print("\rLoad is {0}% Complete".format(
582582
((total_rows - remaining_rows) * 100) / total_rows))
583583

584-
body = StringIO('{}\n'.format('\n'.join(rows)))
584+
body = '{}\n'.format('\n'.join(rows))
585+
if isinstance(body, bytes):
586+
body = body.decode('utf-8')
587+
body = body.encode('utf-8')
588+
body = BytesIO(body)
585589

586590
try:
587591
self.client.load_table_from_file(

pandas_gbq/tests/test_gbq.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# -*- coding: utf-8 -*-
2+
13
import pytest
24

35
import re
@@ -7,6 +9,7 @@
79
import os
810
from random import randint
911
import logging
12+
import sys
1013

1114
import numpy as np
1215

@@ -1154,6 +1157,61 @@ def test_google_upload_errors_should_raise_exception(self):
11541157
gbq.to_gbq(bad_df, self.destination_table + test_id,
11551158
_get_project_id(), private_key=_get_private_key_path())
11561159

1160+
def test_upload_chinese_unicode_data(self):
1161+
test_id = "2"
1162+
test_size = 6
1163+
df = DataFrame(np.random.randn(6, 4), index=range(6),
1164+
columns=list('ABCD'))
1165+
df['s'] = u'信用卡'
1166+
1167+
gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(),
1168+
chunksize=10000)
1169+
1170+
result_df = gbq.read_gbq("SELECT * FROM {0}".format(
1171+
self.destination_table + test_id),
1172+
project_id=_get_project_id())
1173+
1174+
assert len(result_df) == test_size
1175+
1176+
pytest.skipif(
1177+
sys.version_info.major < 3,
1178+
reason='Unicode comparison in Py2 not working')
1179+
1180+
result = result_df['s'].sort_values()
1181+
expected = df['s'].sort_values()
1182+
1183+
tm.assert_numpy_array_equal(expected.values, result.values)
1184+
1185+
def test_upload_other_unicode_data(self):
1186+
test_id = "3"
1187+
test_size = 3
1188+
df = DataFrame({
1189+
's': ['Skywalker™', 'lego', 'hülle'],
1190+
'i': [200, 300, 400],
1191+
'd': [
1192+
'2017-12-13 17:40:39', '2017-12-13 17:40:39',
1193+
'2017-12-13 17:40:39'
1194+
]
1195+
})
1196+
1197+
gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(),
1198+
chunksize=10000)
1199+
1200+
result_df = gbq.read_gbq("SELECT * FROM {0}".format(
1201+
self.destination_table + test_id),
1202+
project_id=_get_project_id())
1203+
1204+
assert len(result_df) == test_size
1205+
1206+
pytest.skipif(
1207+
sys.version_info.major < 3,
1208+
reason='Unicode comparison in Py2 not working')
1209+
1210+
result = result_df['s'].sort_values()
1211+
expected = df['s'].sort_values()
1212+
1213+
tm.assert_numpy_array_equal(expected.values, result.values)
1214+
11571215
def test_generate_schema(self):
11581216
df = tm.makeMixedDataFrame()
11591217
schema = gbq._generate_bq_schema(df)
@@ -1467,6 +1525,59 @@ def test_upload_data(self):
14671525

14681526
assert result['num_rows'][0] == test_size
14691527

1528+
def test_upload_chinese_unicode_data(self):
1529+
test_id = "2"
1530+
test_size = 6
1531+
df = DataFrame(np.random.randn(6, 4), index=range(6),
1532+
columns=list('ABCD'))
1533+
df['s'] = u'信用卡'
1534+
1535+
gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(),
1536+
chunksize=10000)
1537+
1538+
result_df = gbq.read_gbq("SELECT * FROM {0}".format(
1539+
self.destination_table + test_id),
1540+
project_id=_get_project_id())
1541+
1542+
assert len(result_df) == test_size
1543+
1544+
if sys.version_info.major < 3:
1545+
pytest.skip(msg='Unicode comparison in Py2 not working')
1546+
1547+
result = result_df['s'].sort_values()
1548+
expected = df['s'].sort_values()
1549+
1550+
tm.assert_numpy_array_equal(expected.values, result.values)
1551+
1552+
def test_upload_other_unicode_data(self):
1553+
test_id = "3"
1554+
test_size = 3
1555+
df = DataFrame({
1556+
's': ['Skywalker™', 'lego', 'hülle'],
1557+
'i': [200, 300, 400],
1558+
'd': [
1559+
'2017-12-13 17:40:39', '2017-12-13 17:40:39',
1560+
'2017-12-13 17:40:39'
1561+
]
1562+
})
1563+
1564+
gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(),
1565+
chunksize=10000)
1566+
1567+
result_df = gbq.read_gbq("SELECT * FROM {0}".format(
1568+
self.destination_table + test_id),
1569+
project_id=_get_project_id())
1570+
1571+
assert len(result_df) == test_size
1572+
1573+
if sys.version_info.major < 3:
1574+
pytest.skip(msg='Unicode comparison in Py2 not working')
1575+
1576+
result = result_df['s'].sort_values()
1577+
expected = df['s'].sort_values()
1578+
1579+
tm.assert_numpy_array_equal(expected.values, result.values)
1580+
14701581

14711582
class TestToGBQIntegrationWithServiceAccountKeyContents(object):
14721583
# Changes to BigQuery table schema may take up to 2 minutes as of May 2015

0 commit comments

Comments
 (0)