From 5c60f5c11af936a93dbad1959dce64bea61771ea Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 10 Mar 2014 21:27:15 +0100 Subject: [PATCH 1/2] BUG/TST: replace iterrows with itertuples in sql insert (GH6509) --- doc/source/release.rst | 2 ++ pandas/io/sql.py | 13 +++++++------ pandas/io/tests/test_sql.py | 14 +++++++++++++- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 0e6924e4b0122..c42c9920efef1 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -227,6 +227,8 @@ Bug Fixes - Series.quantile raising on an ``object`` dtype (:issue:`6555`) - Bug in ``.xs`` with a ``nan`` in level when dropped (:issue:`6574`) - Bug in fillna with method = 'bfill/ffill' and ``datetime64[ns]`` dtype (:issue:`6587`) +- Bug in sql writing with mixed dtypes possibly leading to data loss (:issue:`6509`) + pandas 0.13.1 ------------- diff --git a/pandas/io/sql.py b/pandas/io/sql.py index cddcb4d72373b..4c0c18a0e7bd0 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -423,16 +423,17 @@ def insert(self): ins = self.insert_statement() data_list = [] # to avoid if check for every row + keys = self.frame.columns if self.index is not None: - for t in self.frame.iterrows(): + for t in self.frame.itertuples(): data = dict((k, self.maybe_asscalar(v)) - for k, v in t[1].iteritems()) + for k, v in zip(keys, t[1:])) data[self.index] = self.maybe_asscalar(t[0]) data_list.append(data) else: - for t in self.frame.iterrows(): + for t in self.frame.itertuples(): data = dict((k, self.maybe_asscalar(v)) - for k, v in t[1].iteritems()) + for k, v in zip(keys, t[1:])) data_list.append(data) self.pd_sql.execute(ins, data_list) @@ -758,8 +759,8 @@ def insert_statement(self): def insert(self): ins = self.insert_statement() cur = self.pd_sql.con.cursor() - for r in self.frame.iterrows(): - data = [self.maybe_asscalar(v) for v in r[1].values] + for r in self.frame.itertuples(): + data = [self.maybe_asscalar(v) for v in r[1:]] if self.index is not None: data.insert(0, self.maybe_asscalar(r[0])) cur.execute(ins, tuple(data)) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 2be086cddf7c4..89c4bd48576e9 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -7,7 +7,7 @@ import nose import numpy as np -from pandas import DataFrame +from pandas import DataFrame, Series from pandas.compat import range, lrange, iteritems #from pandas.core.datetools import format as date_format @@ -554,6 +554,18 @@ def test_date_parsing(self): self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") + def test_mixed_dtype_insert(self): + # see GH6509 + s1 = Series(2**25 + 1,dtype=np.int32) + s2 = Series(0.0,dtype=np.float32) + df = DataFrame({'s1': s1, 's2': s2}) + + # write and read again + df.to_sql("test_read_write", self.conn) + df2 = sql.read_table("test_read_write", self.conn) + + tm.assert_equal(df['s1'].values, df2['s1'].values) + class TestSQLAlchemy(_TestSQLAlchemy): """ From 1127d70342b3ff68a1709db1a8ea6463ab946777 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 11 Mar 2014 10:08:54 +0100 Subject: [PATCH 2/2] TST: add check_exact arg to assert_frame/series_equal --- pandas/io/tests/test_sql.py | 6 +++--- pandas/util/testing.py | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 89c4bd48576e9..0e26a66921df4 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -560,11 +560,11 @@ def test_mixed_dtype_insert(self): s2 = Series(0.0,dtype=np.float32) df = DataFrame({'s1': s1, 's2': s2}) - # write and read again - df.to_sql("test_read_write", self.conn) + # write and read again + df.to_sql("test_read_write", self.conn, index=False) df2 = sql.read_table("test_read_write", self.conn) - tm.assert_equal(df['s1'].values, df2['s1'].values) + tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) class TestSQLAlchemy(_TestSQLAlchemy): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 007dc8af5ed12..a0876179ee4af 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -499,12 +499,18 @@ def is_sorted(seq): def assert_series_equal(left, right, check_dtype=True, check_index_type=False, check_series_type=False, - check_less_precise=False): + check_less_precise=False, + check_exact=False): if check_series_type: assert_isinstance(left, type(right)) if check_dtype: assert_attr_equal('dtype', left, right) - assert_almost_equal(left.values, right.values, check_less_precise) + if check_exact: + if not np.array_equal(left.values, right.values): + raise AssertionError('{0} is not equal to {1}.'.format(left.values, + right.values)) + else: + assert_almost_equal(left.values, right.values, check_less_precise) if check_less_precise: assert_almost_equal( left.index.values, right.index.values, check_less_precise) @@ -522,7 +528,8 @@ def assert_frame_equal(left, right, check_dtype=True, check_frame_type=False, check_less_precise=False, check_names=True, - by_blocks=False): + by_blocks=False, + check_exact=False): if check_frame_type: assert_isinstance(left, type(right)) assert_isinstance(left, DataFrame) @@ -555,7 +562,8 @@ def assert_frame_equal(left, right, check_dtype=True, assert_series_equal(lcol, rcol, check_dtype=check_dtype, check_index_type=check_index_type, - check_less_precise=check_less_precise) + check_less_precise=check_less_precise, + check_exact=check_exact) if check_index_type: assert_isinstance(left.index, type(right.index))