From 224486925e5fd278963164064cc7b6d559f586e1 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Fri, 25 Oct 2024 00:35:38 -0400 Subject: [PATCH 01/18] Add fix for #59242 --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/internals/construction.py | 12 +++++++++++- pandas/tests/io/test_sql.py | 14 ++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 01c2ed3821d7a..2880ecec81f66 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -130,7 +130,7 @@ MultiIndex I/O ^^^ -- +- Bug in :func:`read_sql` causing an unintended exception when byte data was being converted to string when using the pyarrow dtype_backend (:issue:`59242`) - Period diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 959e572b2b35b..1900ed282e876 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -970,7 +970,17 @@ def convert(arr): if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): new_dtype = StringDtype() arr_cls = new_dtype.construct_array_type() - arr = arr_cls._from_sequence(arr, dtype=new_dtype) + try: + # Addressing (#59242) + # Byte data that could not be decoded into + # a string would throw a UnicodeDecodeError exception + + # Try and greedily convert to string + # Will fail if the object is bytes + arr = arr_cls._from_sequence(arr, dtype=new_dtype) + except UnicodeDecodeError: + pass + elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): if arr.dtype.kind in "iufb": arr = pd_array(arr, copy=False) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index c28a33069d23f..69ad44d1a5e73 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4352,3 +4352,17 @@ def test_xsqlite_if_exists(sqlite_buildin): (5, "E"), ] drop_table(table_name, sqlite_buildin) + + +def test_bytes_column(sqlite_buildin): + """ + Regression test for (#59242) + Bytes being returned in a column that could not be converted + to a string would raise a UnicodeDecodeError + when using dtype_backend='pyarrow' + """ + query = """ + select cast(x'0123456789abcdef0123456789abcdef' as blob) a + """ + df = pd.read_sql(query, sqlite_buildin, dtype_backend="pyarrow") + assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" From bd00fc545e25a611d97edecb9aac8c0324d17e90 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Fri, 25 Oct 2024 18:09:53 -0400 Subject: [PATCH 02/18] add skip import --- pandas/tests/io/test_sql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 69ad44d1a5e73..73f9ff42287fc 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4355,6 +4355,7 @@ def test_xsqlite_if_exists(sqlite_buildin): def test_bytes_column(sqlite_buildin): + pytest.importorskip("pyarrow") """ Regression test for (#59242) Bytes being returned in a column that could not be converted From 6a23f05c282f5fb07f507e09ac2667026d258527 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Tue, 12 Nov 2024 17:40:39 -0500 Subject: [PATCH 03/18] address comment --- pandas/core/internals/construction.py | 29 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 1900ed282e876..3f293281a4b53 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -17,6 +17,7 @@ from pandas._config import using_string_dtype from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( @@ -34,7 +35,10 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + ExtensionDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -968,18 +972,27 @@ def convert(arr): # i.e. maybe_convert_objects didn't convert arr = maybe_infer_to_datetimelike(arr) if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): - new_dtype = StringDtype() - arr_cls = new_dtype.construct_array_type() - try: + if dtype_backend == "pyarrow": + pa = import_optional_dependency("pyarrow") # Addressing (#59242) # Byte data that could not be decoded into # a string would throw a UnicodeDecodeError exception - # Try and greedily convert to string - # Will fail if the object is bytes + # Try and greedily convert to pyarrow string + # Will fail if the object is bytes: + # in this case convert to pyarrow binary + try: + str_dtype = ArrowDtype(pa.string()) + str_cls = str_dtype.construct_array_type() + arr = str_cls._from_sequence(arr, dtype=str_dtype) + except pa.lib.ArrowInvalid: + bin_dtype = ArrowDtype(pa.binary()) + bin_cls = bin_dtype.construct_array_type() + arr = bin_cls._from_sequence(arr, dtype=bin_dtype) + else: + new_dtype = StringDtype() + arr_cls = new_dtype.construct_array_type() arr = arr_cls._from_sequence(arr, dtype=new_dtype) - except UnicodeDecodeError: - pass elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): if arr.dtype.kind in "iufb": From 8f900b833cd3a495a9246e888e1b2572faf583aa Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Wed, 13 Nov 2024 17:32:10 -0500 Subject: [PATCH 04/18] also fix for dtype_backend=numpy_nullable --- pandas/core/internals/construction.py | 23 +++++++++++++---------- pandas/tests/io/test_sql.py | 7 ++++--- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 1828913da65ab..245aa3291fe2f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -972,27 +972,30 @@ def convert(arr): # i.e. maybe_convert_objects didn't convert arr = maybe_infer_to_datetimelike(arr) if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): + # Addressing (#59242) + # Byte data that could not be decoded into + # a string would throw a UnicodeDecodeError exception + + # Try and greedily convert to string if dtype_backend == "pyarrow": pa = import_optional_dependency("pyarrow") - # Addressing (#59242) - # Byte data that could not be decoded into - # a string would throw a UnicodeDecodeError exception - - # Try and greedily convert to pyarrow string - # Will fail if the object is bytes: - # in this case convert to pyarrow binary try: str_dtype = ArrowDtype(pa.string()) str_cls = str_dtype.construct_array_type() arr = str_cls._from_sequence(arr, dtype=str_dtype) except pa.lib.ArrowInvalid: + # in this case convert to pyarrow binary bin_dtype = ArrowDtype(pa.binary()) bin_cls = bin_dtype.construct_array_type() arr = bin_cls._from_sequence(arr, dtype=bin_dtype) else: - new_dtype = StringDtype() - arr_cls = new_dtype.construct_array_type() - arr = arr_cls._from_sequence(arr, dtype=new_dtype) + try: + new_dtype = StringDtype() + arr_cls = new_dtype.construct_array_type() + arr = arr_cls._from_sequence(arr, dtype=new_dtype) + except UnicodeDecodeError: + # in this case do nothing + pass elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): if arr.dtype.kind in "iufb": diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1976973b28f74..f94ac59f6f7b6 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4357,16 +4357,17 @@ def test_xsqlite_if_exists(sqlite_buildin): drop_table(table_name, sqlite_buildin) -def test_bytes_column(sqlite_buildin): +@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default]) +def test_bytes_column(sqlite_buildin, dtype_backend): pytest.importorskip("pyarrow") """ Regression test for (#59242) Bytes being returned in a column that could not be converted to a string would raise a UnicodeDecodeError - when using dtype_backend='pyarrow' + when using dtype_backend='pyarrow' or dtype_backend='numpy_nullable' """ query = """ select cast(x'0123456789abcdef0123456789abcdef' as blob) a """ - df = pd.read_sql(query, sqlite_buildin, dtype_backend="pyarrow") + df = pd.read_sql(query, sqlite_buildin, dtype_backend=dtype_backend) assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" From a32b4a6c9f0f412dbc669ba04e23a34880862446 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Sun, 17 Nov 2024 09:15:30 -0500 Subject: [PATCH 05/18] fix --- pandas/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 08bd1117c456a..6c97baa890777 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -191,7 +191,6 @@ # module level doc-string -__version__ = "3.0.0" __doc__ = """ pandas - a powerful data analysis and manipulation library for Python ===================================================================== From a0200d0411d34fb0599b4bd7788e5e8b937b47fc Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Wed, 20 Nov 2024 18:48:32 -0500 Subject: [PATCH 06/18] address comment --- pandas/tests/io/test_sql.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 49f4dab138843..1292cf52f42ff 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4373,6 +4373,12 @@ def test_bytes_column(sqlite_buildin, dtype_backend): select cast(x'0123456789abcdef0123456789abcdef' as blob) a """ df = pd.read_sql(query, sqlite_buildin, dtype_backend=dtype_backend) - assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" - if dtype_backend == "pyarrow": - assert df.a.dtype == pd.ArrowDtype(pa.binary()) + expected = DataFrame( + [ + { + "a": b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef", + } + ], + dtype=(pd.ArrowDtype(pa.binary()) if dtype_backend == "pyarrow" else "O"), + ) + tm.assert_frame_equal(df, expected) From ba2e82dfa440ea3ce90259e14b43504a7a40a65f Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Wed, 4 Dec 2024 00:13:26 -0500 Subject: [PATCH 07/18] address comment --- pandas/tests/io/test_sql.py | 39 +++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1292cf52f42ff..c07bdd54735e5 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4360,25 +4360,22 @@ def test_xsqlite_if_exists(sqlite_buildin): drop_table(table_name, sqlite_buildin) -@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default]) -def test_bytes_column(sqlite_buildin, dtype_backend): +@pytest.mark.parametrize("conn", all_connectable) +def test_bytes_column(conn, request): + # GitHub Issue #59242 + conn = request.getfixturevalue(conn) pa = pytest.importorskip("pyarrow") - """ - Regression test for (#59242) - Bytes being returned in a column that could not be converted - to a string would raise a UnicodeDecodeError - when using dtype_backend='pyarrow' or dtype_backend='numpy_nullable' - """ - query = """ - select cast(x'0123456789abcdef0123456789abcdef' as blob) a - """ - df = pd.read_sql(query, sqlite_buildin, dtype_backend=dtype_backend) - expected = DataFrame( - [ - { - "a": b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef", - } - ], - dtype=(pd.ArrowDtype(pa.binary()) if dtype_backend == "pyarrow" else "O"), - ) - tm.assert_frame_equal(df, expected) + for dtype_backend in ["pyarrow", "numpy_nullable", lib.no_default]: + query = """ + select cast(x'0123456789abcdef0123456789abcdef' as blob) a + """ + df = pd.read_sql(query, conn, dtype_backend=dtype_backend) + expected = DataFrame( + [ + { + "a": b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef", + } + ], + dtype=(pd.ArrowDtype(pa.binary()) if dtype_backend == "pyarrow" else "O"), + ) + tm.assert_frame_equal(df, expected) From 602194a331428d2658e59d3adee04547a2cd48b9 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Wed, 4 Dec 2024 06:51:43 -0500 Subject: [PATCH 08/18] remove cast --- pandas/tests/io/test_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index c07bdd54735e5..020fe2e26a5c2 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4367,7 +4367,7 @@ def test_bytes_column(conn, request): pa = pytest.importorskip("pyarrow") for dtype_backend in ["pyarrow", "numpy_nullable", lib.no_default]: query = """ - select cast(x'0123456789abcdef0123456789abcdef' as blob) a + select x'0123456789abcdef0123456789abcdef' a """ df = pd.read_sql(query, conn, dtype_backend=dtype_backend) expected = DataFrame( From f8ae2856acbeff844f90a1c90aab53ddc5fe1906 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Wed, 4 Dec 2024 08:06:01 -0500 Subject: [PATCH 09/18] fix test --- pandas/tests/io/test_sql.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 020fe2e26a5c2..40c5d26bc3e8c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4360,22 +4360,33 @@ def test_xsqlite_if_exists(sqlite_buildin): drop_table(table_name, sqlite_buildin) -@pytest.mark.parametrize("conn", all_connectable) -def test_bytes_column(conn, request): +@pytest.mark.parametrize("con", all_connectable) +def test_bytes_column(con, request): # GitHub Issue #59242 - conn = request.getfixturevalue(conn) + conn = request.getfixturevalue(con) pa = pytest.importorskip("pyarrow") for dtype_backend in ["pyarrow", "numpy_nullable", lib.no_default]: query = """ select x'0123456789abcdef0123456789abcdef' a """ df = pd.read_sql(query, conn, dtype_backend=dtype_backend) + + dtype = "O" + if dtype_backend == "pyarrow": + # sqlite3 + mysql both return a binary type + # for the binary literal + dtype = pd.ArrowDtype(pa.binary()) + elif dtype_backend == "pyarrow" and "postgres" in con: + # postgres column is a bit type + # but converts to a string when returned + dtype = pd.ArrowDtype(pa.string()) + expected = DataFrame( [ { "a": b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef", } ], - dtype=(pd.ArrowDtype(pa.binary()) if dtype_backend == "pyarrow" else "O"), + dtype=dtype, ) tm.assert_frame_equal(df, expected) From 02514e0098063daa0e161473bed37f7056ace2c6 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Wed, 4 Dec 2024 17:39:07 -0500 Subject: [PATCH 10/18] fix logic --- pandas/tests/io/test_sql.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 40c5d26bc3e8c..d3b9eef22d7ba 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4372,14 +4372,14 @@ def test_bytes_column(con, request): df = pd.read_sql(query, conn, dtype_backend=dtype_backend) dtype = "O" - if dtype_backend == "pyarrow": - # sqlite3 + mysql both return a binary type - # for the binary literal - dtype = pd.ArrowDtype(pa.binary()) - elif dtype_backend == "pyarrow" and "postgres" in con: + if dtype_backend == "pyarrow" and "postgres" in con: # postgres column is a bit type # but converts to a string when returned dtype = pd.ArrowDtype(pa.string()) + elif dtype_backend == "pyarrow": + # sqlite3 + mysql both return a binary type + # for the binary literal + dtype = pd.ArrowDtype(pa.binary()) expected = DataFrame( [ From ef5c2ed52b9fee115460cc009382dd64f9e5eaba Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Fri, 6 Dec 2024 19:26:04 -0500 Subject: [PATCH 11/18] fix --- pandas/tests/io/test_sql.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d3b9eef22d7ba..9d2831b9c25bb 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4366,27 +4366,27 @@ def test_bytes_column(con, request): conn = request.getfixturevalue(con) pa = pytest.importorskip("pyarrow") for dtype_backend in ["pyarrow", "numpy_nullable", lib.no_default]: - query = """ - select x'0123456789abcdef0123456789abcdef' a - """ - df = pd.read_sql(query, conn, dtype_backend=dtype_backend) + df = pd.read_sql( + "select x'0123456789abcdef0123456789abcdef' a", + conn, + dtype_backend=dtype_backend, + ) dtype = "O" + val = b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" if dtype_backend == "pyarrow" and "postgres" in con: # postgres column is a bit type # but converts to a string when returned dtype = pd.ArrowDtype(pa.string()) + val = ( + "0000000100100011010001010110011110001001101010" + "11110011011110111100000001001000110100010101100" + "11110001001101010111100110111101111" + ) elif dtype_backend == "pyarrow": # sqlite3 + mysql both return a binary type # for the binary literal dtype = pd.ArrowDtype(pa.binary()) - expected = DataFrame( - [ - { - "a": b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef", - } - ], - dtype=dtype, - ) + expected = DataFrame([{"a": val}], dtype=dtype) tm.assert_frame_equal(df, expected) From da7817a68ce1dfdc2fd9bbe9e9ac7489973ab744 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Sun, 8 Dec 2024 19:28:29 -0500 Subject: [PATCH 12/18] fix --- pandas/tests/io/test_sql.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9d2831b9c25bb..454e0198fef55 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4374,19 +4374,25 @@ def test_bytes_column(con, request): dtype = "O" val = b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" - if dtype_backend == "pyarrow" and "postgres" in con: - # postgres column is a bit type - # but converts to a string when returned - dtype = pd.ArrowDtype(pa.string()) + + if "postgres" in con: val = ( - "0000000100100011010001010110011110001001101010" + b"\x00\x00\x00\x80\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" + if "adbc" in con + else "0000000100100011010001010110011110001001101010" "11110011011110111100000001001000110100010101100" "11110001001101010111100110111101111" ) - elif dtype_backend == "pyarrow": - # sqlite3 + mysql both return a binary type - # for the binary literal - dtype = pd.ArrowDtype(pa.binary()) + + if "psycopg2" in con and dtype_backend == "numpy_nullable": + dtype = pd.StringDtype() + + if dtype_backend == "pyarrow": + dtype = ( + pd.ArrowDtype(pa.string()) + if "postgres" in con and "adbc" not in con + else pd.ArrowDtype(pa.binary()) + ) expected = DataFrame([{"a": val}], dtype=dtype) tm.assert_frame_equal(df, expected) From de5c604b19f25ee94510fa29146f3e2dc50523cc Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Sun, 8 Dec 2024 21:08:31 -0500 Subject: [PATCH 13/18] also include pa.OpaqueType --- pandas/core/dtypes/dtypes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 96b0aa16940a6..60eff6e10f0be 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2263,7 +2263,9 @@ def type(self): elif pa.types.is_null(pa_type): # TODO: None? pd.NA? pa.null? return type(pa_type) - elif isinstance(pa_type, pa.ExtensionType): + elif isinstance(pa_type, pa.ExtensionType) or isinstance( + pa_type, pa.OpaqueType + ): return type(self)(pa_type.storage_type).type raise NotImplementedError(pa_type) From e025d849e0ae9585119d17af440c41ede6111ec1 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Sun, 8 Dec 2024 22:09:52 -0500 Subject: [PATCH 14/18] fix --- pandas/tests/io/test_sql.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 454e0198fef55..5ddba38cb657c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4391,6 +4391,8 @@ def test_bytes_column(con, request): dtype = ( pd.ArrowDtype(pa.string()) if "postgres" in con and "adbc" not in con + else pd.ArrowDtype(pa.opaque(pa.binary(), "bit", "PostgreSQL")) + if "postgres" in con else pd.ArrowDtype(pa.binary()) ) From c066ebf80fe1d4425d0c325536b7445f59c0316f Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Fri, 27 Dec 2024 15:20:48 -0500 Subject: [PATCH 15/18] fix failing test --- pandas/tests/io/test_sql.py | 55 +++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 5ddba38cb657c..51c5235996907 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4361,40 +4361,43 @@ def test_xsqlite_if_exists(sqlite_buildin): @pytest.mark.parametrize("con", all_connectable) -def test_bytes_column(con, request): +@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default]) +def test_bytes_column(con, dtype_backend, request): # GitHub Issue #59242 conn = request.getfixturevalue(con) pa = pytest.importorskip("pyarrow") - for dtype_backend in ["pyarrow", "numpy_nullable", lib.no_default]: - df = pd.read_sql( - "select x'0123456789abcdef0123456789abcdef' a", - conn, - dtype_backend=dtype_backend, - ) - - dtype = "O" - val = b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" - if "postgres" in con: - val = ( - b"\x00\x00\x00\x80\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" - if "adbc" in con - else "0000000100100011010001010110011110001001101010" - "11110011011110111100000001001000110100010101100" - "11110001001101010111100110111101111" - ) - - if "psycopg2" in con and dtype_backend == "numpy_nullable": - dtype = pd.StringDtype() + dtype = "O" + val = b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" + if "postgres" in con: + val = ( + b"\x00\x00\x00\x80\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" + if "adbc" in con + else "0000000100100011010001010110011110001001101010" + "11110011011110111100000001001000110100010101100" + "11110001001101010111100110111101111" + ) if dtype_backend == "pyarrow": dtype = ( pd.ArrowDtype(pa.string()) - if "postgres" in con and "adbc" not in con + if "adbc" not in con else pd.ArrowDtype(pa.opaque(pa.binary(), "bit", "PostgreSQL")) - if "postgres" in con - else pd.ArrowDtype(pa.binary()) ) - expected = DataFrame([{"a": val}], dtype=dtype) - tm.assert_frame_equal(df, expected) + if "psycopg2" in con: + if dtype_backend == "numpy_nullable": + dtype = pd.StringDtype() + elif dtype_backend == lib.no_default and pd.options.future.infer_string: + dtype = pd.StringDtype(storage="pyarrow", na_value=np.nan) + + if "postgres" not in con and dtype_backend == "pyarrow": + dtype = pd.ArrowDtype(pa.binary()) + + expected = DataFrame([{"a": val}], dtype=dtype) + df = pd.read_sql( + "select x'0123456789abcdef0123456789abcdef' a", + conn, + dtype_backend=dtype_backend, + ) + tm.assert_frame_equal(df, expected) From a8f69e525133eeeacafe8606d59d0a327dbb2632 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Mon, 20 Jan 2025 21:54:27 -0500 Subject: [PATCH 16/18] addressing comments --- pandas/tests/io/test_sql.py | 38 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 51c5235996907..6f986bdb0c9ed 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4361,38 +4361,34 @@ def test_xsqlite_if_exists(sqlite_buildin): @pytest.mark.parametrize("con", all_connectable) -@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default]) def test_bytes_column(con, dtype_backend, request): # GitHub Issue #59242 conn = request.getfixturevalue(con) pa = pytest.importorskip("pyarrow") - dtype = "O" val = b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" - if "postgres" in con: - val = ( - b"\x00\x00\x00\x80\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" - if "adbc" in con - else "0000000100100011010001010110011110001001101010" - "11110011011110111100000001001000110100010101100" - "11110001001101010111100110111101111" - ) - if dtype_backend == "pyarrow": - dtype = ( - pd.ArrowDtype(pa.string()) - if "adbc" not in con - else pd.ArrowDtype(pa.opaque(pa.binary(), "bit", "PostgreSQL")) + if "adbc" in con: + val = b"\x00\x00\x00\x80\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" + else: + val = ( + "0000000100100011010001010110011110001001101010" + "11110011011110111100000001001000110100010101100" + "11110001001101010111100110111101111" ) - if "psycopg2" in con: + if dtype_backend == "pyarrow": + dtype = pd.ArrowDtype(pa.binary()) + if "postgres" in con: + if "adbc" in con: + dtype = pd.ArrowDtype(pa.opaque(pa.binary(), "bit", "PostgreSQL")) + else: + dtype = pd.ArrowDtype(pa.string()) + else: + dtype = "O" + if "postgres" in con and "psycopg2" in con: if dtype_backend == "numpy_nullable": dtype = pd.StringDtype() - elif dtype_backend == lib.no_default and pd.options.future.infer_string: - dtype = pd.StringDtype(storage="pyarrow", na_value=np.nan) - - if "postgres" not in con and dtype_backend == "pyarrow": - dtype = pd.ArrowDtype(pa.binary()) expected = DataFrame([{"a": val}], dtype=dtype) df = pd.read_sql( From 1a0b2cfab26686b168f4781678841fa8a4e7e0db Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Mon, 20 Jan 2025 22:31:11 -0500 Subject: [PATCH 17/18] fix --- pandas/tests/io/test_sql.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6f986bdb0c9ed..e01e73188c38c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4389,6 +4389,8 @@ def test_bytes_column(con, dtype_backend, request): if "postgres" in con and "psycopg2" in con: if dtype_backend == "numpy_nullable": dtype = pd.StringDtype() + elif dtype_backend == lib.no_default and pd.options.future.infer_string: + dtype = pd.StringDtype(storage="pyarrow", na_value=np.nan) expected = DataFrame([{"a": val}], dtype=dtype) df = pd.read_sql( From 4c1c282f58825e016c4fc262843c6dbdefa1372c Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Tue, 28 Jan 2025 19:07:44 -0500 Subject: [PATCH 18/18] clarify --- pandas/tests/io/test_sql.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e01e73188c38c..95aebb2907d2c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4366,7 +4366,8 @@ def test_bytes_column(con, dtype_backend, request): conn = request.getfixturevalue(con) pa = pytest.importorskip("pyarrow") - val = b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" + hex_str = "0123456789abcdef0123456789abcdef" + val = bytes.fromhex(hex_str) if "postgres" in con: if "adbc" in con: val = b"\x00\x00\x00\x80\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" @@ -4394,7 +4395,7 @@ def test_bytes_column(con, dtype_backend, request): expected = DataFrame([{"a": val}], dtype=dtype) df = pd.read_sql( - "select x'0123456789abcdef0123456789abcdef' a", + f"select x'{hex_str}' a", conn, dtype_backend=dtype_backend, )