Skip to content

ENH: Add dtype argument to read_sql_query (GH10285) #37546

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Dec 23, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ Other enhancements
- Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`)
- Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`)
- :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move to 1.3

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

- When :func:`read_csv/sas/json` are called with ``chuncksize``/``iterator`` they can be used in a ``with`` statement as they return context-managers (:issue:`38225`)

.. ---------------------------------------------------------------------------
Expand Down
41 changes: 37 additions & 4 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
from datetime import date, datetime, time
from functools import partial
import re
from typing import Iterator, List, Optional, Union, overload
from typing import Dict, Iterator, List, Optional, Union, overload
import warnings

import numpy as np

import pandas._libs.lib as lib
from pandas._typing import Dtype

from pandas.core.dtypes.common import is_datetime64tz_dtype, is_dict_like, is_list_like
from pandas.core.dtypes.dtypes import DatetimeTZDtype
Expand Down Expand Up @@ -119,10 +120,15 @@ def _parse_date_columns(data_frame, parse_dates):
return data_frame


def _wrap_result(data, columns, index_col=None, coerce_float=True, parse_dates=None):
def _wrap_result(
data, columns, index_col=None, coerce_float=True, parse_dates=None, dtype=None
):
"""Wrap result set of query in a DataFrame."""
frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float)

if dtype:
frame = frame.astype(dtype)

frame = _parse_date_columns(frame, parse_dates)

if index_col is not None:
Expand Down Expand Up @@ -295,6 +301,7 @@ def read_sql_query(
params=None,
parse_dates=None,
chunksize: None = None,
dtype: Optional[Union[Dtype, Dict[str, Dtype]]] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this type actually is pretty usefule, can you define in _typing, call it DtypeOrDictDtype / DtypeTable and add a comment about it. cc @simonjayhawkins @WillAyd @jorisvandenbossche for the name here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
dtype: Optional[Union[Dtype, Dict[str, Dtype]]] = None,
dtype: Optional[Union[Dtype, Dict[Label, Dtype]]] = None,

Maybe DtypeArg?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, added DtypeArg to _typing

) -> DataFrame:
...

Expand All @@ -308,6 +315,7 @@ def read_sql_query(
params=None,
parse_dates=None,
chunksize: int = 1,
dtype: Optional[Union[Dtype, Dict[str, Dtype]]] = None,
) -> Iterator[DataFrame]:
...

Expand All @@ -320,6 +328,7 @@ def read_sql_query(
params=None,
parse_dates=None,
chunksize: Optional[int] = None,
dtype: Optional[Union[Dtype, Dict[str, Dtype]]] = None,
) -> Union[DataFrame, Iterator[DataFrame]]:
"""
Read SQL query into a DataFrame.
Expand Down Expand Up @@ -358,6 +367,9 @@ def read_sql_query(
chunksize : int, default None
If specified, return an iterator where `chunksize` is the number of
rows to include in each chunk.
dtype : Type name or dict of columns
Data type for data or columns. E.g. np.float64 or
{‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need a versionadded 1.3 here. ok to add in next PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I see i didn't commit that change. But will indeed add it to the follow on


Returns
-------
Expand All @@ -381,6 +393,7 @@ def read_sql_query(
coerce_float=coerce_float,
parse_dates=parse_dates,
chunksize=chunksize,
dtype=dtype,
)


Expand Down Expand Up @@ -1225,7 +1238,13 @@ def read_table(

@staticmethod
def _query_iterator(
result, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None
result,
chunksize,
columns,
index_col=None,
coerce_float=True,
parse_dates=None,
dtype=None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you type anywhere you are adding type

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

):
"""Return generator through chunked result set"""
while True:
Expand All @@ -1239,6 +1258,7 @@ def _query_iterator(
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
dtype=dtype,
)

def read_query(
Expand All @@ -1249,6 +1269,7 @@ def read_query(
parse_dates=None,
params=None,
chunksize=None,
dtype=None,
):
"""
Read SQL query into a DataFrame.
Expand Down Expand Up @@ -1304,6 +1325,7 @@ def read_query(
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
dtype=dtype,
)
else:
data = result.fetchall()
Expand All @@ -1313,6 +1335,7 @@ def read_query(
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
dtype=dtype,
)
return frame

Expand Down Expand Up @@ -1712,7 +1735,13 @@ def execute(self, *args, **kwargs):

@staticmethod
def _query_iterator(
cursor, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None
cursor,
chunksize,
columns,
index_col=None,
coerce_float=True,
parse_dates=None,
dtype=None,
):
"""Return generator through chunked result set"""
while True:
Expand All @@ -1729,6 +1758,7 @@ def _query_iterator(
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
dtype=dtype,
)

def read_query(
Expand All @@ -1739,6 +1769,7 @@ def read_query(
params=None,
parse_dates=None,
chunksize=None,
dtype=None,
):

args = _convert_params(sql, params)
Expand All @@ -1753,6 +1784,7 @@ def read_query(
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
dtype=dtype,
)
else:
data = self._fetchall_as_list(cursor)
Expand All @@ -1764,6 +1796,7 @@ def read_query(
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
dtype=dtype,
)
return frame

Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/io/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,25 @@ def test_multiindex_roundtrip(self):
)
tm.assert_frame_equal(df, result, check_index_type=True)

@pytest.mark.parametrize(
"dtype, expected",
[
(None, [float, float]),
(int, [int, int]),
(float, [float, float]),
({"SepalLength": int, "SepalWidth": float}, [int, float]),
],
)
def test_dtype_argument(self, dtype, expected):
# GH10285 Add dtype argument to read_sql_query
result = sql.read_sql_query(
"SELECT SepalLength, SepalWidth FROM iris", self.conn, dtype=dtype
)
assert result.dtypes.to_dict() == {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you constructed an expected frame and use tm.assert_frame_equal

"SepalLength": expected[0],
"SepalWidth": expected[1],
}

def test_integer_col_names(self):
df = DataFrame([[1, 2], [3, 4]], columns=[0, 1])
sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace")
Expand Down