-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
feat: Implement option 'delete_rows' of argument 'if_exists' in 'DataFrame.to_sql' API. #60376
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,6 +76,7 @@ | |
|
||
from sqlalchemy import Table | ||
from sqlalchemy.sql.expression import ( | ||
Delete, | ||
Select, | ||
TextClause, | ||
) | ||
|
@@ -738,7 +739,7 @@ def to_sql( | |
name: str, | ||
con, | ||
schema: str | None = None, | ||
if_exists: Literal["fail", "replace", "append"] = "fail", | ||
if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", | ||
index: bool = True, | ||
index_label: IndexLabel | None = None, | ||
chunksize: int | None = None, | ||
|
@@ -750,6 +751,12 @@ def to_sql( | |
""" | ||
Write records stored in a DataFrame to a SQL database. | ||
|
||
.. warning:: | ||
The pandas library does not attempt to sanitize inputs provided via a to_sql call. | ||
Please refer to the documentation for the underlying database driver to see if it | ||
will properly prevent injection, or alternatively be advised of a security risk when | ||
executing arbitrary commands in a to_sql call. | ||
|
||
Parameters | ||
---------- | ||
frame : DataFrame, Series | ||
|
@@ -764,10 +771,11 @@ def to_sql( | |
schema : str, optional | ||
Name of SQL schema in database to write to (if database flavor | ||
supports this). If None, use default schema (default). | ||
if_exists : {'fail', 'replace', 'append'}, default 'fail' | ||
if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail' | ||
- fail: If table exists, do nothing. | ||
- replace: If table exists, drop it, recreate it, and insert data. | ||
- append: If table exists, insert data. Create if does not exist. | ||
- delete_rows: If a table exists, delete all records and insert data. | ||
index : bool, default True | ||
Write DataFrame index as a column. | ||
index_label : str or sequence, optional | ||
|
@@ -818,7 +826,7 @@ def to_sql( | |
`sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or | ||
`SQLAlchemy <https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.BaseCursorResult.rowcount>`__ | ||
""" # noqa: E501 | ||
if if_exists not in ("fail", "replace", "append"): | ||
if if_exists not in ("fail", "replace", "append", "delete_rows"): | ||
raise ValueError(f"'{if_exists}' is not valid for if_exists") | ||
|
||
if isinstance(frame, Series): | ||
|
@@ -926,7 +934,7 @@ def __init__( | |
pandas_sql_engine, | ||
frame=None, | ||
index: bool | str | list[str] | None = True, | ||
if_exists: Literal["fail", "replace", "append"] = "fail", | ||
if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", | ||
prefix: str = "pandas", | ||
index_label=None, | ||
schema=None, | ||
|
@@ -974,11 +982,13 @@ def create(self) -> None: | |
if self.exists(): | ||
if self.if_exists == "fail": | ||
raise ValueError(f"Table '{self.name}' already exists.") | ||
if self.if_exists == "replace": | ||
elif self.if_exists == "replace": | ||
self.pd_sql.drop_table(self.name, self.schema) | ||
self._execute_create() | ||
elif self.if_exists == "append": | ||
pass | ||
elif self.if_exists == "delete_rows": | ||
self.pd_sql.delete_rows(self.name, self.schema) | ||
else: | ||
raise ValueError(f"'{self.if_exists}' is not valid for if_exists") | ||
else: | ||
|
@@ -997,7 +1007,7 @@ def _execute_insert(self, conn, keys: list[str], data_iter) -> int: | |
Each item contains a list of values to be inserted | ||
""" | ||
data = [dict(zip(keys, row)) for row in data_iter] | ||
result = conn.execute(self.table.insert(), data) | ||
result = self.pd_sql.execute(self.table.insert(), data) | ||
return result.rowcount | ||
|
||
def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: | ||
|
@@ -1014,7 +1024,7 @@ def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: | |
|
||
data = [dict(zip(keys, row)) for row in data_iter] | ||
stmt = insert(self.table).values(data) | ||
result = conn.execute(stmt) | ||
result = self.pd_sql.execute(stmt) | ||
return result.rowcount | ||
|
||
def insert_data(self) -> tuple[list[str], list[np.ndarray]]: | ||
|
@@ -1480,7 +1490,7 @@ def to_sql( | |
self, | ||
frame, | ||
name: str, | ||
if_exists: Literal["fail", "replace", "append"] = "fail", | ||
if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", | ||
index: bool = True, | ||
index_label=None, | ||
schema=None, | ||
|
@@ -1649,7 +1659,7 @@ def run_transaction(self): | |
else: | ||
yield self.con | ||
|
||
def execute(self, sql: str | Select | TextClause, params=None): | ||
def execute(self, sql: str | Select | TextClause | Delete, params=None): | ||
"""Simple passthrough to SQLAlchemy connectable""" | ||
from sqlalchemy.exc import SQLAlchemyError | ||
|
||
|
@@ -1874,7 +1884,7 @@ def prep_table( | |
self, | ||
frame, | ||
name: str, | ||
if_exists: Literal["fail", "replace", "append"] = "fail", | ||
if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", | ||
index: bool | str | list[str] | None = True, | ||
index_label=None, | ||
schema=None, | ||
|
@@ -1951,7 +1961,7 @@ def to_sql( | |
self, | ||
frame, | ||
name: str, | ||
if_exists: Literal["fail", "replace", "append"] = "fail", | ||
if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", | ||
index: bool = True, | ||
index_label=None, | ||
schema: str | None = None, | ||
|
@@ -1969,10 +1979,11 @@ def to_sql( | |
frame : DataFrame | ||
name : string | ||
Name of SQL table. | ||
if_exists : {'fail', 'replace', 'append'}, default 'fail' | ||
if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail' | ||
- fail: If table exists, do nothing. | ||
- replace: If table exists, drop it, recreate it, and insert data. | ||
- append: If table exists, insert data. Create if does not exist. | ||
- delete_rows: If a table exists, delete all records and insert data. | ||
index : boolean, default True | ||
Write DataFrame index as a column. | ||
index_label : string or sequence, default None | ||
|
@@ -2069,6 +2080,16 @@ def drop_table(self, table_name: str, schema: str | None = None) -> None: | |
self.get_table(table_name, schema).drop(bind=self.con) | ||
self.meta.clear() | ||
|
||
def delete_rows(self, table_name: str, schema: str | None = None) -> None: | ||
schema = schema or self.meta.schema | ||
if self.has_table(table_name, schema): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So in the case the table does not exist we are just ignoring any instruction to perform a DELETE? I'm somewhat wary of assuming a user may not want an error here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, me neither. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On second thought I think this is OK. It follows the same pattern as |
||
self.meta.reflect( | ||
bind=self.con, only=[table_name], schema=schema, views=True | ||
) | ||
table = self.get_table(table_name, schema) | ||
self.execute(table.delete()).close() | ||
self.meta.clear() | ||
|
||
def _create_sql_schema( | ||
self, | ||
frame: DataFrame, | ||
|
@@ -2304,7 +2325,7 @@ def to_sql( | |
self, | ||
frame, | ||
name: str, | ||
if_exists: Literal["fail", "replace", "append"] = "fail", | ||
if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail", | ||
index: bool = True, | ||
index_label=None, | ||
schema: str | None = None, | ||
|
@@ -2326,6 +2347,7 @@ def to_sql( | |
- fail: If table exists, do nothing. | ||
- replace: If table exists, drop it, recreate it, and insert data. | ||
- append: If table exists, insert data. Create if does not exist. | ||
- delete_rows: If a table exists, delete all records and insert data. | ||
index : boolean, default True | ||
Write DataFrame index as a column. | ||
index_label : string or sequence, default None | ||
|
@@ -2379,6 +2401,9 @@ def to_sql( | |
self.execute(sql_statement).close() | ||
elif if_exists == "append": | ||
mode = "append" | ||
elif if_exists == "delete_rows": | ||
mode = "append" | ||
self.delete_rows(name, schema) | ||
|
||
try: | ||
tbl = pa.Table.from_pandas(frame, preserve_index=index) | ||
|
@@ -2416,6 +2441,11 @@ def has_table(self, name: str, schema: str | None = None) -> bool: | |
|
||
return False | ||
|
||
def delete_rows(self, name: str, schema: str | None = None) -> None: | ||
table_name = f"{schema}.{name}" if schema else name | ||
if self.has_table(name, schema): | ||
self.execute(f"DELETE FROM {table_name}").close() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm still a bit unclear why we are calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is a good / recommended practice to close a cursor after usage because you release resources. I believe that's a fact we agree ? Ok...but putting it aside for a moment the answer is the adbc driver raises an error in case a cursor is not explicitly closed, causing some tests to fail. There's no such check in sqlalchemy / sqlite3, that's why a missing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So do sqlalchemy and sqlite3 just leak the cursor then? Or should they be adding this? I am just confused as to why we are offering potentially different cursor lifetime management across the implementations There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case pandas is leaking the cursor because sqlalchemy and sqlite3 do not provide a cool and friendly message alerting the developer. On the other hand it is standard practice to always close it. I'm in favor of adding Please let me know if I can help with more context There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Oh OK - I thought that previously you tried to We shouldn't be leaking resources across any of these implementations, but the challenge is that we also need to stay consistent in how we are managing lifecycles. If it is as simple as calling .close (or pref using a context manager) for all implementations, then let's do that. If calling .close on everything but ADBC is causing an error, then we need to align the cursor lifecycle management of the ADBC subclass with the others There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There you go @WillAyd :). Sorry for not sending it as a separate commit. I'm keeping a close eye on CI. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All good. CI passed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK great. And none of these offer a context manager right? And we don't want to be using I'm still a little unclear as to the difference in usage of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
sqlite3 implementation for a cursor does not provide a close object. This is one of the reasons I decided to standardize to
No worries, I don't think you're bikeshedding. It is a bit confusing for sure. Let me try to break it down:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK that's helpful. Sounds like |
||
|
||
def _create_sql_schema( | ||
self, | ||
frame: DataFrame, | ||
|
@@ -2790,10 +2820,11 @@ def to_sql( | |
frame: DataFrame | ||
name: string | ||
Name of SQL table. | ||
if_exists: {'fail', 'replace', 'append'}, default 'fail' | ||
if_exists: {'fail', 'replace', 'append', 'delete_rows'}, default 'fail' | ||
fail: If table exists, do nothing. | ||
replace: If table exists, drop it, recreate it, and insert data. | ||
append: If table exists, insert data. Create if it does not exist. | ||
delete_rows: If a table exists, delete all records and insert data. | ||
gmcrocetti marked this conversation as resolved.
Show resolved
Hide resolved
|
||
index : bool, default True | ||
Write DataFrame index as a column | ||
index_label : string or sequence, default None | ||
|
@@ -2867,7 +2898,12 @@ def get_table(self, table_name: str, schema: str | None = None) -> None: | |
|
||
def drop_table(self, name: str, schema: str | None = None) -> None: | ||
drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}" | ||
self.execute(drop_sql) | ||
self.execute(drop_sql).close() | ||
|
||
def delete_rows(self, name: str, schema: str | None = None) -> None: | ||
delete_sql = f"DELETE FROM {_get_valid_sqlite_name(name)}" | ||
if self.has_table(name, schema): | ||
self.execute(delete_sql).close() | ||
|
||
def _create_sql_schema( | ||
self, | ||
|
Uh oh!
There was an error while loading. Please reload this page.