From f6bdff665f6732d677aefe955b9c56fc4d3dd43a Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sat, 14 Mar 2020 16:56:46 +0100 Subject: [PATCH 01/14] Add errors argument to to_csv() call to enable error handling for encoders --- pandas/core/generic.py | 6 ++++++ pandas/io/common.py | 5 +++-- pandas/io/formats/csvs.py | 4 ++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0260f30b9e7e2..4a556cd965df9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3039,6 +3039,7 @@ def to_csv( index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, mode: str = "w", encoding: Optional[str] = None, + errors: str = "strict", compression: Optional[Union[str, Mapping[str, str]]] = "infer", quoting: Optional[int] = None, quotechar: str = '"', @@ -3092,6 +3093,10 @@ def to_csv( for easier importing in R. mode : str Python write mode, default 'w'. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. encoding : str, optional A string representing the encoding to use in the output file, defaults to 'utf-8'. @@ -3179,6 +3184,7 @@ def to_csv( line_terminator=line_terminator, sep=sep, encoding=encoding, + errors=errors, compression=compression, quoting=quoting, na_rep=na_rep, diff --git a/pandas/io/common.py b/pandas/io/common.py index 8349acafca1e3..318ac503289a3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -349,6 +349,7 @@ def get_handle( path_or_buf, mode: str, encoding=None, + errors=None, compression: Optional[Union[str, Mapping[str, Any]]] = None, memory_map: bool = False, is_text: bool = True, @@ -475,7 +476,7 @@ def get_handle( elif is_path: if encoding: # Encoding - f = open(path_or_buf, mode, encoding=encoding, newline="") + f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") elif is_text: # No explicit encoding f = open(path_or_buf, mode, errors="replace", newline="") @@ -488,7 +489,7 @@ def get_handle( if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper - g = TextIOWrapper(f, encoding=encoding, newline="") + g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="") if not isinstance(f, (BufferedIOBase, RawIOBase)): handles.append(g) f = g diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index dcd764bec7426..5bd51dc8351f6 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -44,6 +44,7 @@ def __init__( index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None, mode: str = "w", encoding: Optional[str] = None, + errors: str = "strict", compression: Union[str, Mapping[str, str], None] = "infer", quoting: Optional[int] = None, line_terminator="\n", @@ -77,6 +78,7 @@ def __init__( if encoding is None: encoding = "utf-8" self.encoding = encoding + self.errors = errors self.compression = infer_compression(self.path_or_buf, compression) if quoting is None: @@ -184,6 +186,7 @@ def save(self) -> None: self.path_or_buf, self.mode, encoding=self.encoding, + errors=self.errors, compression=dict(self.compression_args, method=self.compression), ) close = True @@ -215,6 +218,7 @@ def save(self) -> None: self.path_or_buf, self.mode, encoding=self.encoding, + errors=self.errors, compression=compression, ) f.write(buf) From 33d7e1abd582489a3664a8fdb1a0aeeb4abe4340 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sat, 14 Mar 2020 18:31:56 +0100 Subject: [PATCH 02/14] Fix doc string validation --- pandas/core/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4a556cd965df9..91f28cd2aa104 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3093,13 +3093,13 @@ def to_csv( for easier importing in R. mode : str Python write mode, default 'w'. + encoding : str, optional + A string representing the encoding to use in the output file, + defaults to 'utf-8'. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. - encoding : str, optional - A string representing the encoding to use in the output file, - defaults to 'utf-8'. compression : str or dict, default 'infer' If str, represents compression mode. If dict, value at 'method' is the compression mode. Compression mode may be any of the following From e7b76067156be593116b220fdaea2c496d76dcd8 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 15 Mar 2020 15:04:17 +0100 Subject: [PATCH 03/14] Add docstring for errors argument --- pandas/io/common.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index 318ac503289a3..9b2efb1632f6a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -365,6 +365,10 @@ def get_handle( Mode to open path_or_buf with. encoding : str or None Encoding to use. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', From 7c03df0d9c7b0f0317546ba14f994c3f5ea0f8ef Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 15 Mar 2020 15:19:21 +0100 Subject: [PATCH 04/14] Move errors keyword argument after encoding keyword argument (similar to open()) --- pandas/core/generic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 91f28cd2aa104..ed82aba0e0cb6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2367,8 +2367,9 @@ def to_hdf( nan_rep=None, dropna: Optional[bool_t] = None, data_columns: Optional[Union[bool_t, List[str]]] = None, - errors: str = "strict", + data_columns: Optional[List[str]] = None, encoding: str = "UTF-8", + errors: str = "strict", ) -> None: """ Write the contained data to an HDF5 file using HDFStore. @@ -2420,11 +2421,11 @@ def to_hdf( like searching / selecting subsets of the data. - If None, pd.get_option('io.hdf.default_format') is checked, followed by fallback to "fixed" + encoding : str, default "UTF-8" errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. - encoding : str, default "UTF-8" min_itemsize : dict or int, optional Map column names to minimum string sizes for columns. nan_rep : Any, optional From 03105b50778b2b15b9612393c6ed711d2d806dd6 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 15 Mar 2020 15:53:13 +0100 Subject: [PATCH 05/14] Add a test for the errors argument in to_csv() --- pandas/tests/io/formats/test_to_csv.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index b3ee8da52dece..73c2e6de8b1c2 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -597,3 +597,13 @@ def test_na_rep_truncated(self): result = pd.Series([1.1, 2.2]).to_csv(na_rep=".") expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) assert result == expected + + @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"]) + def test_to_csv_errors(self, errors): + # GH 22610 + data = ['\ud800foo'] + ser = pd.Series(data, index=pd.Index(data)) + with tm.ensure_clean("test.csv") as path: + ser.to_csv(path, errors=errors) + # No use in reading back the data as it is not the same anymore + # due to the error handling From 0c71fa6410c5248c45ff739ac65209a2cd5790dc Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 15 Mar 2020 15:57:51 +0100 Subject: [PATCH 06/14] Add whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bc0214a03ec55..b3e30b2493263 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -288,6 +288,8 @@ Other enhancements - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). +- :meth:`to_csv` now accepts an ``error`` argument (:issue:`22610`) +- .. --------------------------------------------------------------------------- From 8d943e4390a92a39be2ece1ca4cb6718236bfb01 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 15 Mar 2020 16:20:41 +0100 Subject: [PATCH 07/14] Reformatting by black --- pandas/tests/io/formats/test_to_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 73c2e6de8b1c2..4c86e3a16b135 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -601,7 +601,7 @@ def test_na_rep_truncated(self): @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"]) def test_to_csv_errors(self, errors): # GH 22610 - data = ['\ud800foo'] + data = ["\ud800foo"] ser = pd.Series(data, index=pd.Index(data)) with tm.ensure_clean("test.csv") as path: ser.to_csv(path, errors=errors) From a9eb4e0cae0890820f185ebe5ad4ae4d0d375736 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Mon, 16 Mar 2020 21:13:50 +0100 Subject: [PATCH 08/14] Fix whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b3e30b2493263..ac794095df28b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -289,7 +289,8 @@ Other enhancements - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`to_csv` now accepts an ``error`` argument (:issue:`22610`) -- +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``error`` argument (:issue:`22610`) + .. --------------------------------------------------------------------------- From 0ba9d141284ff54b3c1991874ae6d1eaa8cdecbb Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Mon, 16 Mar 2020 21:21:40 +0100 Subject: [PATCH 09/14] Add versionadded tag to error argument --- pandas/core/generic.py | 3 +++ pandas/io/common.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ed82aba0e0cb6..db6be96ae78b8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2426,6 +2426,9 @@ def to_hdf( Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. + + .. versionadded:: 1.1.0 + min_itemsize : dict or int, optional Map column names to minimum string sizes for columns. nan_rep : Any, optional diff --git a/pandas/io/common.py b/pandas/io/common.py index 9b2efb1632f6a..2b5f1578b6f8a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -369,6 +369,9 @@ def get_handle( Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. + + .. versionadded:: 1.1.0 + compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', From 6fba37dd0b4d771f1bcd860967f8444bd8d53067 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 17 Mar 2020 22:13:52 +0100 Subject: [PATCH 10/14] Revert change --- pandas/core/generic.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index db6be96ae78b8..7b86f30ea7ee1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2368,8 +2368,8 @@ def to_hdf( dropna: Optional[bool_t] = None, data_columns: Optional[Union[bool_t, List[str]]] = None, data_columns: Optional[List[str]] = None, - encoding: str = "UTF-8", errors: str = "strict", + encoding: str = "UTF-8", ) -> None: """ Write the contained data to an HDF5 file using HDFStore. @@ -2421,14 +2421,11 @@ def to_hdf( like searching / selecting subsets of the data. - If None, pd.get_option('io.hdf.default_format') is checked, followed by fallback to "fixed" - encoding : str, default "UTF-8" errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. - - .. versionadded:: 1.1.0 - + encoding : str, default "UTF-8" min_itemsize : dict or int, optional Map column names to minimum string sizes for columns. nan_rep : Any, optional From 149ed352a5bb61655d8308af7eb72f6d357fd81c Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 18 Mar 2020 20:47:14 +0100 Subject: [PATCH 11/14] Fix versionadded tag 3rd try --- pandas/core/generic.py | 2 ++ pandas/io/common.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7b86f30ea7ee1..b3c087241d174 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3101,6 +3101,8 @@ def to_csv( Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. + + .. versionadded:: 1.1.0 compression : str or dict, default 'infer' If str, represents compression mode. If dict, value at 'method' is the compression mode. Compression mode may be any of the following diff --git a/pandas/io/common.py b/pandas/io/common.py index 2b5f1578b6f8a..d94e140ba860e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -371,7 +371,6 @@ def get_handle( of options. .. versionadded:: 1.1.0 - compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', From 1030f6cdd11bf9540853056d94b3e67c1dcf5031 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Thu, 19 Mar 2020 20:11:33 +0100 Subject: [PATCH 12/14] Fix whatsnew entry: error -> errors --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index ac794095df28b..d89a3015c64c7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -290,6 +290,7 @@ Other enhancements - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`to_csv` now accepts an ``error`` argument (:issue:`22610`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``error`` argument (:issue:`22610`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) .. --------------------------------------------------------------------------- From 55e6dbf62f2dcfc110d43fbae960508b0f0fc1b2 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 25 Mar 2020 21:53:23 +0100 Subject: [PATCH 13/14] Move new errors argument to the end --- pandas/core/generic.py | 14 +++++++------- pandas/io/common.py | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b3c087241d174..7347f4b54cef6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3040,7 +3040,6 @@ def to_csv( index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, mode: str = "w", encoding: Optional[str] = None, - errors: str = "strict", compression: Optional[Union[str, Mapping[str, str]]] = "infer", quoting: Optional[int] = None, quotechar: str = '"', @@ -3050,6 +3049,7 @@ def to_csv( doublequote: bool_t = True, escapechar: Optional[str] = None, decimal: Optional[str] = ".", + errors: str = "strict", ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3097,12 +3097,6 @@ def to_csv( encoding : str, optional A string representing the encoding to use in the output file, defaults to 'utf-8'. - errors : str, default 'strict' - Specifies how encoding and decoding errors are to be handled. - See the errors argument for :func:`open` for a full list - of options. - - .. versionadded:: 1.1.0 compression : str or dict, default 'infer' If str, represents compression mode. If dict, value at 'method' is the compression mode. Compression mode may be any of the following @@ -3150,6 +3144,12 @@ def to_csv( decimal : str, default '.' Character recognized as decimal separator. E.g. use ',' for European data. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. + + .. versionadded:: 1.1.0 Returns ------- diff --git a/pandas/io/common.py b/pandas/io/common.py index d94e140ba860e..055f84970e916 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -349,10 +349,10 @@ def get_handle( path_or_buf, mode: str, encoding=None, - errors=None, compression: Optional[Union[str, Mapping[str, Any]]] = None, memory_map: bool = False, is_text: bool = True, + errors=None, ): """ Get file handle for given path/buffer and mode. @@ -365,12 +365,6 @@ def get_handle( Mode to open path_or_buf with. encoding : str or None Encoding to use. - errors : str, default 'strict' - Specifies how encoding and decoding errors are to be handled. - See the errors argument for :func:`open` for a full list - of options. - - .. versionadded:: 1.1.0 compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', @@ -397,6 +391,12 @@ def get_handle( is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.). + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. + + .. versionadded:: 1.1.0 Returns ------- From 6fa2290c2d72ccdb978299de3f8c24e40d80d7e5 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Thu, 28 May 2020 23:38:38 +0200 Subject: [PATCH 14/14] Fix rebase issues --- doc/source/whatsnew/v1.1.0.rst | 3 --- pandas/core/generic.py | 1 - 2 files changed, 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d89a3015c64c7..eaed49220de14 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -288,11 +288,8 @@ Other enhancements - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). -- :meth:`to_csv` now accepts an ``error`` argument (:issue:`22610`) -- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``error`` argument (:issue:`22610`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) - .. --------------------------------------------------------------------------- Increased minimum versions for dependencies diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7347f4b54cef6..65b05e0ec71d5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2367,7 +2367,6 @@ def to_hdf( nan_rep=None, dropna: Optional[bool_t] = None, data_columns: Optional[Union[bool_t, List[str]]] = None, - data_columns: Optional[List[str]] = None, errors: str = "strict", encoding: str = "UTF-8", ) -> None: