From 669adb9e28045812ba28986d3973db0c41e1e561 Mon Sep 17 00:00:00 2001 From: Keita ODA Date: Mon, 25 Sep 2023 20:52:22 +0900 Subject: [PATCH 1/4] Add remove_from_default_na options to read_csv, read_excel... --- pandas/io/parsers/readers.py | 17 ++++++++++++----- pandas/tests/io/parser/test_na_values.py | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index acf35ebd6afe5..9012d93e644c4 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1740,7 +1740,8 @@ def _clean_options( # Converting values to NA keep_default_na = options["keep_default_na"] - na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) + remove_from_default_na = options["remove_from_default_na"] + na_values, na_fvalues = _clean_na_values(na_values, remove_from_default_na, keep_default_na) # handle skiprows; this is internally handled by the # c-engine, so only need for python and pyarrow parsers @@ -1952,11 +1953,17 @@ def TextParser(*args, **kwds) -> TextFileReader: return TextFileReader(*args, **kwds) -def _clean_na_values(na_values, keep_default_na: bool = True): +def _clean_na_values(na_values, remove_from_default_na, keep_default_na: bool = True): na_fvalues: set | dict + if remove_from_default_na is None: + remove_from_default_na = set() + elif not is_list_like(remove_from_default_na): + remove_from_default_na = set([remove_from_default_na]) + else: + remove_from_default_na = set(remove_from_default_na) if na_values is None: if keep_default_na: - na_values = STR_NA_VALUES + na_values = STR_NA_VALUES - remove_from_default_na else: na_values = set() na_fvalues = set() @@ -1973,7 +1980,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True): v = [v] if keep_default_na: - v = set(v) | STR_NA_VALUES + v = set(v) | (STR_NA_VALUES - remove_from_default_na) na_values[k] = v na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} @@ -1982,7 +1989,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True): na_values = [na_values] na_values = _stringify_na_values(na_values) if keep_default_na: - na_values = na_values | STR_NA_VALUES + na_values = na_values | (STR_NA_VALUES - remove_from_default_na) na_fvalues = _floatify_na_values(na_values) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 9a16ec5a50d36..71034d13c961d 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -277,6 +277,26 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): } ), ), + ( + {"remove_from_default_na": ["nan"]}, + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", np.nan, "seven"], + } + ), + ), + ( + {"na_values" ["nan"], "remove_from_default_na": ["nan"]}, + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + } + ), + ), ], ) def test_na_values_keep_default(all_parsers, kwargs, expected): From 8fe299c5bf52c2e747429d4e33c69cb82a3dbaeb Mon Sep 17 00:00:00 2001 From: Keita ODA Date: Mon, 25 Sep 2023 21:44:28 +0900 Subject: [PATCH 2/4] Nit: Fix typo and lint. --- pandas/io/parsers/readers.py | 4 +++- pandas/tests/io/parser/test_na_values.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 9012d93e644c4..953123c6592a1 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1741,7 +1741,9 @@ def _clean_options( # Converting values to NA keep_default_na = options["keep_default_na"] remove_from_default_na = options["remove_from_default_na"] - na_values, na_fvalues = _clean_na_values(na_values, remove_from_default_na, keep_default_na) + na_values, na_fvalues = _clean_na_values( + na_values, remove_from_default_na, keep_default_na + ) # handle skiprows; this is internally handled by the # c-engine, so only need for python and pyarrow parsers diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 71034d13c961d..9828e55c66520 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -288,7 +288,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ), ( - {"na_values" ["nan"], "remove_from_default_na": ["nan"]}, + {"na_values": ["nan"], "remove_from_default_na": ["nan"]}, DataFrame( { "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], From 1733f4bb8b0ccebe12b35f846e76183835f7290d Mon Sep 17 00:00:00 2001 From: Keita ODA Date: Mon, 25 Sep 2023 22:47:44 +0900 Subject: [PATCH 3/4] Fix docstring. --- pandas/io/parsers/readers.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 953123c6592a1..da69b38632e4a 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -211,6 +211,9 @@ + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + """ ". +remove_from_default_na : : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional + Remvoe values from the default ``NaN`` values when parsing the data. + keep_default_na : bool, default True Whether or not to include the default ``NaN`` values when parsing the data. Depending on whether ``na_values`` is passed in, the behavior is as follows: @@ -718,6 +721,10 @@ def read_csv( | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None = ..., + remove_from_default_na: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -781,6 +788,10 @@ def read_csv( | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None = ..., + remove_from_default_na: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -844,6 +855,10 @@ def read_csv( | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None = ..., + remove_from_default_na: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -920,6 +935,10 @@ def read_csv( | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None = None, + remove_from_default_na: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = True, na_filter: bool = True, verbose: bool = False, @@ -1013,6 +1032,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1073,6 +1093,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1133,6 +1154,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1193,6 +1215,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1268,6 +1291,7 @@ def read_table( nrows: int | None = None, # NA and Missing Data Handling na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, + remove_from_default_na: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., keep_default_na: bool = True, na_filter: bool = True, verbose: bool = False, @@ -1919,6 +1943,8 @@ def TextParser(*args, **kwds) -> TextFileReader: not in the header. na_values : scalar, str, list-like, or dict, optional Additional strings to recognize as NA/NaN. + remove_from_default_na : scalar, str, list-like, or dict, optional + Strings not to recognize as NA/NaN. keep_default_na : bool, default True thousands : str, optional Thousands separator From 94a9a189784b82ddef13ccdd1f7d9208ba4f3c13 Mon Sep 17 00:00:00 2001 From: Keita ODA Date: Mon, 25 Sep 2023 22:55:57 +0900 Subject: [PATCH 4/4] Nit: Fix lint. --- pandas/io/parsers/readers.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index da69b38632e4a..27c1c840eb12f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -211,7 +211,7 @@ + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + """ ". -remove_from_default_na : : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional +remove_from_default_na : Hashable or Iterable of Hashable, optional Remvoe values from the default ``NaN`` values when parsing the data. keep_default_na : bool, default True @@ -723,7 +723,6 @@ def read_csv( | None = ..., remove_from_default_na: Hashable | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., @@ -790,7 +789,6 @@ def read_csv( | None = ..., remove_from_default_na: Hashable | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., @@ -857,7 +855,6 @@ def read_csv( | None = ..., remove_from_default_na: Hashable | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., @@ -937,7 +934,6 @@ def read_csv( | None = None, remove_from_default_na: Hashable | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] | None = ..., keep_default_na: bool = True, na_filter: bool = True, @@ -1032,7 +1028,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - remove_from_default_na: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1093,7 +1089,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - remove_from_default_na: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1154,7 +1150,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - remove_from_default_na: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1215,7 +1211,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - remove_from_default_na: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1291,7 +1287,7 @@ def read_table( nrows: int | None = None, # NA and Missing Data Handling na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, - remove_from_default_na: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | None = ..., keep_default_na: bool = True, na_filter: bool = True, verbose: bool = False,