diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index acf35ebd6afe5..27c1c840eb12f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -211,6 +211,9 @@ + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + """ ". +remove_from_default_na : Hashable or Iterable of Hashable, optional + Remvoe values from the default ``NaN`` values when parsing the data. + keep_default_na : bool, default True Whether or not to include the default ``NaN`` values when parsing the data. Depending on whether ``na_values`` is passed in, the behavior is as follows: @@ -718,6 +721,9 @@ def read_csv( | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None = ..., + remove_from_default_na: Hashable + | Iterable[Hashable] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -781,6 +787,9 @@ def read_csv( | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None = ..., + remove_from_default_na: Hashable + | Iterable[Hashable] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -844,6 +853,9 @@ def read_csv( | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None = ..., + remove_from_default_na: Hashable + | Iterable[Hashable] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -920,6 +932,9 @@ def read_csv( | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None = None, + remove_from_default_na: Hashable + | Iterable[Hashable] + | None = ..., keep_default_na: bool = True, na_filter: bool = True, verbose: bool = False, @@ -1013,6 +1028,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1073,6 +1089,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1133,6 +1150,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1193,6 +1211,7 @@ def read_table( skipfooter: int = ..., nrows: int | None = ..., na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + remove_from_default_na: Sequence[str] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -1268,6 +1287,7 @@ def read_table( nrows: int | None = None, # NA and Missing Data Handling na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, + remove_from_default_na: Sequence[str] | None = ..., keep_default_na: bool = True, na_filter: bool = True, verbose: bool = False, @@ -1740,7 +1760,10 @@ def _clean_options( # Converting values to NA keep_default_na = options["keep_default_na"] - na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) + remove_from_default_na = options["remove_from_default_na"] + na_values, na_fvalues = _clean_na_values( + na_values, remove_from_default_na, keep_default_na + ) # handle skiprows; this is internally handled by the # c-engine, so only need for python and pyarrow parsers @@ -1916,6 +1939,8 @@ def TextParser(*args, **kwds) -> TextFileReader: not in the header. na_values : scalar, str, list-like, or dict, optional Additional strings to recognize as NA/NaN. + remove_from_default_na : scalar, str, list-like, or dict, optional + Strings not to recognize as NA/NaN. keep_default_na : bool, default True thousands : str, optional Thousands separator @@ -1952,11 +1977,17 @@ def TextParser(*args, **kwds) -> TextFileReader: return TextFileReader(*args, **kwds) -def _clean_na_values(na_values, keep_default_na: bool = True): +def _clean_na_values(na_values, remove_from_default_na, keep_default_na: bool = True): na_fvalues: set | dict + if remove_from_default_na is None: + remove_from_default_na = set() + elif not is_list_like(remove_from_default_na): + remove_from_default_na = set([remove_from_default_na]) + else: + remove_from_default_na = set(remove_from_default_na) if na_values is None: if keep_default_na: - na_values = STR_NA_VALUES + na_values = STR_NA_VALUES - remove_from_default_na else: na_values = set() na_fvalues = set() @@ -1973,7 +2004,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True): v = [v] if keep_default_na: - v = set(v) | STR_NA_VALUES + v = set(v) | (STR_NA_VALUES - remove_from_default_na) na_values[k] = v na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} @@ -1982,7 +2013,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True): na_values = [na_values] na_values = _stringify_na_values(na_values) if keep_default_na: - na_values = na_values | STR_NA_VALUES + na_values = na_values | (STR_NA_VALUES - remove_from_default_na) na_fvalues = _floatify_na_values(na_values) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 9a16ec5a50d36..9828e55c66520 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -277,6 +277,26 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): } ), ), + ( + {"remove_from_default_na": ["nan"]}, + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", np.nan, "seven"], + } + ), + ), + ( + {"na_values": ["nan"], "remove_from_default_na": ["nan"]}, + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + } + ), + ), ], ) def test_na_values_keep_default(all_parsers, kwargs, expected):