diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 133757b131312..089c4f59445e3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -449,10 +449,7 @@ Other Enhancements - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. - ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) - ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) - - - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - - ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`). - ``DataFrame`` has gained a ``melt()`` method, equivalent to ``pd.melt()``, for unpivoting from a wide to long format (:issue:`12640`). - ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`, :issue:`15197`). @@ -1302,6 +1299,7 @@ Other API Changes - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv()`` and will be removed in the future (:issue:`12665`) - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`) +- ``Series.map()`` now respects default values of dictionary subclasses with a ``__missing__`` method, such as ``collections.Counter`` (:issue:`15999`) - ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`) - ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`) - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3305f0b6c439e..7f8a97af99490 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2079,8 +2079,8 @@ def map(self, arg, na_action=None): two bar three baz - Mapping a dictionary keys on the index labels works similar as - with a `Series`: + If `arg` is a dictionary, return a new Series with values converted + according to the dictionary's mapping: >>> z = {1: 'A', 2: 'B', 3: 'C'} @@ -2094,16 +2094,14 @@ def map(self, arg, na_action=None): >>> s = pd.Series([1, 2, 3, np.nan]) - >>> s2 = s.map(lambda x: 'this is a string {}'.format(x), - na_action=None) + >>> s2 = s.map('this is a string {}'.format, na_action=None) 0 this is a string 1.0 1 this is a string 2.0 2 this is a string 3.0 3 this is a string nan dtype: object - >>> s3 = s.map(lambda x: 'this is a string {}'.format(x), - na_action='ignore') + >>> s3 = s.map('this is a string {}'.format, na_action='ignore') 0 this is a string 1.0 1 this is a string 2.0 2 this is a string 3.0 @@ -2115,6 +2113,23 @@ def map(self, arg, na_action=None): Series.apply: For applying more complex functions on a Series DataFrame.apply: Apply a function row-/column-wise DataFrame.applymap: Apply a function elementwise on a whole DataFrame + + Notes + ----- + When `arg` is a dictionary, values in Series that are not in the + dictionary (as keys) are converted to ``NaN``. However, if the + dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e. + provides a method for default values), then this default is used + rather than ``NaN``: + + >>> from collections import Counter + >>> counter = Counter() + >>> counter['bar'] += 1 + >>> y.map(counter) + 1 0 + 2 1 + 3 0 + dtype: int64 """ if is_extension_type(self.dtype): @@ -2132,13 +2147,23 @@ def map_f(values, f): else: map_f = lib.map_infer - if isinstance(arg, (dict, Series)): - if isinstance(arg, dict): + if isinstance(arg, dict): + if hasattr(arg, '__missing__'): + # If a dictionary subclass defines a default value method, + # convert arg to a lookup function (GH #15999). + dict_with_default = arg + arg = lambda x: dict_with_default[x] + else: + # Dictionary does not have a default. Thus it's safe to + # convert to an indexed series for efficiency. arg = self._constructor(arg, index=arg.keys()) + if isinstance(arg, Series): + # arg is a Series indexer = arg.index.get_indexer(values) new_values = algorithms.take_1d(arg._values, indexer) else: + # arg is a function new_values = map_f(values, arg) return self._constructor(new_values, diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 524167602c249..a4a49e3aeb826 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -1,7 +1,7 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 -from collections import OrderedDict +from collections import Counter, defaultdict, OrderedDict import numpy as np import pandas as pd @@ -411,6 +411,46 @@ def test_map_dict_with_tuple_keys(self): tm.assert_series_equal(df['labels'], df['expected_labels'], check_names=False) + def test_map_counter(self): + s = Series(['a', 'b', 'c'], index=[1, 2, 3]) + counter = Counter() + counter['b'] = 5 + counter['c'] += 1 + result = s.map(counter) + expected = Series([0, 5, 1], index=[1, 2, 3]) + assert_series_equal(result, expected) + + def test_map_defaultdict(self): + s = Series([1, 2, 3], index=['a', 'b', 'c']) + default_dict = defaultdict(lambda: 'blank') + default_dict[1] = 'stuff' + result = s.map(default_dict) + expected = Series(['stuff', 'blank', 'blank'], index=['a', 'b', 'c']) + assert_series_equal(result, expected) + + def test_map_dict_subclass_with_missing(self): + """ + Test Series.map with a dictionary subclass that defines __missing__, + i.e. sets a default value (GH #15999). + """ + class DictWithMissing(dict): + def __missing__(self, key): + return 'missing' + s = Series([1, 2, 3]) + dictionary = DictWithMissing({3: 'three'}) + result = s.map(dictionary) + expected = Series(['missing', 'missing', 'three']) + assert_series_equal(result, expected) + + def test_map_dict_subclass_without_missing(self): + class DictWithoutMissing(dict): + pass + s = Series([1, 2, 3]) + dictionary = DictWithoutMissing({3: 'three'}) + result = s.map(dictionary) + expected = Series([np.nan, np.nan, 'three']) + assert_series_equal(result, expected) + def test_map_box(self): vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] s = pd.Series(vals)