Support dicts with default values in series.map (#16002)

dhimmel · jreback · commit 61d84dbf161f · 2017-04-15T13:46:43.000Z
* series.map: support dicts with defaults closes #15999
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -449,10 +449,7 @@ Other Enhancements
 - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here <io.feather>`.
 - ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`)
 - ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`)
-
-
 - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)
-
 - ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`).
 - ``DataFrame`` has gained a ``melt()`` method, equivalent to ``pd.melt()``, for unpivoting from a wide to long format (:issue:`12640`).
 - ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`, :issue:`15197`).
@@ -1302,6 +1299,7 @@ Other API Changes
 - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv()`` and will be removed in the future (:issue:`12665`)
 - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)
 - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`)
+- ``Series.map()`` now respects default values of dictionary subclasses with a ``__missing__`` method, such as ``collections.Counter`` (:issue:`15999`)
 - ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`)
 - ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`)
 - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`)
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2079,8 +2079,8 @@ def map(self, arg, na_action=None):
         two   bar
         three baz
 
-        Mapping a dictionary keys on the index labels works similar as
-        with a `Series`:
+        If `arg` is a dictionary, return a new Series with values converted
+        according to the dictionary's mapping:
 
         >>> z = {1: 'A', 2: 'B', 3: 'C'}
 
@@ -2094,16 +2094,14 @@ def map(self, arg, na_action=None):
 
         >>> s = pd.Series([1, 2, 3, np.nan])
 
-        >>> s2 = s.map(lambda x: 'this is a string {}'.format(x),
-                       na_action=None)
+        >>> s2 = s.map('this is a string {}'.format, na_action=None)
         0    this is a string 1.0
         1    this is a string 2.0
         2    this is a string 3.0
         3    this is a string nan
         dtype: object
 
-        >>> s3 = s.map(lambda x: 'this is a string {}'.format(x),
-                       na_action='ignore')
+        >>> s3 = s.map('this is a string {}'.format, na_action='ignore')
         0    this is a string 1.0
         1    this is a string 2.0
         2    this is a string 3.0
@@ -2115,6 +2113,23 @@ def map(self, arg, na_action=None):
         Series.apply: For applying more complex functions on a Series
         DataFrame.apply: Apply a function row-/column-wise
         DataFrame.applymap: Apply a function elementwise on a whole DataFrame
+
+        Notes
+        -----
+        When `arg` is a dictionary, values in Series that are not in the
+        dictionary (as keys) are converted to ``NaN``. However, if the
+        dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
+        provides a method for default values), then this default is used
+        rather than ``NaN``:
+
+        >>> from collections import Counter
+        >>> counter = Counter()
+        >>> counter['bar'] += 1
+        >>> y.map(counter)
+        1    0
+        2    1
+        3    0
+        dtype: int64
         """
 
         if is_extension_type(self.dtype):
@@ -2132,13 +2147,23 @@ def map_f(values, f):
             else:
                 map_f = lib.map_infer
 
-        if isinstance(arg, (dict, Series)):
-            if isinstance(arg, dict):
+        if isinstance(arg, dict):
+            if hasattr(arg, '__missing__'):
+                # If a dictionary subclass defines a default value method,
+                # convert arg to a lookup function (GH #15999).
+                dict_with_default = arg
+                arg = lambda x: dict_with_default[x]
+            else:
+                # Dictionary does not have a default. Thus it's safe to
+                # convert to an indexed series for efficiency.
                 arg = self._constructor(arg, index=arg.keys())
 
+        if isinstance(arg, Series):
+            # arg is a Series
             indexer = arg.index.get_indexer(values)
             new_values = algorithms.take_1d(arg._values, indexer)
         else:
+            # arg is a function
             new_values = map_f(values, arg)
 
         return self._constructor(new_values,
diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 # pylint: disable-msg=E1101,W0612
 
-from collections import OrderedDict
+from collections import Counter, defaultdict, OrderedDict
 import numpy as np
 import pandas as pd
 
@@ -411,6 +411,46 @@ def test_map_dict_with_tuple_keys(self):
         tm.assert_series_equal(df['labels'], df['expected_labels'],
                                check_names=False)
 
+    def test_map_counter(self):
+        s = Series(['a', 'b', 'c'], index=[1, 2, 3])
+        counter = Counter()
+        counter['b'] = 5
+        counter['c'] += 1
+        result = s.map(counter)
+        expected = Series([0, 5, 1], index=[1, 2, 3])
+        assert_series_equal(result, expected)
+
+    def test_map_defaultdict(self):
+        s = Series([1, 2, 3], index=['a', 'b', 'c'])
+        default_dict = defaultdict(lambda: 'blank')
+        default_dict[1] = 'stuff'
+        result = s.map(default_dict)
+        expected = Series(['stuff', 'blank', 'blank'], index=['a', 'b', 'c'])
+        assert_series_equal(result, expected)
+
+    def test_map_dict_subclass_with_missing(self):
+        """
+        Test Series.map with a dictionary subclass that defines __missing__,
+        i.e. sets a default value (GH #15999).
+        """
+        class DictWithMissing(dict):
+            def __missing__(self, key):
+                return 'missing'
+        s = Series([1, 2, 3])
+        dictionary = DictWithMissing({3: 'three'})
+        result = s.map(dictionary)
+        expected = Series(['missing', 'missing', 'three'])
+        assert_series_equal(result, expected)
+
+    def test_map_dict_subclass_without_missing(self):
+        class DictWithoutMissing(dict):
+            pass
+        s = Series([1, 2, 3])
+        dictionary = DictWithoutMissing({3: 'three'})
+        result = s.map(dictionary)
+        expected = Series([np.nan, np.nan, 'three'])
+        assert_series_equal(result, expected)
+
     def test_map_box(self):
         vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]
         s = pd.Series(vals)