Skip to content

Commit 61d84db

Browse files
dhimmeljreback
authored andcommitted
Support dicts with default values in series.map (#16002)
* series.map: support dicts with defaults closes #15999
1 parent 9c56098 commit 61d84db

File tree

3 files changed

+75
-12
lines changed

3 files changed

+75
-12
lines changed

doc/source/whatsnew/v0.20.0.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -449,10 +449,7 @@ Other Enhancements
449449
- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here <io.feather>`.
450450
- ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`)
451451
- ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`)
452-
453-
454452
- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)
455-
456453
- ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`).
457454
- ``DataFrame`` has gained a ``melt()`` method, equivalent to ``pd.melt()``, for unpivoting from a wide to long format (:issue:`12640`).
458455
- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`, :issue:`15197`).
@@ -1302,6 +1299,7 @@ Other API Changes
13021299
- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv()`` and will be removed in the future (:issue:`12665`)
13031300
- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)
13041301
- ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`)
1302+
- ``Series.map()`` now respects default values of dictionary subclasses with a ``__missing__`` method, such as ``collections.Counter`` (:issue:`15999`)
13051303
- ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`)
13061304
- ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`)
13071305
- ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`)

pandas/core/series.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2079,8 +2079,8 @@ def map(self, arg, na_action=None):
20792079
two bar
20802080
three baz
20812081
2082-
Mapping a dictionary keys on the index labels works similar as
2083-
with a `Series`:
2082+
If `arg` is a dictionary, return a new Series with values converted
2083+
according to the dictionary's mapping:
20842084
20852085
>>> z = {1: 'A', 2: 'B', 3: 'C'}
20862086
@@ -2094,16 +2094,14 @@ def map(self, arg, na_action=None):
20942094
20952095
>>> s = pd.Series([1, 2, 3, np.nan])
20962096
2097-
>>> s2 = s.map(lambda x: 'this is a string {}'.format(x),
2098-
na_action=None)
2097+
>>> s2 = s.map('this is a string {}'.format, na_action=None)
20992098
0 this is a string 1.0
21002099
1 this is a string 2.0
21012100
2 this is a string 3.0
21022101
3 this is a string nan
21032102
dtype: object
21042103
2105-
>>> s3 = s.map(lambda x: 'this is a string {}'.format(x),
2106-
na_action='ignore')
2104+
>>> s3 = s.map('this is a string {}'.format, na_action='ignore')
21072105
0 this is a string 1.0
21082106
1 this is a string 2.0
21092107
2 this is a string 3.0
@@ -2115,6 +2113,23 @@ def map(self, arg, na_action=None):
21152113
Series.apply: For applying more complex functions on a Series
21162114
DataFrame.apply: Apply a function row-/column-wise
21172115
DataFrame.applymap: Apply a function elementwise on a whole DataFrame
2116+
2117+
Notes
2118+
-----
2119+
When `arg` is a dictionary, values in Series that are not in the
2120+
dictionary (as keys) are converted to ``NaN``. However, if the
2121+
dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
2122+
provides a method for default values), then this default is used
2123+
rather than ``NaN``:
2124+
2125+
>>> from collections import Counter
2126+
>>> counter = Counter()
2127+
>>> counter['bar'] += 1
2128+
>>> y.map(counter)
2129+
1 0
2130+
2 1
2131+
3 0
2132+
dtype: int64
21182133
"""
21192134

21202135
if is_extension_type(self.dtype):
@@ -2132,13 +2147,23 @@ def map_f(values, f):
21322147
else:
21332148
map_f = lib.map_infer
21342149

2135-
if isinstance(arg, (dict, Series)):
2136-
if isinstance(arg, dict):
2150+
if isinstance(arg, dict):
2151+
if hasattr(arg, '__missing__'):
2152+
# If a dictionary subclass defines a default value method,
2153+
# convert arg to a lookup function (GH #15999).
2154+
dict_with_default = arg
2155+
arg = lambda x: dict_with_default[x]
2156+
else:
2157+
# Dictionary does not have a default. Thus it's safe to
2158+
# convert to an indexed series for efficiency.
21372159
arg = self._constructor(arg, index=arg.keys())
21382160

2161+
if isinstance(arg, Series):
2162+
# arg is a Series
21392163
indexer = arg.index.get_indexer(values)
21402164
new_values = algorithms.take_1d(arg._values, indexer)
21412165
else:
2166+
# arg is a function
21422167
new_values = map_f(values, arg)
21432168

21442169
return self._constructor(new_values,

pandas/tests/series/test_apply.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# coding=utf-8
22
# pylint: disable-msg=E1101,W0612
33

4-
from collections import OrderedDict
4+
from collections import Counter, defaultdict, OrderedDict
55
import numpy as np
66
import pandas as pd
77

@@ -411,6 +411,46 @@ def test_map_dict_with_tuple_keys(self):
411411
tm.assert_series_equal(df['labels'], df['expected_labels'],
412412
check_names=False)
413413

414+
def test_map_counter(self):
415+
s = Series(['a', 'b', 'c'], index=[1, 2, 3])
416+
counter = Counter()
417+
counter['b'] = 5
418+
counter['c'] += 1
419+
result = s.map(counter)
420+
expected = Series([0, 5, 1], index=[1, 2, 3])
421+
assert_series_equal(result, expected)
422+
423+
def test_map_defaultdict(self):
424+
s = Series([1, 2, 3], index=['a', 'b', 'c'])
425+
default_dict = defaultdict(lambda: 'blank')
426+
default_dict[1] = 'stuff'
427+
result = s.map(default_dict)
428+
expected = Series(['stuff', 'blank', 'blank'], index=['a', 'b', 'c'])
429+
assert_series_equal(result, expected)
430+
431+
def test_map_dict_subclass_with_missing(self):
432+
"""
433+
Test Series.map with a dictionary subclass that defines __missing__,
434+
i.e. sets a default value (GH #15999).
435+
"""
436+
class DictWithMissing(dict):
437+
def __missing__(self, key):
438+
return 'missing'
439+
s = Series([1, 2, 3])
440+
dictionary = DictWithMissing({3: 'three'})
441+
result = s.map(dictionary)
442+
expected = Series(['missing', 'missing', 'three'])
443+
assert_series_equal(result, expected)
444+
445+
def test_map_dict_subclass_without_missing(self):
446+
class DictWithoutMissing(dict):
447+
pass
448+
s = Series([1, 2, 3])
449+
dictionary = DictWithoutMissing({3: 'three'})
450+
result = s.map(dictionary)
451+
expected = Series([np.nan, np.nan, 'three'])
452+
assert_series_equal(result, expected)
453+
414454
def test_map_box(self):
415455
vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]
416456
s = pd.Series(vals)

0 commit comments

Comments
 (0)