From e8f9d2709da1b3237cffb8e6b254bc136160e752 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 16:01:49 -0400 Subject: [PATCH 01/11] TST: test series.map on Counter & defaultdict --- pandas/tests/series/test_apply.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 524167602c249..87c8ff01a4f0e 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -1,7 +1,7 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 -from collections import OrderedDict +from collections import Counter, defaultdict, OrderedDict import numpy as np import pandas as pd @@ -411,6 +411,23 @@ def test_map_dict_with_tuple_keys(self): tm.assert_series_equal(df['labels'], df['expected_labels'], check_names=False) + def test_map_counter(self): + s = Series(['a', 'b', 'c'], index=[1, 2, 3]) + counter = Counter() + counter['b'] = 5 + counter['c'] += 1 + result = s.map(counter) + expected = Series([0, 5, 1], index=[1, 2, 3]) + assert_series_equal(result, expected) + + def test_map_defaultdict(self): + s = Series([1, 2, 3], index=['a', 'b', 'c']) + default_dict = defaultdict(lambda: 'blank') + default_dict[1] = 'stuff' + result = s.map(default_dict) + expected = Series(['stuff', 'blank', 'blank'], index=['a', 'b', 'c']) + assert_series_equal(result, expected) + def test_map_box(self): vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] s = pd.Series(vals) From 96d12a603b41aea44666c68e24d25eb0f6f7e31e Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 15:42:22 -0400 Subject: [PATCH 02/11] series.map: support dicts with defaults --- pandas/core/series.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3305f0b6c439e..9b7649543c0f1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -6,6 +6,7 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 +import collections import types import warnings @@ -2132,10 +2133,15 @@ def map_f(values, f): else: map_f = lib.map_infer - if isinstance(arg, (dict, Series)): - if isinstance(arg, dict): - arg = self._constructor(arg, index=arg.keys()) + default_dict_types = collections.Counter, collections.defaultdict + if isinstance(arg, default_dict_types): + dict_with_default = arg + arg = lambda x: dict_with_default[x] + elif isinstance(arg, dict): + arg = self._constructor(arg, index=arg.keys()) + + if isinstance(arg, Series): indexer = arg.index.get_indexer(values) new_values = algorithms.take_1d(arg._values, indexer) else: From 961ea46ec21d5842b87471d57563336caa61f19f Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 16:21:30 -0400 Subject: [PATCH 03/11] Detect default support using __missing__ https://docs.python.org/3.6/reference/datamodel.html#object.__missing__ --- pandas/core/series.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9b7649543c0f1..c27aa2c5d7d1e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -6,7 +6,6 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 -import collections import types import warnings @@ -2133,13 +2132,12 @@ def map_f(values, f): else: map_f = lib.map_infer - default_dict_types = collections.Counter, collections.defaultdict - if isinstance(arg, default_dict_types): - dict_with_default = arg - arg = lambda x: dict_with_default[x] - - elif isinstance(arg, dict): - arg = self._constructor(arg, index=arg.keys()) + if isinstance(arg, dict): + if hasattr(arg, '__missing__'): + dict_with_default = arg + arg = lambda x: dict_with_default[x] + else: + arg = self._constructor(arg, index=arg.keys()) if isinstance(arg, Series): indexer = arg.index.get_indexer(values) From d73cee82311a5abeb893be0ca38f2776e974007c Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 16:57:49 -0400 Subject: [PATCH 04/11] TST: dict subclasses with/out __missing__ --- pandas/tests/series/test_apply.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 87c8ff01a4f0e..2c086ed7332e1 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -428,6 +428,25 @@ def test_map_defaultdict(self): expected = Series(['stuff', 'blank', 'blank'], index=['a', 'b', 'c']) assert_series_equal(result, expected) + def test_map_dict_subclass_with_missing(self): + class DictWithMissing(dict): + def __missing__(self, key): + return 'missing' + s = Series([1, 2, 3]) + dictionary = DictWithMissing({3: 'three'}) + result = s.map(dictionary) + expected = Series(['missing', 'missing', 'three']) + assert_series_equal(result, expected) + + def test_map_dict_subclass_without_missing(self): + class DictWithoutMissing(dict): + pass + s = Series([1, 2, 3]) + dictionary = DictWithoutMissing({3: 'three'}) + result = s.map(dictionary) + expected = Series([np.nan, np.nan, 'three']) + assert_series_equal(result, expected) + def test_map_box(self): vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] s = pd.Series(vals) From 2a2bab79dcae2fe281e16104103e4ad8d7332f1d Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 17:12:36 -0400 Subject: [PATCH 05/11] Add v0.20.0 what's new --- doc/source/whatsnew/v0.20.0.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 133757b131312..bc8f0b9b2c9a3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -449,10 +449,8 @@ Other Enhancements - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. - ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) - ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) - - +- ``Series.map()`` now respects default values of dictionary subclasses with a ``__missing__`` method, such as ``collections.Counter`` (:issue:`15999`, :issue:`16002`) - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - - ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`). - ``DataFrame`` has gained a ``melt()`` method, equivalent to ``pd.melt()``, for unpivoting from a wide to long format (:issue:`12640`). - ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`, :issue:`15197`). From 4f3dc6bd6147b7ca4d5bdd443f09ecdfd355b0e8 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 17:32:35 -0400 Subject: [PATCH 06/11] DOC: comment series.map --- pandas/core/series.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index c27aa2c5d7d1e..1ae7cc8d3a3c0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2134,15 +2134,21 @@ def map_f(values, f): if isinstance(arg, dict): if hasattr(arg, '__missing__'): + # If a dictionary subclass defines a default value method, + # convert arg to a lookup function (https://git.io/vS7LK). dict_with_default = arg arg = lambda x: dict_with_default[x] else: + # Dictionary does not have a default. Thus it's safe to + # convert to an indexed series for efficiency. arg = self._constructor(arg, index=arg.keys()) if isinstance(arg, Series): + # arg is a Series indexer = arg.index.get_indexer(values) new_values = algorithms.take_1d(arg._values, indexer) else: + # arg is a function new_values = map_f(values, arg) return self._constructor(new_values, From 24e147846c058bc5f96595bbad5f998fa3feea50 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 17:36:19 -0400 Subject: [PATCH 07/11] DOC: remove unnecessary lambda in formatter --- pandas/core/series.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1ae7cc8d3a3c0..33ef81f4671b5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2094,16 +2094,14 @@ def map(self, arg, na_action=None): >>> s = pd.Series([1, 2, 3, np.nan]) - >>> s2 = s.map(lambda x: 'this is a string {}'.format(x), - na_action=None) + >>> s2 = s.map('this is a string {}'.format, na_action=None) 0 this is a string 1.0 1 this is a string 2.0 2 this is a string 3.0 3 this is a string nan dtype: object - >>> s3 = s.map(lambda x: 'this is a string {}'.format(x), - na_action='ignore') + >>> s3 = s.map('this is a string {}'.format, na_action='ignore') 0 this is a string 1.0 1 this is a string 2.0 2 this is a string 3.0 From 11f57698d9e40622c313540cf445451f402209ab Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 17:55:56 -0400 Subject: [PATCH 08/11] DOC: add counter example to docstring --- pandas/core/series.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 33ef81f4671b5..482c26212bbbc 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2079,8 +2079,8 @@ def map(self, arg, na_action=None): two bar three baz - Mapping a dictionary keys on the index labels works similar as - with a `Series`: + If `arg` is a dictionary, return a new Series with values converted + according to the dictionary's mapping: >>> z = {1: 'A', 2: 'B', 3: 'C'} @@ -2089,6 +2089,20 @@ def map(self, arg, na_action=None): two B three C + Values in Series that are not in the dictionary (as keys) are converted + to ``NaN``. However, if the dictionary is a ``dict`` subclass that + defines ``__missing__`` (i.e. provides a method for default values), + then this default is used rather than ``NaN``: + + >>> from collections import Counter + >>> counter = Counter() + >>> counter['bar'] += 1 + >>> y.map(counter) + 1 0 + 2 1 + 3 0 + dtype: int64 + Use na_action to control whether NA values are affected by the mapping function. From ddb04804cc38eed4241b0a45d057f7b41956bacf Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 18:02:16 -0400 Subject: [PATCH 09/11] DOC: Address review comments --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index bc8f0b9b2c9a3..089c4f59445e3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -449,7 +449,6 @@ Other Enhancements - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. - ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) - ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) -- ``Series.map()`` now respects default values of dictionary subclasses with a ``__missing__`` method, such as ``collections.Counter`` (:issue:`15999`, :issue:`16002`) - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) - ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`). - ``DataFrame`` has gained a ``melt()`` method, equivalent to ``pd.melt()``, for unpivoting from a wide to long format (:issue:`12640`). @@ -1300,6 +1299,7 @@ Other API Changes - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv()`` and will be removed in the future (:issue:`12665`) - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`) +- ``Series.map()`` now respects default values of dictionary subclasses with a ``__missing__`` method, such as ``collections.Counter`` (:issue:`15999`) - ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`) - ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`) - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 482c26212bbbc..ef9dc348a199a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2147,7 +2147,7 @@ def map_f(values, f): if isinstance(arg, dict): if hasattr(arg, '__missing__'): # If a dictionary subclass defines a default value method, - # convert arg to a lookup function (https://git.io/vS7LK). + # convert arg to a lookup function (GH #15999). dict_with_default = arg arg = lambda x: dict_with_default[x] else: From 1f56c81747ff00d0e71ad0d58ada02ce042d12ec Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 20:22:20 -0400 Subject: [PATCH 10/11] DOC: mention GitHub issue in test docstring --- pandas/tests/series/test_apply.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 2c086ed7332e1..a4a49e3aeb826 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -429,6 +429,10 @@ def test_map_defaultdict(self): assert_series_equal(result, expected) def test_map_dict_subclass_with_missing(self): + """ + Test Series.map with a dictionary subclass that defines __missing__, + i.e. sets a default value (GH #15999). + """ class DictWithMissing(dict): def __missing__(self, key): return 'missing' From 79cfd11c807884d41e4b725dcef095a0f839d883 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Fri, 14 Apr 2017 20:28:57 -0400 Subject: [PATCH 11/11] DOC: move missing dict key info to notes --- pandas/core/series.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index ef9dc348a199a..7f8a97af99490 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2089,20 +2089,6 @@ def map(self, arg, na_action=None): two B three C - Values in Series that are not in the dictionary (as keys) are converted - to ``NaN``. However, if the dictionary is a ``dict`` subclass that - defines ``__missing__`` (i.e. provides a method for default values), - then this default is used rather than ``NaN``: - - >>> from collections import Counter - >>> counter = Counter() - >>> counter['bar'] += 1 - >>> y.map(counter) - 1 0 - 2 1 - 3 0 - dtype: int64 - Use na_action to control whether NA values are affected by the mapping function. @@ -2127,6 +2113,23 @@ def map(self, arg, na_action=None): Series.apply: For applying more complex functions on a Series DataFrame.apply: Apply a function row-/column-wise DataFrame.applymap: Apply a function elementwise on a whole DataFrame + + Notes + ----- + When `arg` is a dictionary, values in Series that are not in the + dictionary (as keys) are converted to ``NaN``. However, if the + dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e. + provides a method for default values), then this default is used + rather than ``NaN``: + + >>> from collections import Counter + >>> counter = Counter() + >>> counter['bar'] += 1 + >>> y.map(counter) + 1 0 + 2 1 + 3 0 + dtype: int64 """ if is_extension_type(self.dtype):