From 91b720e51148e6c8c0d768c42ca9ab8bc78a8fb5 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 15 Nov 2018 19:52:32 +0100 Subject: [PATCH 1/9] TST: add method/dtype coverage to str-accessor; precursor to #23167 --- pandas/tests/test_strings.py | 274 ++++++++++++++++++++++++++++++++++- 1 file changed, 266 insertions(+), 8 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 7cd9182b4dff4..e37ee1762ce77 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1,17 +1,20 @@ # -*- coding: utf-8 -*- # pylint: disable-msg=E1101,W0612 -from datetime import datetime, timedelta +from datetime import datetime, date, timedelta, time import pytest import re +from decimal import Decimal from numpy import nan as NA import numpy as np from numpy.random import randint -from pandas.compat import range, u +from pandas.compat import range, u, PY3 import pandas.compat as compat -from pandas import Index, Series, DataFrame, isna, MultiIndex, notna, concat +from pandas import (Index, Series, DataFrame, isna, MultiIndex, notna, concat, + Timestamp, Period, NaT, Interval) +import pandas._libs.lib as lib from pandas.util.testing import assert_series_equal, assert_index_equal import pandas.util.testing as tm @@ -26,6 +29,157 @@ def assert_series_or_index_equal(left, right): assert_index_equal(left, right) +# method names plus minimal set of arguments to call +_all_string_methods = [ + ('get', [0]), + ('join', [',']), + ('contains', ['some_pattern']), + ('match', ['some_pattern']), + ('count', ['some_pattern']), + ('startswith', ['some_pattern']), + ('endswith', ['some_pattern']), + ('findall', ['some_pattern']), + ('find', ['some_pattern']), + ('rfind', ['some_pattern']), + # because "index"/"rindex" fail (intentionally) if the string is not found + # (and we're testing on generic data), search only for empty string + ('index', ['']), + ('rindex', ['']), + ('extract', [r'(some_pattern)']), + ('extractall', [r'(some_pattern)']), + ('replace', ['some_pattern', 'other_pattern']), + ('repeat', [10]), + ('pad', [10]), + ('center', [10]), + ('ljust', [10]), + ('rjust', [10]), + ('zfill', [10]), + ('wrap', [10]), + ('encode', ['utf8']), + ('decode', ['utf8']), + ('translate', [{97: 100}]), # translating 'a' to 'd' + ('normalize', ['NFC']) +] + list(zip([ + # methods without positional arguments: zip with empty tuple + 'cat', 'len', 'split', 'rsplit', + 'partition', 'rpartition', 'get_dummies', + 'slice', 'slice_replace', + 'strip', 'lstrip', 'rstrip', + 'lower', 'upper', 'capitalize', + 'title', 'swapcase', + 'isalpha', 'isnumeric', 'isalnum', + 'isdigit', 'isdecimal', 'isspace', + 'islower', 'isupper', 'istitle' +], [tuple()] * 100)) +ids, _ = zip(*_all_string_methods) # use method name as fixture-id + + +@pytest.fixture(params=_all_string_methods, ids=ids) +def all_string_methods(request): + """ + Fixture for all public methods of `StringMethods` + + This fixture returns a tuple of the method name and a list of sample values + for the required positional arguments of that method. + """ + return request.param + + +_all_allowed_skipna_inferred_dtypes = [ + ('string', ['a', np.nan, 'c']), + ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), + ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), + ('empty', [np.nan, np.nan, np.nan]), + ('empty', []), + ('mixed-integer', ['a', np.nan, 2]), + ('mixed', ['a', np.nan, 2.0])] +ids, _ = zip(*_all_allowed_skipna_inferred_dtypes) # use inferred type as id + + +@pytest.fixture(params=_all_allowed_skipna_inferred_dtypes, ids=ids) +def all_allowed_skipna_inferred_dtypes(request): + """ + Fixture for all (inferred) dtypes allowed in StringMethods.__init__ + + Returns an np.ndarray that will be inferred to have the given dtype (when + skipping missing values). + + The allowed (inferred) types are: + * 'string' + * 'unicode' (if PY2) + * 'empty' + * 'bytes' (if PY3) + * 'mixed' + * 'mixed-integer' + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting + + # make sure the inferred dtype of the fixture is as requested + assert inferred_dtype == lib.infer_dtype(values, skipna=True) + + return inferred_dtype, values + + +# categoricals are handled separately +_all_skipna_inferred_dtypes = _all_allowed_skipna_inferred_dtypes + [ + ('floating', [1.0, np.nan, 2.0]), + ('integer', [1, np.nan, 2]), + ('mixed-integer-float', [1, np.nan, 2.0]), + ('decimal', [Decimal(1), np.nan, Decimal(2)]), + ('boolean', [True, np.nan, False]), + ('datetime64', [np.datetime64('2013-01-01'), np.nan, + np.datetime64('2018-01-01')]), + ('datetime', [Timestamp('20130101'), np.nan, Timestamp('20180101')]), + ('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), + # The following two dtypes are commented out due to GH 23554 + # ('complex', [1 + 1j, np.nan, 2 + 2j]), + # ('timedelta64', [np.timedelta64(1, 'D'), + # np.nan, np.timedelta64(2, 'D')]), + ('timedelta', [timedelta(1), np.nan, timedelta(2)]), + ('time', [time(1), np.nan, time(2)]), + ('period', [Period(2013), NaT, Period(2018)]), + ('interval', [Interval(0, 1), np.nan, Interval(0, 2)])] +ids, _ = zip(*_all_skipna_inferred_dtypes) # use inferred type as fixture-id + + +@pytest.fixture(params=_all_skipna_inferred_dtypes, ids=ids) +def all_skipna_inferred_dtypes(request): + """ + Fixture for all inferred dtypes from _libs.lib.infer_dtype + + Returns an np.ndarray that will be inferred to have the given dtype (when + skipping missing values). + + The covered (inferred) types are: + * 'string' + * 'unicode' (if PY2) + * 'empty' + * 'bytes' (if PY3) + * 'mixed' + * 'mixed-integer' + * 'mixed-integer-float' + * 'floating' + * 'integer' + * 'decimal' + * 'boolean' + * 'datetime64' + * 'datetime' + * 'date' + * 'timedelta' + * 'time' + * 'period' + * 'interval' + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting + + # make sure the inferred dtype of the fixture is as requested + assert inferred_dtype == lib.infer_dtype(values, skipna=True) + + return inferred_dtype, values + + class TestStringMethods(object): def test_api(self): @@ -34,11 +188,115 @@ def test_api(self): assert Series.str is strings.StringMethods assert isinstance(Series(['']).str, strings.StringMethods) - # GH 9184 - invalid = Series([1]) - with pytest.raises(AttributeError, match="only use .str accessor"): - invalid.str - assert not hasattr(invalid, 'str') + @pytest.mark.parametrize('dtype', [object, 'category']) + @pytest.mark.parametrize('box', [Series, Index]) + def test_api_per_dtype(self, box, dtype, all_skipna_inferred_dtypes): + # one instance of parametrized fixture + inferred_dtype, values = all_skipna_inferred_dtypes + + t = box(values, dtype=dtype) # explicit dtype to avoid casting + + # TODO: get rid of these xfails + if dtype == 'category' and inferred_dtype in ['period', 'interval']: + pytest.xfail(reason='Conversion to numpy array fails because ' + 'the ._values-attribute is not a numpy array for ' + 'PeriodArray/IntervalArray; see GH 23553') + if box == Index and inferred_dtype in ['empty', 'bytes']: + pytest.xfail(reason='Raising too restrictively; ' + 'solved by GH 23167') + if (box == Index and dtype == object + and inferred_dtype in ['boolean', 'date', 'time']): + pytest.xfail(reason='Inferring incorrectly because of NaNs; ' + 'solved by GH 23167') + if (box == Series + and (dtype == object and inferred_dtype not in [ + 'string', 'unicode', 'empty', + 'bytes', 'mixed', 'mixed-integer']) + or (dtype == 'category' + and inferred_dtype in ['decimal', 'boolean', 'time'])): + pytest.xfail(reason='Not raising correctly; solved by GH 23167') + + types_passing_constructor = ['string', 'unicode', 'empty', + 'bytes', 'mixed', 'mixed-integer'] + if inferred_dtype in types_passing_constructor: + # GH 6106 + assert isinstance(t.str, strings.StringMethods) + else: + # GH 9184, GH 23011, GH 23163 + with tm.assert_raises_regex(AttributeError, 'Can only use .str ' + 'accessor with string values.*'): + t.str + assert not hasattr(t, 'str') + + @pytest.mark.xfail(reason='not correctly raising on master; ' + 'solved by GH 23167') + def test_api_mi_raises(self): + mi = MultiIndex.from_arrays([['a', 'b', 'c']]) + with tm.assert_raises_regex(AttributeError, 'Can only use .str ' + 'accessor with Index, not MultiIndex'): + mi.str + assert not hasattr(mi, 'str') + + @pytest.mark.parametrize('dtype', [object, 'category']) + @pytest.mark.parametrize('box', [Series, Index]) + def test_api_per_method(self, box, dtype, + all_allowed_skipna_inferred_dtypes, + all_string_methods): + # this test does not check correctness of the different methods, + # just that the methods work on the specified (inferred) dtypes, + # and raise on all others + + # one instance of each parametrized fixture + inferred_dtype, values = all_allowed_skipna_inferred_dtypes + method_name, minimal_args = all_string_methods + + # TODO: get rid of these xfails + if (method_name not in ['encode', 'decode', 'len'] + and inferred_dtype == 'bytes'): + pytest.xfail(reason='Not raising for "bytes", see GH 23011;' + 'Also: malformed method names, see GH 23551; ' + 'solved by GH 23167') + if (method_name == 'cat' + and inferred_dtype in ['mixed', 'mixed-integer']): + pytest.xfail(reason='Bad error message; should raise better; ' + 'solved by GH 23167') + if box == Index and inferred_dtype in ['empty', 'bytes']: + pytest.xfail(reason='Raising too restrictively; ' + 'solved by GH 23167') + if (box == Index and dtype == object + and inferred_dtype in ['boolean', 'date', 'time']): + pytest.xfail(reason='Inferring incorrectly because of NaNs; ' + 'solved by GH 23167') + if box == Index and dtype == 'category': + pytest.xfail(reason='Broken methods on CategoricalIndex; ' + 'see GH 23556') + if (method_name in ['partition', 'rpartition'] and box == Index + and inferred_dtype != 'bytes'): + pytest.xfail(reason='Method not nan-safe on Index; see GH 23558') + + t = box(values, dtype=dtype) # explicit dtype to avoid casting + method = getattr(t.str, method_name) + + bytes_allowed = method_name in ['encode', 'decode', 'len'] + # as of v0.23.4, all methods except 'cat' are very lenient with the + # allowed data types, just returning NaN for entries that error. + # This could be changed with an 'errors'-kwarg to the `str`-accessor, + # see discussion in GH 13877 + mixed_allowed = method_name not in ['cat'] + + allowed_types = (['string', 'unicode', 'empty'] + + ['bytes'] * bytes_allowed + + ['mixed', 'mixed-integer'] * mixed_allowed) + + if inferred_dtype in allowed_types: + method(*minimal_args) # works! + else: + # GH 23011, GH 23163 + msg = ('Cannot use .str.{name} with values of inferred dtype ' + '{inferred_dtype!r}.'.format(name=method_name, + inferred_dtype=inferred_dtype)) + with tm.assert_raises_regex(TypeError, msg): + method(*minimal_args) def test_iter(self): # GH3638 From dcee05aeca810396e90c9343228fcfa23741e3da Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 13 Nov 2018 08:40:09 +0100 Subject: [PATCH 2/9] Move dtype-fixtures to pandas.conftest --- pandas/conftest.py | 82 ++++++++++++++++- pandas/tests/dtypes/test_inference.py | 7 ++ pandas/tests/test_strings.py | 121 ++++++++------------------ 3 files changed, 122 insertions(+), 88 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 03e09175bdb09..70228b2fb268e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,3 +1,5 @@ +from datetime import date, time, timedelta +from decimal import Decimal import importlib import os @@ -6,7 +8,7 @@ import numpy as np import pytest -from pandas.compat import PY3 +from pandas.compat import PY3, u import pandas.util._test_decorators as td import pandas as pd @@ -497,6 +499,84 @@ def any_numpy_dtype(request): return request.param +# categoricals are handled separately +_any_skipna_inferred_dtype = [ + ('string', ['a', np.nan, 'c']), + ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), + ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), + ('empty', [np.nan, np.nan, np.nan]), + ('empty', []), + ('mixed-integer', ['a', np.nan, 2]), + ('mixed', ['a', np.nan, 2.0]), + ('floating', [1.0, np.nan, 2.0]), + ('integer', [1, np.nan, 2]), + ('mixed-integer-float', [1, np.nan, 2.0]), + ('decimal', [Decimal(1), np.nan, Decimal(2)]), + ('boolean', [True, np.nan, False]), + ('datetime64', [np.datetime64('2013-01-01'), np.nan, + np.datetime64('2018-01-01')]), + ('datetime', [pd.Timestamp('20130101'), np.nan, pd.Timestamp('20180101')]), + ('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), + # The following two dtypes are commented out due to GH 23554 + # ('complex', [1 + 1j, np.nan, 2 + 2j]), + # ('timedelta64', [np.timedelta64(1, 'D'), + # np.nan, np.timedelta64(2, 'D')]), + ('timedelta', [timedelta(1), np.nan, timedelta(2)]), + ('time', [time(1), np.nan, time(2)]), + ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]), + ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])] +ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id + + +@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids) +def any_skipna_inferred_dtype(request): + """ + Fixture for all inferred dtypes from _libs.lib.infer_dtype + + The covered (inferred) types are: + * 'string' + * 'unicode' (if PY2) + * 'empty' + * 'bytes' (if PY3) + * 'mixed' + * 'mixed-integer' + * 'mixed-integer-float' + * 'floating' + * 'integer' + * 'decimal' + * 'boolean' + * 'datetime64' + * 'datetime' + * 'date' + * 'timedelta' + * 'time' + * 'period' + * 'interval' + + Returns + ------- + inferred_dtype : str + The string for the inferred dtype from _libs.lib.infer_dtype + values : np.ndarray + An array of object dtype that will be inferred to have + `inferred_dtype` + + Examples + -------- + >>> import pandas._libs.lib as lib + >>> + >>> def test_something(any_skipna_inferred_dtype): + ... inferred_dtype, values = any_skipna_inferred_dtype + ... # will pass + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting + + # correctness of inference tested in tests/dtypes/test_inference.py + return inferred_dtype, values + + @pytest.fixture def mock(): """ diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index f2552cffc6651..8159861260e89 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -495,6 +495,13 @@ class TestTypeInference(object): class Dummy(): pass + def test_dtype_fixture(self, any_skipna_inferred_dtype): + # see pandas/conftest.py + inferred_dtype, values = any_skipna_inferred_dtype + + # make sure the inferred dtype of the fixture is as requested + assert inferred_dtype == lib.infer_dtype(values, skipna=True) + def test_length_zero(self): result = lib.infer_dtype(np.array([], dtype='i4')) assert result == 'integer' diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index e37ee1762ce77..d83fb1a023457 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1,27 +1,24 @@ # -*- coding: utf-8 -*- # pylint: disable-msg=E1101,W0612 -from datetime import datetime, date, timedelta, time +from datetime import datetime, timedelta import pytest import re -from decimal import Decimal from numpy import nan as NA import numpy as np from numpy.random import randint -from pandas.compat import range, u, PY3 +from pandas.compat import range, u import pandas.compat as compat -from pandas import (Index, Series, DataFrame, isna, MultiIndex, notna, concat, - Timestamp, Period, NaT, Interval) -import pandas._libs.lib as lib +from pandas import Index, Series, DataFrame, isna, MultiIndex, notna, concat from pandas.util.testing import assert_series_equal, assert_index_equal import pandas.util.testing as tm +import pandas.conftest as top_level_conftest import pandas.core.strings as strings - def assert_series_or_index_equal(left, right): if isinstance(left, Series): assert_series_equal(left, right) @@ -85,72 +82,19 @@ def all_string_methods(request): return request.param -_all_allowed_skipna_inferred_dtypes = [ - ('string', ['a', np.nan, 'c']), - ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), - ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), - ('empty', [np.nan, np.nan, np.nan]), - ('empty', []), - ('mixed-integer', ['a', np.nan, 2]), - ('mixed', ['a', np.nan, 2.0])] -ids, _ = zip(*_all_allowed_skipna_inferred_dtypes) # use inferred type as id +_any_allowed_skipna_inferred_dtype = [ + (dtype, values) for dtype, values + in top_level_conftest._any_skipna_inferred_dtype + if dtype in {'series', 'unicode', 'empty', + 'bytes', 'mixed', 'mixed-integer'}] +ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id -@pytest.fixture(params=_all_allowed_skipna_inferred_dtypes, ids=ids) -def all_allowed_skipna_inferred_dtypes(request): +@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) +def any_allowed_skipna_inferred_dtype(request): """ Fixture for all (inferred) dtypes allowed in StringMethods.__init__ - Returns an np.ndarray that will be inferred to have the given dtype (when - skipping missing values). - - The allowed (inferred) types are: - * 'string' - * 'unicode' (if PY2) - * 'empty' - * 'bytes' (if PY3) - * 'mixed' - * 'mixed-integer' - """ - inferred_dtype, values = request.param - values = np.array(values, dtype=object) # object dtype to avoid casting - - # make sure the inferred dtype of the fixture is as requested - assert inferred_dtype == lib.infer_dtype(values, skipna=True) - - return inferred_dtype, values - - -# categoricals are handled separately -_all_skipna_inferred_dtypes = _all_allowed_skipna_inferred_dtypes + [ - ('floating', [1.0, np.nan, 2.0]), - ('integer', [1, np.nan, 2]), - ('mixed-integer-float', [1, np.nan, 2.0]), - ('decimal', [Decimal(1), np.nan, Decimal(2)]), - ('boolean', [True, np.nan, False]), - ('datetime64', [np.datetime64('2013-01-01'), np.nan, - np.datetime64('2018-01-01')]), - ('datetime', [Timestamp('20130101'), np.nan, Timestamp('20180101')]), - ('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), - # The following two dtypes are commented out due to GH 23554 - # ('complex', [1 + 1j, np.nan, 2 + 2j]), - # ('timedelta64', [np.timedelta64(1, 'D'), - # np.nan, np.timedelta64(2, 'D')]), - ('timedelta', [timedelta(1), np.nan, timedelta(2)]), - ('time', [time(1), np.nan, time(2)]), - ('period', [Period(2013), NaT, Period(2018)]), - ('interval', [Interval(0, 1), np.nan, Interval(0, 2)])] -ids, _ = zip(*_all_skipna_inferred_dtypes) # use inferred type as fixture-id - - -@pytest.fixture(params=_all_skipna_inferred_dtypes, ids=ids) -def all_skipna_inferred_dtypes(request): - """ - Fixture for all inferred dtypes from _libs.lib.infer_dtype - - Returns an np.ndarray that will be inferred to have the given dtype (when - skipping missing values). - The covered (inferred) types are: * 'string' * 'unicode' (if PY2) @@ -158,25 +102,28 @@ def all_skipna_inferred_dtypes(request): * 'bytes' (if PY3) * 'mixed' * 'mixed-integer' - * 'mixed-integer-float' - * 'floating' - * 'integer' - * 'decimal' - * 'boolean' - * 'datetime64' - * 'datetime' - * 'date' - * 'timedelta' - * 'time' - * 'period' - * 'interval' + + Returns + ------- + inferred_dtype : str + The string for the inferred dtype from _libs.lib.infer_dtype + values : np.ndarray + An array of object dtype that will be inferred to have + `inferred_dtype` + + Examples + -------- + >>> import pandas._libs.lib as lib + >>> + >>> def test_something(any_allowed_skipna_inferred_dtype): + ... inferred_dtype, values = any_skipna_inferred_dtype + ... # will pass + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting - # make sure the inferred dtype of the fixture is as requested - assert inferred_dtype == lib.infer_dtype(values, skipna=True) - + # correctness of inference tested in tests/dtypes/test_inference.py return inferred_dtype, values @@ -190,9 +137,9 @@ def test_api(self): @pytest.mark.parametrize('dtype', [object, 'category']) @pytest.mark.parametrize('box', [Series, Index]) - def test_api_per_dtype(self, box, dtype, all_skipna_inferred_dtypes): + def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): # one instance of parametrized fixture - inferred_dtype, values = all_skipna_inferred_dtypes + inferred_dtype, values = any_skipna_inferred_dtype t = box(values, dtype=dtype) # explicit dtype to avoid casting @@ -240,14 +187,14 @@ def test_api_mi_raises(self): @pytest.mark.parametrize('dtype', [object, 'category']) @pytest.mark.parametrize('box', [Series, Index]) def test_api_per_method(self, box, dtype, - all_allowed_skipna_inferred_dtypes, + any_allowed_skipna_inferred_dtype, all_string_methods): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, # and raise on all others # one instance of each parametrized fixture - inferred_dtype, values = all_allowed_skipna_inferred_dtypes + inferred_dtype, values = any_allowed_skipna_inferred_dtype method_name, minimal_args = all_string_methods # TODO: get rid of these xfails From 41cecb9dd9e17db0a2c701c1dc0ac1bbd5e402df Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 15 Nov 2018 19:55:05 +0100 Subject: [PATCH 3/9] Unify test_str_accessor_api_for_categorical with parametrized tests --- pandas/tests/series/test_api.py | 76 ----------------- pandas/tests/test_strings.py | 146 +++++++++++++++++++++++--------- 2 files changed, 104 insertions(+), 118 deletions(-) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index f944d6f8c9d08..65f5c59deba36 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -602,82 +602,6 @@ def f(): ordered=True)) tm.assert_series_equal(result, expected) - def test_str_accessor_api_for_categorical(self): - # https://github.com/pandas-dev/pandas/issues/10661 - from pandas.core.strings import StringMethods - s = Series(list('aabb')) - s = s + " " + s - c = s.astype('category') - assert isinstance(c.str, StringMethods) - - # str functions, which need special arguments - special_func_defs = [ - ('cat', (list("zyxw"),), {"sep": ","}), - ('center', (10,), {}), - ('contains', ("a",), {}), - ('count', ("a",), {}), - ('decode', ("UTF-8",), {}), - ('encode', ("UTF-8",), {}), - ('endswith', ("a",), {}), - ('extract', ("([a-z]*) ",), {"expand": False}), - ('extract', ("([a-z]*) ",), {"expand": True}), - ('extractall', ("([a-z]*) ",), {}), - ('find', ("a",), {}), - ('findall', ("a",), {}), - ('index', (" ",), {}), - ('ljust', (10,), {}), - ('match', ("a"), {}), # deprecated... - ('normalize', ("NFC",), {}), - ('pad', (10,), {}), - ('partition', (" ",), {"expand": False}), # not default - ('partition', (" ",), {"expand": True}), # default - ('repeat', (3,), {}), - ('replace', ("a", "z"), {}), - ('rfind', ("a",), {}), - ('rindex', (" ",), {}), - ('rjust', (10,), {}), - ('rpartition', (" ",), {"expand": False}), # not default - ('rpartition', (" ",), {"expand": True}), # default - ('slice', (0, 1), {}), - ('slice_replace', (0, 1, "z"), {}), - ('split', (" ",), {"expand": False}), # default - ('split', (" ",), {"expand": True}), # not default - ('startswith', ("a",), {}), - ('wrap', (2,), {}), - ('zfill', (10,), {}) - ] - _special_func_names = [f[0] for f in special_func_defs] - - # * get, join: they need a individual elements of type lists, but - # we can't make a categorical with lists as individual categories. - # -> `s.str.split(" ").astype("category")` will error! - # * `translate` has different interfaces for py2 vs. py3 - _ignore_names = ["get", "join", "translate"] - - str_func_names = [f for f in dir(s.str) if not ( - f.startswith("_") or - f in _special_func_names or - f in _ignore_names)] - - func_defs = [(f, (), {}) for f in str_func_names] - func_defs.extend(special_func_defs) - - for func, args, kwargs in func_defs: - res = getattr(c.str, func)(*args, **kwargs) - exp = getattr(s.str, func)(*args, **kwargs) - - if isinstance(res, DataFrame): - tm.assert_frame_equal(res, exp) - else: - tm.assert_series_equal(res, exp) - - invalid = Series([1, 2, 3]).astype('category') - msg = "Can only use .str accessor with string" - - with pytest.raises(AttributeError, match=msg): - invalid.str - assert not hasattr(invalid, 'str') - def test_dt_accessor_api_for_categorical(self): # https://github.com/pandas-dev/pandas/issues/10661 from pandas.core.indexes.accessors import Properties diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d83fb1a023457..64f277efeec8e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -19,6 +19,7 @@ import pandas.conftest as top_level_conftest import pandas.core.strings as strings + def assert_series_or_index_equal(left, right): if isinstance(left, Series): assert_series_equal(left, right) @@ -26,38 +27,50 @@ def assert_series_or_index_equal(left, right): assert_index_equal(left, right) -# method names plus minimal set of arguments to call -_all_string_methods = [ - ('get', [0]), - ('join', [',']), - ('contains', ['some_pattern']), - ('match', ['some_pattern']), - ('count', ['some_pattern']), - ('startswith', ['some_pattern']), - ('endswith', ['some_pattern']), - ('findall', ['some_pattern']), - ('find', ['some_pattern']), - ('rfind', ['some_pattern']), - # because "index"/"rindex" fail (intentionally) if the string is not found - # (and we're testing on generic data), search only for empty string - ('index', ['']), - ('rindex', ['']), - ('extract', [r'(some_pattern)']), - ('extractall', [r'(some_pattern)']), - ('replace', ['some_pattern', 'other_pattern']), - ('repeat', [10]), - ('pad', [10]), - ('center', [10]), - ('ljust', [10]), - ('rjust', [10]), - ('zfill', [10]), - ('wrap', [10]), - ('encode', ['utf8']), - ('decode', ['utf8']), - ('translate', [{97: 100}]), # translating 'a' to 'd' - ('normalize', ['NFC']) +_any_string_method = [ + ('cat', (), {'sep': ','}), # noqa: E241 + ('cat', (Series(list('zyx')),), {'sep': ',', # noqa: E241 + 'join': 'left'}), + ('center', (10,), {}), # noqa: E241 + ('contains', ('a',), {}), # noqa: E241 + ('count', ('a',), {}), # noqa: E241 + ('decode', ('UTF-8',), {}), # noqa: E241 + ('encode', ('UTF-8',), {}), # noqa: E241 + ('endswith', ('a',), {}), # noqa: E241 + ('extract', ('([a-z]*)',), {'expand': False}), # noqa: E241 + ('extract', ('([a-z]*)',), {'expand': True}), # noqa: E241 + ('extractall', ('([a-z]*)',), {}), # noqa: E241 + ('find', ('a',), {}), # noqa: E241 + ('findall', ('a',), {}), # noqa: E241 + ('get', (0,), {}), # noqa: E241 + # because "index" (and "rindex") fail intentionally + # if the string is not found, search only for empty string + ('index', ('',), {}), # noqa: E241 + ('join', (',',), {}), # noqa: E241 + ('ljust', (10,), {}), # noqa: E241 + ('match', ('a',), {}), # noqa: E241 + ('normalize', ('NFC',), {}), # noqa: E241 + ('pad', (10,), {}), # noqa: E241 + ('partition', (' ',), {'expand': False}), # noqa: E241 + ('partition', (' ',), {'expand': True}), # noqa: E241 + ('repeat', (3,), {}), # noqa: E241 + ('replace', ('a', 'z',), {}), # noqa: E241 + ('rfind', ('a',), {}), # noqa: E241 + ('rindex', ('',), {}), # noqa: E241 + ('rjust', (10,), {}), # noqa: E241 + ('rpartition', (' ',), {'expand': False}), # noqa: E241 + ('rpartition', (' ',), {'expand': True}), # noqa: E241 + ('slice', (0, 1,), {}), # noqa: E241 + ('slice_replace', (0, 1, 'z',), {}), # noqa: E241 + ('split', (' ',), {'expand': False}), # noqa: E241 + ('split', (' ',), {'expand': True}), # noqa: E241 + ('startswith', ('a',), {}), # noqa: E241 + # translating unicode points of "a" to "d" + ('translate', ({97: 100},), {}), # noqa: E241 + ('wrap', (2,), {}), # noqa: E241 + ('zfill', (10,), {}) # noqa: E241 ] + list(zip([ - # methods without positional arguments: zip with empty tuple + # methods without positional arguments: zip with empty tuple and empty dict 'cat', 'len', 'split', 'rsplit', 'partition', 'rpartition', 'get_dummies', 'slice', 'slice_replace', @@ -67,17 +80,42 @@ def assert_series_or_index_equal(left, right): 'isalpha', 'isnumeric', 'isalnum', 'isdigit', 'isdecimal', 'isspace', 'islower', 'isupper', 'istitle' -], [tuple()] * 100)) -ids, _ = zip(*_all_string_methods) # use method name as fixture-id +], [()] * 100, [{}] * 100)) +ids, _, _ = zip(*_any_string_method) # use method name as fixture-id + +# test that the above list captures all methods of StringMethods +missing_methods = {f for f in dir(strings.StringMethods) + if not f.startswith('_')} - set(ids) +assert not missing_methods -@pytest.fixture(params=_all_string_methods, ids=ids) -def all_string_methods(request): + +@pytest.fixture(params=_any_string_method, ids=ids) +def any_string_method(request): """ Fixture for all public methods of `StringMethods` - This fixture returns a tuple of the method name and a list of sample values - for the required positional arguments of that method. + This fixture returns a tuple of the method name and sample arguments + necessary to call the method. + + Returns + ------- + method_name : str + The name of the method in `StringMethods` + args : tuple + Sample values for the positional arguments + kwargs : dict + Sample values for the keyword arguments + + Examples + -------- + >>> def test_something(any_string_method): + ... s = pd.Series(['a', 'b', np.nan, 'd']) + ... + ... method_name, args, kwargs = any_string_method + ... method = getattr(s.str, method_name) + ... # will not raise + ... method(*args, **kwargs) """ return request.param @@ -188,14 +226,14 @@ def test_api_mi_raises(self): @pytest.mark.parametrize('box', [Series, Index]) def test_api_per_method(self, box, dtype, any_allowed_skipna_inferred_dtype, - all_string_methods): + any_string_method): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, # and raise on all others # one instance of each parametrized fixture inferred_dtype, values = any_allowed_skipna_inferred_dtype - method_name, minimal_args = all_string_methods + method_name, args, kwargs = any_string_method # TODO: get rid of these xfails if (method_name not in ['encode', 'decode', 'len'] @@ -220,6 +258,10 @@ def test_api_per_method(self, box, dtype, if (method_name in ['partition', 'rpartition'] and box == Index and inferred_dtype != 'bytes'): pytest.xfail(reason='Method not nan-safe on Index; see GH 23558') + if (method_name == 'split' and box == Index + and inferred_dtype in ['mixed', 'mixed-integer'] + and dtype == object and kwargs.get('expand', None) == True): + pytest.xfail(reason='Method not nan-safe on Index; see GH 23677') t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) @@ -236,14 +278,34 @@ def test_api_per_method(self, box, dtype, + ['mixed', 'mixed-integer'] * mixed_allowed) if inferred_dtype in allowed_types: - method(*minimal_args) # works! + method(*args, **kwargs) # works! else: # GH 23011, GH 23163 msg = ('Cannot use .str.{name} with values of inferred dtype ' '{inferred_dtype!r}.'.format(name=method_name, inferred_dtype=inferred_dtype)) - with tm.assert_raises_regex(TypeError, msg): - method(*minimal_args) + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) + + def test_api_for_categorical(self, any_string_method): + # https://github.com/pandas-dev/pandas/issues/10661 + s = Series(list('aabb')) + s = s + " " + s + c = s.astype('category') + assert isinstance(c.str, strings.StringMethods) + + method_name, args, kwargs = any_string_method + + result = getattr(c.str, method_name)(*args, **kwargs) + expected = getattr(s.str, method_name)(*args, **kwargs) + + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + elif isinstance(result, Series): + tm.assert_series_equal(result, expected) + else: + # str.cat(others=None) returns string, for example + assert result == expected def test_iter(self): # GH3638 From 01a3c10f7b333a206ce5221c34258df26aef35d9 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 13 Nov 2018 23:51:13 +0100 Subject: [PATCH 4/9] Change test name in dtypes/test_inference.py --- pandas/tests/dtypes/test_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 8159861260e89..e5698b7f5bc66 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -495,7 +495,7 @@ class TestTypeInference(object): class Dummy(): pass - def test_dtype_fixture(self, any_skipna_inferred_dtype): + def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype): # see pandas/conftest.py inferred_dtype, values = any_skipna_inferred_dtype From a94f569628fd58dffd08be8c5d8519e4981dc5ee Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 14 Nov 2018 00:06:24 +0100 Subject: [PATCH 5/9] Alphabetize methods without positional kwargs in fixture-parametrization --- pandas/tests/test_strings.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 64f277efeec8e..166b0ecb59a62 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -71,15 +71,14 @@ def assert_series_or_index_equal(left, right): ('zfill', (10,), {}) # noqa: E241 ] + list(zip([ # methods without positional arguments: zip with empty tuple and empty dict - 'cat', 'len', 'split', 'rsplit', - 'partition', 'rpartition', 'get_dummies', - 'slice', 'slice_replace', - 'strip', 'lstrip', 'rstrip', - 'lower', 'upper', 'capitalize', - 'title', 'swapcase', - 'isalpha', 'isnumeric', 'isalnum', - 'isdigit', 'isdecimal', 'isspace', - 'islower', 'isupper', 'istitle' + 'capitalize', 'cat', 'get_dummies', + 'isalnum', 'isalpha', 'isdecimal', + 'isdigit', 'islower', 'isnumeric', + 'isspace', 'istitle', 'isupper', + 'len', 'lower', 'lstrip', 'partition', + 'rpartition', 'rsplit', 'rstrip', + 'slice', 'slice_replace', 'split', + 'strip', 'swapcase', 'title', 'upper' ], [()] * 100, [{}] * 100)) ids, _, _ = zip(*_any_string_method) # use method name as fixture-id From f0ae1db3f1dd60281938811d3a1c76f18288a20f Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 14 Nov 2018 00:06:50 +0100 Subject: [PATCH 6/9] Add missed PY2-xfail --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 166b0ecb59a62..4b2a418ca42c9 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -258,7 +258,7 @@ def test_api_per_method(self, box, dtype, and inferred_dtype != 'bytes'): pytest.xfail(reason='Method not nan-safe on Index; see GH 23558') if (method_name == 'split' and box == Index - and inferred_dtype in ['mixed', 'mixed-integer'] + and inferred_dtype in ['unicode', 'mixed', 'mixed-integer'] and dtype == object and kwargs.get('expand', None) == True): pytest.xfail(reason='Method not nan-safe on Index; see GH 23677') From 16fe71c525a164e91274f4dc4f8119a9cbcf3da4 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 15 Nov 2018 19:59:53 +0100 Subject: [PATCH 7/9] Lint; switch to pytest.raises --- pandas/tests/test_strings.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 4b2a418ca42c9..0bdded54fc0d9 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -207,20 +207,11 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): assert isinstance(t.str, strings.StringMethods) else: # GH 9184, GH 23011, GH 23163 - with tm.assert_raises_regex(AttributeError, 'Can only use .str ' - 'accessor with string values.*'): + with pytest.raises(AttributeError, match='Can only use .str ' + 'accessor with string values.*'): t.str assert not hasattr(t, 'str') - @pytest.mark.xfail(reason='not correctly raising on master; ' - 'solved by GH 23167') - def test_api_mi_raises(self): - mi = MultiIndex.from_arrays([['a', 'b', 'c']]) - with tm.assert_raises_regex(AttributeError, 'Can only use .str ' - 'accessor with Index, not MultiIndex'): - mi.str - assert not hasattr(mi, 'str') - @pytest.mark.parametrize('dtype', [object, 'category']) @pytest.mark.parametrize('box', [Series, Index]) def test_api_per_method(self, box, dtype, @@ -259,7 +250,7 @@ def test_api_per_method(self, box, dtype, pytest.xfail(reason='Method not nan-safe on Index; see GH 23558') if (method_name == 'split' and box == Index and inferred_dtype in ['unicode', 'mixed', 'mixed-integer'] - and dtype == object and kwargs.get('expand', None) == True): + and dtype == object and kwargs.get('expand', None) is True): pytest.xfail(reason='Method not nan-safe on Index; see GH 23677') t = box(values, dtype=dtype) # explicit dtype to avoid casting From 9b36a506f5d48116334da5affa95665395afe166 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 20 Nov 2018 07:47:02 +0100 Subject: [PATCH 8/9] Clean-up post-merge --- pandas/tests/test_strings.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 0197f423ebced..322ace5acee58 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -245,9 +245,6 @@ def test_api_per_method(self, box, dtype, if box == Index and dtype == 'category': pytest.xfail(reason='Broken methods on CategoricalIndex; ' 'see GH 23556') - if (method_name in ['partition', 'rpartition'] and box == Index - and inferred_dtype != 'bytes'): - pytest.xfail(reason='Method not nan-safe on Index; see GH 23558') if (method_name == 'split' and box == Index and inferred_dtype in ['unicode', 'mixed', 'mixed-integer'] and dtype == object and kwargs.get('expand', None) is True): @@ -297,26 +294,6 @@ def test_api_for_categorical(self, any_string_method): # str.cat(others=None) returns string, for example assert result == expected - def test_api_for_categorical(self, any_string_method): - # https://github.com/pandas-dev/pandas/issues/10661 - s = Series(list('aabb')) - s = s + " " + s - c = s.astype('category') - assert isinstance(c.str, strings.StringMethods) - - method_name, args, kwargs = any_string_method - - result = getattr(c.str, method_name)(*args, **kwargs) - expected = getattr(s.str, method_name)(*args, **kwargs) - - if isinstance(result, DataFrame): - tm.assert_frame_equal(result, expected) - elif isinstance(result, Series): - tm.assert_series_equal(result, expected) - else: - # str.cat(others=None) returns string, for example - assert result == expected - def test_iter(self): # GH3638 strs = 'google', 'wikimedia', 'wikipedia', 'wikitravel' From a53a28e70393e5ba853dbcfbdd610d339f97acb5 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 26 Nov 2018 09:10:50 +0100 Subject: [PATCH 9/9] Review (jreback) --- pandas/tests/test_strings.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 322ace5acee58..117984ce89743 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -9,14 +9,13 @@ import numpy as np from numpy.random import randint -from pandas.compat import range, u +from pandas.compat import range, u, PY3 import pandas.compat as compat from pandas import Index, Series, DataFrame, isna, MultiIndex, notna, concat from pandas.util.testing import assert_series_equal, assert_index_equal import pandas.util.testing as tm -import pandas.conftest as top_level_conftest import pandas.core.strings as strings @@ -119,11 +118,15 @@ def any_string_method(request): return request.param +# subset of the full set from pandas/conftest.py _any_allowed_skipna_inferred_dtype = [ - (dtype, values) for dtype, values - in top_level_conftest._any_skipna_inferred_dtype - if dtype in {'series', 'unicode', 'empty', - 'bytes', 'mixed', 'mixed-integer'}] + ('string', ['a', np.nan, 'c']), + ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), + ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), + ('empty', [np.nan, np.nan, np.nan]), + ('empty', []), + ('mixed-integer', ['a', np.nan, 2]) +] ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id @@ -245,10 +248,6 @@ def test_api_per_method(self, box, dtype, if box == Index and dtype == 'category': pytest.xfail(reason='Broken methods on CategoricalIndex; ' 'see GH 23556') - if (method_name == 'split' and box == Index - and inferred_dtype in ['unicode', 'mixed', 'mixed-integer'] - and dtype == object and kwargs.get('expand', None) is True): - pytest.xfail(reason='Method not nan-safe on Index; see GH 23677') t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name)