From 11c26e2b0d2b344b14f2c6b33ccf8166f73b8dbb Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Mon, 20 Apr 2020 22:08:50 +0900 Subject: [PATCH 01/13] BUG: value_counts not working correctly on ExtensionArrays --- pandas/core/algorithms.py | 2 +- pandas/tests/arrays/string_/test_string.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 62a3808d36ba2..56b5dd0c4cd5e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -724,7 +724,7 @@ def value_counts( result = result.sort_values(ascending=ascending) if normalize: - result = result / float(counts.sum()) + result = result / float(sum(counts)) return result diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index fe770eed84b62..f3a36b0303736 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -277,3 +277,13 @@ def test_value_counts_na(): result = arr.value_counts(dropna=True) expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") tm.assert_series_equal(result, expected) + + +def test_normalize_value_counts(): + result = ( + pd.Series(list("abcd"), dtype="string") + .value_counts(normalize=True) + .sort_index() + ) + expected = pd.Series([0.25, 0.25, 0.25, 0.25], index=["a", "b", "c", "d"]) + tm.assert_series_equal(expected, result) From b0252320feb969bbcc604df87fd19905944099ec Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Mon, 20 Apr 2020 23:19:20 +0900 Subject: [PATCH 02/13] Change sum() to ndarray.sum() --- pandas/core/algorithms.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 56b5dd0c4cd5e..f6a03501cb546 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -724,7 +724,9 @@ def value_counts( result = result.sort_values(ascending=ascending) if normalize: - result = result / float(sum(counts)) + if not isinstance(counts, np.ndarray): + counts = counts.to_numpy() + result = result / float(counts.sum()) return result From 700720589a11fdffb1cdf1bc64b8e21c15c29e02 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Wed, 22 Apr 2020 01:30:06 +0900 Subject: [PATCH 03/13] Expand tests --- pandas/core/algorithms.py | 2 + pandas/core/arrays/masked.py | 2 +- pandas/core/groupby/generic.py | 12 ++++- pandas/tests/arrays/integer/test_function.py | 6 +-- pandas/tests/arrays/string_/test_string.py | 10 ---- pandas/tests/arrays/test_datetimes.py | 2 +- pandas/tests/base/test_value_counts.py | 38 ++++++------- pandas/tests/extension/base/methods.py | 11 +++- pandas/tests/extension/test_datetime.py | 4 -- pandas/tests/extension/test_string.py | 4 +- pandas/tests/frame/test_api.py | 2 +- pandas/tests/frame/test_arithmetic.py | 6 ++- pandas/tests/indexes/datetimes/test_ops.py | 6 +-- pandas/tests/indexes/period/test_ops.py | 6 +-- pandas/tests/indexes/timedeltas/test_ops.py | 6 +-- pandas/tests/io/pytables/test_store.py | 3 +- pandas/tests/reshape/test_get_dummies.py | 4 +- pandas/tests/series/methods/test_asfreq.py | 4 +- .../tests/series/methods/test_value_counts.py | 34 ++++++------ pandas/tests/test_algos.py | 53 ++++++++++--------- web/pandas_web.py | 5 +- 21 files changed, 119 insertions(+), 101 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f6a03501cb546..20f48dfcf62cd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -727,6 +727,8 @@ def value_counts( if not isinstance(counts, np.ndarray): counts = counts.to_numpy() result = result / float(counts.sum()) + else: + result = result.astype("Int64") return result diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index fc5b307bd5754..2203a4e389774 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -259,7 +259,7 @@ def value_counts(self, dropna: bool = True) -> "Series": # if we want nans, count the mask if dropna: - counts = value_counts._values + counts = np.array(value_counts._values, dtype=int) else: counts = np.empty(len(value_counts) + 1, dtype="int64") counts[:-1] = value_counts diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 13938c41a0f6b..0605ae1378b7a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -731,7 +731,12 @@ def value_counts( ) if is_integer_dtype(out): - out = ensure_int64(out) + return Series( + ensure_int64(out), + index=mi, + name=self._selection_name, + dtype="Int64", + ) return Series(out, index=mi, name=self._selection_name) # for compat. with libgroupby.value_counts need to ensure every @@ -763,7 +768,10 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) if is_integer_dtype(out): - out = ensure_int64(out) + return Series( + ensure_int64(out), index=mi, name=self._selection_name, dtype="Int64" + ) + return Series(out, index=mi, name=self._selection_name) def count(self) -> Series: diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index bdf902d1aca62..93800d724fb10 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -93,7 +93,7 @@ def test_stat_method(pandasmethname, kwargs): def test_value_counts_na(): - arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") + arr = pd.array([1, 2, 1, pd.NA]) result = arr.value_counts(dropna=False) expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") tm.assert_series_equal(result, expected) @@ -105,10 +105,10 @@ def test_value_counts_na(): def test_value_counts_empty(): # https://github.com/pandas-dev/pandas/issues/33317 - s = pd.Series([], dtype="Int64") + s = pd.Series([]) result = s.value_counts() # TODO: The dtype of the index seems wrong (it's int64 for non-empty) - idx = pd.Index([], dtype="object") + idx = pd.Float64Index([], dtype="float64") expected = pd.Series([], index=idx, dtype="Int64") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index f3a36b0303736..fe770eed84b62 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -277,13 +277,3 @@ def test_value_counts_na(): result = arr.value_counts(dropna=True) expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") tm.assert_series_equal(result, expected) - - -def test_normalize_value_counts(): - result = ( - pd.Series(list("abcd"), dtype="string") - .value_counts(normalize=True) - .sort_index() - ) - expected = pd.Series([0.25, 0.25, 0.25, 0.25], index=["a", "b", "c", "d"]) - tm.assert_series_equal(expected, result) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 7d80ad3d8c6be..7a26952379c3a 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -248,7 +248,7 @@ def test_value_counts_preserves_tz(self): arr[-2] = pd.NaT result = arr.value_counts() - expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]]) + expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]], dtype="Int64") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("method", ["pad", "backfill"]) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index d45feaff68dde..f96df86027d28 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -30,7 +30,7 @@ def test_value_counts(index_or_series_obj): result = obj.value_counts() counter = collections.Counter(obj) - expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) + expected = pd.Series(dict(counter.most_common()), dtype="Int64", name=obj.name) expected.index = expected.index.astype(obj.dtype) if isinstance(obj, pd.MultiIndex): expected.index = pd.Index(expected.index) @@ -67,7 +67,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): # because np.nan == np.nan is False, but None == None is True # np.nan would be duplicated, whereas None wouldn't counter = collections.Counter(obj.dropna()) - expected = pd.Series(dict(counter.most_common()), dtype=np.int64) + expected = pd.Series(dict(counter.most_common()), dtype="Int64") expected.index = expected.index.astype(obj.dtype) result = obj.value_counts() @@ -80,7 +80,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): # can't use expected[null_obj] = 3 as # IntervalIndex doesn't allow assignment - new_entry = pd.Series({np.nan: 3}, dtype=np.int64) + new_entry = pd.Series({np.nan: 3}, dtype="Int64") expected = expected.append(new_entry) result = obj.value_counts(dropna=False) @@ -96,7 +96,7 @@ def test_value_counts_inferred(index_or_series): klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) - expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) + expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], dtype="Int64") tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): @@ -110,17 +110,17 @@ def test_value_counts_inferred(index_or_series): # don't sort, have to sort after the fact as not sorting is # platform-dep hist = s.value_counts(sort=False).sort_values() - expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() + expected = Series([3, 1, 4, 2], index=list("acbd"), dtype="Int64").sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list("cdab")) + expected = Series([1, 2, 3, 4], index=list("cdab"), dtype="Int64") tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) - expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) + expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], dtype="float64") tm.assert_series_equal(hist, expected) @@ -136,16 +136,16 @@ def test_value_counts_bins(index_or_series): s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.997, 3.0): 4}) + exp1 = Series({Interval(0.997, 3.0): 4}, dtype="Int64") tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.997, 3.0): 1.0}) + exp1n = Series({Interval(0.997, 3.0): 1.0}, dtype="float64") tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: - exp = np.array([1, 2, 3], dtype=np.int64) + exp = np.array([1, 2, 3], dtype="Int64") tm.assert_numpy_array_equal(s1.unique(), exp) assert s1.nunique() == 3 @@ -153,22 +153,24 @@ def test_value_counts_bins(index_or_series): # these return the same res4 = s1.value_counts(bins=4, dropna=True) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]), dtype="Int64") tm.assert_series_equal(res4, exp4) res4 = s1.value_counts(bins=4, dropna=False) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]), dtype="Int64") tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) + exp4n = Series( + [0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2]), dtype="float64" + ) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] s = klass(s_values) - expected = Series([4, 3, 2], index=["b", "a", "d"]) + expected = Series([4, 3, 2], index=["b", "a", "d"], dtype="Int64") tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): @@ -180,7 +182,7 @@ def test_value_counts_bins(index_or_series): assert s.nunique() == 3 s = klass({}) if klass is dict else klass({}, dtype=object) - expected = Series([], dtype=np.int64) + expected = Series([], dtype="Int64") tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): @@ -216,7 +218,7 @@ def test_value_counts_datetime64(index_or_series): idx = pd.to_datetime( ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] ) - expected_s = Series([3, 2, 1], index=idx) + expected_s = Series([3, 2, 1], index=idx, dtype="Int64") tm.assert_series_equal(s.value_counts(), expected_s) expected = np_array_datetime64_compat( @@ -240,7 +242,7 @@ def test_value_counts_datetime64(index_or_series): result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 - tm.assert_series_equal(result, expected_s) + tm.assert_series_equal(result, expected_s.astype("Int64")) unique = s.unique() assert unique.dtype == "datetime64[ns]" @@ -261,7 +263,7 @@ def test_value_counts_datetime64(index_or_series): td = klass(td, name="dt") result = td.value_counts() - expected_s = Series([6], index=[Timedelta("1day")], name="dt") + expected_s = Series([6], index=[Timedelta("1day")], name="dt", dtype="Int64") tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(["1 days"], name="dt") diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 22e53dbc89f01..cad4d3f1cf4fd 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -17,7 +17,7 @@ class BaseMethodsTests(BaseExtensionTests): @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): - all_data = all_data[:10] + all_data = np.unique(all_data[:5]) if dropna: other = np.array(all_data[~all_data.isna()]) else: @@ -28,6 +28,15 @@ def test_value_counts(self, all_data, dropna): self.assert_series_equal(result, expected) + result = ( + pd.Series(all_data, dtype=all_data.dtype) + .value_counts(dropna=dropna, normalize=True) + .sort_index() + ) + + expected = pd.Series([1 / len(other)] * len(other), index=result.index) + tm.assert_series_equal(expected, result) + def test_count(self, data_missing): df = pd.DataFrame({"A": data_missing}) result = df.count(axis="columns") diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 3aa188098620d..5a4cb1f2ec12c 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -90,10 +90,6 @@ class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests): class TestMethods(BaseDatetimeTests, base.BaseMethodsTests): - @pytest.mark.skip(reason="Incorrect expected") - def test_value_counts(self, all_data, dropna): - pass - def test_combine_add(self, data_repeated): # Timestamp.__add__(Timestamp) not defined pass diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 86aed671f1b88..8519c2999ade3 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -81,9 +81,7 @@ class TestNoReduce(base.BaseNoReduceTests): class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="returns nullable") - def test_value_counts(self, all_data, dropna): - return super().test_value_counts(all_data, dropna) + pass class TestCasting(base.BaseCastingTests): diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index ec8613faaa663..5de918e875bf7 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -443,7 +443,7 @@ def test_with_datetimelikes(self): t = df.T result = t.dtypes.value_counts() - expected = Series({np.dtype("object"): 10}) + expected = Series({np.dtype("object"): 10}, dtype="Int64") tm.assert_series_equal(result, expected) def test_values(self, float_frame): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index d929d3e030508..17987903edb5a 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -325,7 +325,8 @@ def test_df_flex_cmp_constant_return_types(self, opname): const = 2 result = getattr(df, opname)(const).dtypes.value_counts() - tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) + expected = pd.Series([2], index=[np.dtype(bool)], dtype="Int64") + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types_empty(self, opname): @@ -335,7 +336,8 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): empty = df.iloc[:0] result = getattr(empty, opname)(const).dtypes.value_counts() - tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) + expected = pd.Series([2], index=[np.dtype(bool)], dtype="Int64") + tm.assert_series_equal(result, expected) # ------------------------------------------------------------------- diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index c55b0481c1041..fff3f86797324 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -133,7 +133,7 @@ def test_value_counts_unique(self, tz_naive_fixture): idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) exp_idx = pd.date_range("2011-01-01 18:00", freq="-1H", periods=10, tz=tz) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") + expected = Series(range(10, 0, -1), index=exp_idx, dtype="Int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) @@ -154,13 +154,13 @@ def test_value_counts_unique(self, tz_naive_fixture): ) exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00"], tz=tz) - expected = Series([3, 2], index=exp_idx) + expected = Series([3, 2], index=exp_idx, dtype="Int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00", pd.NaT], tz=tz) - expected = Series([3, 2, 1], index=exp_idx) + expected = Series([3, 2, 1], index=exp_idx, dtype="Int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(dropna=False), expected) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index fc44226f9d72f..495f0c172eb0e 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -47,7 +47,7 @@ def test_value_counts_unique(self): ], freq="H", ) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") + expected = Series(range(10, 0, -1), index=exp_idx, dtype="Int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) @@ -68,13 +68,13 @@ def test_value_counts_unique(self): ) exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00"], freq="H") - expected = Series([3, 2], index=exp_idx) + expected = Series([3, 2], index=exp_idx, dtype="Int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00", NaT], freq="H") - expected = Series([3, 2, 1], index=exp_idx) + expected = Series([3, 2, 1], index=exp_idx, dtype="Int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(dropna=False), expected) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index aa1bf997fc66b..09887931c010a 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -18,7 +18,7 @@ def test_value_counts_unique(self): idx = TimedeltaIndex(np.repeat(idx.values, range(1, len(idx) + 1))) exp_idx = timedelta_range("1 days 18:00:00", freq="-1H", periods=10) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") + expected = Series(range(10, 0, -1), index=exp_idx, dtype="Int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) @@ -38,13 +38,13 @@ def test_value_counts_unique(self): ) exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00"]) - expected = Series([3, 2], index=exp_idx) + expected = Series([3, 2], index=exp_idx, dtype="Int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00", pd.NaT]) - expected = Series([3, 2, 1], index=exp_idx) + expected = Series([3, 2, 1], index=exp_idx, dtype="Int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(dropna=False), expected) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 536f4aa760b9c..1b46011e7986c 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1990,7 +1990,8 @@ def test_table_values_dtypes_roundtrip(self, setup_path): "int64": 1, "object": 1, "datetime64[ns]": 2, - } + }, + dtype="Int64", ) result = result.sort_index() expected = expected.sort_index() diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index c003bfa6a239a..0457cdaf23e44 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -90,7 +90,7 @@ def test_get_dummies_basic_types(self, sparse, dtype): else: dtype_name = self.effective_dtype(dtype).name - expected = Series({dtype_name: 8}) + expected = Series({dtype_name: 8}, dtype="Int64") result = result.dtypes.value_counts() result.index = [str(i) for i in result.index] tm.assert_series_equal(result, expected) @@ -100,7 +100,7 @@ def test_get_dummies_basic_types(self, sparse, dtype): expected_counts = {"int64": 1, "object": 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) - expected = Series(expected_counts).sort_index() + expected = Series(expected_counts, dtype="Int64").sort_index() result = result.dtypes.value_counts() result.index = [str(i) for i in result.index] result = result.sort_index() diff --git a/pandas/tests/series/methods/test_asfreq.py b/pandas/tests/series/methods/test_asfreq.py index d94b60384a07c..4523cbc4a1b8c 100644 --- a/pandas/tests/series/methods/test_asfreq.py +++ b/pandas/tests/series/methods/test_asfreq.py @@ -64,7 +64,9 @@ def test_asfreq(self): daily_ts = ts.asfreq("D", fill_value=-1) result = daily_ts.value_counts().sort_index() - expected = Series([60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0]).sort_index() + expected = Series( + [60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], dtype="Int64" + ).sort_index() tm.assert_series_equal(result, expected) def test_asfreq_datetimeindex_empty_series(self): diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index f97362ce9c2a9..7c40259aebb0d 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -21,7 +21,7 @@ def test_value_counts_datetime(self): exp_idx = pd.DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] ) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx", dtype="Int64") ser = pd.Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -48,7 +48,7 @@ def test_value_counts_datetime_tz(self): ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], tz="US/Eastern", ) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx", dtype="Int64") ser = pd.Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -70,7 +70,7 @@ def test_value_counts_period(self): ] exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx", dtype="Int64") ser = pd.Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -88,7 +88,7 @@ def test_value_counts_categorical_ordered(self): values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True) exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx", dtype="Int64") ser = pd.Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -105,7 +105,7 @@ def test_value_counts_categorical_not_ordered(self): values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False) exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx", dtype="Int64") ser = pd.Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -125,20 +125,20 @@ def test_value_counts_categorical(self): res = ser.value_counts(sort=False) exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) - exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) + exp = Series([3, 1, 2, 0], name="xxx", index=exp_index, dtype="Int64") tm.assert_series_equal(res, exp) res = ser.value_counts(sort=True) exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) - exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) + exp = Series([3, 2, 1, 0], name="xxx", index=exp_index, dtype="Int64") tm.assert_series_equal(res, exp) # check object dtype handles the Series.name as the same # (tested in tests/base) ser = Series(["a", "b", "c", "c", "c", "b"], name="xxx") res = ser.value_counts() - exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) + exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"], dtype="Int64") tm.assert_series_equal(res, exp) def test_value_counts_categorical_with_nan(self): @@ -146,7 +146,7 @@ def test_value_counts_categorical_with_nan(self): # sanity check ser = Series(["a", "b", "a"], dtype="category") - exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), dtype="Int64") res = ser.value_counts(dropna=True) tm.assert_series_equal(res, exp) @@ -164,18 +164,22 @@ def test_value_counts_categorical_with_nan(self): for ser in series: # None is a NaN value, so we exclude its count here - exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), dtype="Int64") res = ser.value_counts(dropna=True) tm.assert_series_equal(res, exp) # we don't exclude the count of None and sort by counts - exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"])) + exp = Series( + [3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]), dtype="Int64" + ) res = ser.value_counts(dropna=False) tm.assert_series_equal(res, exp) # When we aren't sorting by counts, and np.nan isn't a # category, it should be last. - exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) + exp = Series( + [2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]), dtype="Int64" + ) res = ser.value_counts(dropna=False, sort=False) tm.assert_series_equal(res, exp) @@ -185,17 +189,17 @@ def test_value_counts_categorical_with_nan(self): ( pd.Series([False, True, True, pd.NA]), False, - pd.Series([2, 1, 1], index=[True, False, pd.NA]), + pd.Series([2, 1, 1], index=[True, False, pd.NA], dtype="Int64"), ), ( pd.Series([False, True, True, pd.NA]), True, - pd.Series([2, 1], index=[True, False]), + pd.Series([2, 1], index=[True, False], dtype="Int64"), ), ( pd.Series(range(3), index=[True, False, np.nan]).index, False, - pd.Series([1, 1, 1], index=[True, False, pd.NA]), + pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64"), ), ], ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ad7028702ec8c..f3609275b1aae 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -663,7 +663,6 @@ def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixtur class TestIsin: def test_invalid(self): - msg = ( r"only list-like objects are allowed to be passed to isin\(\), " r"you passed a \[int\]" @@ -676,7 +675,6 @@ def test_invalid(self): algos.isin([1], 1) def test_basic(self): - result = algos.isin([1, 2], [1]) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) @@ -714,7 +712,6 @@ def test_basic(self): tm.assert_numpy_array_equal(result, expected) def test_i8(self): - arr = pd.date_range("20130101", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) @@ -742,7 +739,6 @@ def test_i8(self): tm.assert_numpy_array_equal(result, expected) def test_large(self): - s = pd.date_range("20000101", periods=2000000, freq="s").values result = algos.isin(s, s[0:2]) expected = np.zeros(len(s), dtype=bool) @@ -886,18 +882,22 @@ def test_value_counts(self): result = algos.value_counts(factor) breaks = [-1.194, -0.535, 0.121, 0.777, 1.433] index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True)) - expected = Series([1, 1, 1, 1], index=index) + expected = Series([1, 1, 1, 1], index=index, dtype="Int64") tm.assert_series_equal(result.sort_index(), expected.sort_index()) def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) - expected = Series([4], index=IntervalIndex.from_tuples([(0.996, 4.0)])) + expected = Series( + [4], index=IntervalIndex.from_tuples([(0.996, 4.0)]), dtype="Int64" + ) tm.assert_series_equal(result, expected) result = algos.value_counts(s, bins=2, sort=False) expected = Series( - [2, 2], index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]) + [2, 2], + index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]), + dtype="Int64", ) tm.assert_series_equal(result, expected) @@ -925,7 +925,7 @@ def test_value_counts_nat(self): assert len(vc) == 1 assert len(vc_with_na) == 2 - exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}) + exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, dtype="Int64") tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) @@ -947,7 +947,7 @@ def test_value_counts_datetime_outofbounds(self): [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], dtype=object, ) - exp = Series([3, 2, 1], index=exp_index) + exp = Series([3, 2, 1], index=exp_index, dtype="Int64") tm.assert_series_equal(res, exp) # GH 12424 @@ -958,7 +958,9 @@ def test_value_counts_datetime_outofbounds(self): def test_categorical(self): s = Series(Categorical(list("aaabbc"))) result = s.value_counts() - expected = Series([3, 2, 1], index=CategoricalIndex(["a", "b", "c"])) + expected = Series( + [3, 2, 1], index=CategoricalIndex(["a", "b", "c"]), dtype="Int64" + ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -975,10 +977,13 @@ def test_categorical_nans(self): expected = Series( [4, 3, 2], index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]), + dtype="Int64", ) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = Series([4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan])) + expected = Series( + [4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan]), dtype="Int64" + ) tm.assert_series_equal(result, expected, check_index_type=True) # out of order @@ -992,6 +997,7 @@ def test_categorical_nans(self): index=CategoricalIndex( ["a", "b", "c"], categories=["b", "a", "c"], ordered=True ), + dtype="Int64", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1001,6 +1007,7 @@ def test_categorical_nans(self): index=CategoricalIndex( ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True ), + dtype="Int64", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1013,6 +1020,7 @@ def test_categorical_zeroes(self): index=Categorical( ["b", "a", "c", "d"], categories=list("abcd"), ordered=True ), + dtype="Int64", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1021,39 +1029,39 @@ def test_dropna(self): tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=True), - Series([2, 1], index=[True, False]), + Series([2, 1], index=[True, False], dtype="Int64"), ) tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=False), - Series([2, 1], index=[True, False]), + Series([2, 1], index=[True, False], dtype="Int64"), ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=True), - Series([2, 1], index=[True, False]), + Series([2, 1], index=[True, False], dtype="Int64"), ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=False), - Series([2, 1, 1], index=[True, False, np.nan]), + Series([2, 1, 1], index=[True, False, np.nan], dtype="Int64"), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=True), - Series([2, 1], index=[5.0, 10.3]), + Series([2, 1], index=[5.0, 10.3], dtype="Int64"), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=False), - Series([2, 1], index=[5.0, 10.3]), + Series([2, 1], index=[5.0, 10.3], dtype="Int64"), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True), - Series([2, 1], index=[5.0, 10.3]), + Series([2, 1], index=[5.0, 10.3], dtype="Int64"), ) # 32-bit linux has a different ordering if not compat.is_platform_32bit(): result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False) - expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan]) + expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan], dtype="Int64") tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): @@ -1074,13 +1082,13 @@ def test_value_counts_normalized(self): def test_value_counts_uint64(self): arr = np.array([2 ** 63], dtype=np.uint64) - expected = Series([1], index=[2 ** 63]) + expected = Series([1], index=[2 ** 63], dtype="Int64") result = algos.value_counts(arr) tm.assert_series_equal(result, expected) arr = np.array([-1, 2 ** 63], dtype=object) - expected = Series([1, 1], index=[-1, 2 ** 63]) + expected = Series([1, 1], index=[-1, 2 ** 63], dtype="Int64") result = algos.value_counts(arr) # 32-bit linux has a different ordering @@ -1400,7 +1408,6 @@ class TestGroupVarFloat64(GroupVarTestMixin): rtol = 1e-5 def test_group_var_large_inputs(self): - prng = RandomState(1234) out = np.array([[np.nan]], dtype=self.dtype) @@ -1632,7 +1639,6 @@ def test_quantile(): def test_unique_label_indices(): - a = np.random.randint(1, 1 << 10, 1 << 15).astype("i8") left = ht.unique_label_indices(a) @@ -1698,7 +1704,6 @@ def test_pct_max_many_rows(self, values): def test_pad_backfill_object_segfault(): - old = np.array([], dtype="O") new = np.array([datetime(2010, 12, 31)], dtype="O") diff --git a/web/pandas_web.py b/web/pandas_web.py index e62deaa8cdc7f..7dd63175e69ac 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -34,13 +34,12 @@ import time import typing +import feedparser import jinja2 +import markdown import requests import yaml -import feedparser -import markdown - class Preprocessors: """ From af971263f4c5e30f00cb5476f2aad571efb51d9e Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Thu, 23 Apr 2020 00:06:29 +0900 Subject: [PATCH 04/13] Update test --- pandas/tests/extension/base/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index cad4d3f1cf4fd..9a9eae43750c1 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -17,7 +17,7 @@ class BaseMethodsTests(BaseExtensionTests): @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): - all_data = np.unique(all_data[:5]) + all_data = all_data[:10].unique() if dropna: other = np.array(all_data[~all_data.isna()]) else: From cbabcad9ac83ac54023a3c04f60821ef3e7c0840 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Fri, 24 Apr 2020 12:56:39 +0900 Subject: [PATCH 05/13] Resolve warning --- pandas/tests/arrays/integer/test_function.py | 2 +- pandas/tests/base/test_value_counts.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 93800d724fb10..a9324e9b8f318 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -105,7 +105,7 @@ def test_value_counts_na(): def test_value_counts_empty(): # https://github.com/pandas-dev/pandas/issues/33317 - s = pd.Series([]) + s = pd.Series([], dtype="float64") result = s.value_counts() # TODO: The dtype of the index seems wrong (it's int64 for non-empty) idx = pd.Float64Index([], dtype="float64") diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index f96df86027d28..d1559c655168e 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -145,7 +145,7 @@ def test_value_counts_bins(index_or_series): if isinstance(s1, Index): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: - exp = np.array([1, 2, 3], dtype="Int64") + exp = np.array([1, 2, 3]) tm.assert_numpy_array_equal(s1.unique(), exp) assert s1.nunique() == 3 @@ -170,7 +170,7 @@ def test_value_counts_bins(index_or_series): # handle NA's properly s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] s = klass(s_values) - expected = Series([4, 3, 2], index=["b", "a", "d"], dtype="Int64") + expected = Series(data=[4, 3, 2], index=["b", "a", "d"], dtype="Int64") tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): From dac1af541337f33db6a04592653ea53c9b23ae53 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Fri, 24 Apr 2020 23:46:21 +0900 Subject: [PATCH 06/13] Pass test in Windows py36_np15 --- pandas/core/algorithms.py | 2 +- pandas/tests/base/test_value_counts.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 20f48dfcf62cd..f1308924264e5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -718,7 +718,7 @@ def value_counts( else: keys, counts = _value_counts_arraylike(values, dropna) - result = Series(counts, index=keys, name=name) + result = Series(counts, index=keys, name=name, dtype="Int64") if sort: result = result.sort_values(ascending=ascending) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index d1559c655168e..8075ae9a8775b 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -145,7 +145,7 @@ def test_value_counts_bins(index_or_series): if isinstance(s1, Index): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: - exp = np.array([1, 2, 3]) + exp = np.array([1, 2, 3], dtype="int64") tm.assert_numpy_array_equal(s1.unique(), exp) assert s1.nunique() == 3 From 0be33fce19dd4bfcdcddea18f3b99459d30e3be9 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Sat, 25 Apr 2020 13:36:43 +0900 Subject: [PATCH 07/13] Change Int32 to Int64 --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2203a4e389774..f1d9a50550d3d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -273,4 +273,4 @@ def value_counts(self, dropna: bool = True) -> "Series": mask = np.zeros(len(counts), dtype="bool") counts = IntegerArray(counts, mask) - return Series(counts, index=index) + return Series(counts, index=index, dtype="Int64") From d5eec7627e33ccd922c247f5b17c82af713f8c56 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Sat, 25 Apr 2020 18:30:17 +0900 Subject: [PATCH 08/13] Fix docs --- pandas/core/base.py | 6 +++--- pandas/tests/extension/base/methods.py | 2 +- web/pandas_web.py | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index ee514888c6331..9281869d9f2f8 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1206,7 +1206,7 @@ def value_counts( 4.0 1 2.0 1 1.0 1 - dtype: int64 + dtype: Int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. @@ -1230,7 +1230,7 @@ def value_counts( (2.0, 3.0] 2 (0.996, 2.0] 2 (3.0, 4.0] 1 - dtype: int64 + dtype: Int64 **dropna** @@ -1242,7 +1242,7 @@ def value_counts( 4.0 1 2.0 1 1.0 1 - dtype: int64 + dtype: Int64 """ result = value_counts( self, diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 9a9eae43750c1..8e038d02360e8 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -35,7 +35,7 @@ def test_value_counts(self, all_data, dropna): ) expected = pd.Series([1 / len(other)] * len(other), index=result.index) - tm.assert_series_equal(expected, result) + self.assert_series_equal(expected, result) def test_count(self, data_missing): df = pd.DataFrame({"A": data_missing}) diff --git a/web/pandas_web.py b/web/pandas_web.py index 7dd63175e69ac..e62deaa8cdc7f 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -34,12 +34,13 @@ import time import typing -import feedparser import jinja2 -import markdown import requests import yaml +import feedparser +import markdown + class Preprocessors: """ From db8ea2eec954f2c19a87d7eef503e1f5d7c650f8 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Sat, 25 Apr 2020 19:07:25 +0900 Subject: [PATCH 09/13] Remove product code update --- pandas/core/algorithms.py | 4 +--- pandas/core/groupby/generic.py | 20 ++++++++------------ pandas/tests/extension/base/methods.py | 2 +- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6a2b41f6ed7fc..046f6fed3e1dc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -712,14 +712,12 @@ def value_counts( else: keys, counts = _value_counts_arraylike(values, dropna) - result = Series(counts, index=keys, name=name, dtype="Int64") + result = Series(counts, index=keys, name=name) if sort: result = result.sort_values(ascending=ascending) if normalize: - if not isinstance(counts, np.ndarray): - counts = counts.to_numpy() result = result / float(counts.sum()) else: result = result.astype("Int64") diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2ab81406dd331..5f520c8fc67f8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -757,15 +757,12 @@ def value_counts( mi = MultiIndex( levels=levels, codes=codes, names=names, verify_integrity=False ) + dtype = "float64" if is_integer_dtype(out): - return Series( - ensure_int64(out), - index=mi, - name=self._selection_name, - dtype="Int64", - ) - return Series(out, index=mi, name=self._selection_name) + out = ensure_int64(out) + dtype = "Int64" + return Series(out, index=mi, name=self._selection_name, dtype=dtype) # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros @@ -794,13 +791,12 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: codes.append(left[-1]) mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) + dtype = "float64" if is_integer_dtype(out): - return Series( - ensure_int64(out), index=mi, name=self._selection_name, dtype="Int64" - ) - - return Series(out, index=mi, name=self._selection_name) + out = ensure_int64(out) + dtype = "Int64" + return Series(out, index=mi, name=self._selection_name, dtype=dtype) def count(self) -> Series: """ diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 8e038d02360e8..b557509c05afa 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -35,7 +35,7 @@ def test_value_counts(self, all_data, dropna): ) expected = pd.Series([1 / len(other)] * len(other), index=result.index) - self.assert_series_equal(expected, result) + self.assert_series_equal(result, expected) def test_count(self, data_missing): df = pd.DataFrame({"A": data_missing}) From 243d3ece01e28c0d07f34c47c926b85f06403fb3 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Sun, 26 Apr 2020 12:28:22 +0900 Subject: [PATCH 10/13] Remove unnecessary changes --- pandas/core/algorithms.py | 2 - pandas/core/arrays/masked.py | 4 +- pandas/core/arrays/string_.py | 2 +- pandas/core/base.py | 6 +-- pandas/core/groupby/generic.py | 8 +-- pandas/tests/arrays/integer/test_function.py | 6 +-- pandas/tests/arrays/string_/test_string.py | 4 +- pandas/tests/arrays/test_datetimes.py | 2 +- pandas/tests/base/test_value_counts.py | 38 +++++++------ pandas/tests/extension/test_datetime.py | 4 ++ pandas/tests/frame/test_api.py | 2 +- pandas/tests/frame/test_arithmetic.py | 6 +-- pandas/tests/indexes/datetimes/test_ops.py | 7 +-- pandas/tests/indexes/period/test_ops.py | 6 +-- pandas/tests/indexes/timedeltas/test_ops.py | 6 +-- pandas/tests/io/pytables/test_store.py | 3 +- pandas/tests/reshape/test_get_dummies.py | 4 +- pandas/tests/series/methods/test_asfreq.py | 4 +- .../tests/series/methods/test_value_counts.py | 34 ++++++------ pandas/tests/test_algos.py | 53 +++++++++---------- 20 files changed, 92 insertions(+), 109 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 046f6fed3e1dc..e6967630b97ac 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -719,8 +719,6 @@ def value_counts( if normalize: result = result / float(counts.sum()) - else: - result = result.astype("Int64") return result diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index f1d9a50550d3d..fc5b307bd5754 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -259,7 +259,7 @@ def value_counts(self, dropna: bool = True) -> "Series": # if we want nans, count the mask if dropna: - counts = np.array(value_counts._values, dtype=int) + counts = value_counts._values else: counts = np.empty(len(value_counts) + 1, dtype="int64") counts[:-1] = value_counts @@ -273,4 +273,4 @@ def value_counts(self, dropna: bool = True) -> "Series": mask = np.zeros(len(counts), dtype="bool") counts = IntegerArray(counts, mask) - return Series(counts, index=index, dtype="Int64") + return Series(counts, index=index) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 51bbe182a002b..6b6f5935ea66a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -290,7 +290,7 @@ def _reduce(self, name, skipna=True, **kwargs): def value_counts(self, dropna=False): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna).astype("Int64") + return value_counts(self._ndarray, dropna=dropna) # Override parent because we have different return types. @classmethod diff --git a/pandas/core/base.py b/pandas/core/base.py index 9281869d9f2f8..ee514888c6331 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1206,7 +1206,7 @@ def value_counts( 4.0 1 2.0 1 1.0 1 - dtype: Int64 + dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. @@ -1230,7 +1230,7 @@ def value_counts( (2.0, 3.0] 2 (0.996, 2.0] 2 (3.0, 4.0] 1 - dtype: Int64 + dtype: int64 **dropna** @@ -1242,7 +1242,7 @@ def value_counts( 4.0 1 2.0 1 1.0 1 - dtype: Int64 + dtype: int64 """ result = value_counts( self, diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5f520c8fc67f8..504de404b2509 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -757,12 +757,10 @@ def value_counts( mi = MultiIndex( levels=levels, codes=codes, names=names, verify_integrity=False ) - dtype = "float64" if is_integer_dtype(out): out = ensure_int64(out) - dtype = "Int64" - return Series(out, index=mi, name=self._selection_name, dtype=dtype) + return Series(out, index=mi, name=self._selection_name) # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros @@ -791,12 +789,10 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: codes.append(left[-1]) mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) - dtype = "float64" if is_integer_dtype(out): out = ensure_int64(out) - dtype = "Int64" - return Series(out, index=mi, name=self._selection_name, dtype=dtype) + return Series(out, index=mi, name=self._selection_name) def count(self) -> Series: """ diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index b2312083f998d..44c3077228e80 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -93,7 +93,7 @@ def test_stat_method(pandasmethname, kwargs): def test_value_counts_na(): - arr = pd.array([1, 2, 1, pd.NA]) + arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") result = arr.value_counts(dropna=False) expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") tm.assert_series_equal(result, expected) @@ -105,10 +105,10 @@ def test_value_counts_na(): def test_value_counts_empty(): # https://github.com/pandas-dev/pandas/issues/33317 - s = pd.Series([], dtype="float64") + s = pd.Series([], dtype="Int64") result = s.value_counts() # TODO: The dtype of the index seems wrong (it's int64 for non-empty) - idx = pd.Float64Index([], dtype="float64") + idx = pd.Index([], dtype="object") expected = pd.Series([], index=idx, dtype="Int64") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index eb89798a1ad96..a3e6f4e62918f 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -297,9 +297,9 @@ def test_arrow_roundtrip(): def test_value_counts_na(): arr = pd.array(["a", "b", "a", pd.NA], dtype="string") result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") + expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA]) tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") + expected = pd.Series([2, 1], index=["a", "b"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 7a26952379c3a..7d80ad3d8c6be 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -248,7 +248,7 @@ def test_value_counts_preserves_tz(self): arr[-2] = pd.NaT result = arr.value_counts() - expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]], dtype="Int64") + expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("method", ["pad", "backfill"]) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 8075ae9a8775b..d45feaff68dde 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -30,7 +30,7 @@ def test_value_counts(index_or_series_obj): result = obj.value_counts() counter = collections.Counter(obj) - expected = pd.Series(dict(counter.most_common()), dtype="Int64", name=obj.name) + expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) expected.index = expected.index.astype(obj.dtype) if isinstance(obj, pd.MultiIndex): expected.index = pd.Index(expected.index) @@ -67,7 +67,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): # because np.nan == np.nan is False, but None == None is True # np.nan would be duplicated, whereas None wouldn't counter = collections.Counter(obj.dropna()) - expected = pd.Series(dict(counter.most_common()), dtype="Int64") + expected = pd.Series(dict(counter.most_common()), dtype=np.int64) expected.index = expected.index.astype(obj.dtype) result = obj.value_counts() @@ -80,7 +80,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): # can't use expected[null_obj] = 3 as # IntervalIndex doesn't allow assignment - new_entry = pd.Series({np.nan: 3}, dtype="Int64") + new_entry = pd.Series({np.nan: 3}, dtype=np.int64) expected = expected.append(new_entry) result = obj.value_counts(dropna=False) @@ -96,7 +96,7 @@ def test_value_counts_inferred(index_or_series): klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) - expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], dtype="Int64") + expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): @@ -110,17 +110,17 @@ def test_value_counts_inferred(index_or_series): # don't sort, have to sort after the fact as not sorting is # platform-dep hist = s.value_counts(sort=False).sort_values() - expected = Series([3, 1, 4, 2], index=list("acbd"), dtype="Int64").sort_values() + expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list("cdab"), dtype="Int64") + expected = Series([1, 2, 3, 4], index=list("cdab")) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) - expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], dtype="float64") + expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) tm.assert_series_equal(hist, expected) @@ -136,16 +136,16 @@ def test_value_counts_bins(index_or_series): s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.997, 3.0): 4}, dtype="Int64") + exp1 = Series({Interval(0.997, 3.0): 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.997, 3.0): 1.0}, dtype="float64") + exp1n = Series({Interval(0.997, 3.0): 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: - exp = np.array([1, 2, 3], dtype="int64") + exp = np.array([1, 2, 3], dtype=np.int64) tm.assert_numpy_array_equal(s1.unique(), exp) assert s1.nunique() == 3 @@ -153,24 +153,22 @@ def test_value_counts_bins(index_or_series): # these return the same res4 = s1.value_counts(bins=4, dropna=True) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]), dtype="Int64") + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4 = s1.value_counts(bins=4, dropna=False) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]), dtype="Int64") + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series( - [0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2]), dtype="float64" - ) + exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] s = klass(s_values) - expected = Series(data=[4, 3, 2], index=["b", "a", "d"], dtype="Int64") + expected = Series([4, 3, 2], index=["b", "a", "d"]) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): @@ -182,7 +180,7 @@ def test_value_counts_bins(index_or_series): assert s.nunique() == 3 s = klass({}) if klass is dict else klass({}, dtype=object) - expected = Series([], dtype="Int64") + expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): @@ -218,7 +216,7 @@ def test_value_counts_datetime64(index_or_series): idx = pd.to_datetime( ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] ) - expected_s = Series([3, 2, 1], index=idx, dtype="Int64") + expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np_array_datetime64_compat( @@ -242,7 +240,7 @@ def test_value_counts_datetime64(index_or_series): result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 - tm.assert_series_equal(result, expected_s.astype("Int64")) + tm.assert_series_equal(result, expected_s) unique = s.unique() assert unique.dtype == "datetime64[ns]" @@ -263,7 +261,7 @@ def test_value_counts_datetime64(index_or_series): td = klass(td, name="dt") result = td.value_counts() - expected_s = Series([6], index=[Timedelta("1day")], name="dt", dtype="Int64") + expected_s = Series([6], index=[Timedelta("1day")], name="dt") tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(["1 days"], name="dt") diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 5a4cb1f2ec12c..3aa188098620d 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -90,6 +90,10 @@ class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests): class TestMethods(BaseDatetimeTests, base.BaseMethodsTests): + @pytest.mark.skip(reason="Incorrect expected") + def test_value_counts(self, all_data, dropna): + pass + def test_combine_add(self, data_repeated): # Timestamp.__add__(Timestamp) not defined pass diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index a0a8e6e2f53a4..5cf74d3205a13 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -444,7 +444,7 @@ def test_with_datetimelikes(self): t = df.T result = t.dtypes.value_counts() - expected = Series({np.dtype("object"): 10}, dtype="Int64") + expected = Series({np.dtype("object"): 10}) tm.assert_series_equal(result, expected) def test_values(self, float_frame): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 17987903edb5a..d929d3e030508 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -325,8 +325,7 @@ def test_df_flex_cmp_constant_return_types(self, opname): const = 2 result = getattr(df, opname)(const).dtypes.value_counts() - expected = pd.Series([2], index=[np.dtype(bool)], dtype="Int64") - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types_empty(self, opname): @@ -336,8 +335,7 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): empty = df.iloc[:0] result = getattr(empty, opname)(const).dtypes.value_counts() - expected = pd.Series([2], index=[np.dtype(bool)], dtype="Int64") - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) # ------------------------------------------------------------------- diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 6b4b9a4421e0b..f0fe5e9b293fc 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -133,7 +133,8 @@ def test_value_counts_unique(self, tz_naive_fixture): idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) exp_idx = pd.date_range("2011-01-01 18:00", freq="-1H", periods=10, tz=tz) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="Int64") + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") + expected.index._set_freq(None) for obj in [idx, Series(idx)]: @@ -156,13 +157,13 @@ def test_value_counts_unique(self, tz_naive_fixture): ) exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00"], tz=tz) - expected = Series([3, 2], index=exp_idx, dtype="Int64") + expected = Series([3, 2], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00", pd.NaT], tz=tz) - expected = Series([3, 2, 1], index=exp_idx, dtype="Int64") + expected = Series([3, 2, 1], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(dropna=False), expected) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 495f0c172eb0e..fc44226f9d72f 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -47,7 +47,7 @@ def test_value_counts_unique(self): ], freq="H", ) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="Int64") + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) @@ -68,13 +68,13 @@ def test_value_counts_unique(self): ) exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00"], freq="H") - expected = Series([3, 2], index=exp_idx, dtype="Int64") + expected = Series([3, 2], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00", NaT], freq="H") - expected = Series([3, 2, 1], index=exp_idx, dtype="Int64") + expected = Series([3, 2, 1], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(dropna=False), expected) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index ab62c625b29b8..0e5abe2f5ccd1 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -18,7 +18,7 @@ def test_value_counts_unique(self): idx = TimedeltaIndex(np.repeat(idx.values, range(1, len(idx) + 1))) exp_idx = timedelta_range("1 days 18:00:00", freq="-1H", periods=10) - expected = Series(range(10, 0, -1), index=exp_idx, dtype="Int64") + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) @@ -38,13 +38,13 @@ def test_value_counts_unique(self): ) exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00"]) - expected = Series([3, 2], index=exp_idx, dtype="Int64") + expected = Series([3, 2], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00", pd.NaT]) - expected = Series([3, 2, 1], index=exp_idx, dtype="Int64") + expected = Series([3, 2, 1], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(dropna=False), expected) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 44bc537d840dc..299ae2f41d676 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1992,8 +1992,7 @@ def test_table_values_dtypes_roundtrip(self, setup_path): "int64": 1, "object": 1, "datetime64[ns]": 2, - }, - dtype="Int64", + } ) result = result.sort_index() expected = expected.sort_index() diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 0457cdaf23e44..c003bfa6a239a 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -90,7 +90,7 @@ def test_get_dummies_basic_types(self, sparse, dtype): else: dtype_name = self.effective_dtype(dtype).name - expected = Series({dtype_name: 8}, dtype="Int64") + expected = Series({dtype_name: 8}) result = result.dtypes.value_counts() result.index = [str(i) for i in result.index] tm.assert_series_equal(result, expected) @@ -100,7 +100,7 @@ def test_get_dummies_basic_types(self, sparse, dtype): expected_counts = {"int64": 1, "object": 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) - expected = Series(expected_counts, dtype="Int64").sort_index() + expected = Series(expected_counts).sort_index() result = result.dtypes.value_counts() result.index = [str(i) for i in result.index] result = result.sort_index() diff --git a/pandas/tests/series/methods/test_asfreq.py b/pandas/tests/series/methods/test_asfreq.py index 4e34223f4c4c0..cd61c510c75f5 100644 --- a/pandas/tests/series/methods/test_asfreq.py +++ b/pandas/tests/series/methods/test_asfreq.py @@ -67,9 +67,7 @@ def test_asfreq(self): daily_ts = ts.asfreq("D", fill_value=-1) result = daily_ts.value_counts().sort_index() - expected = Series( - [60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], dtype="Int64" - ).sort_index() + expected = Series([60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0]).sort_index() tm.assert_series_equal(result, expected) def test_asfreq_datetimeindex_empty_series(self): diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index 7c40259aebb0d..f97362ce9c2a9 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -21,7 +21,7 @@ def test_value_counts_datetime(self): exp_idx = pd.DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] ) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx", dtype="Int64") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") ser = pd.Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -48,7 +48,7 @@ def test_value_counts_datetime_tz(self): ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], tz="US/Eastern", ) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx", dtype="Int64") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") ser = pd.Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -70,7 +70,7 @@ def test_value_counts_period(self): ] exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx", dtype="Int64") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") ser = pd.Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -88,7 +88,7 @@ def test_value_counts_categorical_ordered(self): values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True) exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx", dtype="Int64") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") ser = pd.Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -105,7 +105,7 @@ def test_value_counts_categorical_not_ordered(self): values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False) exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx", dtype="Int64") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") ser = pd.Series(values, name="xxx") tm.assert_series_equal(ser.value_counts(), exp) @@ -125,20 +125,20 @@ def test_value_counts_categorical(self): res = ser.value_counts(sort=False) exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) - exp = Series([3, 1, 2, 0], name="xxx", index=exp_index, dtype="Int64") + exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) tm.assert_series_equal(res, exp) res = ser.value_counts(sort=True) exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) - exp = Series([3, 2, 1, 0], name="xxx", index=exp_index, dtype="Int64") + exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) tm.assert_series_equal(res, exp) # check object dtype handles the Series.name as the same # (tested in tests/base) ser = Series(["a", "b", "c", "c", "c", "b"], name="xxx") res = ser.value_counts() - exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"], dtype="Int64") + exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) tm.assert_series_equal(res, exp) def test_value_counts_categorical_with_nan(self): @@ -146,7 +146,7 @@ def test_value_counts_categorical_with_nan(self): # sanity check ser = Series(["a", "b", "a"], dtype="category") - exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), dtype="Int64") + exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) res = ser.value_counts(dropna=True) tm.assert_series_equal(res, exp) @@ -164,22 +164,18 @@ def test_value_counts_categorical_with_nan(self): for ser in series: # None is a NaN value, so we exclude its count here - exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), dtype="Int64") + exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) res = ser.value_counts(dropna=True) tm.assert_series_equal(res, exp) # we don't exclude the count of None and sort by counts - exp = Series( - [3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]), dtype="Int64" - ) + exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"])) res = ser.value_counts(dropna=False) tm.assert_series_equal(res, exp) # When we aren't sorting by counts, and np.nan isn't a # category, it should be last. - exp = Series( - [2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]), dtype="Int64" - ) + exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) res = ser.value_counts(dropna=False, sort=False) tm.assert_series_equal(res, exp) @@ -189,17 +185,17 @@ def test_value_counts_categorical_with_nan(self): ( pd.Series([False, True, True, pd.NA]), False, - pd.Series([2, 1, 1], index=[True, False, pd.NA], dtype="Int64"), + pd.Series([2, 1, 1], index=[True, False, pd.NA]), ), ( pd.Series([False, True, True, pd.NA]), True, - pd.Series([2, 1], index=[True, False], dtype="Int64"), + pd.Series([2, 1], index=[True, False]), ), ( pd.Series(range(3), index=[True, False, np.nan]).index, False, - pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64"), + pd.Series([1, 1, 1], index=[True, False, pd.NA]), ), ], ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 60ac43addb4d6..5f904241da485 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -662,6 +662,7 @@ def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixtur class TestIsin: def test_invalid(self): + msg = ( r"only list-like objects are allowed to be passed to isin\(\), " r"you passed a \[int\]" @@ -674,6 +675,7 @@ def test_invalid(self): algos.isin([1], 1) def test_basic(self): + result = algos.isin([1, 2], [1]) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) @@ -711,6 +713,7 @@ def test_basic(self): tm.assert_numpy_array_equal(result, expected) def test_i8(self): + arr = pd.date_range("20130101", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) @@ -738,6 +741,7 @@ def test_i8(self): tm.assert_numpy_array_equal(result, expected) def test_large(self): + s = pd.date_range("20000101", periods=2000000, freq="s").values result = algos.isin(s, s[0:2]) expected = np.zeros(len(s), dtype=bool) @@ -881,22 +885,18 @@ def test_value_counts(self): result = algos.value_counts(factor) breaks = [-1.194, -0.535, 0.121, 0.777, 1.433] index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True)) - expected = Series([1, 1, 1, 1], index=index, dtype="Int64") + expected = Series([1, 1, 1, 1], index=index) tm.assert_series_equal(result.sort_index(), expected.sort_index()) def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) - expected = Series( - [4], index=IntervalIndex.from_tuples([(0.996, 4.0)]), dtype="Int64" - ) + expected = Series([4], index=IntervalIndex.from_tuples([(0.996, 4.0)])) tm.assert_series_equal(result, expected) result = algos.value_counts(s, bins=2, sort=False) expected = Series( - [2, 2], - index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]), - dtype="Int64", + [2, 2], index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]) ) tm.assert_series_equal(result, expected) @@ -924,7 +924,7 @@ def test_value_counts_nat(self): assert len(vc) == 1 assert len(vc_with_na) == 2 - exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, dtype="Int64") + exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}) tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) @@ -946,7 +946,7 @@ def test_value_counts_datetime_outofbounds(self): [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], dtype=object, ) - exp = Series([3, 2, 1], index=exp_index, dtype="Int64") + exp = Series([3, 2, 1], index=exp_index) tm.assert_series_equal(res, exp) # GH 12424 @@ -957,9 +957,7 @@ def test_value_counts_datetime_outofbounds(self): def test_categorical(self): s = Series(Categorical(list("aaabbc"))) result = s.value_counts() - expected = Series( - [3, 2, 1], index=CategoricalIndex(["a", "b", "c"]), dtype="Int64" - ) + expected = Series([3, 2, 1], index=CategoricalIndex(["a", "b", "c"])) tm.assert_series_equal(result, expected, check_index_type=True) @@ -976,13 +974,10 @@ def test_categorical_nans(self): expected = Series( [4, 3, 2], index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]), - dtype="Int64", ) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = Series( - [4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan]), dtype="Int64" - ) + expected = Series([4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan])) tm.assert_series_equal(result, expected, check_index_type=True) # out of order @@ -996,7 +991,6 @@ def test_categorical_nans(self): index=CategoricalIndex( ["a", "b", "c"], categories=["b", "a", "c"], ordered=True ), - dtype="Int64", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1006,7 +1000,6 @@ def test_categorical_nans(self): index=CategoricalIndex( ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True ), - dtype="Int64", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1019,7 +1012,6 @@ def test_categorical_zeroes(self): index=Categorical( ["b", "a", "c", "d"], categories=list("abcd"), ordered=True ), - dtype="Int64", ) tm.assert_series_equal(result, expected, check_index_type=True) @@ -1028,39 +1020,39 @@ def test_dropna(self): tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=True), - Series([2, 1], index=[True, False], dtype="Int64"), + Series([2, 1], index=[True, False]), ) tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=False), - Series([2, 1], index=[True, False], dtype="Int64"), + Series([2, 1], index=[True, False]), ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=True), - Series([2, 1], index=[True, False], dtype="Int64"), + Series([2, 1], index=[True, False]), ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=False), - Series([2, 1, 1], index=[True, False, np.nan], dtype="Int64"), + Series([2, 1, 1], index=[True, False, np.nan]), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=True), - Series([2, 1], index=[5.0, 10.3], dtype="Int64"), + Series([2, 1], index=[5.0, 10.3]), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=False), - Series([2, 1], index=[5.0, 10.3], dtype="Int64"), + Series([2, 1], index=[5.0, 10.3]), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True), - Series([2, 1], index=[5.0, 10.3], dtype="Int64"), + Series([2, 1], index=[5.0, 10.3]), ) # 32-bit linux has a different ordering if not compat.is_platform_32bit(): result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False) - expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan], dtype="Int64") + expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan]) tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): @@ -1081,13 +1073,13 @@ def test_value_counts_normalized(self): def test_value_counts_uint64(self): arr = np.array([2 ** 63], dtype=np.uint64) - expected = Series([1], index=[2 ** 63], dtype="Int64") + expected = Series([1], index=[2 ** 63]) result = algos.value_counts(arr) tm.assert_series_equal(result, expected) arr = np.array([-1, 2 ** 63], dtype=object) - expected = Series([1, 1], index=[-1, 2 ** 63], dtype="Int64") + expected = Series([1, 1], index=[-1, 2 ** 63]) result = algos.value_counts(arr) # 32-bit linux has a different ordering @@ -1407,6 +1399,7 @@ class TestGroupVarFloat64(GroupVarTestMixin): rtol = 1e-5 def test_group_var_large_inputs(self): + prng = RandomState(1234) out = np.array([[np.nan]], dtype=self.dtype) @@ -1638,6 +1631,7 @@ def test_quantile(): def test_unique_label_indices(): + a = np.random.randint(1, 1 << 10, 1 << 15).astype("i8") left = ht.unique_label_indices(a) @@ -1703,6 +1697,7 @@ def test_pct_max_many_rows(self, values): def test_pad_backfill_object_segfault(): + old = np.array([], dtype="O") new = np.array([datetime(2010, 12, 31)], dtype="O") From 67ee98015b3fba894685db14bb9b4d281cea4870 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Mon, 27 Apr 2020 22:02:36 +0900 Subject: [PATCH 11/13] split new test --- pandas/core/arrays/string_.py | 2 +- pandas/tests/arrays/string_/test_string.py | 4 ++-- pandas/tests/extension/base/methods.py | 11 ++++++----- pandas/tests/extension/decimal/test_decimal.py | 4 ++++ pandas/tests/extension/json/test_json.py | 4 ++++ pandas/tests/extension/test_boolean.py | 4 ++++ pandas/tests/extension/test_integer.py | 4 ++++ pandas/tests/extension/test_numpy.py | 4 ++++ pandas/tests/extension/test_sparse.py | 4 ++++ pandas/tests/extension/test_string.py | 4 +++- 10 files changed, 36 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 6b6f5935ea66a..51bbe182a002b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -290,7 +290,7 @@ def _reduce(self, name, skipna=True, **kwargs): def value_counts(self, dropna=False): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna) + return value_counts(self._ndarray, dropna=dropna).astype("Int64") # Override parent because we have different return types. @classmethod diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a3e6f4e62918f..eb89798a1ad96 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -297,9 +297,9 @@ def test_arrow_roundtrip(): def test_value_counts_na(): arr = pd.array(["a", "b", "a", pd.NA], dtype="string") result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA]) + expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=["a", "b"]) + expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b557509c05afa..25fabe98851f1 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -17,7 +17,7 @@ class BaseMethodsTests(BaseExtensionTests): @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): - all_data = all_data[:10].unique() + all_data = all_data[:10] if dropna: other = np.array(all_data[~all_data.isna()]) else: @@ -28,13 +28,14 @@ def test_value_counts(self, all_data, dropna): self.assert_series_equal(result, expected) + def test_value_counts_with_normalize(self, data): + data = data[:10].unique() + result = ( - pd.Series(all_data, dtype=all_data.dtype) - .value_counts(dropna=dropna, normalize=True) - .sort_index() + pd.Series(data, dtype=data.dtype).value_counts(normalize=True).sort_index() ) - expected = pd.Series([1 / len(other)] * len(other), index=result.index) + expected = pd.Series([1 / len(data)] * len(data), index=result.index) self.assert_series_equal(result, expected) def test_count(self, data_missing): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index f4ffcb8d0f109..233b658d29782 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -181,6 +181,10 @@ def test_value_counts(self, all_data, dropna): tm.assert_series_equal(result, expected) + @pytest.mark.xfail(reason="value_counts not implemented yet.") + def test_value_counts_with_normalize(self, data): + return super().test_value_counts_with_normalize(data) + class TestCasting(BaseDecimal, base.BaseCastingTests): pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index d086896fb09c3..85a2ec8bc70ac 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -193,6 +193,10 @@ class TestMethods(BaseJSON, base.BaseMethodsTests): def test_value_counts(self, all_data, dropna): pass + @unhashable + def test_value_counts_with_normalize(self, data): + pass + @unhashable def test_sort_values_frame(self): # TODO (EA.factorize): see if _values_for_factorize allows this. diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index e2331b69916fb..673020c64fff6 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -230,6 +230,10 @@ def test_searchsorted(self, data_for_sorting, as_series): def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) + @pytest.mark.skip(reason="uses nullable integer") + def test_value_counts_with_normalize(self, data): + return super().test_value_counts_with_normalize(data) + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 725533765ca2c..2585ebbb592af 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -224,6 +224,10 @@ def test_value_counts(self, all_data, dropna): self.assert_series_equal(result, expected) + @pytest.mark.xfail(reason="not working with nan") + def test_value_counts_with_normalize(self, data): + super().test_value_counts_with_normalize(data) + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index aa5a99282131a..b653b09055fb3 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -199,6 +199,10 @@ class TestMethods(BaseNumPyTests, base.BaseMethodsTests): def test_value_counts(self, all_data, dropna): pass + @pytest.mark.xfail(reason="not working") + def test_value_counts_with_normalize(self, data): + return super().test_value_counts_with_normalize(data) + @pytest.mark.skip(reason="Incorrect expected") # We have a bool dtype, so the result is an ExtensionArray # but expected is not diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 694bbee59606f..4e814906fde00 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -229,6 +229,10 @@ def test_fillna_frame(self, data_missing): class TestMethods(BaseSparseTests, base.BaseMethodsTests): + @pytest.mark.xfail(reason="not working with nan") + def test_value_counts_with_normalize(self): + super().test_value_counts_with_normalize(data) + def test_combine_le(self, data_repeated): # We return a Series[SparseArray].__le__ returns a # Series[Sparse[bool]] diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 3bfa81a742064..27a157d2127f6 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -90,7 +90,9 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): class TestMethods(base.BaseMethodsTests): - pass + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) class TestCasting(base.BaseCastingTests): From 405037f565ff3d1e4d8f09c7cf2999c5139348b8 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Wed, 29 Apr 2020 21:35:02 +0900 Subject: [PATCH 12/13] Update normalize value_counts test --- pandas/tests/extension/base/methods.py | 3 ++- pandas/tests/extension/test_boolean.py | 4 ---- pandas/tests/extension/test_integer.py | 4 ---- pandas/tests/extension/test_sparse.py | 4 ---- 4 files changed, 2 insertions(+), 13 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 25fabe98851f1..a275355a25de8 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -30,12 +30,13 @@ def test_value_counts(self, all_data, dropna): def test_value_counts_with_normalize(self, data): data = data[:10].unique() + values = np.array(data[~data.isna()]) result = ( pd.Series(data, dtype=data.dtype).value_counts(normalize=True).sort_index() ) - expected = pd.Series([1 / len(data)] * len(data), index=result.index) + expected = pd.Series([1 / len(values)] * len(values), index=result.index) self.assert_series_equal(result, expected) def test_count(self, data_missing): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 673020c64fff6..e2331b69916fb 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -230,10 +230,6 @@ def test_searchsorted(self, data_for_sorting, as_series): def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) - @pytest.mark.skip(reason="uses nullable integer") - def test_value_counts_with_normalize(self, data): - return super().test_value_counts_with_normalize(data) - class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 2585ebbb592af..725533765ca2c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -224,10 +224,6 @@ def test_value_counts(self, all_data, dropna): self.assert_series_equal(result, expected) - @pytest.mark.xfail(reason="not working with nan") - def test_value_counts_with_normalize(self, data): - super().test_value_counts_with_normalize(data) - class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 4e814906fde00..694bbee59606f 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -229,10 +229,6 @@ def test_fillna_frame(self, data_missing): class TestMethods(BaseSparseTests, base.BaseMethodsTests): - @pytest.mark.xfail(reason="not working with nan") - def test_value_counts_with_normalize(self): - super().test_value_counts_with_normalize(data) - def test_combine_le(self, data_repeated): # We return a Series[SparseArray].__le__ returns a # Series[Sparse[bool]] From 5d54a0583f5cec59c562285ddd305fa7776851a1 Mon Sep 17 00:00:00 2001 From: Matsuoka Kota Date: Thu, 30 Apr 2020 22:23:40 +0900 Subject: [PATCH 13/13] add coments --- pandas/tests/extension/base/methods.py | 1 + pandas/tests/extension/test_numpy.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index a275355a25de8..d8ba6ced187cb 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -29,6 +29,7 @@ def test_value_counts(self, all_data, dropna): self.assert_series_equal(result, expected) def test_value_counts_with_normalize(self, data): + # GH 33172 data = data[:10].unique() values = np.array(data[~data.isna()]) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index b653b09055fb3..1c887cc4371b6 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -199,7 +199,7 @@ class TestMethods(BaseNumPyTests, base.BaseMethodsTests): def test_value_counts(self, all_data, dropna): pass - @pytest.mark.xfail(reason="not working") + @pytest.mark.xfail(reason="not working. will be covered by #32028") def test_value_counts_with_normalize(self, data): return super().test_value_counts_with_normalize(data)