diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 15b4128424eb1..0b010bb67a89a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -283,14 +283,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/tools/datetimes.py RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests top-level reshaping functions' ; echo $MSG - pytest -q --doctest-modules \ - pandas/core/reshape/concat.py \ - pandas/core/reshape/pivot.py \ - pandas/core/reshape/reshape.py \ - pandas/core/reshape/tile.py \ - pandas/core/reshape/melt.py \ - -k"-crosstab -pivot_table -cut" + MSG='Doctests reshaping functions' ; echo $MSG + pytest -q --doctest-modules pandas/core/reshape/ RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests interval classes' ; echo $MSG diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3b3802875b36b..d1222ce04ac87 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -222,7 +222,14 @@ def merge_ordered( Examples -------- - >>> A + >>> df1 = pd.DataFrame( + ... { + ... "key": ["a", "c", "e", "a", "c", "e"], + ... "lvalue": [1, 2, 3, 1, 2, 3], + ... "group": ["a", "a", "a", "b", "b", "b"] + ... } + ... ) + >>> df1 key lvalue group 0 a 1 a 1 c 2 a @@ -231,24 +238,25 @@ def merge_ordered( 4 c 2 b 5 e 3 b - >>> B - Key rvalue - 0 b 1 - 1 c 2 - 2 d 3 - - >>> merge_ordered(A, B, fill_method='ffill', left_by='group') - group key lvalue rvalue - 0 a a 1 NaN - 1 a b 1 1.0 - 2 a c 2 2.0 - 3 a d 2 3.0 - 4 a e 3 3.0 - 5 b a 1 NaN - 6 b b 1 1.0 - 7 b c 2 2.0 - 8 b d 2 3.0 - 9 b e 3 3.0 + >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + >>> df2 + key rvalue + 0 b 1 + 1 c 2 + 2 d 3 + + >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group") + key lvalue group rvalue + 0 a 1 a NaN + 1 b 1 a 1.0 + 2 c 2 a 2.0 + 3 d 2 a 3.0 + 4 e 3 a 3.0 + 5 a 1 b NaN + 6 b 1 b 1.0 + 7 c 2 b 2.0 + 8 d 2 b 3.0 + 9 e 3 b 3.0 """ def _merger(x, y): @@ -369,15 +377,14 @@ def merge_asof( Examples -------- - >>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) + >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) >>> left a left_val 0 1 a 1 5 b 2 10 c - >>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - ... 'right_val': [1, 2, 3, 6, 7]}) + >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) >>> right a right_val 0 1 1 @@ -386,25 +393,25 @@ def merge_asof( 3 6 6 4 7 7 - >>> pd.merge_asof(left, right, on='a') + >>> pd.merge_asof(left, right, on="a") a left_val right_val 0 1 a 1 1 5 b 3 2 10 c 7 - >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False) + >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False) a left_val right_val 0 1 a NaN 1 5 b 3.0 2 10 c 7.0 - >>> pd.merge_asof(left, right, on='a', direction='forward') + >>> pd.merge_asof(left, right, on="a", direction="forward") a left_val right_val 0 1 a 1.0 1 5 b 6.0 2 10 c NaN - >>> pd.merge_asof(left, right, on='a', direction='nearest') + >>> pd.merge_asof(left, right, on="a", direction="nearest") a left_val right_val 0 1 a 1 1 5 b 6 @@ -412,15 +419,14 @@ def merge_asof( We can use indexed DataFrames as well. - >>> left = pd.DataFrame({'left_val': ['a', 'b', 'c']}, index=[1, 5, 10]) + >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) >>> left left_val 1 a 5 b 10 c - >>> right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7]}, - ... index=[1, 2, 3, 6, 7]) + >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) >>> right right_val 1 1 @@ -437,6 +443,32 @@ def merge_asof( Here is a real-world times-series example + >>> quotes = pd.DataFrame( + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.030"), + ... pd.Timestamp("2016-05-25 13:30:00.041"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.049"), + ... pd.Timestamp("2016-05-25 13:30:00.072"), + ... pd.Timestamp("2016-05-25 13:30:00.075") + ... ], + ... "ticker": [ + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT" + ... ], + ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] + ... } + ... ) >>> quotes time ticker bid ask 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 @@ -448,6 +480,20 @@ def merge_asof( 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 + >>> trades = pd.DataFrame( + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.038"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048") + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100] + ... } + ... ) >>> trades time ticker price quantity 0 2016-05-25 13:30:00.023 MSFT 51.95 75 @@ -458,9 +504,7 @@ def merge_asof( By default we are taking the asof of the quotes - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker') + >>> pd.merge_asof(trades, quotes, on="time", by="ticker") time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 @@ -470,10 +514,9 @@ def merge_asof( We only asof within 2ms between the quote time and the trade time - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker', - ... tolerance=pd.Timedelta('2ms')) + >>> pd.merge_asof( + ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") + ... ) time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN @@ -485,11 +528,14 @@ def merge_asof( and we exclude exact matches on time. However *prior* data will propagate forward - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker', - ... tolerance=pd.Timedelta('10ms'), - ... allow_exact_matches=False) + >>> pd.merge_asof( + ... trades, + ... quotes, + ... on="time", + ... by="ticker", + ... tolerance=pd.Timedelta("10ms"), + ... allow_exact_matches=False + ... ) time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index b9eb89b4d14c6..11fb8cc121fb8 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -171,24 +171,26 @@ def cut( ... index=['a', 'b', 'c', 'd', 'e']) >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False) ... # doctest: +ELLIPSIS - (a 0.0 - b 1.0 - c 2.0 - d 3.0 - e 4.0 - dtype: float64, array([0, 2, 4, 6, 8])) + (a 1.0 + b 2.0 + c 3.0 + d 4.0 + e NaN + dtype: float64, + array([ 0, 2, 4, 6, 8, 10])) Use `drop` optional when bins is not unique >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, ... right=False, duplicates='drop') ... # doctest: +ELLIPSIS - (a 0.0 - b 1.0 - c 2.0 + (a 1.0 + b 2.0 + c 3.0 d 3.0 - e 3.0 - dtype: float64, array([0, 2, 4, 6, 8])) + e NaN + dtype: float64, + array([ 0, 2, 4, 6, 10])) Passing an IntervalIndex for `bins` results in those categories exactly. Notice that values not covered by the IntervalIndex are set to NaN. 0 @@ -197,7 +199,7 @@ def cut( >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) - [NaN, (0, 1], NaN, (2, 3], (4, 5]] + [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index 7abb14303f8cc..6949270317f7c 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -19,8 +19,7 @@ def cartesian_product(X): Examples -------- >>> cartesian_product([list('ABC'), [1, 2]]) - [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), - array([1, 2, 1, 2, 1, 2])] + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='