-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
Allows for merging of SparseDataFrames, and fixes __array__ interface #19488
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
5b48613
555fb91
89677b0
97bd19b
b04ac64
d24e464
a796280
be09289
1aef901
42680ba
2084ed3
77c41b7
25fd08a
cd583f7
45e7cd3
6522d6b
029d37b
730c152
171f5dd
bde2588
bfe3065
d13daa7
3ddba8b
4fa7dec
353171b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2918,14 +2918,15 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, | |
# GH#19265 pyarrow is passing this | ||
warnings.warn("fastpath argument is deprecated, will be removed " | ||
"in a future release.", DeprecationWarning) | ||
|
||
if klass is None: | ||
dtype = dtype or values.dtype | ||
klass = get_block_type(values, dtype) | ||
|
||
elif klass is DatetimeTZBlock and not is_datetimetz(values): | ||
return klass(values, ndim=ndim, | ||
placement=placement, dtype=dtype) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You might want to install a flake8 plugin for your editor ;) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok ok installing now. I fully admit that I'm awful when it comes to linting failures and should just tool the fix. Might as well install Ctrl-P while I'm at it. |
||
return klass(values, ndim=ndim, placement=placement) | ||
|
||
# TODO: flexible with index=None and/or items=None | ||
|
@@ -5120,14 +5121,28 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): | |
elif is_uniform_join_units(join_units): | ||
b = join_units[0].block.concat_same_type( | ||
[ju.block for ju in join_units], placement=placement) | ||
elif is_sparse_join_units(join_units): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you ever go down the initial |
||
values = concatenate_join_units(join_units, concat_axis, copy=copy) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a mess There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, yes it is. I'll work on cleaning this up tomorrow since there's too many branches in this code. |
||
values = values[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand this line... A comment maybe? or is it incorrect? |
||
block = join_units[0].block | ||
|
||
if block: | ||
fill_value = block.fill_value | ||
else: | ||
fill_value = np.nan | ||
array = SparseArray(values, fill_value=fill_value) | ||
b = make_block(array, klass=SparseBlock, placement=placement) | ||
else: | ||
b = make_block( | ||
concatenate_join_units(join_units, concat_axis, copy=copy), | ||
placement=placement) | ||
placement=placement | ||
) | ||
blocks.append(b) | ||
|
||
return BlockManager(blocks, axes) | ||
|
||
def is_sparse_join_units(join_units): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Docstring would be nice, at least noting that this is true if any blocks are sparse. |
||
return any(type(ju.block) is SparseBlock for ju in join_units) | ||
|
||
def is_uniform_join_units(join_units): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,8 @@ | |
concatenate_block_managers) | ||
from pandas.util._decorators import Appender, Substitution | ||
|
||
from pandas.core.sparse.array import SparseArray | ||
|
||
from pandas.core.sorting import is_int64_overflow_possible | ||
import pandas.core.algorithms as algos | ||
import pandas.core.sorting as sorting | ||
|
@@ -731,7 +733,11 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): | |
if mask.all(): | ||
key_col = rvals | ||
else: | ||
key_col = Index(lvals).where(~mask, rvals) | ||
# Might need to be IntIndex not Index | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't do this, use |
||
if isinstance(lvals, SparseArray): | ||
key_col = Index(lvals.get_values()).where(~mask, rvals) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure if this has memory or performance issues. But this is the best solution I could come to with this. The other solution would be to look at using lvals.sp_index and implement a where on it that works. One thing I have noticed is that IntIndex doesn't act quite like Index which makes for doing these masks tricky in sparse land. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It'd be nice to avoid |
||
else: | ||
key_col = Index(lvals).where(~mask, rvals) | ||
|
||
if result._is_label_reference(name): | ||
result[name] = key_col | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,8 @@ | |
import pandas.core.ops as ops | ||
import pandas.core.common as com | ||
|
||
from collections import Counter | ||
|
||
_shared_doc_kwargs = dict(klass='SparseDataFrame') | ||
|
||
|
||
|
@@ -73,6 +75,9 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, | |
if columns is None: | ||
raise Exception("cannot pass a series w/o a name or columns") | ||
data = {columns[0]: data} | ||
elif isinstance(data, BlockManager): | ||
if default_fill_value is None: | ||
default_fill_value, _ = Counter([b.fill_value for b in data.blocks]).most_common(1)[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if this is kosher or not... Basically I kept running into the issue of If you create a SparseDataFrame with a bunch of SparseSeries / SparseArrays that have fill_values like == 1 or ==0 or something then it doesn't persist that to default_fill_value. That seems like an enhancement and I could take it out of this PR but it helped me test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a recipe for surprise. I'd hate for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can take it out it didn't add much tbh. was just making my testing easier ;) |
||
|
||
if default_fill_value is None: | ||
default_fill_value = np.nan | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
import numpy as np | ||
import random | ||
import re | ||
import itertools | ||
|
||
import pandas as pd | ||
from pandas.compat import lrange, lzip | ||
|
@@ -1800,3 +1801,31 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): | |
how=how, | ||
sort=sort) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
class TestMergeSparseDataFrames(object): | ||
# Cannot seem to get 0 or 1 working with sparse data frame | ||
@pytest.mark.parametrize('fill_value,how', itertools.product([np.nan], ['left', 'right', 'outer', 'inner'])) | ||
def test_merge_two_sparse_frames(self, fill_value, how): | ||
dense_evens = pd.DataFrame({'A': list(range(0, 200, 2)), 'B': np.random.randint(0,100, size=100)}) | ||
dense_threes = pd.DataFrame({'A': list(range(0, 300, 3)), 'B': np.random.randint(0,100, size=100)}) | ||
|
||
dense_merge = dense_evens.merge(dense_threes, how=how, on='A') | ||
|
||
# If you merge two dense frames together it tends to default to float64 not the original dtype | ||
dense_merge['B_x'] = dense_merge['B_x'].astype(np.int64, errors='ignore') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This to me seems kind of bizarre and I couldn't find an issue for it but basically: If you merge two dense frames together that I defined above the dtype goes from int64 to float64. I think I know where the code is that's doing that so I could fix it but didn't want to get too side tracked in this work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's since |
||
dense_merge['B_y'] = dense_merge['B_y'].astype(np.int64, errors='ignore') | ||
|
||
sparse_evens = dense_evens.to_sparse(fill_value=fill_value) | ||
sparse_threes = dense_threes.to_sparse(fill_value=fill_value) | ||
|
||
sparse_merge = sparse_evens.merge(sparse_threes, how=how, on='A') | ||
|
||
assert sparse_merge.default_fill_value is fill_value | ||
|
||
tm.assert_sp_frame_equal(dense_merge.to_sparse(fill_value=fill_value), sparse_merge, exact_indices=False, check_dtype=False) | ||
|
||
|
||
@pytest.mark.parametrize('fill_value,how', itertools.product([0, 1, np.nan, None], ['left', 'right', 'outer', 'inner'])) | ||
def test_merge_dense_sparse_frames(self, fill_value, how): | ||
"pass" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can go under enhancements, saying "Merging sparse DataFrames" is now supported.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh it's an enhancement alrighty thanks for the guidance.