Skip to content

Grouped dataframe "name" attribute overrides column access / not well documented #25457

Open
@alkasm

Description

@alkasm

Code Sample

>>> df = pd.DataFrame({'val': [9, 10, 3, 6, 2, 3], 'name': list('xxyxyy'), 'group': list('aaabbb')})
>>> df

	val	name	group
0	9	x	a
1	10	x	a
2	3	y	a
3	6	x	b
4	2	y	b
5	3	y	b

Works correctly:

>>> df.groupby('group').apply(lambda g: g[g['name'] == 'x'])

		val	name	group
group				
a	0	9	x	a
	1	10	x	a
b	3	6	x	b

Errors out:

>>> df.groupby('group').apply(lambda g: g[g.name == 'x'])

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)

Rest of the traceback is here:

~/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2655             try:
-> 2656                 return self._engine.get_loc(key)
   2657             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: False

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
~/venv/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    688             try:
--> 689                 result = self._python_apply_general(f)
    690             except Exception:

~/venv/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
    706         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707                                                    self.axis)
    708 

~/venv/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
    189             group_axes = _get_axes(group)
--> 190             res = f(group)
    191             if not _is_indexed_like(res, group_axes):

<ipython-input-282-522c70a9fa21> in <lambda>(g)
----> 1 df.groupby('group').apply(lambda g: g[g.name == 'x'])

~/venv/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2926                 return self._getitem_multilevel(key)
-> 2927             indexer = self.columns.get_loc(key)
   2928             if is_integer(indexer):

~/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2657             except KeyError:
-> 2658                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2659         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: False

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
~/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2655             try:
-> 2656                 return self._engine.get_loc(key)
   2657             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: False

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-282-522c70a9fa21> in <module>
----> 1 df.groupby('group').apply(lambda g: g[g.name == 'x'])

~/venv/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    699 
    700                 with _group_selection_context(self):
--> 701                     return self._python_apply_general(f)
    702 
    703         return result

~/venv/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
    705     def _python_apply_general(self, f):
    706         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707                                                    self.axis)
    708 
    709         return self._wrap_applied_output(

~/venv/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
    188             # group might be modified
    189             group_axes = _get_axes(group)
--> 190             res = f(group)
    191             if not _is_indexed_like(res, group_axes):
    192                 mutated = True

<ipython-input-282-522c70a9fa21> in <lambda>(g)
----> 1 df.groupby('group').apply(lambda g: g[g.name == 'x'])

~/venv/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2925             if self.columns.nlevels > 1:
   2926                 return self._getitem_multilevel(key)
-> 2927             indexer = self.columns.get_loc(key)
   2928             if is_integer(indexer):
   2929                 indexer = [indexer]

~/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2656                 return self._engine.get_loc(key)
   2657             except KeyError:
-> 2658                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2659         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2660         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: False

Problem description

When you call a method on a groupby that expects a function, the documentation states that the function should expect a DataFrame. Indeed if you inspect the type, a DataFrame is passed. However, this DataFrame has a new attribute added --- name --- and this overwrites the dot accessor for a column "name" if it exists. This is a little troubling since doing dotted access will work outside of the groupby and not inside the groupby function.

First, this needs to be documented more explicitly---I found it once in the documentation for one of the related functions that the .name attribute gets added, but cannot find it again, so I'm not sure which one it was. Edit: It was the transform() method's docstring, as shown in a comment below.

Related

#9545

Expected Output

The expected output is what happens when you use the [] indexer, instead of dot access.

Suggestions

Not necessarily mutually exclusive:

  • Give a better error message so that it's known .name is an attribute of the group DataFrame
  • Improve documentation on all related methods to know the .name attribute exists and will override the column dot access
  • Only add the .name attribute if a column of that name doesn't exist
  • Append an underscore to the attribute so there's a much lower chance of conflicts, e.g. groupdf.name_
  • Change the attribute name entirely for similar lower chance of conflicts, e.g. groupdf.grouped_value
  • Move into a method call instead, e.g. groupdf.get_group_name()
  • Add a kwarg to apply() / transform() which toggles whether to send a second argument into the function, that second argument being the group name

Output of pd.show_versions()

INSTALLED VERSIONS

commit: None
python: 3.7.0.beta.4
python-bits: 64
OS: Darwin
OS-release: 17.6.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: en_US.UTF-8
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8

pandas: 0.24.1
pytest: None
pip: 19.0.3
setuptools: 39.0.1
Cython: None
numpy: 1.16.1
scipy: None
pyarrow: None
xarray: None
IPython: 7.3.0
sphinx: None
patsy: None
dateutil: 2.8.0
pytz: 2018.9
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: None
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml.etree: None
bs4: None
html5lib: None
sqlalchemy: 1.2.18
pymysql: None
psycopg2: 2.7.7 (dt dec pq3 ext lo64)
jinja2: 2.10
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
gcsfs: None

Metadata

Metadata

Assignees

No one assigned

    Labels

    ApplyApply, Aggregate, Transform, MapBugGroupby

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions