Skip to content

Consistent Naming Standard When Using Dict in GroupBy .agg #21806

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 37 additions & 48 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,52 +364,46 @@ def nested_renaming_depr(level=4):
"version"),
FutureWarning, stacklevel=level)

# if we have a dict of any non-scalars
# eg. {'A' : ['mean']}, normalize all to
# be list-likes
if any(is_aggregator(x) for x in compat.itervalues(arg)):
new_arg = compat.OrderedDict()
for k, v in compat.iteritems(arg):
if not isinstance(v, (tuple, list, dict)):
new_arg[k] = [v]
else:
new_arg[k] = v

# the keys must be in the columns
# for ndim=2, or renamers for ndim=1

# ok for now, but deprecated
# {'A': { 'ra': 'mean' }}
# {'A': { 'ra': ['mean'] }}
# {'ra': ['mean']}

# not ok
# {'ra' : { 'A' : 'mean' }}
if isinstance(v, dict):
is_nested_renamer = True

if k not in obj.columns:
msg = ('cannot perform renaming for {key} with a '
'nested dictionary').format(key=k)
raise SpecificationError(msg)
nested_renaming_depr(4 + (_level or 0))

elif isinstance(obj, ABCSeries):
nested_renaming_depr()
elif isinstance(obj, ABCDataFrame) and \
k not in obj.columns:
raise KeyError(
"Column '{col}' does not exist!".format(col=k))

arg = new_arg

else:
if any(issubclass(type(x), dict) for x in compat.itervalues(arg)):
# deprecation of renaming keys
# GH 15931
keys = list(compat.iterkeys(arg))
if (isinstance(obj, ABCDataFrame) and
len(obj.columns.intersection(keys)) != len(keys)):
nested_renaming_depr()

# normalize all non-scalars be list-likes
new_arg = compat.OrderedDict()
for k, v in compat.iteritems(arg):
if not isinstance(v, (tuple, list, dict)):
new_arg[k] = [v]
else:
new_arg[k] = v

# the keys must be in the columns
# for ndim=2, or renamers for ndim=1

# ok for now, but deprecated
# {'A': { 'ra': 'mean' }}
# {'A': { 'ra': ['mean'] }}
# {'ra': ['mean']}

# not ok
# {'ra' : { 'A' : 'mean' }}
if isinstance(v, dict):
is_nested_renamer = True

if k not in obj.columns:
msg = ('cannot perform renaming for {key} with a '
'nested dictionary').format(key=k)
raise SpecificationError(msg)
nested_renaming_depr(4 + (_level or 0))

elif isinstance(obj, ABCSeries):
nested_renaming_depr()
elif isinstance(obj, ABCDataFrame) and \
k not in obj.columns:
raise KeyError(
"Column '{col}' does not exist!".format(col=k))

arg = new_arg

from pandas.core.reshape.concat import concat

Expand Down Expand Up @@ -456,11 +450,6 @@ def _agg(arg, func):
result.update(r)
keys = list(compat.iterkeys(result))

else:

if self._selection is not None:
keys = None

# some selection on the object
elif self._selection is not None:

Expand Down
52 changes: 32 additions & 20 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,11 @@ def test_agg_python_multiindex(mframe):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize('groupbyfunc', [
lambda x: x.weekday(),
[lambda x: x.month, lambda x: x.weekday()],
@pytest.mark.parametrize('groupbyfunc,multiple', [
(lambda x: x.weekday(), False),
([lambda x: x.month, lambda x: x.weekday()], True),
])
def test_aggregate_str_func(tsframe, groupbyfunc):
def test_aggregate_str_func(tsframe, groupbyfunc, multiple):
grouped = tsframe.groupby(groupbyfunc)

# single series
Expand All @@ -139,10 +139,19 @@ def test_aggregate_str_func(tsframe, groupbyfunc):
['B', 'std'],
['C', 'mean'],
['D', 'sem']]))
expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
['B', grouped['B'].std()],
['C', grouped['C'].mean()],
['D', grouped['D'].sem()]]))

columns = pd.MultiIndex.from_arrays([
list('ABCD'), ['var', 'std', 'mean', 'sem']])
expected = DataFrame(list(zip(grouped['A'].var(),
grouped['B'].std(),
grouped['C'].mean(),
grouped['D'].sem())), columns=columns)

if multiple:
mi = pd.MultiIndex.from_product([
range(1, len(groupbyfunc) + 1, 1), range(5)])
expected = expected.set_index(mi)

tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -225,27 +234,24 @@ def test_more_flexible_frame_multi_function(df):

exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]]))
exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]]))

expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
expected = concat([exmean, exstd], axis=1)
expected = expected.sort_index(axis=1)

d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]])
result = grouped.aggregate(d)

tm.assert_frame_equal(result, expected)

# be careful
result = grouped.aggregate(OrderedDict([['C', np.mean],
['D', [np.mean, np.std]]]))
expected = grouped.aggregate(OrderedDict([['C', np.mean],
['D', [np.mean, np.std]]]))

tm.assert_frame_equal(result, expected)

def foo(x):
return np.mean(x)

def bar(x):
return np.std(x, ddof=1)
def test_more_flexible_frame_mult_function_warns(df):
grouped = df.groupby('A')

# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
Expand All @@ -254,6 +260,12 @@ def bar(x):
['bar', np.std]])]])
result = grouped.aggregate(d)

def foo(x):
return np.mean(x)

def bar(x):
return np.std(x, ddof=1)

d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]])
expected = grouped.aggregate(d)

Expand All @@ -271,18 +283,18 @@ def test_multi_function_flexible_mix(df):
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
expected = grouped.aggregate(d)

# Test 1
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
['D', 'sum']])
['D', ['sum']]])
# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = grouped.aggregate(d)
tm.assert_frame_equal(result, expected)

# Test 2
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
['D', ['sum']]])
['D', 'sum']])

# this uses column selection & renaming
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = grouped.aggregate(d)

tm.assert_frame_equal(result, expected)
63 changes: 47 additions & 16 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,19 +99,23 @@ def test_agg_dict_parameter_cast_result_dtypes():
grouped = df.groupby('class')
tm.assert_frame_equal(grouped.first(), exp)
tm.assert_frame_equal(grouped.agg('first'), exp)
tm.assert_frame_equal(grouped.agg({'time': 'first'}), exp)
tm.assert_series_equal(grouped.time.first(), exp['time'])
tm.assert_series_equal(grouped.time.agg('first'), exp['time'])

exp.columns = pd.MultiIndex.from_tuples([('time', 'first')])
tm.assert_frame_equal(grouped.agg({'time': 'first'}), exp)

# test for `last` function
exp = df.loc[[0, 3, 4, 7]].set_index('class')
grouped = df.groupby('class')
tm.assert_frame_equal(grouped.last(), exp)
tm.assert_frame_equal(grouped.agg('last'), exp)
tm.assert_frame_equal(grouped.agg({'time': 'last'}), exp)
tm.assert_series_equal(grouped.time.last(), exp['time'])
tm.assert_series_equal(grouped.time.agg('last'), exp['time'])

exp.columns = pd.MultiIndex.from_tuples([('time', 'last')])
tm.assert_frame_equal(grouped.agg({'time': 'last'}), exp)

# count
exp = pd.Series([2, 2, 2, 2],
index=Index(list('ABCD'), name='class'),
Expand Down Expand Up @@ -192,7 +196,9 @@ def test_aggregate_api_consistency():
tm.assert_frame_equal(result, expected, check_like=True)

result = grouped.agg({'C': 'mean', 'D': 'sum'})
expected = pd.concat([d_sum, c_mean], axis=1)
expected = pd.concat([c_mean, d_sum], axis=1)
expected.columns = MultiIndex.from_arrays([['C', 'D'],
['mean', 'sum']])
tm.assert_frame_equal(result, expected, check_like=True)

result = grouped.agg({'C': ['mean', 'sum'],
Expand All @@ -201,13 +207,19 @@ def test_aggregate_api_consistency():
expected.columns = MultiIndex.from_product([['C', 'D'],
['mean', 'sum']])

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
def test_aggregate_api_raises():
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C': np.random.randn(8) + 1.0,
'D': np.arange(8)})

grouped = df.groupby(['A', 'B'])

with pytest.raises(KeyError):
result = grouped[['D', 'C']].agg({'r': np.sum,
'r2': np.mean})
expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1)
expected.columns = MultiIndex.from_product([['r', 'r2'],
['D', 'C']])
tm.assert_frame_equal(result, expected, check_like=True)


def test_agg_dict_renaming_deprecation():
Expand All @@ -222,14 +234,21 @@ def test_agg_dict_renaming_deprecation():
'C': {'bar': ['count', 'min']}})
assert "using a dict with renaming" in str(w[0].message)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
df.groupby('A')[['B', 'C']].agg({'ma': 'max'})

# TODO: Shouldn't the below fail as well?
with tm.assert_produces_warning(FutureWarning) as w:
df.groupby('A').B.agg({'foo': 'count'})
assert "using a dict on a Series for aggregation" in str(w[0].message)


def test_agg_dict_renaming_deprecation_raises():
df = pd.DataFrame({'A': [1, 1, 1, 2, 2],
'B': range(5),
'C': range(5)})

with pytest.raises(KeyError):
df.groupby('A')[['B', 'C']].agg({'ma': 'max'})


def test_agg_compat():
# GH 12334
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
Expand Down Expand Up @@ -267,11 +286,6 @@ def test_agg_nested_dicts():

g = df.groupby(['A', 'B'])

msg = r'cannot perform renaming for r[1-2] with a nested dictionary'
with tm.assert_raises_regex(SpecificationError, msg):
g.aggregate({'r1': {'C': ['mean', 'sum']},
'r2': {'D': ['mean', 'sum']}})

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g.agg({'C': {'ra': ['mean', 'std']},
'D': {'rb': ['mean', 'std']}})
Expand All @@ -283,6 +297,23 @@ def test_agg_nested_dicts():
('rb', 'mean'), ('rb', 'std')])
tm.assert_frame_equal(result, expected, check_like=True)


def test_agg_nested_dicts_raises():
# API change for disallowing these types of nested dicts
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C': np.random.randn(8) + 1.0,
'D': np.arange(8)})

g = df.groupby(['A', 'B'])

msg = r'cannot perform renaming for r[1-2] with a nested dictionary'
with tm.assert_raises_regex(SpecificationError, msg):
g.aggregate({'r1': {'C': ['mean', 'sum']},
'r2': {'D': ['mean', 'sum']}})

# same name as the original column
# GH9052
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,8 @@ def test_groupby_as_index_agg(df):
result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
expected2 = grouped.mean()
expected2['D'] = grouped.sum()['D']
expected2.columns = pd.MultiIndex.from_arrays([
expected2.columns, ['', 'mean', 'sum']])
assert_frame_equal(result2, expected2)

grouped = df.groupby('A', as_index=True)
Expand All @@ -561,6 +563,7 @@ def test_groupby_as_index_agg(df):
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result3 = grouped['C'].agg({'Q': np.sum})

assert_frame_equal(result3, expected3)

# multi-key
Expand All @@ -574,10 +577,14 @@ def test_groupby_as_index_agg(df):
result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
expected2 = grouped.mean()
expected2['D'] = grouped.sum()['D']
expected2.columns = pd.MultiIndex.from_arrays([
expected2.columns, ['', '', 'mean', 'sum']])
assert_frame_equal(result2, expected2)

expected3 = grouped['C'].sum()
expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
expected3.columns = pd.MultiIndex.from_arrays([
expected3.columns, ['', '', 'sum']])
result3 = grouped['C'].agg({'Q': np.sum})
assert_frame_equal(result3, expected3)

Expand Down Expand Up @@ -1340,6 +1347,7 @@ def test_multifunc_sum_bug():

grouped = x.groupby('test')
result = grouped.agg({'fl': 'sum', 2: 'size'})
result.columns = result.columns.droplevel(-1)
assert result['fl'].dtype == np.float64


Expand Down Expand Up @@ -1693,3 +1701,25 @@ def test_groupby_agg_ohlc_non_first():
result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc'])

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("select_columns", [True, False])
@pytest.mark.parametrize("agg_argument", [
{'B': 'sum', 'C': 'min'}, # Scalar result
{'B': 'sum', 'C': ['min']}, # Scalar and list
{'B': ['sum'], 'C': ['min']}, # Lists
{'B': {'sum': 'sum'}, 'C': {'min': 'min'}} # deprecated call
])
def test_agg_dict_naming_consistency(select_columns, agg_argument):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI plan to move this to the aggregate sub-directory of tests

df = pd.DataFrame([['foo', 1, 1], ['bar', 1, 1]], columns=['A', 'B', 'C'])
expected = pd.DataFrame([[1, 1], [1, 1]], index=pd.Index(
['bar', 'foo'], name='A'), columns=pd.MultiIndex.from_tuples(
(('B', 'sum'), ('C', 'min'))))

with catch_warnings(record=True):
if select_columns:
result = df.groupby('A')[['B', 'C']].agg(agg_argument)
else:
result = df.groupby('A').agg(agg_argument)

tm.assert_frame_equal(result, expected)