Skip to content

Commit

Permalink
HIVE-28675: Maximize the removal of redundant columns from GROUP BY c…
Browse files Browse the repository at this point in the history
…lauses (Stamatis Zampetakis reviewed by Soumyakanti Das, Ramesh Kumar)

Enhance HiveRelFieldTrimmer to remove the maximum number of redundant columns from the GROUP BY clause.

The optimization has the following benefits:
1. Generate more efficient plans by pruning as many columns as possible (less CPU/IO/network cost).
2. Avoid missing optimization opportunities by examining all candidates.

Close apache#5586
  • Loading branch information
zabetak committed Jan 6, 2025
1 parent d824e2f commit b7a3e8b
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -502,28 +502,16 @@ private ImmutableBitSet generateNewGroupset(Aggregate aggregate, ImmutableBitSet
return generateGroupSetIfCardinalitySame(aggregate, originalGroupSet, fieldsUsed);
}

// we have set of unique key, get to the key which is same as group by key
ImmutableBitSet groupByUniqueKey = null;

// Find the maximum number of columns that can be removed by retaining a certain unique key
ImmutableBitSet columnsToRemove = ImmutableBitSet.of();
final ImmutableBitSet unusedGroupingColumns = aggregate.getGroupSet().except(fieldsUsed);
for (ImmutableBitSet key : uniqueKeys) {
if (aggregate.getGroupSet().contains(key)) {
groupByUniqueKey = key;
break;
ImmutableBitSet removeCandidate = unusedGroupingColumns.except(key);
if (aggregate.getGroupSet().contains(key) && removeCandidate.cardinality() > columnsToRemove.cardinality()) {
columnsToRemove = removeCandidate;
}
}

if (groupByUniqueKey == null) {
// group by keys do not represent unique keys
return originalGroupSet;
}

// we know group by key contains primary key and there is at least one column in group by which is not being used
// if that column is not part of key it should be removed
ImmutableBitSet nonKeyColumns = aggregate.getGroupSet().except(groupByUniqueKey);
ImmutableBitSet columnsToRemove = nonKeyColumns.except(fieldsUsed);
ImmutableBitSet newGroupSet = aggregate.getGroupSet().except(columnsToRemove);

return newGroupSet;
return aggregate.getGroupSet().except(columnsToRemove);
}

/**
Expand Down
18 changes: 18 additions & 0 deletions ql/src/test/queries/clientpositive/cbo_groupby_remove_key.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
CREATE TABLE passenger
(
id INT NOT NULL,
fname STRING NOT NULL,
lname STRING NOT NULL,
passport STRING NOT NULL,
UNIQUE (id) DISABLE RELY,
UNIQUE (passport) DISABLE RELY,
UNIQUE (fname, lname) DISABLE RELY
);

EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id, passport;
EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY id, passport;
EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport;
EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport;
EXPLAIN CBO SELECT fname, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport;
EXPLAIN CBO SELECT lname, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport;
EXPLAIN CBO SELECT fname, lname, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport;
112 changes: 112 additions & 0 deletions ql/src/test/results/clientpositive/llap/cbo_groupby_remove_key.q.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
PREHOOK: query: CREATE TABLE passenger
(
id INT NOT NULL,
fname STRING NOT NULL,
lname STRING NOT NULL,
passport STRING NOT NULL,
UNIQUE (id) DISABLE RELY,
UNIQUE (passport) DISABLE RELY,
UNIQUE (fname, lname) DISABLE RELY
)
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@passenger
POSTHOOK: query: CREATE TABLE passenger
(
id INT NOT NULL,
fname STRING NOT NULL,
lname STRING NOT NULL,
passport STRING NOT NULL,
UNIQUE (id) DISABLE RELY,
UNIQUE (passport) DISABLE RELY,
UNIQUE (fname, lname) DISABLE RELY
)
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@passenger
PREHOOK: query: EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id, passport
PREHOOK: type: QUERY
PREHOOK: Input: default@passenger
#### A masked pattern was here ####
POSTHOOK: query: EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id, passport
POSTHOOK: type: QUERY
POSTHOOK: Input: default@passenger
#### A masked pattern was here ####
CBO PLAN:
HiveAggregate(group=[{0}], agg#0=[count()])
HiveTableScan(table=[[default, passenger]], table:alias=[passenger])

PREHOOK: query: EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY id, passport
PREHOOK: type: QUERY
PREHOOK: Input: default@passenger
#### A masked pattern was here ####
POSTHOOK: query: EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY id, passport
POSTHOOK: type: QUERY
POSTHOOK: Input: default@passenger
#### A masked pattern was here ####
CBO PLAN:
HiveAggregate(group=[{3}], agg#0=[count()])
HiveTableScan(table=[[default, passenger]], table:alias=[passenger])

PREHOOK: query: EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport
PREHOOK: type: QUERY
PREHOOK: Input: default@passenger
#### A masked pattern was here ####
POSTHOOK: query: EXPLAIN CBO SELECT id, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport
POSTHOOK: type: QUERY
POSTHOOK: Input: default@passenger
#### A masked pattern was here ####
CBO PLAN:
HiveAggregate(group=[{0}], agg#0=[count()])
HiveTableScan(table=[[default, passenger]], table:alias=[passenger])

PREHOOK: query: EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport
PREHOOK: type: QUERY
PREHOOK: Input: default@passenger
#### A masked pattern was here ####
POSTHOOK: query: EXPLAIN CBO SELECT passport, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport
POSTHOOK: type: QUERY
POSTHOOK: Input: default@passenger
#### A masked pattern was here ####
CBO PLAN:
HiveAggregate(group=[{3}], agg#0=[count()])
HiveTableScan(table=[[default, passenger]], table:alias=[passenger])

PREHOOK: query: EXPLAIN CBO SELECT fname, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport
PREHOOK: type: QUERY
PREHOOK: Input: default@passenger
#### A masked pattern was here ####
POSTHOOK: query: EXPLAIN CBO SELECT fname, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport
POSTHOOK: type: QUERY
POSTHOOK: Input: default@passenger
#### A masked pattern was here ####
CBO PLAN:
HiveProject(fname=[$1], _o__c1=[$2])
HiveAggregate(group=[{0, 1}], agg#0=[count()])
HiveTableScan(table=[[default, passenger]], table:alias=[passenger])

PREHOOK: query: EXPLAIN CBO SELECT lname, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport
PREHOOK: type: QUERY
PREHOOK: Input: default@passenger
#### A masked pattern was here ####
POSTHOOK: query: EXPLAIN CBO SELECT lname, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport
POSTHOOK: type: QUERY
POSTHOOK: Input: default@passenger
#### A masked pattern was here ####
CBO PLAN:
HiveProject(lname=[$1], _o__c1=[$2])
HiveAggregate(group=[{0, 2}], agg#0=[count()])
HiveTableScan(table=[[default, passenger]], table:alias=[passenger])

PREHOOK: query: EXPLAIN CBO SELECT fname, lname, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport
PREHOOK: type: QUERY
PREHOOK: Input: default@passenger
#### A masked pattern was here ####
POSTHOOK: query: EXPLAIN CBO SELECT fname, lname, COUNT(1) FROM passenger GROUP BY id, fname, lname, passport
POSTHOOK: type: QUERY
POSTHOOK: Input: default@passenger
#### A masked pattern was here ####
CBO PLAN:
HiveAggregate(group=[{1, 2}], agg#0=[count()])
HiveTableScan(table=[[default, passenger]], table:alias=[passenger])

0 comments on commit b7a3e8b

Please sign in to comment.