Skip to content

Commit

Permalink
Add filter selectivity estimation for auto search strategy (apache#3848)
Browse files Browse the repository at this point in the history
* Add filter selectivity estimation for auto search strategy

* Addressed comments

* Lazy bitmap materialization for bitmap sampling and java docs

* Addressed comments.

- Fix wrong non-overlap ratio computation and added unit tests.
- Change Iterable<Integer> to IntIterable
- Remove unnecessary Iterable<Integer>

* Addressed comments

- Split a long ternary operation into if-else blocks
- Add IntListUtils.fromTo()

* Fix test failure and add a test for RangeIntList

* fix code style

* Diabled selectivity estimation for multi-valued dimensions

* Address comment
  • Loading branch information
jihoonson authored and gianm committed Feb 6, 2017
1 parent 8a13a85 commit ddd8c9e
Show file tree
Hide file tree
Showing 20 changed files with 1,107 additions and 277 deletions.
187 changes: 150 additions & 37 deletions benchmarks/src/main/java/io/druid/benchmark/query/SearchBenchmark.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Suppliers;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.hash.Hashing;
Expand All @@ -39,16 +40,25 @@
import io.druid.java.util.common.guava.Sequences;
import io.druid.java.util.common.logger.Logger;
import io.druid.query.Druids;
import io.druid.query.Druids.SearchQueryBuilder;
import io.druid.query.FinalizeResultsQueryRunner;
import io.druid.query.Query;
import io.druid.query.QueryRunner;
import io.druid.query.QueryRunnerFactory;
import io.druid.query.QueryToolChest;
import io.druid.query.Result;
import io.druid.query.aggregation.hyperloglog.HyperUniquesSerde;
import io.druid.query.extraction.DimExtractionFn;
import io.druid.query.extraction.IdentityExtractionFn;
import io.druid.query.extraction.LowerExtractionFn;
import io.druid.query.extraction.StrlenExtractionFn;
import io.druid.query.extraction.SubstringDimExtractionFn;
import io.druid.query.extraction.UpperExtractionFn;
import io.druid.query.filter.AndDimFilter;
import io.druid.query.filter.BoundDimFilter;
import io.druid.query.filter.DimFilter;
import io.druid.query.filter.InDimFilter;
import io.druid.query.filter.SelectorDimFilter;
import io.druid.query.search.SearchQueryQueryToolChest;
import io.druid.query.search.SearchQueryRunnerFactory;
import io.druid.query.search.SearchResultValue;
Expand Down Expand Up @@ -146,56 +156,156 @@ public int columnCacheSizeBytes()
private void setupQueries()
{
// queries for the basic schema
Map<String, Druids.SearchQueryBuilder> basicQueries = new LinkedHashMap<>();
BenchmarkSchemaInfo basicSchema = BenchmarkSchemas.SCHEMA_MAP.get("basic");
final Map<String, SearchQueryBuilder> basicQueries = new LinkedHashMap<>();
final BenchmarkSchemaInfo basicSchema = BenchmarkSchemas.SCHEMA_MAP.get("basic");

{ // basic.A
QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(basicSchema.getDataInterval()));
final List<String> queryTypes = ImmutableList.of("A", "B", "C", "D");
for (final String eachType : queryTypes) {
basicQueries.put(eachType, makeQuery(eachType, basicSchema));
}

SCHEMA_QUERY_MAP.put("basic", basicQueries);
}

private static SearchQueryBuilder makeQuery(final String name, final BenchmarkSchemaInfo basicSchema)
{
switch (name) {
case "A":
return basicA(basicSchema);
case "B":
return basicB(basicSchema);
case "C":
return basicC(basicSchema);
case "D":
return basicD(basicSchema);
default:
return null;
}
}

private static SearchQueryBuilder basicA(final BenchmarkSchemaInfo basicSchema)
{
final QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(basicSchema.getDataInterval()));

Druids.SearchQueryBuilder queryBuilderA =
Druids.newSearchQueryBuilder()
.dataSource("blah")
.granularity(QueryGranularities.ALL)
.intervals(intervalSpec)
.query("123");
return Druids.newSearchQueryBuilder()
.dataSource("blah")
.granularity(QueryGranularities.ALL)
.intervals(intervalSpec)
.query("123");
}

basicQueries.put("A", queryBuilderA);
private static SearchQueryBuilder basicB(final BenchmarkSchemaInfo basicSchema)
{
final QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(basicSchema.getDataInterval()));

final List<String> dimUniformFilterVals = Lists.newArrayList();
int resultNum = (int) (100000 * 0.1);
int step = 100000 / resultNum;
for (int i = 1; i < 100001 && dimUniformFilterVals.size() < resultNum; i += step) {
dimUniformFilterVals.add(String.valueOf(i));
}

{ // basic.B
QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(basicSchema.getDataInterval()));
List<String> dimHyperUniqueFilterVals = Lists.newArrayList();
resultNum = (int) (100000 * 0.1);
step = 100000 / resultNum;
for (int i = 0; i < 100001 && dimHyperUniqueFilterVals.size() < resultNum; i += step) {
dimHyperUniqueFilterVals.add(String.valueOf(i));
}

List<String> dimUniformFilterVals = Lists.newArrayList();
int resultNum = (int) (100000 * 0.1);
int step = 100000 / resultNum;
for (int i = 1; i < 100001 && dimUniformFilterVals.size() < resultNum; i += step) {
dimUniformFilterVals.add(String.valueOf(i));
final List<DimFilter> dimFilters = Lists.newArrayList();
dimFilters.add(new InDimFilter("dimUniform", dimUniformFilterVals, null));
dimFilters.add(new InDimFilter("dimHyperUnique", dimHyperUniqueFilterVals, null));

return Druids.newSearchQueryBuilder()
.dataSource("blah")
.granularity(QueryGranularities.ALL)
.intervals(intervalSpec)
.query("")
.dimensions(Lists.newArrayList("dimUniform", "dimHyperUnique"))
.filters(new AndDimFilter(dimFilters));
}

private static SearchQueryBuilder basicC(final BenchmarkSchemaInfo basicSchema)
{
final QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(basicSchema.getDataInterval()));

final List<String> dimUniformFilterVals = Lists.newArrayList();
final int resultNum = (int) (100000 * 0.1);
final int step = 100000 / resultNum;
for (int i = 1; i < 100001 && dimUniformFilterVals.size() < resultNum; i += step) {
dimUniformFilterVals.add(String.valueOf(i));
}

final String dimName = "dimUniform";
final List<DimFilter> dimFilters = Lists.newArrayList();
dimFilters.add(new InDimFilter(dimName, dimUniformFilterVals, IdentityExtractionFn.getInstance()));
dimFilters.add(new SelectorDimFilter(dimName, "3", StrlenExtractionFn.instance()));
dimFilters.add(new BoundDimFilter(dimName, "100", "10000", true, true, true, new DimExtractionFn()
{
@Override
public byte[] getCacheKey()
{
return new byte[]{0xF};
}

List<String> dimHyperUniqueFilterVals = Lists.newArrayList();
resultNum = (int) (100000 * 0.1);
step = 100000 / resultNum;
for (int i = 0; i < 100001 && dimHyperUniqueFilterVals.size() < resultNum; i += step) {
dimHyperUniqueFilterVals.add(String.valueOf(i));
@Override
public String apply(String value)
{
return String.valueOf(Long.parseLong(value) + 1);
}

final List<DimFilter> dimFilters = Lists.newArrayList();
dimFilters.add(new InDimFilter("dimUniform", dimUniformFilterVals, null));
dimFilters.add(new InDimFilter("dimHyperUnique", dimHyperUniqueFilterVals, null));
@Override
public boolean preservesOrdering()
{
return false;
}

Druids.SearchQueryBuilder queryBuilderB =
Druids.newSearchQueryBuilder()
.dataSource("blah")
.granularity(QueryGranularities.ALL)
.intervals(intervalSpec)
.query("")
.dimensions(Lists.newArrayList("dimUniform", "dimHyperUnique"))
.filters(new AndDimFilter(dimFilters));
@Override
public ExtractionType getExtractionType()
{
return ExtractionType.ONE_TO_ONE;
}
}, null));
dimFilters.add(new InDimFilter(dimName, dimUniformFilterVals, new LowerExtractionFn(null)));
dimFilters.add(new InDimFilter(dimName, dimUniformFilterVals, new UpperExtractionFn(null)));
dimFilters.add(new InDimFilter(dimName, dimUniformFilterVals, new SubstringDimExtractionFn(1, 3)));

return Druids.newSearchQueryBuilder()
.dataSource("blah")
.granularity(QueryGranularities.ALL)
.intervals(intervalSpec)
.query("")
.dimensions(Lists.newArrayList("dimUniform"))
.filters(new AndDimFilter(dimFilters));
}

private static SearchQueryBuilder basicD(final BenchmarkSchemaInfo basicSchema)
{
final QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(basicSchema.getDataInterval()));

basicQueries.put("B", queryBuilderB);
final List<String> dimUniformFilterVals = Lists.newArrayList();
final int resultNum = (int) (100000 * 0.1);
final int step = 100000 / resultNum;
for (int i = 1; i < 100001 && dimUniformFilterVals.size() < resultNum; i += step) {
dimUniformFilterVals.add(String.valueOf(i));
}

SCHEMA_QUERY_MAP.put("basic", basicQueries);
final String dimName = "dimUniform";
final List<DimFilter> dimFilters = Lists.newArrayList();
dimFilters.add(new InDimFilter(dimName, dimUniformFilterVals, null));
dimFilters.add(new SelectorDimFilter(dimName, "3", null));
dimFilters.add(new BoundDimFilter(dimName, "100", "10000", true, true, true, null, null));
dimFilters.add(new InDimFilter(dimName, dimUniformFilterVals, null));
dimFilters.add(new InDimFilter(dimName, dimUniformFilterVals, null));
dimFilters.add(new InDimFilter(dimName, dimUniformFilterVals, null));

return Druids.newSearchQueryBuilder()
.dataSource("blah")
.granularity(QueryGranularities.ALL)
.intervals(intervalSpec)
.query("")
.dimensions(Lists.newArrayList("dimUniform"))
.filters(new AndDimFilter(dimFilters));
}

@Setup
Expand Down Expand Up @@ -357,7 +467,10 @@ public void queryMultiQueryableIndex(Blackhole blackhole) throws Exception
);

Sequence<Result<SearchResultValue>> queryResult = theRunner.run(query, Maps.<String, Object>newHashMap());
List<Result<SearchResultValue>> results = Sequences.toList(queryResult, Lists.<Result<SearchResultValue>>newArrayList());
List<Result<SearchResultValue>> results = Sequences.toList(
queryResult,
Lists.<Result<SearchResultValue>>newArrayList()
);

for (Result<SearchResultValue> result : results) {
List<SearchHit> hits = result.getValue().getValue();
Expand Down
44 changes: 40 additions & 4 deletions processing/src/main/java/io/druid/query/filter/Filter.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package io.druid.query.filter;

import io.druid.collections.bitmap.ImmutableBitmap;
import io.druid.segment.ColumnSelector;
import io.druid.segment.ColumnSelectorFactory;

/**
Expand All @@ -30,26 +31,61 @@ public interface Filter
* Get a bitmap index, indicating rows that match this filter.
*
* @param selector Object used to retrieve bitmap indexes
*
* @return A bitmap indicating rows that match this filter.
*
* @see Filter#estimateSelectivity(BitmapIndexSelector)
*/
ImmutableBitmap getBitmapIndex(BitmapIndexSelector selector);


/**
* Estimate selectivity of this filter.
* This method can be used for cost-based query planning like in {@link io.druid.query.search.search.AutoStrategy}.
* To avoid significant performance degradation for calculating the exact cost,
* implementation of this method targets to achieve rapid selectivity estimation
* with reasonable sacrifice of the accuracy.
* As a result, the estimated selectivity might be different from the exact value.
*
* @param indexSelector Object used to retrieve bitmap indexes
*
* @return an estimated selectivity ranging from 0 (filter selects no rows) to 1 (filter selects all rows).
*
* @see Filter#getBitmapIndex(BitmapIndexSelector)
*/
public ImmutableBitmap getBitmapIndex(BitmapIndexSelector selector);
double estimateSelectivity(BitmapIndexSelector indexSelector);


/**
* Get a ValueMatcher that applies this filter to row values.
*
* @param factory Object used to create ValueMatchers
*
* @return ValueMatcher that applies this filter to row values.
*/
public ValueMatcher makeMatcher(ColumnSelectorFactory factory);
ValueMatcher makeMatcher(ColumnSelectorFactory factory);


/**
* Indicates whether this filter can return a bitmap index for filtering, based on
* the information provided by the input BitmapIndexSelector.
*
* @param selector Object used to retrieve bitmap indexes
* @return true if this Filter can provide a bitmap index using the selector, false otherwise
*
* @return true if this Filter can provide a bitmap index using the selector, false otherwise.
*/
boolean supportsBitmapIndex(BitmapIndexSelector selector);


/**
* Indicates whether this filter supports selectivity estimation.
* A filter supports selectivity estimation if it supports bitmap index and
* the dimension which the filter evaluates does not have multi values.
*
* @param columnSelector Object to check the dimension has multi values.
* @param indexSelector Object used to retrieve bitmap indexes
*
* @return true if this Filter supports selectivity estimation, false otherwise.
*/
public boolean supportsBitmapIndex(BitmapIndexSelector selector);
boolean supportsSelectivityEstimation(ColumnSelector columnSelector, BitmapIndexSelector indexSelector);
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
package io.druid.query.search.search;

import com.metamx.emitter.EmittingLogger;
import io.druid.collections.bitmap.ImmutableBitmap;
import io.druid.query.dimension.DimensionSpec;
import io.druid.query.filter.BitmapIndexSelector;
import io.druid.segment.ColumnSelectorBitmapIndexSelector;
Expand Down Expand Up @@ -58,18 +57,12 @@ public List<SearchQueryExecutor> getExecutionPlan(SearchQuery query, Segment seg
index
);

// Index-only plan is used only when any filter is not specified or every filter supports bitmap indexes.
// Index-only plan is used only when any filter is not specified or the filter supports bitmap indexes.
//
// Note: if some filters support bitmap indexes but others are not, the current implementation always employs
// the cursor-based plan. This can be more optimized. One possible optimization is generating a bitmap index
// from the non-bitmap-support filter, and then use it to compute the filtered result by intersecting bitmaps.
if (filter == null || filter.supportsBitmapIndex(selector)) {
final ImmutableBitmap timeFilteredBitmap = UseIndexesStrategy.makeTimeFilteredBitmap(
index,
segment,
filter,
interval
);
// from the non-bitmap-support filters, and then use it to compute the filtered result by intersecting bitmaps.
if (filter == null || filter.supportsSelectivityEstimation(index, selector)) {
final List<DimensionSpec> dimsToSearch = getDimsToSearch(
index.getAvailableDimensions(),
query.getDimensions()
Expand All @@ -84,15 +77,19 @@ public List<SearchQueryExecutor> getExecutionPlan(SearchQuery query, Segment seg
// * (search predicate processing cost)
final SearchQueryDecisionHelper helper = getDecisionHelper(index);
final double useIndexStrategyCost = helper.getBitmapIntersectCost() * computeTotalCard(index, dimsToSearch);
final double cursorOnlyStrategyCost =
(timeFilteredBitmap == null ? index.getNumRows() : timeFilteredBitmap.size()) * dimsToSearch.size();
log.debug("Use-index strategy cost: %f, cursor-only strategy cost: %f",
useIndexStrategyCost, cursorOnlyStrategyCost
final double cursorOnlyStrategyCost = (filter == null ? 1. : filter.estimateSelectivity(selector))
* selector.getNumRows()
* dimsToSearch.size();

log.debug(
"Use-index strategy cost: %f, cursor-only strategy cost: %f",
useIndexStrategyCost,
cursorOnlyStrategyCost
);

if (useIndexStrategyCost < cursorOnlyStrategyCost) {
log.debug("Use-index execution strategy is selected, query id [%s]", query.getId());
return UseIndexesStrategy.withTimeFilteredBitmap(query, timeFilteredBitmap).getExecutionPlan(query, segment);
return UseIndexesStrategy.of(query).getExecutionPlan(query, segment);
} else {
log.debug("Cursor-only execution strategy is selected, query id [%s]", query.getId());
return CursorOnlyStrategy.of(query).getExecutionPlan(query, segment);
Expand Down
Loading

0 comments on commit ddd8c9e

Please sign in to comment.