Skip to content

Commit

Permalink
Reduce memory via Collector
Browse files Browse the repository at this point in the history
TopDocs with MatchAllDocsQuery can use a lot of memory if the index has
many docs, even just for counting.  Using the Collector interface avoids
this.  Also, there's no longer a real distinction between output-limit
and query-limit.
  • Loading branch information
Joel Barry committed May 16, 2016
1 parent 6957aec commit e33b583
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 27 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ project:
%id-file file) (required, scriptFile may
contain -q and -o)
--query-field <arg> default field for query
--query-limit <arg> max number of query hits to process
--query-limit <arg> same as --output-limit
--regex <arg> filter query by regex, syntax is field:/regex/
--show-hits show total hit count
--show-id show Lucene document id in results
Expand Down
77 changes: 51 additions & 26 deletions src/main/java/com/basistech/lucene/tools/LuceneQueryTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.tools.ant.types.Commandline;
Expand Down Expand Up @@ -99,7 +100,7 @@
* %id-file file) (required, scriptFile may
* contain -q and -o)
* --query-field &lt;arg&gt; default field for query
* --query-limit &lt;arg&gt; max number of query hits to process
* --query-limit &lt;arg&gt; same as --output-limit
* --regex &lt;arg&gt; filter query by regex, syntax is field:/regex/
* --show-hits show total hit count
* --show-id show Lucene document id in results
Expand All @@ -112,7 +113,6 @@
public final class LuceneQueryTool {
private List<String> fieldNames;
private Set<String> allFieldNames;
private int queryLimit;
private int outputLimit;
private String regexField;
private Pattern regex;
Expand All @@ -129,7 +129,6 @@ public final class LuceneQueryTool {

LuceneQueryTool(IndexReader reader, PrintStream out) throws IOException {
this.indexReader = reader;
this.queryLimit = Integer.MAX_VALUE;
this.outputLimit = Integer.MAX_VALUE;
this.analyzer = new KeywordAnalyzer();
this.fieldNames = Lists.newArrayList();
Expand Down Expand Up @@ -171,8 +170,9 @@ void setAnalyzer(String analyzerString) {
}
}

// same as outputLimit; for compatibility
void setQueryLimit(int queryLimit) {
this.queryLimit = queryLimit;
this.outputLimit = queryLimit;
}

void setOutputLimit(int outputLimit) {
Expand Down Expand Up @@ -217,7 +217,7 @@ void setDefaultField(String defaultField) {
this.defaultField = defaultField;
}

public void setFormatter(Formatter formatter) {
void setFormatter(Formatter formatter) {
this.formatter = formatter;
}

Expand Down Expand Up @@ -384,9 +384,9 @@ private void countFields() throws IOException {
}
}

private void runQuery(String queryString, PrintStream out)
private void runQuery(String queryString, final PrintStream out)
throws IOException, org.apache.lucene.queryparser.classic.ParseException {
IndexSearcher searcher = new IndexSearcher(indexReader);
final IndexSearcher searcher = new IndexSearcher(indexReader);
docsPrinted = 0;
Query query;
if (queryString == null) {
Expand All @@ -411,27 +411,52 @@ private void runQuery(String queryString, PrintStream out)
}
}

TopDocs topDocs = searcher.search(query, queryLimit);
if (showHits) {
out.println("totalHits: " + topDocs.totalHits);
out.println();
}
Set<String> fieldSet = Sets.newHashSet(fieldNames);
for (int i = 0; i < topDocs.scoreDocs.length && docsPrinted < outputLimit; i++) {
int id = topDocs.scoreDocs[i].doc;
float score = topDocs.scoreDocs[i].score;
Document doc = fieldSet.isEmpty() ? searcher.doc(id) : searcher.doc(id, fieldSet);
boolean passedFilter = regexField == null;
if (regexField != null) {
String value = doc.get(regexField);
if (value != null && regex.matcher(value).matches()) {
passedFilter = true;
final Set<String> fieldSet = Sets.newHashSet(fieldNames);

// use a Collector instead of TopDocs for memory efficiency, especially
// for the %all query
class MyCollector extends SimpleCollector {
private Scorer scorer;
private long totalHits;

@Override
public void collect(int id) throws IOException {
totalHits++;
if (docsPrinted >= outputLimit) {
return;
}

Document doc = fieldSet.isEmpty() ? searcher.doc(id) : searcher.doc(id, fieldSet);
boolean passedFilter = regexField == null;
if (regexField != null) {
String value = doc.get(regexField);
if (value != null && regex.matcher(value).matches()) {
passedFilter = true;
}
}
if (passedFilter) {
float score = scorer.score();
printDocument(doc, id, score, out);
}
}

@Override
public boolean needsScores() {
return true;
}
if (passedFilter) {
printDocument(doc, id, score, out);

@Override
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
}
}

MyCollector collector = new MyCollector();
searcher.search(query, collector);
if (showHits) {
out.println("totalHits: " + collector.totalHits);
out.println();
}
}

private String formatBinary(byte[] bytes) {
Expand Down Expand Up @@ -536,7 +561,7 @@ private static Options createOptions() {
option = new Option(null, "sort-fields", false, "sort fields within document");
options.addOption(option);

option = new Option(null, "query-limit", true, "max number of query hits to process");
option = new Option(null, "query-limit", true, "same as output-limit");
option.setArgs(1);
options.addOption(option);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;

import java.io.ByteArrayOutputStream;
Expand Down

0 comments on commit e33b583

Please sign in to comment.