Reduce memory via Collector

TopDocs with MatchAllDocsQuery can use a lot of memory if the index has many docs, even just for counting. Using the Collector interface avoids this. Also, there's no longer a real distinction between output-limit and query-limit.
joelb-git · May 16, 2016 · e33b583 · e33b583
1 parent 6957aec
commit e33b583
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ project:
                              %id-file file) (required, scriptFile may
                              contain -q and -o)
       --query-field <arg>    default field for query
-      --query-limit <arg>    max number of query hits to process
+      --query-limit <arg>    same as --output-limit
       --regex <arg>          filter query by regex, syntax is field:/regex/
       --show-hits            show total hit count
       --show-id              show Lucene document id in results

diff --git a/src/main/java/com/basistech/lucene/tools/LuceneQueryTool.java b/src/main/java/com/basistech/lucene/tools/LuceneQueryTool.java
@@ -51,7 +51,8 @@
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.SimpleCollector;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.tools.ant.types.Commandline;
@@ -99,7 +100,7 @@
  *                            %id-file file) (required, scriptFile may
  *                            contain -q and -o)
  *     --query-field &lt;arg&gt;    default field for query
- *     --query-limit &lt;arg&gt;    max number of query hits to process
+ *     --query-limit &lt;arg&gt;    same as --output-limit
  *     --regex &lt;arg&gt;          filter query by regex, syntax is field:/regex/
  *     --show-hits            show total hit count
  *     --show-id              show Lucene document id in results
@@ -112,7 +113,6 @@
 public final class LuceneQueryTool {
     private List<String> fieldNames;
     private Set<String> allFieldNames;
-    private int queryLimit;
     private int outputLimit;
     private String regexField;
     private Pattern regex;
@@ -129,7 +129,6 @@ public final class LuceneQueryTool {
 
     LuceneQueryTool(IndexReader reader, PrintStream out) throws IOException {
         this.indexReader = reader;
-        this.queryLimit = Integer.MAX_VALUE;
         this.outputLimit = Integer.MAX_VALUE;
         this.analyzer = new KeywordAnalyzer();
         this.fieldNames = Lists.newArrayList();
@@ -171,8 +170,9 @@ void setAnalyzer(String analyzerString) {
         }
     }
 
+    // same as outputLimit; for compatibility
     void setQueryLimit(int queryLimit) {
-        this.queryLimit = queryLimit;
+        this.outputLimit = queryLimit;
     }
 
     void setOutputLimit(int outputLimit) {
@@ -217,7 +217,7 @@ void setDefaultField(String defaultField) {
         this.defaultField = defaultField;
     }
 
-    public void setFormatter(Formatter formatter) {
+    void setFormatter(Formatter formatter) {
         this.formatter = formatter;
     }
 
@@ -384,9 +384,9 @@ private void countFields() throws IOException {
         }
     }
 
-    private void runQuery(String queryString, PrintStream out)
+    private void runQuery(String queryString, final PrintStream out)
         throws IOException, org.apache.lucene.queryparser.classic.ParseException {
-        IndexSearcher searcher = new IndexSearcher(indexReader);
+        final IndexSearcher searcher = new IndexSearcher(indexReader);
         docsPrinted = 0;
         Query query;
         if (queryString == null) {
@@ -411,27 +411,52 @@ private void runQuery(String queryString, PrintStream out)
             }
         }
 
-        TopDocs topDocs = searcher.search(query, queryLimit);
-        if (showHits) {
-            out.println("totalHits: " + topDocs.totalHits);
-            out.println();
-        }
-        Set<String> fieldSet = Sets.newHashSet(fieldNames);
-        for (int i = 0; i < topDocs.scoreDocs.length && docsPrinted < outputLimit; i++) {
-            int id = topDocs.scoreDocs[i].doc;
-            float score = topDocs.scoreDocs[i].score;
-            Document doc = fieldSet.isEmpty() ? searcher.doc(id) : searcher.doc(id, fieldSet);
-            boolean passedFilter = regexField == null;
-            if (regexField != null) {
-                String value = doc.get(regexField);
-                if (value != null && regex.matcher(value).matches()) {
-                    passedFilter = true;
+        final Set<String> fieldSet = Sets.newHashSet(fieldNames);
+
+        // use a Collector instead of TopDocs for memory efficiency, especially
+        // for the %all query
+        class MyCollector extends SimpleCollector {
+            private Scorer scorer;
+            private long totalHits;
+
+            @Override
+            public void collect(int id) throws IOException {
+                totalHits++;
+                if (docsPrinted >= outputLimit) {
+                    return;
+                }
+
+                Document doc = fieldSet.isEmpty() ? searcher.doc(id) : searcher.doc(id, fieldSet);
+                boolean passedFilter = regexField == null;
+                if (regexField != null) {
+                    String value = doc.get(regexField);
+                    if (value != null && regex.matcher(value).matches()) {
+                        passedFilter = true;
+                    }
                 }
+                if (passedFilter) {
+                    float score = scorer.score();
+                    printDocument(doc, id, score, out);
+                }
+            }
+
+            @Override
+            public boolean needsScores() {
+                return true;
             }
-            if (passedFilter) {
-                printDocument(doc, id, score, out);
+
+            @Override
+            public void setScorer(Scorer scorer) throws IOException {
+                this.scorer = scorer;
             }
         }
+
+        MyCollector collector = new MyCollector();
+        searcher.search(query, collector);
+        if (showHits) {
+            out.println("totalHits: " + collector.totalHits);
+            out.println();
+        }
     }
 
     private String formatBinary(byte[] bytes) {
@@ -536,7 +561,7 @@ private static Options createOptions() {
         option = new Option(null, "sort-fields", false, "sort fields within document");
         options.addOption(option);
 
-        option = new Option(null, "query-limit", true, "max number of query hits to process");
+        option = new Option(null, "query-limit", true, "same as output-limit");
         option.setArgs(1);
         options.addOption(option);
 

diff --git a/src/test/java/com/basistech/lucene/tools/LuceneQueryToolTest.java b/src/test/java/com/basistech/lucene/tools/LuceneQueryToolTest.java
@@ -39,6 +39,7 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
 import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import java.io.ByteArrayOutputStream;