NUTCH-762 : Generator can generate several segments in one parse of t…

…he crawlDB git-svn-id: https://svn.apache.org/repos/asf/lucene/nutch/trunk@926155 13f79535-47bb-0310-9956-ffa450edef68
w2ogroup · Mar 22, 2010 · 133cc06 · 133cc06
1 parent 671bf17
commit 133cc06
Show file tree

Hide file tree

Showing 9 changed files with 439 additions and 310 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-762 Generator can generate several segments in one parse of the crawlDB (jnioche)
+
 * NUTCH-740 Configuration option to override default language for fetched pages (Marcin Okraszewski via jnioche)
 
 * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
@@ -514,24 +514,21 @@
 <!-- generate properties -->
 
 <property>
-  <name>generate.max.per.host</name>
+  <name>generate.max.count</name>
   <value>-1</value>
-  <description>The maximum number of urls per host in a single
-  fetchlist.  -1 if unlimited.</description>
+  <description>The maximum number of urls in a single
+  fetchlist.  -1 if unlimited. The urls are counted according
+  to the value of the parameter generator.count.mode.
+  </description>
 </property>
 
 <property>
-  <name>generate.max.per.host.by.ip</name>
-  <value>false</value>
-  <description>If false, same host names are counted. If true,
-  hosts' IP addresses are resolved and the same IP-s are counted.
-
-  -+-+-+- WARNING !!! -+-+-+-
-  When set to true, Generator will create a lot of DNS lookup
-  requests, rapidly. This may cause a DOS attack on
-  remote DNS servers, not to mention increased external traffic
-  and latency. For these reasons when using this option it is
-  required that a local caching DNS be used.</description>
+  <name>generate.count.mode</name>
+  <value>host</value>
+  <description>Determines how the URLs are counted for generator.max.count.
+  Default value is 'host' but can be 'domain'. Note that we do not count 
+  per IP in the new version of the Generator.
+  </description>
 </property>
 
 <property>
@@ -545,6 +542,34 @@
   updatedb will generate identical fetchlists.</description>
 </property>
 
+<property>
+  <name>generate.max.per.host</name>
+  <value>-1</value>
+  <description>(Deprecated). Use generate.max.count and generate.count.mode instead.
+  The maximum number of urls per host in a single
+  fetchlist.  -1 if unlimited.</description>
+</property>
+
+<!-- urlpartitioner properties -->
+<property>
+  <name>partition.url.mode</name>
+  <value>byHost</value>
+  <description>Determines how to partition URLs. Default value is 'byHost', 
+  also takes 'byDomain' or 'byIP'. 
+  </description>
+</property>
+
+<property>
+  <name>crawl.gen.delay</name>
+  <value>604800000</value>
+  <description>
+   This value, expressed in days, defines how long we should keep the lock on records 
+   in CrawlDb that were just selected for fetching. If these records are not updated 
+   in the meantime, the lock is canceled, i.e. the become eligible for selecting. 
+   Default value of this is 7 days.
+  </description>
+</property>
+
 <!-- fetcher properties -->
 
 <property>

diff --git a/src/java/org/apache/nutch/crawl/Crawl.java b/src/java/org/apache/nutch/crawl/Crawl.java
@@ -124,17 +124,17 @@ public static void main(String args[]) throws Exception {
     injector.inject(crawlDb, rootUrlDir);
     int i;
     for (i = 0; i < depth; i++) {             // generate new segment
-      Path segment = generator.generate(crawlDb, segments, -1, topN, System
+      Path[] segs = generator.generate(crawlDb, segments, -1, topN, System
           .currentTimeMillis());
-      if (segment == null) {
+      if (segments == null) {
         LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
         break;
       }
-      fetcher.fetch(segment, threads, org.apache.nutch.fetcher.Fetcher.isParsing(conf));  // fetch it
+      fetcher.fetch(segs[0], threads, org.apache.nutch.fetcher.Fetcher.isParsing(conf));  // fetch it
       if (!Fetcher.isParsing(job)) {
-        parseSegment.parse(segment);    // parse it, if needed
+        parseSegment.parse(segs[0]);    // parse it, if needed
       }
-      crawlDbTool.update(crawlDb, new Path[]{segment}, true, true); // update crawldb
+      crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
     }
     if (i > 0) {
       linkDbTool.invert(linkDb, segments, true, true, false); // invert links