NUTCH-442 - Integrate Solr/Nutch

git-svn-id: https://svn.apache.org/repos/asf/lucene/nutch/trunk@733738 13f79535-47bb-0310-9956-ffa450edef68
w2ogroup · Jan 12, 2009 · 21acba4 · 21acba4
1 parent 2e44f18
commit 21acba4
Show file tree

Hide file tree

Showing 71 changed files with 2,605 additions and 1,220 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -303,6 +303,8 @@ Unreleased changes (1.0-dev)
 
 113. NUTCH-594 -  Serve Nutch search results in multiple formats including 
                   XML and JSON. (kubes)
+
+114. NUTCH-442 - Integrate Solr/Nutch. (dogacan, original version by siren) 
 
 Release 0.9 - 2007-04-02
 

diff --git a/bin/nutch b/bin/nutch
@@ -49,6 +49,7 @@ if [ $# = 0 ]; then
   echo "  invertlinks       create a linkdb from parsed segments"
   echo "  mergelinkdb       merge linkdb-s, with optional filtering"
   echo "  index             run the indexer on parsed segments and linkdb"
+  echo "  solrindex         run the solr indexer on parsed segments and linkdb"
   echo "  merge             merge several segment indexes"
   echo "  dedup             remove duplicates from a set of segment indexes"
   echo "  plugin            load a plugin and run one of its classes main()"
@@ -230,6 +231,8 @@ elif [ "$COMMAND" = "mergelinkdb" ] ; then
   CLASS=org.apache.nutch.crawl.LinkDbMerger
 elif [ "$COMMAND" = "index" ] ; then
   CLASS=org.apache.nutch.indexer.Indexer
+elif [ "$COMMAND" = "solrindex" ] ; then
+  CLASS=org.apache.nutch.indexer.solr.SolrIndexer
 elif [ "$COMMAND" = "dedup" ] ; then
   CLASS=org.apache.nutch.indexer.DeleteDuplicates
 elif [ "$COMMAND" = "merge" ] ; then

diff --git a/build.xml b/build.xml
@@ -184,6 +184,9 @@
         <include name="dom4j-*.jar"/>
         <include name="xerces-*.jar"/>
         <include name="tika-*.jar"/>
+        <include name="apache-solr-*.jar"/>
+        <include name="commons-httpclient-*.jar"/>
+        <include name="commons-codec-*.jar"/>
         <include name="commons-collections-*.jar"/>
         <include name="commons-beanutils-*.jar"/>
         <include name="commons-cli-*.jar"/>

diff --git a/lib/apache-solr-common-1.3.0.jar b/lib/apache-solr-common-1.3.0.jar
diff --git a/lib/apache-solr-solrj-1.3.0.jar b/lib/apache-solr-solrj-1.3.0.jar
diff --git a/src/java/org/apache/nutch/crawl/Crawl.java b/src/java/org/apache/nutch/crawl/Crawl.java
@@ -62,7 +62,6 @@ public static void main(String args[]) throws Exception {
     int threads = job.getInt("fetcher.threads.fetch", 10);
     int depth = 5;
     long topN = Long.MAX_VALUE;
-
     for (int i = 0; i < args.length; i++) {
       if ("-dir".equals(args[i])) {
         dir = new Path(args[i+1]);
@@ -74,8 +73,8 @@ public static void main(String args[]) throws Exception {
         depth = Integer.parseInt(args[i+1]);
         i++;
       } else if ("-topN".equals(args[i])) {
-        topN = Integer.parseInt(args[i+1]);
-        i++;
+          topN = Integer.parseInt(args[i+1]);
+          i++;
       } else if (args[i] != null) {
         rootUrlDir = new Path(args[i]);
       }
@@ -128,24 +127,28 @@ public static void main(String args[]) throws Exception {
     if (i > 0) {
       linkDbTool.invert(linkDb, segments, true, true, false); // invert links
 
-      // Delete old indexes
-      if (fs.exists(indexes)) {
-        LOG.info("Deleting old indexes: " + indexes);
-        fs.delete(indexes, true);
-      }
-
-      // Delete old index
-      if (fs.exists(index)) {
-        LOG.info("Deleting old merged index: " + index);
-        fs.delete(index, true);
+      if(indexes != null) {
+        // Delete old indexes
+        if (fs.exists(indexes)) {
+          LOG.info("Deleting old indexes: " + indexes);
+          fs.delete(indexes, true);
+        }
+
+        // Delete old index
+        if (fs.exists(index)) {
+          LOG.info("Deleting old merged index: " + index);
+          fs.delete(index, true);
+        }
       }
 
       // index, dedup & merge
       FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
-      indexer.index(indexes, crawlDb, linkDb, HadoopFSUtil.getPaths(fstats));
-      dedup.dedup(new Path[] { indexes });
-      fstats = fs.listStatus(indexes, HadoopFSUtil.getPassDirectoriesFilter(fs));
-      merger.merge(HadoopFSUtil.getPaths(fstats), index, tmpDir);
+      indexer.index(indexes, crawlDb, linkDb, Arrays.asList(HadoopFSUtil.getPaths(fstats)));
+      if(indexes != null) {
+        dedup.dedup(new Path[] { indexes });
+        fstats = fs.listStatus(indexes, HadoopFSUtil.getPassDirectoriesFilter(fs));
+        merger.merge(HadoopFSUtil.getPaths(fstats), index, tmpDir);
+      }
     } else {
       LOG.warn("No URLs to fetch - check your seed list and URL filters.");
     }

diff --git a/src/java/org/apache/nutch/crawl/Inlinks.java b/src/java/org/apache/nutch/crawl/Inlinks.java
@@ -69,7 +69,7 @@ public String toString() {
 
   /** Return the set of anchor texts.  Only a single anchor with a given text
    * is permitted from a given domain. */
-  public String[] getAnchors() throws IOException {
+  public String[] getAnchors() {
     HashMap<String, Set<String>> domainToAnchors =
       new HashMap<String, Set<String>>();
     ArrayList<String> results = new ArrayList<String>();
@@ -97,5 +97,4 @@ public String[] getAnchors() throws IOException {
     return results.toArray(new String[results.size()]);
   }
 
-
 }