Skip to content

Commit

Permalink
NUTCH-442 - Integrate Solr/Nutch
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/lucene/nutch/trunk@733738 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
Tacettin Guney committed Jan 12, 2009
1 parent 2e44f18 commit 21acba4
Show file tree
Hide file tree
Showing 71 changed files with 2,605 additions and 1,220 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ Unreleased changes (1.0-dev)

113. NUTCH-594 - Serve Nutch search results in multiple formats including
XML and JSON. (kubes)

114. NUTCH-442 - Integrate Solr/Nutch. (dogacan, original version by siren)

Release 0.9 - 2007-04-02

Expand Down
3 changes: 3 additions & 0 deletions bin/nutch
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ if [ $# = 0 ]; then
echo " invertlinks create a linkdb from parsed segments"
echo " mergelinkdb merge linkdb-s, with optional filtering"
echo " index run the indexer on parsed segments and linkdb"
echo " solrindex run the solr indexer on parsed segments and linkdb"
echo " merge merge several segment indexes"
echo " dedup remove duplicates from a set of segment indexes"
echo " plugin load a plugin and run one of its classes main()"
Expand Down Expand Up @@ -230,6 +231,8 @@ elif [ "$COMMAND" = "mergelinkdb" ] ; then
CLASS=org.apache.nutch.crawl.LinkDbMerger
elif [ "$COMMAND" = "index" ] ; then
CLASS=org.apache.nutch.indexer.Indexer
elif [ "$COMMAND" = "solrindex" ] ; then
CLASS=org.apache.nutch.indexer.solr.SolrIndexer
elif [ "$COMMAND" = "dedup" ] ; then
CLASS=org.apache.nutch.indexer.DeleteDuplicates
elif [ "$COMMAND" = "merge" ] ; then
Expand Down
3 changes: 3 additions & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@
<include name="dom4j-*.jar"/>
<include name="xerces-*.jar"/>
<include name="tika-*.jar"/>
<include name="apache-solr-*.jar"/>
<include name="commons-httpclient-*.jar"/>
<include name="commons-codec-*.jar"/>
<include name="commons-collections-*.jar"/>
<include name="commons-beanutils-*.jar"/>
<include name="commons-cli-*.jar"/>
Expand Down
Binary file added lib/apache-solr-common-1.3.0.jar
Binary file not shown.
Binary file added lib/apache-solr-solrj-1.3.0.jar
Binary file not shown.
37 changes: 20 additions & 17 deletions src/java/org/apache/nutch/crawl/Crawl.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ public static void main(String args[]) throws Exception {
int threads = job.getInt("fetcher.threads.fetch", 10);
int depth = 5;
long topN = Long.MAX_VALUE;

for (int i = 0; i < args.length; i++) {
if ("-dir".equals(args[i])) {
dir = new Path(args[i+1]);
Expand All @@ -74,8 +73,8 @@ public static void main(String args[]) throws Exception {
depth = Integer.parseInt(args[i+1]);
i++;
} else if ("-topN".equals(args[i])) {
topN = Integer.parseInt(args[i+1]);
i++;
topN = Integer.parseInt(args[i+1]);
i++;
} else if (args[i] != null) {
rootUrlDir = new Path(args[i]);
}
Expand Down Expand Up @@ -128,24 +127,28 @@ public static void main(String args[]) throws Exception {
if (i > 0) {
linkDbTool.invert(linkDb, segments, true, true, false); // invert links

// Delete old indexes
if (fs.exists(indexes)) {
LOG.info("Deleting old indexes: " + indexes);
fs.delete(indexes, true);
}

// Delete old index
if (fs.exists(index)) {
LOG.info("Deleting old merged index: " + index);
fs.delete(index, true);
if(indexes != null) {
// Delete old indexes
if (fs.exists(indexes)) {
LOG.info("Deleting old indexes: " + indexes);
fs.delete(indexes, true);
}

// Delete old index
if (fs.exists(index)) {
LOG.info("Deleting old merged index: " + index);
fs.delete(index, true);
}
}

// index, dedup & merge
FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
indexer.index(indexes, crawlDb, linkDb, HadoopFSUtil.getPaths(fstats));
dedup.dedup(new Path[] { indexes });
fstats = fs.listStatus(indexes, HadoopFSUtil.getPassDirectoriesFilter(fs));
merger.merge(HadoopFSUtil.getPaths(fstats), index, tmpDir);
indexer.index(indexes, crawlDb, linkDb, Arrays.asList(HadoopFSUtil.getPaths(fstats)));
if(indexes != null) {
dedup.dedup(new Path[] { indexes });
fstats = fs.listStatus(indexes, HadoopFSUtil.getPassDirectoriesFilter(fs));
merger.merge(HadoopFSUtil.getPaths(fstats), index, tmpDir);
}
} else {
LOG.warn("No URLs to fetch - check your seed list and URL filters.");
}
Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/crawl/Inlinks.java
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public String toString() {

/** Return the set of anchor texts. Only a single anchor with a given text
* is permitted from a given domain. */
public String[] getAnchors() throws IOException {
public String[] getAnchors() {
HashMap<String, Set<String>> domainToAnchors =
new HashMap<String, Set<String>>();
ArrayList<String> results = new ArrayList<String>();
Expand Down Expand Up @@ -97,5 +97,4 @@ public String[] getAnchors() throws IOException {
return results.toArray(new String[results.size()]);
}


}
Loading

0 comments on commit 21acba4

Please sign in to comment.