Skip to content

Commit

Permalink
NUTCH-676 - MapWritable is written inefficiently and confusingly.
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/lucene/nutch/trunk@736385 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
Tacettin Guney committed Jan 21, 2009
1 parent 478c83b commit de1ea50
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 247 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,9 @@ Unreleased changes (1.0-dev)

118. NUTCH-681 - parse-mp3 compilation problem.
(Wildan Maulana via dogacan)

119. NUTCH-676 - MapWritable is written inefficiently and confusingly.
(dogacan)

Release 0.9 - 2007-04-02

Expand Down
75 changes: 45 additions & 30 deletions src/java/org/apache/nutch/crawl/CrawlDatum.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,18 @@

import java.io.*;
import java.util.*;
import java.util.Map.Entry;

import org.apache.hadoop.io.*;
import org.apache.nutch.util.*;

/* The crawl state of a url. */
public class CrawlDatum implements WritableComparable, Cloneable {
public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
public static final String GENERATE_DIR_NAME = "crawl_generate";
public static final String FETCH_DIR_NAME = "crawl_fetch";
public static final String PARSE_DIR_NAME = "crawl_parse";

private final static byte CUR_VERSION = 6;
private final static byte CUR_VERSION = 7;

/** Compatibility values for on-the-fly conversion from versions < 5. */
private static final byte OLD_STATUS_SIGNATURE = 0;
Expand Down Expand Up @@ -118,7 +119,7 @@ public class CrawlDatum implements WritableComparable, Cloneable {
private float score = 1.0f;
private byte[] signature = null;
private long modifiedTime;
private MapWritable metaData;
private org.apache.hadoop.io.MapWritable metaData;

public static boolean hasDbStatus(CrawlDatum datum) {
if (datum.status <= STATUS_DB_MAX) return true;
Expand All @@ -131,10 +132,11 @@ public static boolean hasFetchStatus(CrawlDatum datum) {
}

public CrawlDatum() {
metaData = new MapWritable();
metaData = new org.apache.hadoop.io.MapWritable();
}

public CrawlDatum(int status, int fetchInterval) {
this();
this.status = (byte)status;
this.fetchInterval = fetchInterval;
}
Expand Down Expand Up @@ -201,14 +203,16 @@ public void setSignature(byte[] signature) {
this.signature = signature;
}

public void setMetaData(MapWritable mapWritable) {this.metaData = mapWritable; }
public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
this.metaData = mapWritable;
}

/**
* returns a MapWritable if it was set or read in @see readFields(DataInput),
* returns empty map in case CrawlDatum was freshly created (lazily instantiated).
*/
public MapWritable getMetaData() {
if (this.metaData == null) this.metaData = new MapWritable();
public org.apache.hadoop.io.MapWritable getMetaData() {
if (this.metaData == null) this.metaData = new org.apache.hadoop.io.MapWritable();
return this.metaData;
}

Expand All @@ -223,7 +227,6 @@ public static CrawlDatum read(DataInput in) throws IOException {
return result;
}


public void readFields(DataInput in) throws IOException {
byte version = in.readByte(); // read version
if (version > CUR_VERSION) // check version
Expand All @@ -244,10 +247,20 @@ public void readFields(DataInput in) throws IOException {
in.readFully(signature);
} else signature = null;
}
metaData = new org.apache.hadoop.io.MapWritable();
if (version > 3) {
metaData.clear();
if (in.readBoolean()) {
metaData.readFields(in);
if (version < 7) {
MapWritable oldMetaData = new MapWritable();
if (in.readBoolean()) {
oldMetaData.readFields(in);
}
for (Writable key : oldMetaData.keySet()) {
metaData.put(key, oldMetaData.get(key));
}
} else {
if (in.readBoolean()) {
metaData.readFields(in);
}
}
}
// translate status codes
Expand Down Expand Up @@ -278,7 +291,7 @@ public void write(DataOutput out) throws IOException {
out.writeByte(signature.length);
out.write(signature);
}
if (metaData != null && metaData.size() > 0) {
if (metaData.size() > 0) {
out.writeBoolean(true);
metaData.write(out);
} else {
Expand All @@ -295,7 +308,7 @@ public void set(CrawlDatum that) {
this.score = that.score;
this.modifiedTime = that.modifiedTime;
this.signature = that.signature;
this.metaData = new MapWritable(that.metaData); // make a deep copy
this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make a deep copy
}


Expand All @@ -304,8 +317,7 @@ public void set(CrawlDatum that) {
//

/** Sort by decreasing score. */
public int compareTo(Object o) {
CrawlDatum that = (CrawlDatum)o;
public int compareTo(CrawlDatum that) {
if (that.score != this.score)
return (that.score - this.score) > 0 ? 1 : -1;
if (that.status != this.status)
Expand Down Expand Up @@ -367,7 +379,7 @@ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
//

public String toString() {
StringBuffer buf = new StringBuffer();
StringBuilder buf = new StringBuilder();
buf.append("Version: " + CUR_VERSION + "\n");
buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + ")\n");
buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
Expand All @@ -377,9 +389,23 @@ public String toString() {
(getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
buf.append("Score: " + getScore() + "\n");
buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
buf.append("Metadata: " + (metaData != null ? metaData.toString() : "null") + "\n");
buf.append("Metadata: ");
for (Entry<Writable, Writable> e : metaData.entrySet()) {
buf.append(e.getKey());
buf.append(": ");
buf.append(e.getValue());
}
buf.append('\n');
return buf.toString();
}

private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) {
HashSet<Entry<Writable, Writable>> set1 =
new HashSet<Entry<Writable,Writable>>(metaData.entrySet());
HashSet<Entry<Writable, Writable>> set2 =
new HashSet<Entry<Writable,Writable>>(otherMetaData.entrySet());
return set1.equals(set2);
}

public boolean equals(Object o) {
if (!(o instanceof CrawlDatum))
Expand All @@ -394,18 +420,7 @@ public boolean equals(Object o) {
(SignatureComparator._compare(this.signature, other.signature) == 0) &&
(this.score == other.score);
if (!res) return res;
// allow zero-sized metadata to be equal to null metadata
if (this.metaData == null) {
if (other.metaData != null && other.metaData.size() > 0) return false;
else return true;
} else {
if (other.metaData == null) {
if (this.metaData.size() == 0) return true;
else return false;
} else {
return this.metaData.equals(other.metaData);
}
}
return metadataEquals(other.metaData);
}

public int hashCode() {
Expand All @@ -416,7 +431,7 @@ public int hashCode() {
signature[i+2] << 8 + signature[i+3]);
}
}
if (metaData != null) res ^= metaData.hashCode();
res ^= metaData.entrySet().hashCode();
return
res ^ status ^
((int)fetchTime) ^
Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/crawl/CrawlDbMerger.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public class CrawlDbMerger extends Configured implements Tool {
private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class);

public static class Merger extends MapReduceBase implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
private MapWritable meta = new MapWritable();
private org.apache.hadoop.io.MapWritable meta = new org.apache.hadoop.io.MapWritable();
private CrawlDatum res = new CrawlDatum();
private FetchSchedule schedule;

Expand Down
1 change: 1 addition & 0 deletions src/java/org/apache/nutch/crawl/MapWritable.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
* into the header of each MapWritable that uses these types.
*
* @author Stefan Groschupf
* @deprecated Use org.apache.hadoop.io.MapWritable instead.
*/
public class MapWritable implements Writable {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.io.Writable;
Expand All @@ -44,7 +45,6 @@
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.MapWritable;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
Expand All @@ -37,7 +38,6 @@
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.MapWritable;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.webgraph.Node;
import org.apache.nutch.util.FSUtils;
Expand Down
Loading

0 comments on commit de1ea50

Please sign in to comment.