From 5d5d9385123d14a9a9bf0bda0c9024326d851927 Mon Sep 17 00:00:00 2001 From: qaate47 Date: Tue, 16 May 2023 14:18:23 +0200 Subject: [PATCH] Create QuadString and add simple parser extension to parse NQuad --- .../hdt/exceptions/ParserException.java | 10 + .../java/org/rdfhdt/hdt/quad/QuadString.java | 119 ++++++++++ .../org/rdfhdt/hdt/triples/TripleString.java | 205 +++++++++++++++--- .../org/rdfhdt/hdt/rdf/RDFParserFactory.java | 2 +- .../hdt/rdf/parsers/JenaNodeFormatter.java | 3 + .../rdfhdt/hdt/rdf/parsers/RDFParserRIOT.java | 131 +++++------ .../hdt/rdf/parsers/RDFParserSimple.java | 59 +++-- .../hdt/rdf/parsers/RDFParserSimpleTest.java | 147 ++++++++----- .../util/LargeFakeDataSetStreamSupplier.java | 145 ++++++++----- 9 files changed, 605 insertions(+), 216 deletions(-) create mode 100644 hdt-api/src/main/java/org/rdfhdt/hdt/quad/QuadString.java diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/exceptions/ParserException.java b/hdt-api/src/main/java/org/rdfhdt/hdt/exceptions/ParserException.java index 98ca736d..7f1e4b9a 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/exceptions/ParserException.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/exceptions/ParserException.java @@ -35,6 +35,12 @@ public class ParserException extends Exception { private static final long serialVersionUID = -5159336711525269027L; + private static String createMessage(String message, String line, int location) { + int start = Math.max(0, location - 10); + int end = Math.min(line.length(), location + 10); + return message + " near " + line.substring(start, end); + } + public ParserException() { super(); } @@ -43,6 +49,10 @@ public ParserException(String message) { super(message); } + public ParserException(String message, String line, int location) { + this(createMessage(message, line, location)); + } + public ParserException(Throwable e) { super(e); } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/quad/QuadString.java b/hdt-api/src/main/java/org/rdfhdt/hdt/quad/QuadString.java new file mode 100644 index 00000000..7df8963f --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/quad/QuadString.java @@ -0,0 +1,119 @@ +package org.rdfhdt.hdt.quad; + +import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.triples.TripleString; + +public class QuadString extends TripleString { + protected CharSequence context; + + public QuadString() { + super(); + context = ""; + } + + public QuadString(CharSequence subject, CharSequence predicate, CharSequence object, CharSequence context) { + super(subject, predicate, object); + this.context = context; + } + + public QuadString(TripleString other) { + super(other); + this.context = other.getObject(); + } + + @Override + public void clear() { + super.clear(); + context = ""; + } + + @Override + public boolean equals(Object other) { + if (context.length() == 0) { + return super.equals(other); + } + if (!(other instanceof QuadString)) { + return false; + } + QuadString qs = (QuadString) other; + return equalsCharSequence(subject, qs.subject) + && equalsCharSequence(predicate, qs.predicate) + && equalsCharSequence(object, qs.object) + && equalsCharSequence(context, qs.context); + } + + @Override + public CharSequence getGraph() { + return context; + } + + @Override + public void setGraph(CharSequence context) { + this.context = context; + } + + @Override + public void setAll(CharSequence subject, CharSequence predicate, CharSequence object) { + setAll(subject, predicate, object, ""); + } + + /** + * Sets all components at once. Useful to reuse existing object instead of + * creating new ones for performance. + * + * @param subject subject + * @param predicate predicate + * @param object object + * @param context context + */ + public void setAll(CharSequence subject, CharSequence predicate, CharSequence object, CharSequence context) { + super.setAll(subject, predicate, object); + this.context = context; + } + + @Override + public boolean match(TripleString pattern) { + if (context.length() != 0 + && !(pattern instanceof QuadString && equalsCharSequence(((QuadString) pattern).context, context))) { + // if a context is defined, we don't match + return false; + } + if (pattern.getSubject().length() == 0 || equalsCharSequence(pattern.getSubject(), this.subject)) { + if (pattern.getPredicate().length() == 0 || equalsCharSequence(pattern.getPredicate(), this.predicate)) { + return pattern.getObject().length() == 0 || equalsCharSequence(pattern.getObject(), this.object); + } + } + return false; + } + + @Override + public boolean isEmpty() { + return super.isEmpty() && context.length() == 0; + } + + @Override + public void read(String line) throws ParserException { + super.read(line, true); + } + + @Override + public void read(String line, int start, int end) throws ParserException { + super.read(line, start, end, true); + } + + @Override + public int hashCode() { + // Same as Objects.hashCode(subject, predicate, object), with fewer + // calls + int s = subject == null ? 0 : subject.hashCode(); + int p = predicate == null ? 0 : predicate.hashCode(); + int o = object == null ? 0 : object.hashCode(); + int c = context == null ? 0 : context.hashCode(); + return 31 * (31 * (31 * (31 * s) + p) + o) + c; + } + + @Override + public QuadString tripleToString() { + return new QuadString(subject.toString(), predicate.toString(), object.toString(), context.toString()); + } +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java index c8317eb9..a25eae94 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java @@ -37,9 +37,9 @@ */ public class TripleString { - private CharSequence subject; - private CharSequence predicate; - private CharSequence object; + protected CharSequence subject; + protected CharSequence predicate; + protected CharSequence object; public TripleString() { this.subject = this.predicate = this.object = null; @@ -128,7 +128,7 @@ public void setAll(CharSequence subject, CharSequence predicate, CharSequence ob this.object = object; } - private boolean equalsCharSequence(CharSequence cs1, CharSequence cs2) { + protected boolean equalsCharSequence(CharSequence cs1, CharSequence cs2) { if (cs1 instanceof String && cs2 instanceof String) return cs1.equals(cs2); // use string method if we can @@ -198,11 +198,23 @@ public boolean hasEmpty() { /** * Read from a line, where each component is separated by space. + * * @param line line to read * @throws ParserException if the line is not RDF complient */ public void read(String line) throws ParserException { - read(line, 0, line.length()); + read(line, false); + } + + /** + * Read from a line, where each component is separated by space. + * + * @param line line to read + * @param processQuad process quad + * @throws ParserException if the line is not RDF complient + */ + public void read(String line, boolean processQuad) throws ParserException { + read(line, 0, line.length(), processQuad); } private int searchNextTabOrSpace(String line, int start, int end) { @@ -222,13 +234,72 @@ private int searchNextTabOrSpace(String line, int start, int end) { return -1; } + private int searchBNodeBackward(String line, int start, int end) { + // bn grammar + // BLANK_NODE_LABEL ::= '_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* + // PN_CHARS)? + // PN_CHARS_BASE ::= [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | + // [#x00F8-#x02FF] + // | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | + // [#x2070-#x218F] + // | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | + // [#xFDF0-#xFFFD] + // | [#x10000-#xEFFFF] + // PN_CHARS_U ::= PN_CHARS_BASE | '_' | ':' + // PN_CHARS ::= PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | + // [#x203F-#x2040] + + int loc = end; + + while (start > loc) { + switch (line.charAt(loc)) { + case ' ': + case '\t': + if (loc + 2 > end) { + return -1; + } + if (line.charAt(loc + 1) == '_' && line.charAt(loc + 2) == ':') { + return loc + 1; + } + break; + case '^': + case '@': + case '>': + case '<': + case '"': + // it wasn't a bnode + return -1; + default: break; // ignore, we don't check the format + } + loc--; + } + return -1; + } + /** * Read from a line, where each component is separated by space. - * @param line line to read + * + * @param line line to read + * @param start start in the string + * @param end in the string * @throws ParserException if the line is not RDF complient */ public void read(String line, int start, int end) throws ParserException { + read(line, start, end, false); + } + + /** + * Read from a line, where each component is separated by space. + * + * @param line line to read + * @param start start in the string + * @param end in the string + * @param processQuad process quad + * @throws ParserException if the line is not RDF complient + */ + public void read(String line, int start, int end, boolean processQuad) throws ParserException { int split, posa, posb; + // for quad implementation, don't forget to clear the graph this.clear(); // SET SUBJECT @@ -240,9 +311,9 @@ public void read(String line, int start, int end) throws ParserException { return; } if (line.charAt(posa) == '<') { - posa++; // Remove < - if (line.charAt(posb-1) == '>') { - posb--; // Remove > + posa++; // Remove < + if (line.charAt(posb - 1) == '>') { + posb--; // Remove > } } @@ -264,24 +335,78 @@ public void read(String line, int start, int end) throws ParserException { this.setPredicate(UnicodeEscape.unescapeString(line, posa, posb)); - // SET OBJECT - posa = split + 1; - posb = end; + if (processQuad) { + // SET OBJECT + posa = split + 1; + posb = end; - // Remove trailing from NTRIPLES. - if (line.charAt(posb-1) == '.') { - posb--; - } - char prev = line.charAt(posb-1); - if (prev == ' ' || prev == '\t') { - posb--; + // Remove trailing from NTRIPLES. + if (line.charAt(posb - 1) == '.') { + posb--; + } + char prev = line.charAt(posb - 1); + if (prev == ' ' || prev == '\t') { + posb--; + } + + char lastElem = line.charAt(posb - 1); + if (lastElem != '"') { + if (lastElem == '>') { + // can describe an IRI, can be: + // datatype of a literal + // object IRI + // graph IRI + + int iriStart = line.lastIndexOf('<', posb); + + if (iriStart < posa) { + throw new ParserException("end of a '>' without a start '<'", line, posb); + } + if (posa != iriStart && line.charAt(iriStart - 1) != '^') { + this.setGraph(UnicodeEscape.unescapeString(line, iriStart + 1, posb - 1)); + posb = iriStart - 1; + } + // not the current element, literal or object iri + } else { + // end of a lang tag for a literal + // end of an object BNode + // end of a graph BNode + + // '_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* PN_CHARS)? + // PN_CHARS_U ::= PN_CHARS_BASE | '_' | ':' + // PN_CHARS ::= PN_CHARS_U | '-' | [0-9] | #x00B7 | + // [#x0300-#x036F] | [#x203F-#x2040] + + int bnodeStart = searchBNodeBackward(line, posa, posb); + if (bnodeStart > posa) { + this.setGraph(UnicodeEscape.unescapeString(line, bnodeStart + 1, posb - 1)); + posb = bnodeStart; + } + // not the current element, literal language or object bnode + } + } + // a literal can't describe a graph + } else { + // SET OBJECT + posa = split + 1; + posb = end; + + // Remove trailing from NTRIPLES. + if (line.charAt(posb - 1) == '.') { + posb--; + } + char prev = line.charAt(posb - 1); + if (prev == ' ' || prev == '\t') { + posb--; + } } if (line.charAt(posa) == '<') { posa++; - // Remove trailing > only if < appears, so "some"^^ is kept as-is. - if (posb > posa && line.charAt(posb-1)=='>') { + // Remove trailing > only if < appears, so "some"^^ + // is kept as-is. + if (posb > posa && line.charAt(posb - 1) == '>') { posb--; } } @@ -289,6 +414,7 @@ public void read(String line, int start, int end) throws ParserException { this.setObject(UnicodeEscape.unescapeString(line, posa, posb)); } + /* * (non-Javadoc) * @@ -326,14 +452,25 @@ public final void dumpNtriple(Appendable out) throws IOException { } char o0 = object.charAt(0); - if(o0=='"') { + if (o0 == '"') { UnicodeEscape.escapeString(object.toString(), out); - out.append(" .\n"); - } else if(o0=='_' ||o0=='<' ) { - out.append(object).append(" .\n"); + } else if (o0 == '_' || o0 == '<') { + out.append(object); } else { - out.append('<').append(object).append("> .\n"); + out.append('<').append(object).append(">"); } + + CharSequence graph = getGraph(); + if (graph.length() != 0) { + char g0 = graph.charAt(0); + if (g0 == '<') { + out.append(' ').append(graph); + } else { + out.append(" <").append(graph).append(">"); + } + } + + out.append(" .\n"); } /** @@ -347,4 +484,20 @@ public TripleString tripleToString() { object.toString() ); } + + /** + * implementation for the graph context + * + * @param context context + */ + public void setGraph(CharSequence context) { + // nothing + } + + /** + * @return graph + */ + public CharSequence getGraph() { + return ""; + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java index 981930e6..eab64384 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java @@ -58,10 +58,10 @@ public static RDFParserCallback getParserCallback(RDFNotation notation) { public static RDFParserCallback getParserCallback(RDFNotation notation, HDTOptions spec) { switch(notation) { case NTRIPLES: + case NQUAD: if (useSimple(spec)) { return new RDFParserSimple(); } - case NQUAD: case TURTLE: case N3: case RDFXML: diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/JenaNodeFormatter.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/JenaNodeFormatter.java index effb45d0..7cb8ff68 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/JenaNodeFormatter.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/JenaNodeFormatter.java @@ -42,6 +42,9 @@ public static String format(RDFNode n) { } public static String format(Node node) { + if (node == null) { + return ""; + } if (node.isURI()) { return node.getURI(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRIOT.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRIOT.java index 8bd99295..7754dbad 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRIOT.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRIOT.java @@ -3,71 +3,73 @@ * Revision: $Rev: 191 $ * Last modified: $Date: 2013-03-03 11:41:43 +0000 (dom, 03 mar 2013) $ * Last modified by: $Author: mario.arias $ - * + *

* This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 3.0 of the License. - * + *

* This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + *

* You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * + *

* Contacting the authors: - * Mario Arias: mario.arias@deri.org - * Javier D. Fernandez: jfergar@infor.uva.es - * Miguel A. Martinez-Prieto: migumar2@infor.uva.es + * Mario Arias: mario.arias@deri.org + * Javier D. Fernandez: jfergar@infor.uva.es + * Miguel A. Martinez-Prieto: migumar2@infor.uva.es */ package org.rdfhdt.hdt.rdf.parsers; -import java.io.FileNotFoundException; -import java.io.InputStream; - import org.apache.jena.graph.Triple; import org.apache.jena.riot.Lang; -import org.apache.jena.riot.RDFDataMgr; import org.apache.jena.riot.RDFParser; -import org.apache.jena.riot.RDFParserBuilder; import org.apache.jena.riot.lang.LabelToNode; import org.apache.jena.riot.system.StreamRDF; import org.apache.jena.sparql.core.Quad; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.quad.QuadString; import org.rdfhdt.hdt.rdf.RDFParserCallback; import org.rdfhdt.hdt.triples.TripleString; import org.rdfhdt.hdt.util.io.IOUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.FileNotFoundException; +import java.io.InputStream; + /** * @author mario.arias * */ -public class RDFParserRIOT implements RDFParserCallback, StreamRDF { +public class RDFParserRIOT implements RDFParserCallback { private static final Logger log = LoggerFactory.getLogger(RDFParserRIOT.class); - private void parse(InputStream stream, String baseUri, Lang lang, boolean keepBNode) { + + private void parse(InputStream stream, String baseUri, Lang lang, boolean keepBNode, ElemStringBuffer buffer) { if (keepBNode) { - RDFParser.source(stream).base(baseUri).lang(lang).labelToNode(LabelToNode.createUseLabelAsGiven()).parse(this); + RDFParser.source(stream).base(baseUri).lang(lang).labelToNode(LabelToNode.createUseLabelAsGiven()) + .parse(buffer); } else { - RDFParser.source(stream).base(baseUri).lang(lang).parse(this); + RDFParser.source(stream).base(baseUri).lang(lang).parse(buffer); } } - private RDFCallback callback; - private final TripleString triple = new TripleString(); - - /* (non-Javadoc) - * @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, java.lang.String, hdt.enums.RDFNotation, hdt.rdf.RDFParserCallback.Callback) + /* + * (non-Javadoc) + * @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, + * java.lang.String, hdt.enums.RDFNotation, + * hdt.rdf.RDFParserCallback.Callback) */ @Override - public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException { + public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) + throws ParserException { try (InputStream input = IOUtil.getFileInputStream(fileName)) { doParse(input, baseUri, notation, keepBNode, callback); } catch (FileNotFoundException e) { @@ -79,25 +81,26 @@ public void doParse(String fileName, String baseUri, RDFNotation notation, boole } @Override - public void doParse(InputStream input, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException { - this.callback = callback; + public void doParse(InputStream input, String baseUri, RDFNotation notation, boolean keepBNode, + RDFCallback callback) throws ParserException { try { - switch(notation) { + ElemStringBuffer buffer = new ElemStringBuffer(notation == RDFNotation.NQUAD, callback); + switch (notation) { case NTRIPLES: - parse(input, baseUri, Lang.NTRIPLES, keepBNode); + parse(input, baseUri, Lang.NTRIPLES, keepBNode, buffer); break; case NQUAD: - parse(input, baseUri, Lang.NQUADS, keepBNode); + parse(input, baseUri, Lang.NQUADS, keepBNode, buffer); break; case RDFXML: - parse(input, baseUri, Lang.RDFXML, keepBNode); + parse(input, baseUri, Lang.RDFXML, keepBNode, buffer); break; case N3: case TURTLE: - parse(input, baseUri, Lang.TURTLE, keepBNode); + parse(input, baseUri, Lang.TURTLE, keepBNode, buffer); break; default: - throw new NotImplementedException("Parser not found for format "+notation); + throw new NotImplementedException("Parser not found for format " + notation); } } catch (Exception e) { log.error("Unexpected exception.", e); @@ -105,45 +108,45 @@ public void doParse(InputStream input, String baseUri, RDFNotation notation, boo } } - @Override - public void start() { - // TODO Auto-generated method stub - - } - - @Override - public void triple(Triple parsedTriple) { - triple.setAll( - JenaNodeFormatter.format(parsedTriple.getSubject()), - JenaNodeFormatter.format(parsedTriple.getPredicate()), - JenaNodeFormatter.format(parsedTriple.getObject())); - callback.processTriple(triple, 0); - } + private static class ElemStringBuffer implements StreamRDF { + private final TripleString triple; + private final RDFCallback callback; - @Override - public void quad(Quad quad) { - triple.setAll( - JenaNodeFormatter.format(quad.getSubject()), - JenaNodeFormatter.format(quad.getPredicate()), - JenaNodeFormatter.format(quad.getObject())); - callback.processTriple(triple, 0); - } + private ElemStringBuffer(boolean quad, RDFCallback callback) { + this.triple = quad ? new QuadString() : new TripleString(); + this.callback = callback; + } - @Override - public void base(String base) { -// System.out.println("Base: "+base); - } + @Override + public void triple(Triple parsedTriple) { + triple.setAll(JenaNodeFormatter.format(parsedTriple.getSubject()), + JenaNodeFormatter.format(parsedTriple.getPredicate()), + JenaNodeFormatter.format(parsedTriple.getObject())); + callback.processTriple(triple, 0); + } - @Override - public void prefix(String prefix, String iri) { -// System.out.println("Prefix: "+prefix+" iri "+iri); - } + @Override + public void quad(Quad quad) { + triple.setAll(JenaNodeFormatter.format(quad.getSubject()), JenaNodeFormatter.format(quad.getPredicate()), + JenaNodeFormatter.format(quad.getObject())); + triple.setGraph(JenaNodeFormatter.format(quad.getGraph())); + callback.processTriple(triple, 0); + } - @Override - public void finish() { - // TODO Auto-generated method stub + @Override + public void start() { + } - } + @Override + public void base(String base) { + } + @Override + public void prefix(String prefix, String iri) { + } + @Override + public void finish() { + } + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimple.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimple.java index 2f264455..3f94f290 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimple.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimple.java @@ -19,27 +19,28 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * Contacting the authors: - * Mario Arias: mario.arias@deri.org - * Javier D. Fernandez: jfergar@infor.uva.es - * Miguel A. Martinez-Prieto: migumar2@infor.uva.es - * Alejandro Andres: fuzzy.alej@gmail.com + * Mario Arias: mario.arias@deri.org + * Javier D. Fernandez: jfergar@infor.uva.es + * Miguel A. Martinez-Prieto: migumar2@infor.uva.es + * Alejandro Andres: fuzzy.alej@gmail.com */ package org.rdfhdt.hdt.rdf.parsers; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; - import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.quad.QuadString; import org.rdfhdt.hdt.rdf.RDFParserCallback; import org.rdfhdt.hdt.triples.TripleString; import org.rdfhdt.hdt.util.io.IOUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + /** * @author mario.arias * @@ -47,11 +48,15 @@ public class RDFParserSimple implements RDFParserCallback { private static final Logger log = LoggerFactory.getLogger(RDFParserSimple.class); - /* (non-Javadoc) - * @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, java.lang.String, hdt.enums.RDFNotation, hdt.rdf.RDFParserCallback.RDFCallback) + /* + * (non-Javadoc) + * @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, + * java.lang.String, hdt.enums.RDFNotation, + * hdt.rdf.RDFParserCallback.RDFCallback) */ @Override - public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException { + public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) + throws ParserException { BufferedReader reader; try { reader = IOUtil.getFileReader(fileName); @@ -67,7 +72,8 @@ public void doParse(String fileName, String baseUri, RDFNotation notation, boole } @Override - public void doParse(InputStream input, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException { + public void doParse(InputStream input, String baseUri, RDFNotation notation, boolean keepBNode, + RDFCallback callback) throws ParserException { BufferedReader reader = new BufferedReader(new InputStreamReader(input)); try { doParse(reader, baseUri, notation, keepBNode, callback); @@ -76,12 +82,19 @@ public void doParse(InputStream input, String baseUri, RDFNotation notation, boo } } - private void doParse(BufferedReader reader, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException { - try { + private void doParse(BufferedReader reader, String baseUri, RDFNotation notation, boolean keepBNode, + RDFCallback callback) throws ParserException { + boolean readQuad = notation == RDFNotation.NQUAD; + try (reader) { String line; long numLine = 1; - TripleString triple = new TripleString(); - while((line=reader.readLine())!=null) { + TripleString triple; + if (readQuad) { + triple = new QuadString(); + } else { + triple = new TripleString(); + } + while ((line = reader.readLine()) != null) { // trim, find start int start = 0; while (start < line.length()) { @@ -93,18 +106,19 @@ private void doParse(BufferedReader reader, String baseUri, RDFNotation notation } // trim, find end int end = line.length() - 1; - while (end >= 0 ) { + while (end >= 0) { char c = line.charAt(end); if (c != ' ' && c != '\t') { break; } end--; } - // check that we have at least one element and this line isn't a comment + // check that we have at least one element and this line isn't a + // comment if (start + 1 < end && line.charAt(start) != '#') { - triple.read(line, start, end); + triple.read(line, start, end, readQuad); if (!triple.hasEmpty()) { - //System.out.println(triple); + // System.out.println(triple); callback.processTriple(triple, 0); } else { log.warn("Could not parse triple at line {}, ignored and not processed.\n{}", numLine, line); @@ -112,8 +126,7 @@ private void doParse(BufferedReader reader, String baseUri, RDFNotation notation } numLine++; } - reader.close(); - }catch(Exception e) { + } catch (Exception e) { log.error("Unexpected exception.", e); throw new ParserException(e); } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimpleTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimpleTest.java index 72a30761..3a1302fb 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimpleTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimpleTest.java @@ -2,6 +2,8 @@ import org.junit.Assert; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Suite; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.rdf.RDFParserCallback; @@ -15,63 +17,102 @@ import java.io.PrintStream; import java.util.Iterator; -public class RDFParserSimpleTest extends AbstractNTriplesParserTest { - @Override - protected RDFParserCallback createParser() { - return new RDFParserSimple(); - } +@RunWith(Suite.class) +@Suite.SuiteClasses({RDFParserSimpleTest.NTriplesTest.class, RDFParserSimpleTest.NQuadTest.class, + RDFParserSimpleTest.NQuadNoGraphTest.class}) +public class RDFParserSimpleTest { + public static abstract class AbstractRDFParserSimpleTest extends AbstractNTriplesParserTest { + protected final RDFNotation notation; - @Test - public void ingestTest() throws IOException, InterruptedException, ParserException { - LargeFakeDataSetStreamSupplier supplier = - LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(1_000_001, 42); - LargeFakeDataSetStreamSupplier supplier2 = - LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(1_000_001, 42); - - PipedOutputStream out = new PipedOutputStream(); - PipedInputStream in = new PipedInputStream(); - in.connect(out); - - RuntimeException[] re = new RuntimeException[1]; - Thread t = new Thread(() -> { - try { - Iterator it = supplier.createTripleStringStream(); - PrintStream ps = new PrintStream(out); - while (it.hasNext()) { - TripleString next = it.next(); - next.dumpNtriple(ps); - ps.flush(); - } - out.close(); - } catch (RuntimeException tt) { - re[0] = tt; - } catch (Throwable tt) { - re[0] = new RuntimeException(tt); - } - }); - t.start(); - - RDFParserCallback parser = createParser(); - - Iterator it = supplier2.createTripleStringStream(); - - int[] count = new int[1]; - StopWatch watch = new StopWatch(); - watch.reset(); - parser.doParse(in, "http://example.org/#", RDFNotation.NTRIPLES, true, - (triple, pos) -> { - Assert.assertTrue(it.hasNext()); - Assert.assertEquals(it.next(), triple); - if (count[0] % 100_000 == 0) { - System.out.println(count[0] + " triples " +watch.stopAndShow()); + protected AbstractRDFParserSimpleTest(RDFNotation notation) { + this.notation = notation; + } + + protected abstract LargeFakeDataSetStreamSupplier createSupplier(); + + @Override + protected RDFParserCallback createParser() { + return new RDFParserSimple(); + } + + @Test + public void ingestTest() throws IOException, InterruptedException, ParserException { + LargeFakeDataSetStreamSupplier supplier = createSupplier(); + LargeFakeDataSetStreamSupplier supplier2 = createSupplier(); + + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + in.connect(out); + + RuntimeException[] re = new RuntimeException[1]; + Thread t = new Thread(() -> { + try { + Iterator it = supplier.createTripleStringStream(); + PrintStream ps = new PrintStream(out); + while (it.hasNext()) { + TripleString next = it.next(); + next.dumpNtriple(ps); + ps.flush(); } - count[0]++; + out.close(); + } catch (RuntimeException tt) { + re[0] = tt; + } catch (Throwable tt) { + re[0] = new RuntimeException(tt); } - ); + }); + t.start(); + + RDFParserCallback parser = createParser(); + + Iterator it = supplier2.createTripleStringStream(); + + StopWatch watch = new StopWatch(); + watch.reset(); + parser.doParse(in, "http://example.org/#", notation, true, (triple, pos) -> { + Assert.assertTrue(it.hasNext()); + Assert.assertEquals(it.next(), triple); + }); + + t.join(); + if (re[0] != null) { + throw re[0]; + } + } + } + + public static class NTriplesTest extends AbstractRDFParserSimpleTest { + public NTriplesTest() { + super(RDFNotation.NTRIPLES); + } - t.join(); - if (re[0] != null) { - throw re[0]; + @Override + protected LargeFakeDataSetStreamSupplier createSupplier() { + return LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(1_000_001, 42).withQuads(true) + .withMaxGraph(0); } } + + public static class NQuadTest extends AbstractRDFParserSimpleTest { + public NQuadTest() { + super(RDFNotation.NQUAD); + } + + @Override + protected LargeFakeDataSetStreamSupplier createSupplier() { + return LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(1_000_001, 42).withQuads(true); + } + } + + public static class NQuadNoGraphTest extends AbstractRDFParserSimpleTest { + public NQuadNoGraphTest() { + super(RDFNotation.NQUAD); + } + + @Override + protected LargeFakeDataSetStreamSupplier createSupplier() { + return LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(1_000_001, 42); + } + } + } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java index 2fda34bd..c36b4a17 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java @@ -3,19 +3,18 @@ import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream; import org.rdfhdt.hdt.enums.CompressionType; -import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.iterator.utils.MapIterator; import org.rdfhdt.hdt.options.HDTOptions; -import org.rdfhdt.hdt.options.HDTOptionsKeys; +import org.rdfhdt.hdt.quad.QuadString; import org.rdfhdt.hdt.triples.TripleString; import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.string.ByteStringUtil; import java.io.BufferedWriter; -import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -40,7 +39,8 @@ public class LargeFakeDataSetStreamSupplier { private static final Charset DEFAULT_CHARSET = ByteStringUtil.STRING_ENCODING; /** - * create a lowercase name from a number, to create string without any number in it + * create a lowercase name from a number, to create string without any + * number in it * * @param i id * @return string @@ -61,7 +61,8 @@ public static String stringNameOfInt(int i, boolean unicode) { } /** - * create a lowercase name from a number, to create string without any number in it + * create a lowercase name from a number, to create string without any + * number in it * * @param i id * @return string @@ -88,7 +89,8 @@ public static long estimateTripleSize(TripleString triple) { * create a supplier with a max size * * @param maxSize the max size - * @param seed the seed of the supplier, the same seed will create the same supplier + * @param seed the seed of the supplier, the same seed will create the + * same supplier * @return supplier */ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxSize(long maxSize, long seed) { @@ -99,7 +101,8 @@ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxSize(long maxS * create a supplier with a max count * * @param maxTriples the max number of triples - * @param seed the seed of the supplier, the same seed will create the same supplier + * @param seed the seed of the supplier, the same seed will create the + * same supplier * @return supplier */ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxTriples(long maxTriples, long seed) { @@ -109,7 +112,8 @@ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxTriples(long m /** * create a supplier without a max count * - * @param seed the seed of the supplier, the same seed will create the same supplier + * @param seed the seed of the supplier, the same seed will create the same + * supplier * @return supplier */ public static LargeFakeDataSetStreamSupplier createInfinite(long seed) { @@ -122,11 +126,13 @@ public static LargeFakeDataSetStreamSupplier createInfinite(long seed) { private long maxTriples; public int maxFakeType = 10; public int maxLiteralSize = 2; + public int maxGraph = 10; public int maxElementSplit = Integer.MAX_VALUE; private long slowStream; private boolean unicode; private TripleString buffer; private TripleString next; + private boolean nquad; private LargeFakeDataSetStreamSupplier(long maxSize, long maxTriples, long seed) { this.maxSize = maxSize; @@ -229,32 +235,12 @@ public ThreadedStream createNTInputStream(CompressionType compressionType) throw it.next().dumpNtriple(ps); } } - }, - "ThreadedFakedStream"); + }, "ThreadedFakedStream"); run.start(); return new ThreadedStream(run, is); } - /** - * create an HDT from the stream using two-pass algorithm - * - * @param spec hdt options - * @return hdt - * @throws ParserException parsing exception - * @throws IOException io exception - */ - public HDT createFakeHDTTwoPass(HDTOptions spec) throws ParserException, IOException { - Path f = Path.of("tempNtFile.nt").toAbsolutePath(); - try { - createNTFile(f); - spec.set(HDTOptionsKeys.LOADER_TYPE_KEY, HDTOptionsKeys.LOADER_TYPE_VALUE_TWO_PASS); - return HDTManager.generateHDT(f.toString(), "http://w", RDFNotation.NTRIPLES, spec, null); - } finally { - Files.deleteIfExists(f); - } - } - /** * create an HDT from the stream * @@ -275,26 +261,38 @@ public HDT createFakeHDT(HDTOptions spec) throws ParserException, IOException { * @throws ParserException parsing exception * @throws IOException io exception */ - public void createAndSaveFakeHDT(HDTOptions spec, String location) throws ParserException, IOException { - try (HDT hdt = createFakeHDT(spec)) { - hdt.saveToHDT(location, null); - } + public void createAndSaveFakeHDT(HDTOptions spec, Path location) throws ParserException, IOException { + createAndSaveFakeHDT(spec, location.toAbsolutePath().toString()); } /** - * create an HDT from the stream using 2pass algorithm and save it to a file + * create an HDT from the stream and save it to a file * * @param spec hdt options * @param location save location * @throws ParserException parsing exception * @throws IOException io exception */ - public void createAndSaveFakeHDTTwoPass(HDTOptions spec, String location) throws ParserException, IOException { - try (HDT hdt = createFakeHDTTwoPass(spec)) { + public void createAndSaveFakeHDT(HDTOptions spec, String location) throws ParserException, IOException { + try (HDT hdt = createFakeHDT(spec)) { hdt.saveToHDT(location, null); } } + private CharSequence createGraph() { + if (maxGraph == 0) { + return ""; + } + int rnd = random.nextInt(10); + if (rnd < 4) { + return ""; // no graph + } + if (rnd == 4) { + return "_:bnode" + random.nextInt(maxGraph / 2); + } + return "http://test.org/#graph" + random.nextInt(maxGraph / 2); + } + private CharSequence createResource() { if (random.nextInt(10) == 0) { return "_:bnode" + random.nextInt(maxElementSplit / 10); @@ -317,7 +315,9 @@ private CharSequence createValue() { int size = random.nextInt(maxLiteralSize); StringBuilder litText = new StringBuilder(); for (int i = 0; i < size; i++) { - litText.append(stringNameOfInt(unicode ? random.nextInt(Character.MAX_CODE_POINT - 30) + 30 : random.nextInt(maxElementSplit), unicode)); + litText.append(stringNameOfInt( + unicode ? random.nextInt(Character.MAX_CODE_POINT - 30) + 30 : random.nextInt(maxElementSplit), + unicode)); } String text = "\"" + litText + "\""; int litType = random.nextInt(3); @@ -333,6 +333,13 @@ private CharSequence createValue() { } } + /** + * @return the stream of the objects + */ + public Iterator objectIterator() { + return new MapIterator<>(createTripleStringStream(), TripleString::getObject); + } + private class FakeStatementIterator implements Iterator { private long size; private long count = 0; @@ -368,18 +375,17 @@ public boolean hasNext() { CharSequence value = createValue(); if (buffer != null) { - buffer.setAll( - resource, - iri, - value - ); + buffer.setAll(resource, iri, value); + if (nquad) { + buffer.setGraph(createGraph()); + } next = buffer; } else { - next = new TripleString( - resource, - iri, - value - ); + if (nquad) { + next = new QuadString(resource, iri, value, createGraph()); + } else { + next = new TripleString(resource, iri, value); + } } if (slowStream > 0) { @@ -410,6 +416,7 @@ public TripleString next() { /** * set the max size + * * @param maxSize max size * @return this */ @@ -420,6 +427,7 @@ public LargeFakeDataSetStreamSupplier withMaxSize(long maxSize) { /** * set the max triples count + * * @param maxTriples max triples count * @return this */ @@ -485,20 +493,59 @@ public LargeFakeDataSetStreamSupplier withSlowStream(long slowStream) { } /** - * use the same {@link org.rdfhdt.hdt.triples.TripleString} object, better to simulate the RDFParser outputs + * use the same {@link TripleString} object, better to simulate the + * RDFParser outputs * * @param sameTripleString use same triple * @return this */ public LargeFakeDataSetStreamSupplier withSameTripleString(boolean sameTripleString) { if (sameTripleString) { - buffer = new TripleString(); + if (nquad) { + buffer = new QuadString(); + } else { + buffer = new TripleString(); + } } else { buffer = null; } return this; } + /** + * generate quad with the triple strings + * + * @param quad quads + * @return this + */ + public LargeFakeDataSetStreamSupplier withQuads(boolean quad) { + if (this.nquad == quad) { + return this; + } + this.nquad = quad; + if (buffer != null) { + // we need to reset the buffer + TripleString old = buffer; + if (quad) { + buffer = new QuadString(old); + } else { + buffer = new TripleString(old); + } + } + return this; + } + + /** + * set the maximum number of graph with quad generation + * + * @param maxGraph max number of graph (excluding the default graph) + * @return this + */ + public LargeFakeDataSetStreamSupplier withMaxGraph(int maxGraph) { + this.maxGraph = maxGraph; + return this; + } + /** * Stream connected to a thread to interrupt in case of Exception */