Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow using nt simple parser in RDF2HDT cli/generateHDT #163

Merged
merged 1 commit into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 66 additions & 21 deletions hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java
Original file line number Diff line number Diff line change
Expand Up @@ -204,46 +204,91 @@ public boolean hasEmpty() {
* @throws ParserException if the line is not RDF complient
*/
public void read(String line) throws ParserException {
read(line, 0, line.length());
}

private int searchNextTabOrSpace(String line, int start, int end) {
// searching space
int sindex = line.indexOf(' ', start);
if (sindex != -1 && sindex < end) {
return sindex;
}

// not found, searching tabs
int tindex = line.indexOf('\t', start);
if (tindex != -1 && tindex < end) {
return tindex;
}

// not found
return -1;
}

/**
* Read from a line, where each component is separated by space.
* @param line line to read
* @throws ParserException if the line is not RDF complient
*/
public void read(String line, int start, int end) throws ParserException {
int split, posa, posb;
this.clear();

line = line.replace("\\t"," ");

// SET SUBJECT
posa = 0;
posb = split = line.indexOf(' ', posa);
posa = start;
posb = split = searchNextTabOrSpace(line, posa, end);

if(posb==-1) return; // Not found, error.
if(line.charAt(posa)=='<') posa++; // Remove <
if(line.charAt(posb-1)=='>') posb--; // Remove >
if (posb == -1) {
// Not found, error.
return;
}
if (line.charAt(posa) == '<') {
posa++; // Remove <
if (line.charAt(posb-1) == '>') {
posb--; // Remove >
}
}

this.setSubject(UnicodeEscape.unescapeString(line.substring(posa, posb)));
this.setSubject(UnicodeEscape.unescapeString(line, posa, posb));

// SET PREDICATE
posa = split+1;
posb = split = line.indexOf(' ', posa);
posa = split + 1;
posb = split = searchNextTabOrSpace(line, posa, end);

if(posb==-1) return;
if(line.charAt(posa)=='<') posa++;
if(posb>posa && line.charAt(posb-1)=='>') posb--;
if (posb == -1) {
return;
}
if (line.charAt(posa) == '<') {
posa++;
if (posb > posa && line.charAt(posb - 1) == '>') {
posb--;
}
}

this.setPredicate(UnicodeEscape.unescapeString(line.substring(posa, posb)));
this.setPredicate(UnicodeEscape.unescapeString(line, posa, posb));

// SET OBJECT
posa = split+1;
posb = line.length();
posa = split + 1;
posb = end;

if(line.charAt(posb-1)=='.') posb--; // Remove trailing <space> <dot> from NTRIPLES.
if(line.charAt(posb-1)==' ') posb--;
// Remove trailing <space> <dot> from NTRIPLES.
if (line.charAt(posb-1) == '.') {
posb--;
}
char prev = line.charAt(posb-1);
if (prev == ' ' || prev == '\t') {
posb--;
}

if(line.charAt(posa)=='<') {
if (line.charAt(posa) == '<') {
posa++;

// Remove trailing > only if < appears, so "some"^^<http://datatype> is kept as-is.
if(posb>posa && line.charAt(posb-1)=='>') posb--;
if (posb > posa && line.charAt(posb-1)=='>') {
posb--;
}
}

this.setObject(UnicodeEscape.unescapeString(line.substring(posa, posb)));
this.setObject(UnicodeEscape.unescapeString(line, posa, posb));
}

/*
Expand Down
33 changes: 23 additions & 10 deletions hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,32 +140,45 @@ else if (cInt >= 0x10000 && cInt <= 0x10FFFF) {

appendable.append(label.subSequence(last+1, label.length()));
}

/**
* Unescapes an escaped Unicode string. Any Unicode sequences
* (<code>&#x5C;uxxxx</code> and <code>&#x5C;Uxxxxxxxx</code>) are restored to the
* value indicated by the hexadecimal argument and any backslash-escapes
* (<code>\"</code>, <code>\\</code>, etc.) are decoded to their original form.
*
*
* @param s An escaped Unicode string.
* @return The unescaped string.
* @throws IllegalArgumentException If the supplied string is not a
* correctly escaped N-Triples string.
*/
public static String unescapeString(String s) {
int backSlashIdx = s.indexOf('\\');
return unescapeString(s, 0, s.length());
}
/**
* Unescapes an escaped Unicode string. Any Unicode sequences
* (<code>&#x5C;uxxxx</code> and <code>&#x5C;Uxxxxxxxx</code>) are restored to the
* value indicated by the hexadecimal argument and any backslash-escapes
* (<code>\"</code>, <code>\\</code>, etc.) are decoded to their original form.
*
* @param s An escaped Unicode string.
* @return The unescaped string.
* @throws IllegalArgumentException If the supplied string is not a
* correctly escaped N-Triples string.
*/
public static String unescapeString(String s, int start, int sLength) {
int backSlashIdx = s.indexOf('\\', start);

if (backSlashIdx == -1) {
if (backSlashIdx == -1 || backSlashIdx >= sLength) {
// No escaped characters found
return s;
return s.substring(start, sLength);
}

int startIdx = 0;
int sLength = s.length();
int startIdx = start;
StringBuilder sb = new StringBuilder(sLength);

while (backSlashIdx != -1) {
sb.append(s.substring(startIdx, backSlashIdx));
while (backSlashIdx != -1 && backSlashIdx < sLength) {
sb.append(s, startIdx, backSlashIdx);

if (backSlashIdx + 1 >= sLength) {
throw new IllegalArgumentException("Unescaped backslash in: " + s);
Expand Down Expand Up @@ -238,7 +251,7 @@ else if (c == 'U') {
backSlashIdx = s.indexOf('\\', startIdx);
}

sb.append(s.substring(startIdx));
sb.append(s, startIdx, sLength);

return sb.toString();
}
Expand Down
11 changes: 9 additions & 2 deletions hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,12 @@ public class RDF2HDT implements ProgressListener {

@Parameter(names = "-index", description = "Generate also external indices to solve all queries")
public boolean generateIndex;

@Parameter(names = "-quiet", description = "Do not show progress of the conversion")
public boolean quiet;

@Parameter(names = "-canonicalntfile", description = "Only for NTriples input. Use a Fast NT file parser the input should be in a canonical form. See https://www.w3.org/TR/n-triples/#h2_canonical-ntriples")
public boolean ntSimpleLoading;

public void execute() throws ParserException, IOException {
HDTSpecification spec;
Expand All @@ -88,7 +91,7 @@ public void execute() throws ParserException, IOException {
if(baseURI==null) {
baseURI = "file://"+rdfInput;
}

RDFNotation notation=null;
if(rdfType!=null) {
try {
Expand All @@ -107,6 +110,10 @@ public void execute() throws ParserException, IOException {
}
}

if (ntSimpleLoading) {
spec.set("parser.ntSimpleParser", "true");
}

StopWatch sw = new StopWatch();
HDT hdt = HDTManager.generateHDT(rdfInput, baseURI,notation , spec, this);
System.out.println("File converted in: "+sw.stopAndShow());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@

public class HDTManagerImpl extends HDTManager {

private boolean useSimple(HDTOptions spec) {
String value = spec.get("parser.ntSimpleParser");
return value != null && !value.isEmpty() && !value.equals("false");
}

@Override
public HDTOptions doReadOptions(String file) throws IOException {
return new HDTSpecification(file);
Expand Down Expand Up @@ -90,9 +95,9 @@ public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNota
String loaderType = spec.get("loader.type");
TempHDTImporter loader;
if ("two-pass".equals(loaderType)) {
loader = new TempHDTImporterTwoPass();
loader = new TempHDTImporterTwoPass(useSimple(spec));
} else {
loader = new TempHDTImporterOnePass();
loader = new TempHDTImporterOnePass(useSimple(spec));
}

// Create TempHDT
Expand All @@ -118,7 +123,7 @@ public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNota
@Override
public HDT doGenerateHDT(Iterator<TripleString> triples, String baseURI, HDTOptions spec, ProgressListener listener) throws IOException {
//choose the importer
TempHDTImporterOnePass loader = new TempHDTImporterOnePass();
TempHDTImporterOnePass loader = new TempHDTImporterOnePass(false);

// Create TempHDT
TempHDT modHdt = loader.loadFromTriples(spec, triples, baseURI, listener);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,17 @@ public void processTriple(TripleString triple, long pos) {
}
}

private final boolean useSimple;

public TempHDTImporterOnePass(boolean useSimple) {
this.useSimple = useSimple;
}

@Override
public TempHDT loadFromRDF(HDTOptions specs, String filename, String baseUri, RDFNotation notation, ProgressListener listener)
throws ParserException {

RDFParserCallback parser = RDFParserFactory.getParserCallback(notation);
RDFParserCallback parser = RDFParserFactory.getParserCallback(notation, useSimple);

// Create Modifiable Instance
TempHDT modHDT = new TempHDTImpl(specs, baseUri, ModeOfLoading.ONE_PASS);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,17 @@ public void processTriple(TripleString triple, long pos) {
}
}

@Override
private final boolean useSimple;

public TempHDTImporterTwoPass(boolean useSimple) {
this.useSimple = useSimple;
}

@Override
public TempHDT loadFromRDF(HDTOptions specs, String filename, String baseUri, RDFNotation notation, ProgressListener listener)
throws ParserException {

RDFParserCallback parser = RDFParserFactory.getParserCallback(notation);
RDFParserCallback parser = RDFParserFactory.getParserCallback(notation, useSimple);

// Create Modifiable Instance and parser
TempHDT modHDT = new TempHDTImpl(specs, baseUri, ModeOfLoading.TWO_PASS);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.rdfhdt.hdt.rdf.parsers.RDFParserList;
import org.rdfhdt.hdt.rdf.parsers.RDFParserRAR;
import org.rdfhdt.hdt.rdf.parsers.RDFParserRIOT;
import org.rdfhdt.hdt.rdf.parsers.RDFParserSimple;
import org.rdfhdt.hdt.rdf.parsers.RDFParserTar;
import org.rdfhdt.hdt.rdf.parsers.RDFParserZip;

Expand All @@ -43,24 +44,29 @@
*/
public class RDFParserFactory {
public static RDFParserCallback getParserCallback(RDFNotation notation) {

return getParserCallback(notation, false);
}
public static RDFParserCallback getParserCallback(RDFNotation notation, boolean useSimple) {
switch(notation) {
case NTRIPLES:
case NTRIPLES:
if (useSimple) {
return new RDFParserSimple();
}
case NQUAD:
case TURTLE:
case N3:
case RDFXML:
return new RDFParserRIOT();
case DIR:
return new RDFParserDir();
return new RDFParserDir(useSimple);
case LIST:
return new RDFParserList();
case ZIP:
return new RDFParserZip();
return new RDFParserZip(useSimple);
case TAR:
return new RDFParserTar();
return new RDFParserTar(useSimple);
case RAR:
return new RDFParserRAR();
return new RDFParserRAR(useSimple);
case HDT:
return new RDFParserHDT();
case JSONLD:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@
*/
public class RDFParserDir implements RDFParserCallback {
private static final Logger log = LoggerFactory.getLogger(RDFParserDir.class);
private final boolean simple;

public RDFParserDir(boolean simple) {
this.simple = simple;
}

public RDFParserDir() {
this(false);
}

@Override
public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
Expand All @@ -46,7 +55,7 @@ private void doParse(Path p, String baseUri, RDFNotation notation, boolean keepB
try {
// get the notation of the file
childNotation = RDFNotation.guess(child.toFile());
rdfParserCallback = RDFParserFactory.getParserCallback(childNotation);
rdfParserCallback = RDFParserFactory.getParserCallback(childNotation, simple);
} catch (IllegalArgumentException e) {
log.warn("Ignore file {}", child, e);
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@
*
*/
public class RDFParserList implements RDFParserCallback {
private final boolean simple;

public RDFParserList(boolean simple) {
this.simple = simple;
}

public RDFParserList() {
this(false);
}

/* (non-Javadoc)
* @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, java.lang.String, hdt.enums.RDFNotation, hdt.rdf.RDFParserCallback.RDFCallback)
Expand Down Expand Up @@ -88,7 +97,7 @@ private void doParse(BufferedReader reader, String baseUri, RDFNotation notation

RDFNotation guessnot = RDFNotation.guess(line);
System.out.println("Parse from list: "+line+" as "+guessnot);
RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot);
RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot, simple);

parser.doParse(line, baseUri, guessnot, keepBNode, callback);
}
Expand Down
Loading