Skip to content

Commit

Permalink
Merge pull request #160 from ate47/hdtloader
Browse files Browse the repository at this point in the history
Implement dir and hdt parser
  • Loading branch information
D063520 authored May 10, 2022
2 parents 2112bdf + 3c036a9 commit 38dd2e8
Show file tree
Hide file tree
Showing 7 changed files with 306 additions and 6 deletions.
29 changes: 25 additions & 4 deletions hdt-api/src/main/java/org/rdfhdt/hdt/enums/RDFNotation.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
package org.rdfhdt.hdt.enums;

import java.io.File;
import java.nio.file.Files;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;

/**
* Enumeration of the different valid notations for RDF data.
Expand Down Expand Up @@ -93,8 +96,13 @@ public enum RDFNotation {
/**
* Directory with RDF content
*/
DIR

DIR,

/**
* HDT file
*/
HDT

;

public static RDFNotation parse(String str) {
Expand All @@ -120,12 +128,22 @@ public static RDFNotation parse(String str) {
return ZIP;
} else if(str.equals("list")) {
return LIST;
} else if(str.equals("hdt")) {
return HDT;
}
throw new IllegalArgumentException();
}

public static RDFNotation guess(String fileName) throws IllegalArgumentException {
String str = fileName.toLowerCase();

try {
if (Files.isDirectory(Path.of(fileName))) {
return DIR;
}
} catch (InvalidPathException e) {
// not a valid path, so can't be a directory, ignore
}

int idx = str.lastIndexOf('.');
if(idx!=-1) {
Expand All @@ -152,8 +170,11 @@ public static RDFNotation guess(String fileName) throws IllegalArgumentException
} else if(str.endsWith("zip")){
return ZIP;
} else if(str.endsWith("list")){
return LIST;
}
return LIST;
} else if(str.endsWith("hdt")){
return HDT;
}

throw new IllegalArgumentException("Could not guess the format for "+fileName);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.NotImplementedException;
import org.rdfhdt.hdt.rdf.parsers.RDFParserDir;
import org.rdfhdt.hdt.rdf.parsers.RDFParserHDT;
import org.rdfhdt.hdt.rdf.parsers.RDFParserList;
import org.rdfhdt.hdt.rdf.parsers.RDFParserRAR;
import org.rdfhdt.hdt.rdf.parsers.RDFParserRIOT;
Expand All @@ -50,8 +52,7 @@ public static RDFParserCallback getParserCallback(RDFNotation notation) {
case RDFXML:
return new RDFParserRIOT();
case DIR:
// FIXME: Implement
throw new NotImplementedException("RDFParserDir not implemented");
return new RDFParserDir();
case LIST:
return new RDFParserList();
case ZIP:
Expand All @@ -60,6 +61,8 @@ public static RDFParserCallback getParserCallback(RDFNotation notation) {
return new RDFParserTar();
case RAR:
return new RDFParserRAR();
case HDT:
return new RDFParserHDT();
case JSONLD:
// FIXME: Implement
throw new NotImplementedException("RDFParserJSONLD not implemented");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package org.rdfhdt.hdt.rdf.parsers;

import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.NotImplementedException;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.rdf.RDFParserCallback;
import org.rdfhdt.hdt.rdf.RDFParserFactory;
import org.rdfhdt.hdt.util.ContainerException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;

/**
* @author Antoine Willerval
*/
public class RDFParserDir implements RDFParserCallback {
private static final Logger log = LoggerFactory.getLogger(RDFParserDir.class);

@Override
public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
try {
doParse(Path.of(fileName), baseUri, notation, keepBNode, callback);
} catch (InvalidPathException e) {
throw new ParserException(e);
}
}

private void doParse(Path p, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
if (notation != RDFNotation.DIR) {
throw new IllegalArgumentException("Can't parse notation different than " + RDFNotation.DIR + "!");
}
try {
Files.list(p).forEach(child -> {
try {
if (Files.isDirectory(child)) {
doParse(child, baseUri, RDFNotation.DIR, keepBNode, callback);
return;
}
RDFParserCallback rdfParserCallback;
RDFNotation childNotation;
try {
// get the notation of the file
childNotation = RDFNotation.guess(child.toFile());
rdfParserCallback = RDFParserFactory.getParserCallback(childNotation);
} catch (IllegalArgumentException e) {
log.warn("Ignore file {}", child, e);
return;
}
log.debug("parse {}", child);
// we can parse it, parsing it
rdfParserCallback.doParse(child.toAbsolutePath().toString(), baseUri, childNotation, keepBNode, callback);
} catch (ParserException e) {
throw new ContainerException(e);
}
});
} catch (IOException | SecurityException e) {
throw new ParserException(e);
} catch (ContainerException e) {
throw (ParserException) e.getCause();
}
}

@Override
public void doParse(InputStream in, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
throw new NotImplementedException("Can't parse a stream of directory!");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.rdfhdt.hdt.rdf.parsers;

import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.NotFoundException;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdt.rdf.RDFParserCallback;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;

/**
* @author Antoine Willerval
*/
public class RDFParserHDT implements RDFParserCallback {
private static final Logger log = LoggerFactory.getLogger(RDFParserHDT.class);

@Override
public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
try (HDT hdt = HDTManager.mapHDT(fileName)) {
hdt.search("", "", "").forEachRemaining(t -> callback.processTriple(t, 0));
} catch (IOException | NotFoundException e) {
log.error("Unexpected exception.", e);
throw new ParserException(e);
}
}

@Override
public void doParse(InputStream in, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
try {
// create a temp
Path tempFile = Files.createTempFile("hdtjava-reader", ".hdt");
log.warn("Create temp file to store the HDT stream {}", tempFile);
try {
Files.copy(in, tempFile);
doParse(tempFile.toAbsolutePath().toString(), baseUri, notation, keepBNode, callback);
} finally {
Files.deleteIfExists(tempFile);
}
} catch (IOException e) {
log.error("Unexpected exception.", e);
throw new ParserException(e);
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package org.rdfhdt.hdt.util;

/**
* A simple runtime exception to contain a cause
* @author Antoine Willerval
*/
public class ContainerException extends RuntimeException {

public ContainerException(Throwable cause) {
super(cause);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package org.rdfhdt.hdt.rdf.parsers;

import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.header.HeaderUtil;
import org.rdfhdt.hdt.rdf.RDFParserCallback;
import org.rdfhdt.hdt.rdf.RDFParserFactory;
import org.rdfhdt.hdt.triples.TripleString;
import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

public class RDFParserDirTest {

@Rule
public TemporaryFolder tempDir = new TemporaryFolder();

@Test
public void dirTest() throws IOException, ParserException {
Path root = tempDir.newFolder().toPath();
Files.createDirectories(root);

Path testDir1 = root.resolve("testDir1");
Path testDir2 = root.resolve("testDir2");
Path testDir3 = root.resolve("testDir3");
Path testDir4 = testDir3.resolve("testDir4");

Files.createDirectories(testDir1);
Files.createDirectories(testDir2);
Files.createDirectories(testDir3);
Files.createDirectories(testDir4);

LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier
.createSupplierWithMaxTriples(20, 34);

supplier.createNTFile(root.resolve("test.nt").toAbsolutePath().toString());
supplier.createNTFile(testDir1.resolve("test1.nt").toAbsolutePath().toString());
supplier.createNTFile(testDir2.resolve("test21.nt").toAbsolutePath().toString());
supplier.createNTFile(testDir2.resolve("test22.nt").toAbsolutePath().toString());
supplier.createNTFile(testDir3.resolve("test31.nt").toAbsolutePath().toString());
supplier.createNTFile(testDir3.resolve("test32.nt").toAbsolutePath().toString());

Files.writeString(testDir2.resolve("thing.txt"), "Not parsable RDF DATA");
Files.writeString(root.resolve("thing.py"), "print('Not parsable RDF DATA')");
Files.writeString(testDir4.resolve("thing.sh"), "echo \"Not Parsable RDF data\"");

supplier.reset();

List<TripleString> excepted = new ArrayList<>();
// 6 for the 6 files
for (int i = 0; i < 6; i++) {
Iterator<TripleString> it = supplier.createTripleStringStream();
while (it.hasNext()) {
TripleString ts = it.next();
TripleString e = new TripleString(
HeaderUtil.cleanURI(ts.getSubject().toString()),
HeaderUtil.cleanURI(ts.getPredicate().toString()),
HeaderUtil.cleanURI(ts.getObject().toString())
);
excepted.add(e);
}
}

String filename = root.toAbsolutePath().toString();
RDFNotation dir = RDFNotation.guess(filename);
Assert.assertEquals(dir, RDFNotation.DIR);
RDFParserCallback callback = RDFParserFactory.getParserCallback(dir);
Assert.assertTrue(callback instanceof RDFParserDir);

callback.doParse(filename, "http://example.org/#", dir, true, (triple, pos) ->
Assert.assertTrue("triple " + triple + " wasn't excepted", excepted.remove(triple))
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package org.rdfhdt.hdt.rdf.parsers;

import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.NotFoundException;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdt.options.HDTSpecification;
import org.rdfhdt.hdt.rdf.RDFParserCallback;
import org.rdfhdt.hdt.rdf.RDFParserFactory;
import org.rdfhdt.hdt.triples.IteratorTripleString;
import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier;

import java.io.IOException;
import java.nio.file.Path;

public class RDFParserHDTTest {


@Rule
public TemporaryFolder tempDir = new TemporaryFolder();

@Test
public void hdtTest() throws IOException, ParserException, NotFoundException {
Path root = tempDir.newFile("test.hdt").toPath();

LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier
.createSupplierWithMaxTriples(20, 34);

HDT hdt = HDTManager.generateHDT(
supplier.createTripleStringStream(),
"http://example.org/#",
new HDTSpecification(),
null
);
hdt.saveToHDT(root.toAbsolutePath().toString(), null);

supplier.reset();

String filename = root.toAbsolutePath().toString();
RDFNotation dir = RDFNotation.guess(filename);
Assert.assertEquals(dir, RDFNotation.HDT);
RDFParserCallback callback = RDFParserFactory.getParserCallback(dir);
Assert.assertTrue(callback instanceof RDFParserHDT);

IteratorTripleString it = hdt.search("", "", "");

callback.doParse(filename, "http://example.org/#", dir, true, (triple, pos) ->
Assert.assertEquals(it.next(), triple)
);

hdt.close();
}
}

0 comments on commit 38dd2e8

Please sign in to comment.