Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDTCatTree + HDTGenDisk #179

Merged
merged 9 commits into from
Nov 21, 2022
Prev Previous commit
Next Next commit
add HDTGenerateDisk method with tests
  • Loading branch information
ate47 committed Oct 21, 2022
commit 2b32171b89f3bd4be7ee1e9940d19f3a235c8987
52 changes: 52 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package org.rdfhdt.hdt.enums;

/**
* A compression type
* @author Antoine Willerval
*/
public enum CompressionType {

/**
* gzip compression (.gz .tgz)
*/
GZIP("gz", "tgz"),
/**
* bzip compression (.bz2 .bz)
*/
BZIP("bz2", "bz"),
/**
* bzip compression (.xz)
*/
XZ("xz"),
/**
* no compression
*/
NONE;

/**
* try to guess a compression of a file with its name
* @param fileName the file name to guess
* @return the compression type or none if it can't be guessed
*/
public static CompressionType guess(String fileName) {
String str = fileName.toLowerCase();

int idx = str.lastIndexOf('.');
if(idx!=-1) {
String ext = str.substring(idx + 1);
for (CompressionType type: values()) {
for (String typeExt : type.ext) {
if (typeExt.equals(ext)) {
return type;
}
}
}
}
return NONE;
}

private final String[] ext;
CompressionType(String... ext) {
this.ext = ext;
}
}
152 changes: 152 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import java.util.Iterator;

import org.rdfhdt.hdt.compact.bitmap.Bitmap;
import org.rdfhdt.hdt.enums.CompressionType;
import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.listener.ProgressListener;
Expand Down Expand Up @@ -291,6 +292,153 @@ public static HDT generateHDT(String rdfFileName, String baseURI, RDFNotation rd
public static HDT generateHDT(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(iterator, baseURI, hdtFormat, listener);
}
/**
* Create an HDT file from a RDF stream.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param filename the RDF file name to guess the stream format and compresion.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDT(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener);
}
/**
* Create an HDT file from a RDF stream.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param compressionType Compression type of the RDF stream. (GZIP, ZIP...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener);
}
/**
* Create an HDT file from a RDF stream.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
}

/**
* Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the
* IO usage.
* @param rdfFileName RDF file to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param compressionType Compression type of the RDF file. (GZIP, ZIP...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the file cannot be found
* @throws ParserException when the RDF file can't be parsed
*/
public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, compressionType, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF file without compression by sorting the triples on disk, reduce the memory
* required by increasing the IO usage.
* @param rdfFileName RDF file to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the file cannot be found
* @throws ParserException when the RDF file can't be parsed
*/
public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the
* IO usage. Will guess the RDF file compression/format with the file name.
* @param rdfFileName RDF file to parse.
* @param baseURI Base URI for the dataset.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the file cannot be found
* @throws ParserException when the RDF file can't be parsed
*/
public static HDT generateHDTDisk(String rdfFileName, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, RDFNotation.guess(rdfFileName), CompressionType.guess(rdfFileName), hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param filename the RDF file name to guess the stream format and compresion.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDTDisk(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param compressionType Compression type of the RDF stream. (GZIP, ZIP...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param baseURI Base URI for the dataset.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
*/
public static HDT generateHDTDisk(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener);
}

public static TripleWriter getHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException {
return HDTManager.getInstance().doGetHDTWriter(out, baseURI, hdtFormat);
Expand Down Expand Up @@ -405,7 +553,11 @@ public static HDT catTree(RDFFluxStop fluxStop, HDTSupplier supplier, Iterator<T
protected abstract HDT doMapIndexedHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException;
protected abstract HDT doIndexedHDT(HDT hdt, ProgressListener listener) throws IOException;
protected abstract HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDT(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doGenerateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDTDisk(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract TripleWriter doGetHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException;
protected abstract TripleWriter doGetHDTWriter(String outFile, String baseURI, HDTOptions hdtFormat) throws IOException;
protected abstract HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
Expand Down
13 changes: 12 additions & 1 deletion hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTSupplier.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTOptions;
import org.rdfhdt.hdt.options.HDTOptionsKeys;
import org.rdfhdt.hdt.triples.TripleString;

import java.io.IOException;
Expand All @@ -19,14 +20,24 @@ public interface HDTSupplier {
/**
* @return implementation using in-memory hdt
*/
static HDTSupplier memory() {
static org.rdfhdt.hdt.hdt.HDTSupplier memory() {
return (iterator, baseURI, hdtFormat, listener, location) -> {
try (HDT hdt = HDTManager.generateHDT(iterator, baseURI, hdtFormat, listener)) {
hdt.saveToHDT(location.toAbsolutePath().toString(), listener);
}
};
}

/**
* @return implementation using in-memory hdt
*/
static org.rdfhdt.hdt.hdt.HDTSupplier disk() {
return (iterator, baseURI, hdtFormat, listener, location) -> {
hdtFormat.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, location.toAbsolutePath().toString());
HDTManager.generateHDTDisk(iterator, baseURI, hdtFormat, listener).close();
};
}

/**
* Generate the HDT
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package org.rdfhdt.hdt.listener;

/**
* version of {@link org.rdfhdt.hdt.listener.ProgressListener} for multi-thread logging
*/
@FunctionalInterface
public interface MultiThreadListener extends ProgressListener {

/**
* Send progress notification
* @param thread thread name
* @param level percent of the task accomplished
* @param message Description of the operation
*/
void notifyProgress(String thread, float level, String message);

/**
* Send progress notification, should call {@link #notifyProgress(String, float, String)}
* @param level percent of the task accomplished
* @param message Description of the operation
*/
default void notifyProgress(float level, String message) {
notifyProgress(Thread.currentThread().getName(), level, message);
}

/**
* unregister all the thread
*/
default void unregisterAllThreads() {
// should be filled by implementation if required
}

/**
* register a thread
* @param threadName the thread name
*/
default void registerThread(String threadName) {
// should be filled by implementation if required
}

/**
* unregister a thread
* @param threadName the thread name
*/
default void unregisterThread(String threadName) {
// should be filled by implementation if required
}
}
Loading