Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

generateHDTDisk : a merge sort on disk to create HDTs #162

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
add HDTGenerateDisk method with tests
  • Loading branch information
ate47 committed Sep 15, 2022
commit 5f011f9dc6628d049a1118563ea871b24cd78e41
52 changes: 52 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package org.rdfhdt.hdt.enums;

/**
* A compression type
* @author Antoine Willerval
*/
public enum CompressionType {

/**
* gzip compression (.gz .tgz)
*/
GZIP("gz", "tgz"),
/**
* bzip compression (.bz2 .bz)
*/
BZIP("bz2", "bz"),
/**
* bzip compression (.xz)
*/
XZ("xz"),
/**
* no compression
*/
NONE;

/**
* try to guess a compression of a file with its name
* @param fileName the file name to guess
* @return the compression type or none if it can't be guessed
*/
public static CompressionType guess(String fileName) {
String str = fileName.toLowerCase();

int idx = str.lastIndexOf('.');
if(idx!=-1) {
String ext = str.substring(idx + 1);
for (CompressionType type: values()) {
for (String typeExt : type.ext) {
if (typeExt.equals(ext)) {
return type;
}
}
}
}
return NONE;
}

private final String[] ext;
CompressionType(String... ext) {
this.ext = ext;
}
}
152 changes: 152 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import java.util.Iterator;

import org.rdfhdt.hdt.compact.bitmap.Bitmap;
import org.rdfhdt.hdt.enums.CompressionType;
import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.listener.ProgressListener;
Expand Down Expand Up @@ -290,6 +291,153 @@ public static HDT generateHDT(String rdfFileName, String baseURI, RDFNotation rd
public static HDT generateHDT(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(iterator, baseURI, hdtFormat, listener);
}
/**
* Create an HDT file from a RDF stream.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param filename the RDF file name to guess the stream format and compresion.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDT(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener);
}
/**
* Create an HDT file from a RDF stream.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param compressionType Compression type of the RDF stream. (GZIP, ZIP...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener);
}
/**
* Create an HDT file from a RDF stream.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
}

/**
* Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the
* IO usage.
* @param rdfFileName RDF file to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param compressionType Compression type of the RDF file. (GZIP, ZIP...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the file cannot be found
* @throws ParserException when the RDF file can't be parsed
*/
public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, compressionType, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF file without compression by sorting the triples on disk, reduce the memory
* required by increasing the IO usage.
* @param rdfFileName RDF file to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the file cannot be found
* @throws ParserException when the RDF file can't be parsed
*/
public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the
* IO usage. Will guess the RDF file compression/format with the file name.
* @param rdfFileName RDF file to parse.
* @param baseURI Base URI for the dataset.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the file cannot be found
* @throws ParserException when the RDF file can't be parsed
*/
public static HDT generateHDTDisk(String rdfFileName, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, RDFNotation.guess(rdfFileName), CompressionType.guess(rdfFileName), hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param filename the RDF file name to guess the stream format and compresion.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDTDisk(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param compressionType Compression type of the RDF stream. (GZIP, ZIP...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param baseURI Base URI for the dataset.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
*/
public static HDT generateHDTDisk(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener);
}

public static TripleWriter getHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException {
return HDTManager.getInstance().doGetHDTWriter(out, baseURI, hdtFormat);
Expand Down Expand Up @@ -349,7 +497,11 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB
protected abstract HDT doMapIndexedHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException;
protected abstract HDT doIndexedHDT(HDT hdt, ProgressListener listener) throws IOException;
protected abstract HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDT(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doGenerateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDTDisk(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract TripleWriter doGetHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException;
protected abstract TripleWriter doGetHDTWriter(String outFile, String baseURI, HDTOptions hdtFormat) throws IOException;
protected abstract HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package org.rdfhdt.hdt.listener;

import org.rdfhdt.hdt.listener.ProgressListener;

/**
* version of {@link org.rdfhdt.hdt.listener.ProgressListener} for multi-thread logging
*/
@FunctionalInterface
public interface MultiThreadListener extends ProgressListener {

/**
* Send progress notification
* @param thread thread name
* @param level percent of the task accomplished
* @param message Description of the operation
*/
void notifyProgress(String thread, float level, String message);

/**
* Send progress notification, should call {@link #notifyProgress(String, float, String)}
* @param level percent of the task accomplished
* @param message Description of the operation
*/
default void notifyProgress(float level, String message) {
notifyProgress(Thread.currentThread().getName(), level, message);
}

/**
* unregister all the thread
*/
default void unregisterAllThreads() {
// should be filled by implementation if required
}

/**
* register a thread
* @param threadName the thread name
*/
default void registerThread(String threadName) {
// should be filled by implementation if required
}

/**
* unregister a thread
* @param threadName the thread name
*/
default void unregisterThread(String threadName) {
// should be filled by implementation if required
}
}
77 changes: 77 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package org.rdfhdt.hdt.options;

/**
* keys usable with {@link org.rdfhdt.hdt.options.HDTOptions#set(String, String)}
* @author Antoine Willerval
*/
public class HDTOptionsKeys {
/**
* Key for the compression mode for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods.
* Value can be {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE} or
* {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE}
*/
public static final String LOADER_DISK_COMPRESSION_MODE_KEY = "loader.disk.compressMode";
/**
* Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort all the file before going to the next step, slower
* but decrease the RAM usage. default config.
*/
public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE = "compressionComplete";
/**
* Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort while reading all the file before going to the next
* step, faster but increase the RAM usage.
*/
public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL = "compressionPartial";
/**
* Key for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
* say the number of workers to merge the data. default to the number of processor. long value.
*/
public static final String LOADER_DISK_COMPRESSION_WORKER_KEY = "loader.disk.compressWorker";
/**
* Key for the maximum size of a chunk on disk for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk
* methods, the chunk should be in RAM before writing it on disk and should be sorted. long value.
*/
public static final String LOADER_DISK_CHUNK_SIZE_KEY = "loader.disk.chunkSize";
/**
* Key for the location of the working directory {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
* this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to
* set this option to delete the directory in case of an interruption of the process. file value.
*/
public static final String LOADER_DISK_LOCATION_KEY = "loader.disk.location";
/**
* Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
* this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT
* file. slower, increase the disk usage, but drastically reduce the RAM usage. file value.
*/
public static final String LOADER_DISK_FUTURE_HDT_LOCATION_KEY = "loader.disk.futureHDTLocation";
/**
* Key for the maximum number of file opened at the same time, should be greater than {@link #LOADER_DISK_KWAY_KEY},
* 1024 by default
*/
public static final String LOADER_DISK_MAX_FILE_OPEN_KEY = "loader.disk.maxFileOpen";
/**
* Key for the number of chunk layers opened at the same time, by default
* <p>min(log2(maxFileOpen), chunkSize / (fileBufferSize * compressWorker))</p>
*/
public static final String LOADER_DISK_KWAY_KEY = "loader.disk.kway";
/**
* Key for the size of the buffers when opening a file
*/
public static final String LOADER_DISK_BUFFER_SIZE_KEY = "loader.disk.fileBufferSize";
/**
* Key for the loading mode of a RDF file for the
* {@link org.rdfhdt.hdt.hdt.HDTManager#generateHDT(String, String, org.rdfhdt.hdt.enums.RDFNotation, HDTOptions, org.rdfhdt.hdt.listener.ProgressListener)}
* method, this key isn't working with the other methods.
* Value can be {@link #LOADER_TYPE_VALUE_ONE_PASS} or {@link #LOADER_TYPE_VALUE_TWO_PASS}.
*/
public static final String LOADER_TYPE_KEY = "loader.type";
/**
* Value for {@link #LOADER_TYPE_KEY}, read twice the RDF file, reduce the RAM usage
*/
public static final String LOADER_TYPE_VALUE_TWO_PASS = "two-pass";
/**
* Value for {@link #LOADER_TYPE_KEY}, read only once the RDF file, default value
*/
public static final String LOADER_TYPE_VALUE_ONE_PASS = "one-pass";

private HDTOptionsKeys() {}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
*
*/
public interface RDFParserCallback {
@FunctionalInterface
interface RDFCallback {
void processTriple(TripleString triple, long pos);
}
Expand Down
4 changes: 4 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ public static void escapeString(String label, Appendable appendable)
}
}
}

if (last == label.length()) {
last--;
}

for (int i = first; i <= last; i++) {
char c = label.charAt(i);
Expand Down
Loading