Skip to content

Commit

Permalink
add HDTGenerateDisk method with tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ate47 committed Jul 19, 2022
1 parent 7a313d0 commit 7c9ba69
Show file tree
Hide file tree
Showing 94 changed files with 9,034 additions and 341 deletions.
52 changes: 52 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package org.rdfhdt.hdt.enums;

/**
* A compression type
* @author Antoine Willerval
*/
public enum CompressionType {

/**
* gzip compression (.gz .tgz)
*/
GZIP("gz", "tgz"),
/**
* bzip compression (.bz2 .bz)
*/
BZIP("bz2", "bz"),
/**
* bzip compression (.xz)
*/
XZ("xz"),
/**
* no compression
*/
NONE;

/**
* try to guess a compression of a file with its name
* @param fileName the file name to guess
* @return the compression type or none if it can't be guessed
*/
public static CompressionType guess(String fileName) {
String str = fileName.toLowerCase();

int idx = str.lastIndexOf('.');
if(idx!=-1) {
String ext = str.substring(idx + 1);
for (CompressionType type: values()) {
for (String typeExt : type.ext) {
if (typeExt.equals(ext)) {
return type;
}
}
}
}
return NONE;
}

private final String[] ext;
CompressionType(String... ext) {
this.ext = ext;
}
}
152 changes: 152 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import java.util.Iterator;

import org.rdfhdt.hdt.compact.bitmap.Bitmap;
import org.rdfhdt.hdt.enums.CompressionType;
import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.listener.ProgressListener;
Expand Down Expand Up @@ -290,6 +291,153 @@ public static HDT generateHDT(String rdfFileName, String baseURI, RDFNotation rd
public static HDT generateHDT(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(iterator, baseURI, hdtFormat, listener);
}
/**
* Create an HDT file from a RDF stream.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param filename the RDF file name to guess the stream format and compresion.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDT(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener);
}
/**
* Create an HDT file from a RDF stream.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param compressionType Compression type of the RDF stream. (GZIP, ZIP...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener);
}
/**
* Create an HDT file from a RDF stream.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
}

/**
* Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the
* IO usage.
* @param rdfFileName RDF file to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param compressionType Compression type of the RDF file. (GZIP, ZIP...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the file cannot be found
* @throws ParserException when the RDF file can't be parsed
*/
public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, compressionType, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF file without compression by sorting the triples on disk, reduce the memory
* required by increasing the IO usage.
* @param rdfFileName RDF file to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the file cannot be found
* @throws ParserException when the RDF file can't be parsed
*/
public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the
* IO usage. Will guess the RDF file compression/format with the file name.
* @param rdfFileName RDF file to parse.
* @param baseURI Base URI for the dataset.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the file cannot be found
* @throws ParserException when the RDF file can't be parsed
*/
public static HDT generateHDTDisk(String rdfFileName, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, RDFNotation.guess(rdfFileName), CompressionType.guess(rdfFileName), hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param filename the RDF file name to guess the stream format and compresion.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDTDisk(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param compressionType Compression type of the RDF stream. (GZIP, ZIP...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param fileStream RDF stream to parse.
* @param baseURI Base URI for the dataset.
* @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
* @throws ParserException when the RDF stream can't be parsed
*/
public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
}
/**
* Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
* the IO usage.
* @param baseURI Base URI for the dataset.
* @param hdtFormat Parameters to tune the generated HDT.
* @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
* @return HDT
* @throws IOException when the stream cannot be used
*/
public static HDT generateHDTDisk(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
return HDTManager.getInstance().doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener);
}

public static TripleWriter getHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException {
return HDTManager.getInstance().doGetHDTWriter(out, baseURI, hdtFormat);
Expand Down Expand Up @@ -349,7 +497,11 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB
protected abstract HDT doMapIndexedHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException;
protected abstract HDT doIndexedHDT(HDT hdt, ProgressListener listener) throws IOException;
protected abstract HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDT(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
protected abstract HDT doGenerateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract HDT doGenerateHDTDisk(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
protected abstract TripleWriter doGetHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException;
protected abstract TripleWriter doGetHDTWriter(String outFile, String baseURI, HDTOptions hdtFormat) throws IOException;
protected abstract HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package org.rdfhdt.hdt.listener;

import org.rdfhdt.hdt.listener.ProgressListener;

/**
* version of {@link org.rdfhdt.hdt.listener.ProgressListener} for multi-thread logging
*/
@FunctionalInterface
public interface MultiThreadListener extends ProgressListener {

/**
* Send progress notification
* @param thread thread name
* @param level percent of the task accomplished
* @param message Description of the operation
*/
void notifyProgress(String thread, float level, String message);

/**
* Send progress notification, should call {@link #notifyProgress(String, float, String)}
* @param level percent of the task accomplished
* @param message Description of the operation
*/
default void notifyProgress(float level, String message) {
notifyProgress(Thread.currentThread().getName(), level, message);
}

/**
* unregister all the thread
*/
default void unregisterAllThreads() {
// should be filled by implementation if required
}

/**
* register a thread
* @param threadName the thread name
*/
default void registerThread(String threadName) {
// should be filled by implementation if required
}

/**
* unregister a thread
* @param threadName the thread name
*/
default void unregisterThread(String threadName) {
// should be filled by implementation if required
}
}
77 changes: 77 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package org.rdfhdt.hdt.options;

/**
* keys usable with {@link org.rdfhdt.hdt.options.HDTOptions#set(String, String)}
* @author Antoine Willerval
*/
public class HDTOptionsKeys {
/**
* Key for the compression mode for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods.
* Value can be {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE} or
* {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE}
*/
public static final String LOADER_DISK_COMPRESSION_MODE_KEY = "loader.disk.compressMode";
/**
* Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort all the file before going to the next step, slower
* but decrease the RAM usage. default config.
*/
public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE = "compressionComplete";
/**
* Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort while reading all the file before going to the next
* step, faster but increase the RAM usage.
*/
public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL = "compressionPartial";
/**
* Key for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
* say the number of workers to merge the data. default to the number of processor. long value.
*/
public static final String LOADER_DISK_COMPRESSION_WORKER_KEY = "loader.disk.compressWorker";
/**
* Key for the maximum size of a chunk on disk for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk
* methods, the chunk should be in RAM before writing it on disk and should be sorted. long value.
*/
public static final String LOADER_DISK_CHUNK_SIZE_KEY = "loader.disk.chunkSize";
/**
* Key for the location of the working directory {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
* this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to
* set this option to delete the directory in case of an interruption of the process. file value.
*/
public static final String LOADER_DISK_LOCATION_KEY = "loader.disk.location";
/**
* Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
* this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT
* file. slower, increase the disk usage, but drastically reduce the RAM usage. file value.
*/
public static final String LOADER_DISK_FUTURE_HDT_LOCATION_KEY = "loader.disk.futureHDTLocation";
/**
* Key for the maximum number of file opened at the same time, should be greater than {@link #LOADER_DISK_KWAY_KEY},
* 1024 by default
*/
public static final String LOADER_DISK_MAX_FILE_OPEN_KEY = "loader.disk.maxFileOpen";
/**
* Key for the number of chunk layers opened at the same time, by default
* <p>min(log2(maxFileOpen), chunkSize / (fileBufferSize * compressWorker))</p>
*/
public static final String LOADER_DISK_KWAY_KEY = "loader.disk.kway";
/**
* Key for the size of the buffers when opening a file
*/
public static final String LOADER_DISK_BUFFER_SIZE_KEY = "loader.disk.fileBufferSize";
/**
* Key for the loading mode of a RDF file for the
* {@link org.rdfhdt.hdt.hdt.HDTManager#generateHDT(String, String, org.rdfhdt.hdt.enums.RDFNotation, HDTOptions, org.rdfhdt.hdt.listener.ProgressListener)}
* method, this key isn't working with the other methods.
* Value can be {@link #LOADER_TYPE_VALUE_ONE_PASS} or {@link #LOADER_TYPE_VALUE_TWO_PASS}.
*/
public static final String LOADER_TYPE_KEY = "loader.type";
/**
* Value for {@link #LOADER_TYPE_KEY}, read twice the RDF file, reduce the RAM usage
*/
public static final String LOADER_TYPE_VALUE_TWO_PASS = "two-pass";
/**
* Value for {@link #LOADER_TYPE_KEY}, read only once the RDF file, default value
*/
public static final String LOADER_TYPE_VALUE_ONE_PASS = "one-pass";

private HDTOptionsKeys() {}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
*
*/
public interface RDFParserCallback {
@FunctionalInterface
interface RDFCallback {
void processTriple(TripleString triple, long pos);
}
Expand Down
4 changes: 4 additions & 0 deletions hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ public static void escapeString(String label, Appendable appendable)
}
}
}

if (last == label.length()) {
last--;
}

for (int i = first; i <= last; i++) {
char c = label.charAt(i);
Expand Down
Loading

0 comments on commit 7c9ba69

Please sign in to comment.