rdfhdt · ate47 · Apr 4, 2022
diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java b/hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java
@@ -0,0 +1,52 @@
+package org.rdfhdt.hdt.enums;
+
+/**
+ * A compression type
+ * @author Antoine Willerval
+ */
+public enum CompressionType {
+
+	/**
+	 * gzip compression (.gz .tgz)
+	 */
+	GZIP("gz", "tgz"),
+	/**
+	 * bzip compression (.bz2 .bz)
+	 */
+	BZIP("bz2", "bz"),
+	/**
+	 * bzip compression (.xz)
+	 */
+	XZ("xz"),
+	/**
+	 * no compression
+	 */
+	NONE;
+
+	/**
+	 * try to guess a compression of a file with its name
+	 * @param fileName the file name to guess
+	 * @return the compression type or none if it can't be guessed
+	 */
+	public static CompressionType guess(String fileName) {
+		String str = fileName.toLowerCase();
+
+		int idx = str.lastIndexOf('.');
+		if(idx!=-1) {
+			String ext = str.substring(idx + 1);
+			for (CompressionType type: values()) {
+				for (String typeExt : type.ext) {
+					if (typeExt.equals(ext)) {
+						return type;
+					}
+				}
+			}
+		}
+		return NONE;
+	}
+
+	private final String[] ext;
+	CompressionType(String... ext) {
+		this.ext = ext;
+	}
+}
diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java
@@ -6,6 +6,7 @@
 import java.util.Iterator;
 
 import org.rdfhdt.hdt.compact.bitmap.Bitmap;
+import org.rdfhdt.hdt.enums.CompressionType;
 import org.rdfhdt.hdt.enums.RDFNotation;
 import org.rdfhdt.hdt.exceptions.ParserException;
 import org.rdfhdt.hdt.listener.ProgressListener;
@@ -290,6 +291,153 @@ public static HDT generateHDT(String rdfFileName, String baseURI, RDFNotation rd
 	public static HDT generateHDT(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
 		return HDTManager.getInstance().doGenerateHDT(iterator, baseURI, hdtFormat, listener);
 	}
+	/**
+	 * Create an HDT file from a RDF stream.
+	 * @param fileStream RDF stream to parse.
+	 * @param baseURI Base URI for the dataset.
+	 * @param filename the RDF file name to guess the stream format and compresion.
+	 * @param hdtFormat Parameters to tune the generated HDT.
+	 * @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
+	 * @return HDT
+	 * @throws IOException when the stream cannot be used
+	 * @throws ParserException when the RDF stream can't be parsed
+	 */
+	public static HDT generateHDT(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
+		return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener);
+	}
+	/**
+	 * Create an HDT file from a RDF stream.
+	 * @param fileStream RDF stream to parse.
+	 * @param baseURI Base URI for the dataset.
+	 * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
+	 * @param compressionType Compression type of the RDF stream. (GZIP, ZIP...)
+	 * @param hdtFormat Parameters to tune the generated HDT.
+	 * @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
+	 * @return HDT
+	 * @throws IOException when the stream cannot be used
+	 * @throws ParserException when the RDF stream can't be parsed
+	 */
+	public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
+		return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener);
+	}
+	/**
+	 * Create an HDT file from a RDF stream.
+	 * @param fileStream RDF stream to parse.
+	 * @param baseURI Base URI for the dataset.
+	 * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
+	 * @param hdtFormat Parameters to tune the generated HDT.
+	 * @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
+	 * @return HDT
+	 * @throws IOException when the stream cannot be used
+	 * @throws ParserException when the RDF stream can't be parsed
+	 */
+	public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
+		return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
+	}
+
+	/**
+	 * Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the
+	 * IO usage.
+	 * @param rdfFileName RDF file to parse.
+	 * @param baseURI Base URI for the dataset.
+	 * @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
+	 * @param compressionType Compression type of the RDF file. (GZIP, ZIP...)
+	 * @param hdtFormat Parameters to tune the generated HDT.
+	 * @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
+	 * @return HDT
+	 * @throws IOException when the file cannot be found
+	 * @throws ParserException when the RDF file can't be parsed
+	 */
+	public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
+		return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, compressionType, hdtFormat, listener);
+	}
+	/**
+	 * Create an HDT file from an RDF file without compression by sorting the triples on disk, reduce the memory
+	 * required by increasing the IO usage.
+	 * @param rdfFileName RDF file to parse.
+	 * @param baseURI Base URI for the dataset.
+	 * @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...)
+	 * @param hdtFormat Parameters to tune the generated HDT.
+	 * @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
+	 * @return HDT
+	 * @throws IOException when the file cannot be found
+	 * @throws ParserException when the RDF file can't be parsed
+	 */
+	public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
+		return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
+	}
+	/**
+	 * Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the
+	 * IO usage. Will guess the RDF file compression/format with the file name.
+	 * @param rdfFileName RDF file to parse.
+	 * @param baseURI Base URI for the dataset.
+	 * @param hdtFormat Parameters to tune the generated HDT.
+	 * @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
+	 * @return HDT
+	 * @throws IOException when the file cannot be found
+	 * @throws ParserException when the RDF file can't be parsed
+	 */
+	public static HDT generateHDTDisk(String rdfFileName, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
+		return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, RDFNotation.guess(rdfFileName), CompressionType.guess(rdfFileName), hdtFormat, listener);
+	}
+	/**
+	 * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
+	 * the IO usage.
+	 * @param fileStream RDF stream to parse.
+	 * @param baseURI Base URI for the dataset.
+	 * @param filename the RDF file name to guess the stream format and compresion.
+	 * @param hdtFormat Parameters to tune the generated HDT.
+	 * @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
+	 * @return HDT
+	 * @throws IOException when the stream cannot be used
+	 * @throws ParserException when the RDF stream can't be parsed
+	 */
+	public static HDT generateHDTDisk(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
+		return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener);
+	}
+	/**
+	 * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
+	 * the IO usage.
+	 * @param fileStream RDF stream to parse.
+	 * @param baseURI Base URI for the dataset.
+	 * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
+	 * @param compressionType Compression type of the RDF stream. (GZIP, ZIP...)
+	 * @param hdtFormat Parameters to tune the generated HDT.
+	 * @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
+	 * @return HDT
+	 * @throws IOException when the stream cannot be used
+	 * @throws ParserException when the RDF stream can't be parsed
+	 */
+	public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
+		return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener);
+	}
+	/**
+	 * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
+	 * the IO usage.
+	 * @param fileStream RDF stream to parse.
+	 * @param baseURI Base URI for the dataset.
+	 * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...)
+	 * @param hdtFormat Parameters to tune the generated HDT.
+	 * @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
+	 * @return HDT
+	 * @throws IOException when the stream cannot be used
+	 * @throws ParserException when the RDF stream can't be parsed
+	 */
+	public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
+		return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener);
+	}
+	/**
+	 * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing
+	 * the IO usage.
+	 * @param baseURI Base URI for the dataset.
+	 * @param hdtFormat Parameters to tune the generated HDT.
+	 * @param listener Listener to get notified of loading progress. Can be null if no notifications needed.
+	 * @return HDT
+	 * @throws IOException when the stream cannot be used
+	 */
+	public static HDT generateHDTDisk(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException {
+		return HDTManager.getInstance().doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener);
+	}
 
 	public static TripleWriter getHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException {
 		return HDTManager.getInstance().doGetHDTWriter(out, baseURI, hdtFormat);
@@ -349,7 +497,11 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB
 	protected abstract HDT doMapIndexedHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException;
 	protected abstract HDT doIndexedHDT(HDT hdt, ProgressListener listener) throws IOException;
 	protected abstract HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
+	protected abstract HDT doGenerateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
 	protected abstract HDT doGenerateHDT(Iterator<TripleString> iterator, String baseURI,	HDTOptions hdtFormat, ProgressListener listener) throws IOException;
+	protected abstract HDT doGenerateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
+	protected abstract HDT doGenerateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
+	protected abstract HDT doGenerateHDTDisk(Iterator<TripleString> iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException;
 	protected abstract TripleWriter doGetHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException;
 	protected abstract TripleWriter doGetHDTWriter(String outFile, String baseURI, HDTOptions hdtFormat) throws IOException;
 	protected abstract HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException;

diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/listener/MultiThreadListener.java b/hdt-api/src/main/java/org/rdfhdt/hdt/listener/MultiThreadListener.java
@@ -0,0 +1,50 @@
+package org.rdfhdt.hdt.listener;
+
+import org.rdfhdt.hdt.listener.ProgressListener;
+
+/**
+ * version of {@link org.rdfhdt.hdt.listener.ProgressListener} for multi-thread logging
+ */
+@FunctionalInterface
+public interface MultiThreadListener extends ProgressListener {
+
+	/**
+	 * Send progress notification
+	 * @param thread thread name
+	 * @param level percent of the task accomplished
+	 * @param message Description of the operation
+	 */
+	void notifyProgress(String thread, float level, String message);
+
+	/**
+	 * Send progress notification, should call {@link #notifyProgress(String, float, String)}
+	 * @param level percent of the task accomplished
+	 * @param message Description of the operation
+	 */
+	default void notifyProgress(float level, String message) {
+		notifyProgress(Thread.currentThread().getName(), level, message);
+	}
+
+	/**
+	 * unregister all the thread
+	 */
+	default void unregisterAllThreads() {
+		// should be filled by implementation if required
+	}
+
+	/**
+	 * register a thread
+	 * @param threadName the thread name
+	 */
+	default void registerThread(String threadName) {
+		// should be filled by implementation if required
+	}
+
+	/**
+	 * unregister a thread
+	 * @param threadName the thread name
+	 */
+	default void unregisterThread(String threadName) {
+		// should be filled by implementation if required
+	}
+}
diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java
@@ -0,0 +1,77 @@
+package org.rdfhdt.hdt.options;
+
+/**
+ * keys usable with {@link org.rdfhdt.hdt.options.HDTOptions#set(String, String)}
+ * @author Antoine Willerval
+ */
+public class HDTOptionsKeys {
+	/**
+	 * Key for the compression mode for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods.
+	 * Value can be {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE} or
+	 * {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE}
+	 */
+	public static final String LOADER_DISK_COMPRESSION_MODE_KEY = "loader.disk.compressMode";
+	/**
+	 * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort all the file before going to the next step, slower
+	 * but decrease the RAM usage. default config.
+	 */
+	public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE = "compressionComplete";
+	/**
+	 * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort while reading all the file before going to the next
+	 * step, faster but increase the RAM usage.
+	 */
+	public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL = "compressionPartial";
+	/**
+	 * Key for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
+	 * say the number of workers to merge the data. default to the number of processor. long value.
+	 */
+	public static final String LOADER_DISK_COMPRESSION_WORKER_KEY = "loader.disk.compressWorker";
+	/**
+	 * Key for the maximum size of a chunk on disk for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk
+	 * methods, the chunk should be in RAM before writing it on disk and should be sorted. long value.
+	 */
+	public static final String LOADER_DISK_CHUNK_SIZE_KEY = "loader.disk.chunkSize";
+	/**
+	 * Key for the location of the working directory {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
+	 * this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to
+	 * set this option to delete the directory in case of an interruption of the process. file value.
+	 */
+	public static final String LOADER_DISK_LOCATION_KEY = "loader.disk.location";
+	/**
+	 * Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods,
+	 * this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT
+	 * file. slower, increase the disk usage, but drastically reduce the RAM usage. file value.
+	 */
+	public static final String LOADER_DISK_FUTURE_HDT_LOCATION_KEY = "loader.disk.futureHDTLocation";
+	/**
+	 * Key for the maximum number of file opened at the same time, should be greater than {@link #LOADER_DISK_KWAY_KEY},
+	 * 1024 by default
+	 */
+	public static final String LOADER_DISK_MAX_FILE_OPEN_KEY = "loader.disk.maxFileOpen";
+	/**
+	 * Key for the number of chunk layers opened at the same time, by default
+	 * <p>min(log2(maxFileOpen), chunkSize / (fileBufferSize * compressWorker))</p>
+	 */
+	public static final String LOADER_DISK_KWAY_KEY = "loader.disk.kway";
+	/**
+	 * Key for the size of the buffers when opening a file
+	 */
+	public static final String LOADER_DISK_BUFFER_SIZE_KEY = "loader.disk.fileBufferSize";
+	/**
+	 * Key for the loading mode of a RDF file for the
+	 * {@link org.rdfhdt.hdt.hdt.HDTManager#generateHDT(String, String, org.rdfhdt.hdt.enums.RDFNotation, HDTOptions, org.rdfhdt.hdt.listener.ProgressListener)}
+	 * method, this key isn't working with the other methods.
+	 * Value can be {@link #LOADER_TYPE_VALUE_ONE_PASS} or {@link #LOADER_TYPE_VALUE_TWO_PASS}.
+	 */
+	public static final String LOADER_TYPE_KEY = "loader.type";
+	/**
+	 * Value for {@link #LOADER_TYPE_KEY}, read twice the RDF file, reduce the RAM usage
+	 */
+	public static final String LOADER_TYPE_VALUE_TWO_PASS = "two-pass";
+	/**
+	 * Value for {@link #LOADER_TYPE_KEY}, read only once the RDF file, default value
+	 */
+	public static final String LOADER_TYPE_VALUE_ONE_PASS = "one-pass";
+
+	private HDTOptionsKeys() {}
+}
diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFParserCallback.java b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFParserCallback.java
@@ -40,6 +40,7 @@
  *
  */
 public interface RDFParserCallback {
+	@FunctionalInterface
 	interface RDFCallback {
 		void processTriple(TripleString triple, long pos);
 	}

diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java b/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java
@@ -100,6 +100,10 @@ public static void escapeString(String label, Appendable appendable)
 				}
 			}
 		}
+
+        if (last == label.length()) {
+            last--;
+        }
 
         for (int i = first; i <= last; i++) {
             char c = label.charAt(i);
-Original file line number
+Diff line change
@@ Expand Up @@
     				}
     			}
     		}
+            if (last == label.length()) {
+                last--;
+            }
             for (int i = first; i <= last; i++) {
                 char c = label.charAt(i);
@@ Expand Down @@