diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java b/hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java new file mode 100644 index 00000000..0b3dc6e9 --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/enums/CompressionType.java @@ -0,0 +1,52 @@ +package org.rdfhdt.hdt.enums; + +/** + * A compression type + * @author Antoine Willerval + */ +public enum CompressionType { + + /** + * gzip compression (.gz .tgz) + */ + GZIP("gz", "tgz"), + /** + * bzip compression (.bz2 .bz) + */ + BZIP("bz2", "bz"), + /** + * bzip compression (.xz) + */ + XZ("xz"), + /** + * no compression + */ + NONE; + + /** + * try to guess a compression of a file with its name + * @param fileName the file name to guess + * @return the compression type or none if it can't be guessed + */ + public static CompressionType guess(String fileName) { + String str = fileName.toLowerCase(); + + int idx = str.lastIndexOf('.'); + if(idx!=-1) { + String ext = str.substring(idx + 1); + for (CompressionType type: values()) { + for (String typeExt : type.ext) { + if (typeExt.equals(ext)) { + return type; + } + } + } + } + return NONE; + } + + private final String[] ext; + CompressionType(String... ext) { + this.ext = ext; + } +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java index d3f33f70..7d798b74 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/hdt/HDTManager.java @@ -6,6 +6,7 @@ import java.util.Iterator; import org.rdfhdt.hdt.compact.bitmap.Bitmap; +import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.listener.ProgressListener; @@ -290,6 +291,153 @@ public static HDT generateHDT(String rdfFileName, String baseURI, RDFNotation rd public static HDT generateHDT(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { return HDTManager.getInstance().doGenerateHDT(iterator, baseURI, hdtFormat, listener); } + /** + * Create an HDT file from a RDF stream. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param filename the RDF file name to guess the stream format and compresion. + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDT(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener); + } + /** + * Create an HDT file from a RDF stream. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...) + * @param compressionType Compression type of the RDF stream. (GZIP, ZIP...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener); + } + /** + * Create an HDT file from a RDF stream. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDT(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener); + } + + /** + * Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the + * IO usage. + * @param rdfFileName RDF file to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...) + * @param compressionType Compression type of the RDF file. (GZIP, ZIP...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the file cannot be found + * @throws ParserException when the RDF file can't be parsed + */ + public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, compressionType, hdtFormat, listener); + } + /** + * Create an HDT file from an RDF file without compression by sorting the triples on disk, reduce the memory + * required by increasing the IO usage. + * @param rdfFileName RDF file to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF File (NTriples, N3, RDF-XML...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the file cannot be found + * @throws ParserException when the RDF file can't be parsed + */ + public static HDT generateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener); + } + /** + * Create an HDT file from an RDF file by sorting the triples on disk, reduce the memory required by increasing the + * IO usage. Will guess the RDF file compression/format with the file name. + * @param rdfFileName RDF file to parse. + * @param baseURI Base URI for the dataset. + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the file cannot be found + * @throws ParserException when the RDF file can't be parsed + */ + public static HDT generateHDTDisk(String rdfFileName, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(rdfFileName, baseURI, RDFNotation.guess(rdfFileName), CompressionType.guess(rdfFileName), hdtFormat, listener); + } + /** + * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing + * the IO usage. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param filename the RDF file name to guess the stream format and compresion. + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDTDisk(InputStream fileStream, String baseURI, String filename, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, RDFNotation.guess(filename), CompressionType.guess(filename), hdtFormat, listener); + } + /** + * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing + * the IO usage. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...) + * @param compressionType Compression type of the RDF stream. (GZIP, ZIP...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, compressionType, hdtFormat, listener); + } + /** + * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing + * the IO usage. + * @param fileStream RDF stream to parse. + * @param baseURI Base URI for the dataset. + * @param rdfNotation Format of the source RDF stream (NTriples, N3, RDF-XML...) + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + * @throws ParserException when the RDF stream can't be parsed + */ + public static HDT generateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(fileStream, baseURI, rdfNotation, CompressionType.NONE, hdtFormat, listener); + } + /** + * Create an HDT file from an RDF stream by sorting the triples on disk, reduce the memory required by increasing + * the IO usage. + * @param baseURI Base URI for the dataset. + * @param hdtFormat Parameters to tune the generated HDT. + * @param listener Listener to get notified of loading progress. Can be null if no notifications needed. + * @return HDT + * @throws IOException when the stream cannot be used + */ + public static HDT generateHDTDisk(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + return HDTManager.getInstance().doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener); + } public static TripleWriter getHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException { return HDTManager.getInstance().doGetHDTWriter(out, baseURI, hdtFormat); @@ -349,7 +497,11 @@ public static HDT diffHDTBit(String location, String hdtFileName, Bitmap deleteB protected abstract HDT doMapIndexedHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException; protected abstract HDT doIndexedHDT(HDT hdt, ProgressListener listener) throws IOException; protected abstract HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; + protected abstract HDT doGenerateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; protected abstract HDT doGenerateHDT(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException; + protected abstract HDT doGenerateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; + protected abstract HDT doGenerateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; + protected abstract HDT doGenerateHDTDisk(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException; protected abstract TripleWriter doGetHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException; protected abstract TripleWriter doGetHDTWriter(String outFile, String baseURI, HDTOptions hdtFormat) throws IOException; protected abstract HDT doHDTCat(String location, String hdtFileName1, String hdtFileName2, HDTOptions hdtFormat, ProgressListener listener) throws IOException; diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/listener/MultiThreadListener.java b/hdt-api/src/main/java/org/rdfhdt/hdt/listener/MultiThreadListener.java new file mode 100644 index 00000000..abf44045 --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/listener/MultiThreadListener.java @@ -0,0 +1,50 @@ +package org.rdfhdt.hdt.listener; + +import org.rdfhdt.hdt.listener.ProgressListener; + +/** + * version of {@link org.rdfhdt.hdt.listener.ProgressListener} for multi-thread logging + */ +@FunctionalInterface +public interface MultiThreadListener extends ProgressListener { + + /** + * Send progress notification + * @param thread thread name + * @param level percent of the task accomplished + * @param message Description of the operation + */ + void notifyProgress(String thread, float level, String message); + + /** + * Send progress notification, should call {@link #notifyProgress(String, float, String)} + * @param level percent of the task accomplished + * @param message Description of the operation + */ + default void notifyProgress(float level, String message) { + notifyProgress(Thread.currentThread().getName(), level, message); + } + + /** + * unregister all the thread + */ + default void unregisterAllThreads() { + // should be filled by implementation if required + } + + /** + * register a thread + * @param threadName the thread name + */ + default void registerThread(String threadName) { + // should be filled by implementation if required + } + + /** + * unregister a thread + * @param threadName the thread name + */ + default void unregisterThread(String threadName) { + // should be filled by implementation if required + } +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java new file mode 100644 index 00000000..74d0e3e2 --- /dev/null +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/options/HDTOptionsKeys.java @@ -0,0 +1,77 @@ +package org.rdfhdt.hdt.options; + +/** + * keys usable with {@link org.rdfhdt.hdt.options.HDTOptions#set(String, String)} + * @author Antoine Willerval + */ +public class HDTOptionsKeys { + /** + * Key for the compression mode for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods. + * Value can be {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE} or + * {@link #LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE} + */ + public static final String LOADER_DISK_COMPRESSION_MODE_KEY = "loader.disk.compressMode"; + /** + * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort all the file before going to the next step, slower + * but decrease the RAM usage. default config. + */ + public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE = "compressionComplete"; + /** + * Value for {@link #LOADER_DISK_COMPRESSION_MODE_KEY}, sort while reading all the file before going to the next + * step, faster but increase the RAM usage. + */ + public static final String LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL = "compressionPartial"; + /** + * Key for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods, + * say the number of workers to merge the data. default to the number of processor. long value. + */ + public static final String LOADER_DISK_COMPRESSION_WORKER_KEY = "loader.disk.compressWorker"; + /** + * Key for the maximum size of a chunk on disk for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk + * methods, the chunk should be in RAM before writing it on disk and should be sorted. long value. + */ + public static final String LOADER_DISK_CHUNK_SIZE_KEY = "loader.disk.chunkSize"; + /** + * Key for the location of the working directory {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods, + * this directory will be deleted after the HDT generation. by default, the value is random, it is recommended to + * set this option to delete the directory in case of an interruption of the process. file value. + */ + public static final String LOADER_DISK_LOCATION_KEY = "loader.disk.location"; + /** + * Key for the location of the future HDT for the {@link org.rdfhdt.hdt.hdt.HDTManager} generateHDTDisk methods, + * this option will create a hdt file after the HDT generation, the returned HDT will be a mapped HDT of the HDT + * file. slower, increase the disk usage, but drastically reduce the RAM usage. file value. + */ + public static final String LOADER_DISK_FUTURE_HDT_LOCATION_KEY = "loader.disk.futureHDTLocation"; + /** + * Key for the maximum number of file opened at the same time, should be greater than {@link #LOADER_DISK_KWAY_KEY}, + * 1024 by default + */ + public static final String LOADER_DISK_MAX_FILE_OPEN_KEY = "loader.disk.maxFileOpen"; + /** + * Key for the number of chunk layers opened at the same time, by default + *

min(log2(maxFileOpen), chunkSize / (fileBufferSize * compressWorker))

+ */ + public static final String LOADER_DISK_KWAY_KEY = "loader.disk.kway"; + /** + * Key for the size of the buffers when opening a file + */ + public static final String LOADER_DISK_BUFFER_SIZE_KEY = "loader.disk.fileBufferSize"; + /** + * Key for the loading mode of a RDF file for the + * {@link org.rdfhdt.hdt.hdt.HDTManager#generateHDT(String, String, org.rdfhdt.hdt.enums.RDFNotation, HDTOptions, org.rdfhdt.hdt.listener.ProgressListener)} + * method, this key isn't working with the other methods. + * Value can be {@link #LOADER_TYPE_VALUE_ONE_PASS} or {@link #LOADER_TYPE_VALUE_TWO_PASS}. + */ + public static final String LOADER_TYPE_KEY = "loader.type"; + /** + * Value for {@link #LOADER_TYPE_KEY}, read twice the RDF file, reduce the RAM usage + */ + public static final String LOADER_TYPE_VALUE_TWO_PASS = "two-pass"; + /** + * Value for {@link #LOADER_TYPE_KEY}, read only once the RDF file, default value + */ + public static final String LOADER_TYPE_VALUE_ONE_PASS = "one-pass"; + + private HDTOptionsKeys() {} +} diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFParserCallback.java b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFParserCallback.java index 99bfae3d..098f2d0f 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFParserCallback.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/rdf/RDFParserCallback.java @@ -40,6 +40,7 @@ * */ public interface RDFParserCallback { + @FunctionalInterface interface RDFCallback { void processTriple(TripleString triple, long pos); } diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java b/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java index 88015230..42552eb8 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java @@ -100,6 +100,10 @@ public static void escapeString(String label, Appendable appendable) } } } + + if (last == label.length()) { + last--; + } for (int i = first; i <= last; i++) { char c = label.charAt(i); diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java index de34a168..15900c1c 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/HDTVerify.java @@ -15,9 +15,8 @@ public class HDTVerify { private HDTVerify() {} private static void print(byte[] arr) { - for (int i = 0; i < arr.length; i++) { - byte b = arr[i]; - System.out.print(String.format("%02X ", b)); + for (byte b : arr) { + System.out.printf("%02X ", b); } System.out.println(); } @@ -42,11 +41,11 @@ public static void checkDictionarySectionOrder(Iterator CharSequence charSeq = it.next(); String str = charSeq.toString(); - if(lastCharseq!=null && ((cmp=comparator.compare(lastCharseq, charSeq))>0 )) { + if(lastCharseq!=null && ((cmp=comparator.compare(lastCharseq, charSeq))>=0 )) { System.out.println("ERRA: "+lastCharseq+" / "+charSeq); } - if(lastStr!=null && ((cmp2=lastStr.compareTo(str))>0)) { + if(lastStr!=null && ((cmp2=lastStr.compareTo(str))>=0)) { System.out.println("ERRB: "+lastStr+" / "+str); } @@ -66,11 +65,15 @@ public static void main(String[] args) throws Throwable { System.out.println("hdtVerify "); System.exit(-1); } - HDT hdt = HDTManager.mapHDT(args[0], null); - - checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); - checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); - checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries()); - checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); + try (HDT hdt = HDTManager.mapHDT(args[0], null)) { + System.out.println("Checking subject entries"); + checkDictionarySectionOrder(hdt.getDictionary().getSubjects().getSortedEntries()); + System.out.println("Checking predicate entries"); + checkDictionarySectionOrder(hdt.getDictionary().getPredicates().getSortedEntries()); + System.out.println("Checking object entries"); + checkDictionarySectionOrder(hdt.getDictionary().getObjects().getSortedEntries()); + System.out.println("Checking shared entries"); + checkDictionarySectionOrder(hdt.getDictionary().getShared().getSortedEntries()); + } } } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java index 494235f9..96d8ce38 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java @@ -27,20 +27,24 @@ package org.rdfhdt.hdt.tools; import java.io.IOException; +import java.nio.file.Path; import java.util.List; +import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdt.hdt.HDTVersion; import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; import org.rdfhdt.hdt.util.StopWatch; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.internal.Lists; +import org.rdfhdt.hdt.util.listener.MultiThreadListenerConsole; /** * @author mario.arias @@ -75,37 +79,43 @@ public class RDF2HDT implements ProgressListener { @Parameter(names = "-quiet", description = "Do not show progress of the conversion") public boolean quiet; + @Parameter(names = "-disk", description = "Generate the HDT on disk to reduce memory usage") + public boolean disk; + + @Parameter(names = "-disklocation", description = "Location to run the generate disk, by default in a temporary directory, will be deleted after") + public String diskLocation; + @Parameter(names = "-canonicalntfile", description = "Only for NTriples input. Use a Fast NT file parser the input should be in a canonical form. See https://www.w3.org/TR/n-triples/#h2_canonical-ntriples") public boolean ntSimpleLoading; public void execute() throws ParserException, IOException { HDTSpecification spec; - if(configFile!=null) { + if (configFile != null) { spec = new HDTSpecification(configFile); } else { spec = new HDTSpecification(); } - if(options!=null) { + if (options != null) { spec.setOptions(options); } - if(baseURI==null) { - baseURI = "file://"+rdfInput; + if (baseURI == null) { + baseURI = "file://" + rdfInput; } - RDFNotation notation=null; - if(rdfType!=null) { + RDFNotation notation = null; + if (rdfType != null) { try { notation = RDFNotation.parse(rdfType); } catch (IllegalArgumentException e) { - System.out.println("Notation "+rdfType+" not recognised."); + System.out.println("Notation " + rdfType + " not recognised."); } } - - if(notation==null) { + + if (notation == null) { try { - notation = RDFNotation.guess(rdfInput); + notation = RDFNotation.guess(rdfInput); } catch (IllegalArgumentException e) { - System.out.println("Could not guess notation for "+rdfInput+" Trying NTriples"); + System.out.println("Could not guess notation for " + rdfInput + " Trying NTriples"); notation = RDFNotation.NTRIPLES; } } @@ -115,7 +125,27 @@ public void execute() throws ParserException, IOException { } StopWatch sw = new StopWatch(); - HDT hdt = HDTManager.generateHDT(rdfInput, baseURI,notation , spec, this); + HDT hdt; + + if (disk) { + if (!quiet) { + System.out.println("Generating using generateHDTDisk"); + } + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, hdtOutput); + if (diskLocation != null) { + spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, diskLocation); + if (!quiet) { + System.out.println("Using temp directory " + diskLocation); + } + } + MultiThreadListenerConsole listenerConsole = !quiet ? new MultiThreadListenerConsole() : null; + hdt = HDTManager.generateHDTDisk(rdfInput, baseURI, notation, CompressionType.guess(rdfInput), spec, listenerConsole); + if (listenerConsole != null) { + listenerConsole.notifyProgress(100, "done"); + } + } else { + hdt = HDTManager.generateHDT(rdfInput, baseURI, notation, spec, this); + } System.out.println("File converted in: "+sw.stopAndShow()); try { @@ -129,9 +159,11 @@ public void execute() throws ParserException, IOException { } // Dump to HDT file - sw = new StopWatch(); - hdt.saveToHDT(hdtOutput, this); - System.out.println("HDT saved to file in: "+sw.stopAndShow()); + if (!disk) { + sw = new StopWatch(); + hdt.saveToHDT(hdtOutput, this); + System.out.println("HDT saved to file in: "+sw.stopAndShow()); + } // Generate index and dump it to .hdt.index file sw.reset(); diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java new file mode 100644 index 00000000..915b1e8f --- /dev/null +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/util/listener/MultiThreadListenerConsole.java @@ -0,0 +1,104 @@ +package org.rdfhdt.hdt.util.listener; + +import java.util.Map; +import java.util.TreeMap; + +import org.rdfhdt.hdt.listener.MultiThreadListener; + +public class MultiThreadListenerConsole implements MultiThreadListener { + private static final String ERASE_LINE = "\r\033[K"; + + private static String goBackNLine(int line) { + return "\033[" + line + "A"; + } + + /** + * true if the system allow ascii sequence, false otherwise + */ + private static final boolean ALLOW_ASCII_SEQUENCE; + + static { + String env; + try { + env = System.getenv("TERM"); + } catch (SecurityException e) { + env = null; + } + + ALLOW_ASCII_SEQUENCE = System.console() != null && !(env == null || env.isEmpty()); + } + + private final Map threadMessages; + private int previous; + + public MultiThreadListenerConsole() { + this(ALLOW_ASCII_SEQUENCE); + } + + public MultiThreadListenerConsole(boolean asciiListener) { + if (asciiListener) { + threadMessages = new TreeMap<>(); + } else { + threadMessages = null; + } + } + + @Override + public synchronized void unregisterAllThreads() { + if (threadMessages == null) { + return; + } + threadMessages.clear(); + notifyProgress(0, "-"); + } + + @Override + public synchronized void registerThread(String threadName) { + notifyProgress(threadName, 0, "-"); + } + + @Override + public synchronized void unregisterThread(String threadName) { + if (threadMessages == null) { + return; + } + threadMessages.remove(threadName); + render(); + } + + @Override + public synchronized void notifyProgress(String thread, float level, String message) { + String msg = "[" + level + "] " + message; + if (threadMessages != null) { + threadMessages.put(thread, msg); + render(); + } else { + System.out.println("[" + thread + "]" + msg); + } + } + + private void render() { + if (threadMessages == null) { + return; + } + StringBuilder message = new StringBuilder(); + int lines = threadMessages.size(); + message.append("\r"); + // go back each line of the thread message + if (previous != 0) { + message.append(goBackNLine(previous)); + } + // write each thread logs + threadMessages.forEach((thread, msg) -> { + message.append(ERASE_LINE).append("[").append(thread).append("]").append(msg).append("\n"); + }); + // remove previous printing + int toRemove = previous - lines; + if (toRemove > 0) { + message.append((ERASE_LINE+"\n").repeat(toRemove)).append(goBackNLine(toRemove)); + } + previous = lines; + + System.out.print(message); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java new file mode 100644 index 00000000..799aaca4 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/bitmap/AppendableWriteBitmap.java @@ -0,0 +1,175 @@ +package org.rdfhdt.hdt.compact.bitmap; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRC8; +import org.rdfhdt.hdt.util.crc.CRCOutputStream; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; + +/** + * {@link org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap} implementation for only appending/one read saving into a + * buffer file + * @author Antoine Willerval + */ +public class AppendableWriteBitmap implements ModifiableBitmap, Closeable { + private long countZeros; + private long countOnes; + private long numbits; + private final CloseSuppressPath file; + private final CRCOutputStream stream; + private long currentElement; + private int bit; + private boolean saved; + + public AppendableWriteBitmap(CloseSuppressPath storage, int bufferSize) throws IOException { + file = storage; + stream = new CRCOutputStream(storage.openOutputStream(bufferSize), new CRC32()); + } + + @Override + public void set(long position, boolean value) { + throw new NotImplementedException(); + } + + @Override + public void append(boolean value) { + // count for stats + if (value) { + countOnes++; + } else { + countZeros++; + } + // increase the numbits + numbits++; + + // set the value + if (value) { + currentElement |= 1L << bit; + } + bit++; + + // write the value if required + try { + pushByte(false); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void pushByte(boolean force) throws IOException { + if (bit == 64 || force) { + BitUtil.writeLowerBitsByteAligned(currentElement, bit, stream); + // reset the current element writing + bit = 0; + currentElement = 0L; + } + } + + @Override + public boolean access(long position) { + throw new NotImplementedException(); + } + + @Override + public long rank1(long position) { + throw new NotImplementedException(); + } + + @Override + public long rank0(long position) { + throw new NotImplementedException(); + } + + @Override + public long selectPrev1(long start) { + throw new NotImplementedException(); + } + + @Override + public long selectNext1(long start) { + throw new NotImplementedException(); + } + + @Override + public long select0(long n) { + throw new NotImplementedException(); + } + + @Override + public long select1(long n) { + throw new NotImplementedException(); + } + + @Override + public long getNumBits() { + return numbits; + } + + @Override + public long countOnes() { + return countOnes; + } + + @Override + public long countZeros() { + return countZeros; + } + + @Override + public long getSizeBytes() { + return (numbits - 1) / 8 + 1; + } + + @Override + public void save(OutputStream output, ProgressListener listener) throws IOException { + saved = true; + // maybe a bit was already reading + pushByte(true); + // complete the file + stream.writeCRC(); + stream.close(); + + CRCOutputStream out = new CRCOutputStream(output, new CRC8()); + + // Write Type and Numbits + out.write(BitmapFactory.TYPE_BITMAP_PLAIN); + VByte.encode(out, numbits); + + // Write CRC + out.writeCRC(); + + // write the storage file, already contains the CRC + Files.copy(file.getJavaPath(), output); + + // delete the file + file.close(); + } + + @Override + public void load(InputStream input, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public String getType() { + return HDTVocabulary.BITMAP_TYPE_PLAIN; + } + + @Override + public void close() throws IOException { + if (!saved) { + IOUtil.closeAll(stream, file); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/DynamicSequence.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/DynamicSequence.java index 637e471f..6ebd9289 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/DynamicSequence.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/DynamicSequence.java @@ -27,21 +27,17 @@ package org.rdfhdt.hdt.compact.sequence; +import org.rdfhdt.hdt.util.disk.LongArray; + /** * @author mario.arias * */ -public interface DynamicSequence extends Sequence { - /** - * Set a new value at the specified position. - * @param index - * @param value - */ - void set(long index, long value); - +public interface DynamicSequence extends Sequence, LongArray { + /** * Append a new value after the last position, increasing the number of elements by one. - * @param value + * @param value the value to append */ void append(long value); @@ -55,4 +51,9 @@ public interface DynamicSequence extends Sequence { * Use advanced algorithm to reduce the size to the minimum, even if it is costly. */ void aggressiveTrimToSize(); + + @Override + default long length() { + return getNumberOfElements(); + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64BigDisk.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64BigDisk.java index 0cbb2173..b2f4362d 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64BigDisk.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/sequence/SequenceLog64BigDisk.java @@ -161,9 +161,9 @@ public long get(long position) { @Override public void set(long position, long value) { - //if(value<0 || value>maxvalue) { - //throw new IllegalArgumentException("Value exceeds the maximum for this data structure"); - //} + if (value<0 || value>maxvalue) { + throw new IllegalArgumentException("Value exceeds the maximum for this data structure"); + } //System.out.println("numbits "+this.numbits); setField(data, numbits, position, value); } @@ -296,4 +296,4 @@ public void close() throws IOException { } data=null; } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryPrivate.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryPrivate.java index ca11f961..c3b40ea9 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryPrivate.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/DictionaryPrivate.java @@ -25,7 +25,11 @@ public interface DictionaryPrivate extends Dictionary { * Loads all information from another dictionary into this dictionary. */ void load(TempDictionary other, ProgressListener listener); - + /** + * same as {@link #load(TempDictionary, org.rdfhdt.hdt.listener.ProgressListener)} but read all the section at the same time + */ + void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException; + /** * Saves the dictionary to a OutputStream */ diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java index 1c2903c9..9786f03e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/BaseDictionary.java @@ -30,8 +30,11 @@ import org.rdfhdt.hdt.dictionary.DictionaryPrivate; import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.TempDictionary; import org.rdfhdt.hdt.enums.DictionarySectionRole; import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.DelayedString; @@ -232,5 +235,9 @@ public TreeMap getAllObjects() { public long getNAllObjects() { throw new IllegalArgumentException("Method is not applicable on this dictionary"); } - + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + throw new NotImplementedException(); + } } \ No newline at end of file diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java new file mode 100644 index 00000000..6a3265a6 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionary.java @@ -0,0 +1,247 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.dictionary.TempDictionary; +import org.rdfhdt.hdt.dictionary.TempDictionarySection; +import org.rdfhdt.hdt.dictionary.impl.section.OneReadDictionarySection; +import org.rdfhdt.hdt.enums.TripleComponentRole; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.iterator.utils.MapIterator; +import org.rdfhdt.hdt.iterator.utils.NotificationExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressUtil; +import org.rdfhdt.hdt.util.string.ByteStringUtil; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Comparator; + +/** + * Version of temp dictionary create the four sections from the SPO compressed sections result, should be loaded in a + * async way with {@link org.rdfhdt.hdt.dictionary.DictionaryPrivate#loadAsync(org.rdfhdt.hdt.dictionary.TempDictionary, org.rdfhdt.hdt.listener.ProgressListener)} + * @author Antoine Willerval + */ +public class CompressFourSectionDictionary implements TempDictionary { + private final ExceptionThread cfsdThread; + private final TempDictionarySection subject; + private final TempDictionarySection predicate; + private final TempDictionarySection object; + private final TempDictionarySection shared; + + private static void sendPiped(IndexedNode node, long index, PipedCopyIterator pipe, CompressUtil.DuplicatedIterator it, NodeConsumerMethod method) { + it.setLastHeader(index); + method.consume(node.getIndex(), index); + pipe.addElement(node.getNode()); + } + + public CompressFourSectionDictionary(CompressionResult compressionResult, NodeConsumer nodeConsumer, ProgressListener listener) { + long splits = Math.max(20, compressionResult.getTripleCount() / 10_000); + // send duplicate to the consumer while reading the nodes + CompressUtil.DuplicatedIterator sortedSubject = + CompressUtil.asNoDupeCharSequenceIterator( + new NotificationExceptionIterator<>( + compressionResult.getSubjects(), + compressionResult.getTripleCount(), + splits, + "Subject section filling", + listener + ), + (originalIndex, duplicatedIndex, lastHeader) -> nodeConsumer.onSubject(duplicatedIndex, lastHeader) + ); + CompressUtil.DuplicatedIterator sortedPredicate = + CompressUtil.asNoDupeCharSequenceIterator( + new NotificationExceptionIterator<>( + compressionResult.getPredicates(), + compressionResult.getTripleCount(), + splits, + "Predicate section filling", + listener + ), + (originalIndex, duplicatedIndex, lastHeader) -> nodeConsumer.onPredicate(duplicatedIndex, lastHeader) + ); + CompressUtil.DuplicatedIterator sortedObject = + CompressUtil.asNoDupeCharSequenceIterator( + new NotificationExceptionIterator<>( + compressionResult.getObjects(), + compressionResult.getTripleCount(), + splits, + "Object section filling", + listener + ), + (originalIndex, duplicatedIndex, lastHeader) -> nodeConsumer.onObject(duplicatedIndex, lastHeader) + ); + long subjects = compressionResult.getSubjectsCount(); + long predicates = compressionResult.getPredicatesCount(); + long objects = compressionResult.getObjectsCount(); + long shareds = compressionResult.getSharedCount(); + + // iterator to pipe to the s p o sh + PipedCopyIterator subject = new PipedCopyIterator<>(new StringParser()); + PipedCopyIterator object = new PipedCopyIterator<>(new StringParser()); + PipedCopyIterator shared = new PipedCopyIterator<>(new StringParser()); + Comparator comparator = CharSequenceComparator.getInstance(); + cfsdThread = new ExceptionThread(() -> { + long sharedId = 1; + long subjectId = 1; + long objectId = 1; + sharedLoop: + while (sortedObject.hasNext() && sortedSubject.hasNext()) { + // last was a shared node + IndexedNode newSubject = sortedSubject.next(); + IndexedNode newObject = sortedObject.next(); + int comp = comparator.compare(newSubject.getNode(), newObject.getNode()); + while (comp != 0) { + if (comp < 0) { + sendPiped(newSubject, CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); + if (!sortedSubject.hasNext()) { + // no more subjects, send the current object and break the shared loop + sendPiped(newObject, CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); + break sharedLoop; + } + newSubject = sortedSubject.next(); + } else { + sendPiped(newObject, CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); + if (!sortedObject.hasNext()) { + // no more objects, send the current subject and break the shared loop + sendPiped(newSubject, CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); + break sharedLoop; + } + newObject = sortedObject.next(); + } + comp = comparator.compare(newSubject.getNode(), newObject.getNode()); + } + // shared element + long shid = CompressUtil.asShared(sharedId++); + sortedSubject.setLastHeader(shid); + sortedObject.setLastHeader(shid); + nodeConsumer.onSubject(newSubject.getIndex(), shid); + nodeConsumer.onObject(newObject.getIndex(), shid); + shared.addElement(newSubject.getNode()); + } + // at least one iterator is empty, closing the shared pipe + shared.closePipe(); + // do we have subjects? + while (sortedSubject.hasNext()) { + sendPiped(sortedSubject.next(), CompressUtil.getHeaderId(subjectId++), subject, sortedSubject, nodeConsumer::onSubject); + } + subject.closePipe(); + // do we have objects? + while (sortedObject.hasNext()) { + sendPiped(sortedObject.next(), CompressUtil.getHeaderId(objectId++), object, sortedObject, nodeConsumer::onObject); + } + object.closePipe(); + }, "CFSDPipeBuilder").startAll(); + + // send to the consumer the element while parsing them + this.subject = new OneReadDictionarySection(subject, subjects); + this.predicate = new OneReadDictionarySection(new MapIterator<>(sortedPredicate, (node, index) -> { + long header = CompressUtil.getHeaderId(index + 1); + sortedPredicate.setLastHeader(header); + nodeConsumer.onPredicate(node.getIndex(), header); + // force duplication because it's not made in a pipe like with the others + return node.getNode().toString(); + }), predicates); + this.object = new OneReadDictionarySection(object, objects); + this.shared = new OneReadDictionarySection(shared, shareds); + } + + @Override + public TempDictionarySection getSubjects() { + return subject; + } + + @Override + public TempDictionarySection getPredicates() { + return predicate; + } + + @Override + public TempDictionarySection getObjects() { + return object; + } + + @Override + public TempDictionarySection getShared() { + return shared; + } + + @Override + public void startProcessing() { + } + + @Override + public void endProcessing() { + } + + @Override + public long insert(CharSequence str, TripleComponentRole position) { + throw new NotImplementedException(); + } + + @Override + public void reorganize() { + // already organized + } + + @Override + public void reorganize(TempTriples triples) { + // already organized + } + + @Override + public boolean isOrganized() { + return true; + } + + @Override + public void clear() { + } + + @Override + public long stringToId(CharSequence subject, TripleComponentRole role) { + throw new NotImplementedException(); + } + + @Override + public void close() throws IOException { + try { + cfsdThread.interrupt(); + cfsdThread.joinAndCrashIfRequired(); + } catch (InterruptedException e) { + // normal + } + } + + public interface NodeConsumer { + void onSubject(long preMapId, long newMapId); + void onPredicate(long preMapId, long newMapId); + void onObject(long preMapId, long newMapId); + } + + private static class StringParser implements PipedCopyIterator.Parser { + @Override + public void write(CharSequence node, OutputStream out) throws IOException { + byte[] bytes = node.toString().getBytes(ByteStringUtil.STRING_ENCODING); + VByte.encode(out, bytes.length); + out.write(bytes); + } + + @Override + public String read(InputStream in) throws IOException { + int size = (int) VByte.decode(in); + byte[] bytes = IOUtil.readBuffer(in, size, null); + return new String(bytes, ByteStringUtil.STRING_ENCODING); + } + } + private interface NodeConsumerMethod { + void consume(long id, long header); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary.java index 3fc8312d..3617fa7e 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.util.concurrent.atomic.AtomicReference; import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; import org.rdfhdt.hdt.dictionary.TempDictionary; @@ -44,6 +45,7 @@ import org.rdfhdt.hdt.options.ControlInfo.Type; import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; @@ -55,7 +57,7 @@ */ public class FourSectionDictionary extends BaseDictionary { - public FourSectionDictionary(HDTOptions spec, + public FourSectionDictionary(HDTOptions spec, DictionarySectionPrivate s, DictionarySectionPrivate p, DictionarySectionPrivate o, DictionarySectionPrivate sh) { super(spec); this.subjects = s; @@ -63,7 +65,7 @@ public FourSectionDictionary(HDTOptions spec, this.objects = o; this.shared = sh; } - + public FourSectionDictionary(HDTOptions spec) { super(spec); // FIXME: Read type from spec. @@ -85,6 +87,19 @@ public void load(TempDictionary other, ProgressListener listener) { shared.load(other.getShared(), iListener); } + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + IntermediateListener iListener = new IntermediateListener(null); + new ExceptionThread(() -> predicates.load(other.getPredicates(), iListener), "FourSecSAsyncReaderP") + .attach( + new ExceptionThread(() -> subjects.load(other.getSubjects(), iListener), "FourSecSAsyncReaderS"), + new ExceptionThread(() -> shared.load(other.getShared(), iListener), "FourSecSAsyncReaderSh"), + new ExceptionThread(() -> objects.load(other.getObjects(), iListener), "FourSecSAsyncReaderO") + ) + .startAll() + .joinAndCrashIfRequired(); + } + /* (non-Javadoc) * @see hdt.dictionary.Dictionary#save(java.io.OutputStream, hdt.ControlInformation, hdt.ProgressListener) */ @@ -111,7 +126,7 @@ public void load(InputStream input, ControlInfo ci, ProgressListener listener) t if(ci.getType()!=ControlInfo.Type.DICTIONARY) { throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); } - + IntermediateListener iListener = new IntermediateListener(listener); shared = DictionarySectionFactory.loadFrom(input, iListener); @@ -119,7 +134,7 @@ public void load(InputStream input, ControlInfo ci, ProgressListener listener) t predicates = DictionarySectionFactory.loadFrom(input, iListener); objects = DictionarySectionFactory.loadFrom(input, iListener); } - + @Override public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { ControlInformation ci = new ControlInformation(); @@ -127,13 +142,13 @@ public void mapFromFile(CountInputStream in, File f, ProgressListener listener) if(ci.getType()!=ControlInfo.Type.DICTIONARY) { throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); } - + IntermediateListener iListener = new IntermediateListener(listener); shared = DictionarySectionFactory.loadFrom(in, f, iListener); subjects = DictionarySectionFactory.loadFrom(in, f, iListener); predicates = DictionarySectionFactory.loadFrom(in, f, iListener); objects = DictionarySectionFactory.loadFrom(in, f, iListener); - + // Use cache only for predicates. Preload only up to 100K predicates. // FIXME: DISABLED // predicates = new DictionarySectionCacheAll(predicates, predicates.getNumberOfElements()<100000); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryBig.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryBig.java index e08bc169..c8f79fad 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryBig.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryBig.java @@ -44,6 +44,7 @@ import org.rdfhdt.hdt.options.ControlInfo.Type; import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.listener.IntermediateListener; @@ -84,6 +85,19 @@ public void load(TempDictionary other, ProgressListener listener) { shared.load(other.getShared(), iListener); } + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + IntermediateListener iListener = new IntermediateListener(null); + new ExceptionThread(() -> predicates.load(other.getPredicates(), iListener), "FourSecSAsyncReaderP") + .attach( + new ExceptionThread(() -> subjects.load(other.getSubjects(), iListener), "FourSecSAsyncReaderS"), + new ExceptionThread(() -> shared.load(other.getShared(), iListener), "FourSecSAsyncReaderSh"), + new ExceptionThread(() -> objects.load(other.getObjects(), iListener), "FourSecSAsyncReaderO") + ) + .startAll() + .joinAndCrashIfRequired(); + } + /* (non-Javadoc) * @see hdt.dictionary.Dictionary#save(java.io.OutputStream, hdt.ControlInformation, hdt.ProgressListener) */ diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java index 9976d9c5..25175178 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleBaseDictionary.java @@ -4,10 +4,12 @@ import org.rdfhdt.hdt.dictionary.DictionaryPrivate; import org.rdfhdt.hdt.dictionary.DictionarySection; import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.TempDictionary; import org.rdfhdt.hdt.dictionary.impl.section.PFCOptimizedExtractor; import org.rdfhdt.hdt.enums.DictionarySectionRole; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.util.LiteralsUtils; import org.rdfhdt.hdt.util.string.CompactString; @@ -323,4 +325,9 @@ public AbstractMap.SimpleEntry getDataTypeRange(String dataType){ } return new AbstractMap.SimpleEntry<>(0L,0L); } + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) { + throw new NotImplementedException(); + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java index 8199aace..67f4bf69 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionary.java @@ -7,6 +7,7 @@ import org.rdfhdt.hdt.dictionary.impl.section.HashDictionarySection; import org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySection; import org.rdfhdt.hdt.exceptions.IllegalFormatException; +import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.header.Header; import org.rdfhdt.hdt.listener.ProgressListener; @@ -49,6 +50,7 @@ public void load(TempDictionary other, ProgressListener listener) { predicates.load(other.getPredicates(), iListener); Iterator iter = other.getObjects().getEntries(); + // TODO: allow the usage of OneReadDictionarySection HashMap literalsCounts = ((HashDictionarySection)other.getObjects()).getLiteralsCounts(); if(literalsCounts.containsKey("NO_DATATYPE")) literalsCounts.put("NO_DATATYPE",literalsCounts.get("NO_DATATYPE") - other.getShared().getNumberOfElements()); @@ -229,4 +231,9 @@ public void close() throws IOException { } } + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) { + throw new NotImplementedException(); + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java new file mode 100644 index 00000000..73549bac --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/WriteFourSectionDictionary.java @@ -0,0 +1,106 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.rdfhdt.hdt.dictionary.TempDictionary; +import org.rdfhdt.hdt.dictionary.impl.section.WriteDictionarySection; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.io.CountInputStream; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.listener.ListenerUtil; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Path; + +/** + * Version of four section dictionary with {@link org.rdfhdt.hdt.dictionary.impl.section.WriteDictionarySection} + * @author Antoine Willerval + */ +public class WriteFourSectionDictionary extends BaseDictionary { + public WriteFourSectionDictionary(HDTOptions spec, Path filename, int bufferSize) { + super(spec); + String name = filename.getFileName().toString(); + subjects = new WriteDictionarySection(spec, filename.resolveSibling(name + "SU"), bufferSize); + predicates = new WriteDictionarySection(spec, filename.resolveSibling(name + "PR"), bufferSize); + objects = new WriteDictionarySection(spec, filename.resolveSibling(name + "OB"), bufferSize); + shared = new WriteDictionarySection(spec, filename.resolveSibling(name + "SH"), bufferSize); + } + + @Override + public void loadAsync(TempDictionary other, ProgressListener listener) throws InterruptedException { + MultiThreadListener ml = ListenerUtil.multiThreadListener(listener); + ml.unregisterAllThreads(); + ExceptionThread.async("FourSecSAsyncReader", + () -> predicates.load(other.getPredicates(), new IntermediateListener(ml, "Predicate: ")), + () -> subjects.load(other.getSubjects(), new IntermediateListener(ml, "Subjects: ")), + () -> shared.load(other.getShared(), new IntermediateListener(ml, "Shared: ")), + () -> objects.load(other.getObjects(), new IntermediateListener(ml, "Object: ")) + ) + .startAll() + .joinAndCrashIfRequired(); + ml.unregisterAllThreads(); + } + + @Override + public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public void load(TempDictionary other, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { + ci.setType(ControlInfo.Type.DICTIONARY); + ci.setFormat(getType()); + ci.setInt("elements", this.getNumberOfElements()); + ci.save(output); + + IntermediateListener iListener = new IntermediateListener(listener); + iListener.setRange(0, 25); + iListener.setPrefix("Save shared: "); + shared.save(output, iListener); + iListener.setRange(25, 50); + iListener.setPrefix("Save subjects: "); + subjects.save(output, iListener); + iListener.setRange(50, 75); + iListener.setPrefix("Save predicates: "); + predicates.save(output, iListener); + iListener.setRange(75, 100); + iListener.setPrefix("Save objects: "); + objects.save(output, iListener); + } + + @Override + public void populateHeader(Header header, String rootNode) { + header.insert(rootNode, HDTVocabulary.DICTIONARY_TYPE, getType()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_NUMSHARED, getNshared()); + header.insert(rootNode, HDTVocabulary.DICTIONARY_SIZE_STRINGS, size()); + } + + @Override + public String getType() { + return HDTVocabulary.DICTIONARY_TYPE_FOUR_SECTION; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(shared, subjects, predicates, objects); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java new file mode 100644 index 00000000..1af57850 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySection.java @@ -0,0 +1,76 @@ +package org.rdfhdt.hdt.dictionary.impl.section; + +import org.rdfhdt.hdt.dictionary.TempDictionarySection; +import org.rdfhdt.hdt.exceptions.NotImplementedException; + +import java.io.IOException; +import java.util.Iterator; + +public class OneReadDictionarySection implements TempDictionarySection { + private final Iterator reader; + private final long size; + + public OneReadDictionarySection(Iterator reader, long size) { + this.reader = reader; + this.size = size; + } + + @Override + public long add(CharSequence str) { + throw new NotImplementedException(); + } + + @Override + public void remove(CharSequence str) { + throw new NotImplementedException(); + } + + @Override + public void sort() { + throw new NotImplementedException(); + } + + @Override + public void clear() { + throw new NotImplementedException(); + } + + @Override + public boolean isSorted() { + return true; + } + + @Override + public Iterator getEntries() { + return reader; + } + + @Override + public long locate(CharSequence s) { + throw new NotImplementedException(); + } + + @Override + public CharSequence extract(long pos) { + throw new NotImplementedException(); + } + + @Override + public long size() { + return size; + } + + @Override + public long getNumberOfElements() { + return size; + } + + @Override + public Iterator getSortedEntries() { + return reader; + } + + @Override + public void close() throws IOException { + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java new file mode 100644 index 00000000..a4b197ef --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/WriteDictionarySection.java @@ -0,0 +1,155 @@ +package org.rdfhdt.hdt.dictionary.impl.section; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.compact.sequence.SequenceLog64BigDisk; +import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; +import org.rdfhdt.hdt.dictionary.TempDictionarySection; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRC8; +import org.rdfhdt.hdt.util.crc.CRCOutputStream; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.CountOutputStream; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.ListenerUtil; +import org.rdfhdt.hdt.util.string.ByteStringUtil; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; + +/** + * Implementation of {@link org.rdfhdt.hdt.dictionary.DictionarySectionPrivate} that write loaded + * {@link org.rdfhdt.hdt.dictionary.TempDictionarySection} on disk before saving, reducing the size in ram + * + * @author Antoine Willerval + */ +public class WriteDictionarySection implements DictionarySectionPrivate { + private final CloseSuppressPath tempFilename; + private final CloseSuppressPath blockTempFilename; + private SequenceLog64BigDisk blocks; + private final long blockSize; + private final int bufferSize; + private long numberElements = 0; + private long byteoutSize; + + public WriteDictionarySection(HDTOptions spec, Path filename, int bufferSize) { + this.bufferSize = bufferSize; + String fn = filename.getFileName().toString(); + tempFilename = CloseSuppressPath.of(filename.resolveSibling(fn + "_temp")); + blockTempFilename = CloseSuppressPath.of(filename.resolveSibling(fn + "_tempblock")); + long blockSize = spec.getInt("pfc.blocksize"); + if (blockSize < 0) { + throw new IllegalArgumentException("negative pfc.blocksize"); + } else if (blockSize == 0) { + this.blockSize = PFCDictionarySection.DEFAULT_BLOCK_SIZE; + } else { + this.blockSize = blockSize; + } + } + + @Override + public void load(TempDictionarySection other, ProgressListener plistener) { + MultiThreadListener listener = ListenerUtil.multiThreadListener(plistener); + long otherN = other.getNumberOfElements(); + long block = otherN < 10 ? 1 : otherN / 10; + long currentCount = 0; + blocks = new SequenceLog64BigDisk(blockTempFilename.toAbsolutePath().toString(), 64, otherN / blockSize); + + listener.notifyProgress(0, "Filling section"); + try (CountOutputStream out = new CountOutputStream(tempFilename.openOutputStream(bufferSize))) { + CRCOutputStream crcout = new CRCOutputStream(out, new CRC32()); + String previousStr = null; + for (Iterator it = other.getSortedEntries(); it.hasNext(); currentCount++) { + CharSequence sec = it.next(); + String str = sec.toString(); + if (numberElements % blockSize == 0) { + blocks.append(out.getTotalBytes()); + + // Copy full string + ByteStringUtil.append(out, str, 0); + } else { + // Find common part. + int delta = ByteStringUtil.longestCommonPrefix(previousStr, str); + // Write Delta in VByte + VByte.encode(out, delta); + // Write remaining + ByteStringUtil.append(out, str, delta); + } + out.write(0); + previousStr = str; + numberElements++; + if (currentCount % block == 0) { + listener.notifyProgress((float) (currentCount * 100 / otherN), "Filling section"); + } + } + + byteoutSize = out.getTotalBytes(); + crcout.writeCRC(); + } catch (IOException e) { + throw new RuntimeException("can't load section", e); + } + blocks.append(byteoutSize); + // Trim text/blocks + blocks.aggressiveTrimToSize(); + if (numberElements % 100_000 == 0) { + listener.notifyProgress(100, "Completed section filling"); + } + } + + @Override + public void save(OutputStream output, ProgressListener listener) throws IOException { + CRCOutputStream out = new CRCOutputStream(output, new CRC8()); + out.write(PFCDictionarySection.TYPE_INDEX); + VByte.encode(out, numberElements); + + VByte.encode(out, byteoutSize); + VByte.encode(out, blockSize); + out.writeCRC(); + // Write blocks directly to output, they have their own CRC check. + blocks.save(output, listener); + // Write blocks data directly to output, the load was writing using a CRC check. + Files.copy(tempFilename.getJavaPath(), output); + } + + @Override + public void load(InputStream input, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public long locate(CharSequence s) { + throw new NotImplementedException(); + } + + @Override + public CharSequence extract(long pos) { + throw new NotImplementedException(); + } + + @Override + public long size() { + return numberElements; + } + + @Override + public long getNumberOfElements() { + return numberElements; + } + + @Override + public Iterator getSortedEntries() { + throw new NotImplementedException(); + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(blocks, tempFilename, blockTempFilename); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index ec06a859..06286355 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -1,26 +1,60 @@ package org.rdfhdt.hdt.hdt; import org.rdfhdt.hdt.compact.bitmap.Bitmap; +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.dictionary.impl.CompressFourSectionDictionary; import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionary; +import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.enums.TripleComponentOrder; import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.hdt.impl.HDTBase; import org.rdfhdt.hdt.hdt.impl.HDTImpl; import org.rdfhdt.hdt.hdt.impl.TempHDTImporterOnePass; import org.rdfhdt.hdt.hdt.impl.TempHDTImporterTwoPass; +import org.rdfhdt.hdt.hdt.impl.WriteHDTImpl; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressTripleMapper; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.hdt.impl.diskimport.SectionCompressor; +import org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResult; import org.rdfhdt.hdt.hdt.writer.TripleWriterHDT; +import org.rdfhdt.hdt.header.HeaderPrivate; import org.rdfhdt.hdt.header.HeaderUtil; +import org.rdfhdt.hdt.iterator.utils.FileTripleIDIterator; +import org.rdfhdt.hdt.iterator.utils.FileTripleIterator; +import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.rdf.RDFParserCallback; +import org.rdfhdt.hdt.rdf.RDFParserFactory; import org.rdfhdt.hdt.rdf.TripleWriter; +import org.rdfhdt.hdt.triples.TempTriples; import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.triples.TriplesPrivate; +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.Profiler; +import org.rdfhdt.hdt.util.StopWatch; +import org.rdfhdt.hdt.util.StringUtil; +import org.rdfhdt.hdt.util.concurrent.TreeWorker; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.MapCompressTripleMerger; +import org.rdfhdt.hdt.util.io.compress.TripleGenerator; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.listener.ListenerUtil; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.file.Files; import java.util.Iterator; +import java.util.UUID; +import org.rdfhdt.hdt.util.StopWatch; +import org.rdfhdt.hdt.util.io.IOUtil; public class HDTManagerImpl extends HDTManager { @@ -40,7 +74,7 @@ public HDT doLoadHDT(String hdtFileName, ProgressListener listener, HDTOptions s hdt.loadFromHDT(hdtFileName, listener); return hdt; } - + @Override protected HDT doMapHDT(String hdtFileName, ProgressListener listener, HDTOptions spec) throws IOException { HDTPrivate hdt = new HDTImpl(spec); @@ -63,7 +97,6 @@ public HDT doLoadIndexedHDT(String hdtFileName, ProgressListener listener, HDTOp hdt.loadOrCreateIndex(listener); return hdt; } - @Override @@ -84,21 +117,21 @@ public HDT doLoadIndexedHDT(InputStream hdtFile, ProgressListener listener, HDTO @Override public HDT doIndexedHDT(HDT hdt, ProgressListener listener) throws IOException { - ((HDTPrivate)hdt).loadOrCreateIndex(listener); + ((HDTPrivate) hdt).loadOrCreateIndex(listener); return hdt; } @Override public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNotation, HDTOptions spec, ProgressListener listener) throws IOException, ParserException { //choose the importer - String loaderType = spec.get("loader.type"); + String loaderType = spec.get(HDTOptionsKeys.LOADER_TYPE_KEY); TempHDTImporter loader; - if ("two-pass".equals(loaderType)) { + if (HDTOptionsKeys.LOADER_TYPE_VALUE_TWO_PASS.equals(loaderType)) { loader = new TempHDTImporterTwoPass(useSimple(spec)); } else { loader = new TempHDTImporterOnePass(useSimple(spec)); } - + // Create TempHDT try (TempHDT modHdt = loader.loadFromRDF(spec, rdfFileName, baseURI, rdfNotation, listener)) { @@ -119,6 +152,18 @@ public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNota } } + @Override + public HDT doGenerateHDT(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException { + // uncompress the stream if required + fileStream = IOUtil.asUncompressed(fileStream, compressionType); + // create a parser for this rdf stream + RDFParserCallback parser = RDFParserFactory.getParserCallback(rdfNotation); + // read the stream as triples + Iterator iterator = RDFParserFactory.readAsIterator(parser, fileStream, baseURI, true, rdfNotation); + + return doGenerateHDT(iterator, baseURI, hdtFormat, listener); + } + @Override public HDT doGenerateHDT(Iterator triples, String baseURI, HDTOptions spec, ProgressListener listener) throws IOException { //choose the importer @@ -144,7 +189,224 @@ public HDT doGenerateHDT(Iterator triples, String baseURI, HDTOpti } @Override - protected TripleWriter doGetHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) throws IOException { + public HDT doGenerateHDTDisk(String rdfFileName, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + // read this file as stream, do not compress to allow the compressionType to be different from the file extension + try (InputStream stream = IOUtil.getFileInputStream(rdfFileName, false)) { + return doGenerateHDTDisk(stream, baseURI, rdfNotation, compressionType, hdtFormat, listener); + } + } + + @Override + public HDT doGenerateHDTDisk(InputStream fileStream, String baseURI, RDFNotation rdfNotation, CompressionType compressionType, HDTOptions hdtFormat, ProgressListener listener) throws IOException, ParserException { + // uncompress the stream if required + fileStream = IOUtil.asUncompressed(fileStream, compressionType); + // create a parser for this rdf stream + RDFParserCallback parser = RDFParserFactory.getParserCallback(rdfNotation, useSimple(hdtFormat)); + // read the stream as triples + Iterator iterator = RDFParserFactory.readAsIterator(parser, fileStream, baseURI, true, rdfNotation); + + return doGenerateHDTDisk(iterator, baseURI, hdtFormat, listener); + } + + /** + * @return a theoretical maximum amount of memory the JVM will attempt to use + */ + static long getMaxChunkSize(int workers) { + Runtime runtime = Runtime.getRuntime(); + return (long) ((runtime.maxMemory() - (runtime.totalMemory() - runtime.freeMemory())) * (0.85 * 1.5 * 3 * workers)); + } + + @Override + public HDT doGenerateHDTDisk(Iterator iterator, String baseURI, HDTOptions hdtFormat, ProgressListener progressListener) throws IOException, ParserException { + MultiThreadListener listener = ListenerUtil.multiThreadListener(progressListener); + // load config + // compression mode + String compressMode = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_KEY); // see CompressionResult + // worker for compression tasks + int workers = (int) hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_COMPRESSION_WORKER_KEY); + // maximum size of a chunk + long chunkSize = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_CHUNK_SIZE_KEY); + + long maxFileOpenedLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_MAX_FILE_OPEN_KEY); + long kwayLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_KWAY_KEY); + long bufferSizeLong = hdtFormat.getInt(HDTOptionsKeys.LOADER_DISK_BUFFER_SIZE_KEY); + int maxFileOpened; + int ways; + int bufferSize; + // location of the working directory, will be deleted after generation + String baseNameOpt = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY); + CloseSuppressPath basePath; + // location of the future HDT file, do not set to create the HDT in memory while mergin + String futureHDTLocation = hdtFormat.get(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY); + + Profiler profiler = new Profiler("doGenerateHDTDisk"); + String profilerString = hdtFormat.get("profiler"); + profiler.setDisabled(profilerString == null || !profilerString.equalsIgnoreCase("true")); + // check and set default values if required + if (workers == 0) { + workers = Runtime.getRuntime().availableProcessors(); + } else if (workers < 0) { + throw new IllegalArgumentException("Negative number of workers!"); + } + if (baseNameOpt == null || baseNameOpt.isEmpty()) { + basePath = CloseSuppressPath.of(Files.createTempDirectory("hdt-java-generate-disk")); + } else { + basePath = CloseSuppressPath.of(baseNameOpt); + } + basePath.closeWithDeleteRecurse(); + if (chunkSize == 0) { + chunkSize = getMaxChunkSize(workers); + } else if (chunkSize < 0) { + throw new IllegalArgumentException("Negative chunk size!"); + } + if (bufferSizeLong > Integer.MAX_VALUE - 5L || bufferSizeLong < 0) { + throw new IllegalArgumentException("Buffer size can't be negative or bigger than the size of an array!"); + } else if (bufferSizeLong == 0) { + bufferSize = CloseSuppressPath.BUFFER_SIZE; + } else { + bufferSize = (int) bufferSizeLong; + } + if (maxFileOpenedLong < 0 || maxFileOpenedLong > Integer.MAX_VALUE) { + throw new IllegalArgumentException("maxFileOpened can't be negative!"); + } else if (maxFileOpenedLong == 0) { + maxFileOpened = 1024; + } else { + maxFileOpened = (int) maxFileOpenedLong; + } + if (kwayLong < 0 || kwayLong > Integer.MAX_VALUE) { + throw new IllegalArgumentException("kway can't be negative!"); + } else if (kwayLong == 0) { + ways = 1; // Math.max(1, BitUtil.log2(Math.min(maxFileOpened, chunkSize / ((long) bufferSize * workers)))); + } else { + ways = (int) kwayLong; + } + boolean mapHDT = futureHDTLocation != null && !futureHDTLocation.isEmpty(); + + // create working directory + basePath.mkdirs(); + try { + // compress the triples into sections and compressed triples + listener.notifyProgress(0, "Sorting sections with chunk of size: " + StringUtil.humanReadableByteCount(chunkSize, true) + "B with " + ways + "ways"); + + FileTripleIterator triplesFile = new FileTripleIterator(iterator, chunkSize); + + profiler.pushSection("section compression"); + CompressionResult compressionResult; + try { + compressionResult = new SectionCompressor(basePath, triplesFile, listener, bufferSize) + .compress(workers, ways, compressMode); + } catch (TreeWorker.TreeWorkerException | InterruptedException e) { + throw new ParserException(e); + } + profiler.popSection(); + + HDTBase hdt; + if (!mapHDT) { + // using default implementation + hdt = new HDTImpl(hdtFormat); + } else { + // using map implementation + hdt = new WriteHDTImpl(hdtFormat, basePath.resolve("maphdt"), bufferSize); + } + hdt.setBaseUri(baseURI); + + listener.unregisterAllThreads(); + listener.notifyProgress(20, "Create sections and triple mapping"); + + profiler.pushSection("dictionary write"); + // create sections and triple mapping + DictionaryPrivate dictionary = hdt.getDictionary(); + CompressTripleMapper mapper = new CompressTripleMapper(basePath, compressionResult.getTripleCount(), chunkSize); + CompressFourSectionDictionary modifiableDictionary = new CompressFourSectionDictionary(compressionResult, mapper, listener); + try { + dictionary.loadAsync(modifiableDictionary, listener); + } catch (InterruptedException e) { + throw new ParserException(e); + } + profiler.popSection(); + + // complete the mapper with the shared count and delete compression data + compressionResult.delete(); + mapper.setShared(dictionary.getNshared()); + + listener.notifyProgress(40, "Create mapped and sort triple file"); + // create mapped triples file + TripleCompressionResult tripleCompressionResult; + TriplesPrivate triples = hdt.getTriples(); + TripleComponentOrder order = triples.getOrder(); + profiler.pushSection("triple compression/map"); + try { + MapCompressTripleMerger tripleMapper = new MapCompressTripleMerger( + basePath, + new FileTripleIDIterator(new TripleGenerator(compressionResult.getTripleCount()), chunkSize), + mapper, + listener, + order, + bufferSize); + tripleCompressionResult = tripleMapper.merge(workers, ways, compressMode); + } catch (TreeWorker.TreeWorkerException | InterruptedException e) { + throw new ParserException(e); + } + profiler.popSection(); + listener.unregisterAllThreads(); + + profiler.pushSection("bit triple creation"); + try { + // create bit triples and load the triples + TempTriples tempTriples = tripleCompressionResult.getTriples(); + IntermediateListener il = new IntermediateListener(listener); + il.setRange(80, 90); + il.setPrefix("Create bit triples: "); + il.notifyProgress(0, "create triples"); + triples.load(tempTriples, il); + tempTriples.close(); + + // completed the triples, delete the mapper + mapper.delete(); + } finally { + tripleCompressionResult.close(); + } + profiler.popSection(); + profiler.pushSection("header creation"); + + listener.notifyProgress(90, "Create HDT header"); + // header + hdt.populateHeaderStructure(hdt.getBaseURI()); + hdt.getHeader().insert("_:statistics", HDTVocabulary.ORIGINAL_SIZE, triplesFile.getTotalSize()); + + profiler.popSection(); + // return the HDT + if (mapHDT) { + profiler.pushSection("map to hdt"); + // write the HDT and map it + try { + hdt.saveToHDT(futureHDTLocation, listener); + } finally { + hdt.close(); + } + IntermediateListener il = new IntermediateListener(listener); + il.setPrefix("Map HDT: "); + il.setRange(95, 100); + il.notifyProgress(0, "start"); + try { + return doMapHDT(futureHDTLocation, il, hdtFormat); + } finally { + profiler.popSection(); + } + } else { + listener.notifyProgress(100, "HDT completed"); + return hdt; + } + } finally { + profiler.stop(); + profiler.writeProfiling(); + listener.notifyProgress(100, "Clearing disk"); + basePath.close(); + } + } + + @Override + protected TripleWriter doGetHDTWriter(OutputStream out, String baseURI, HDTOptions hdtFormat) { return new TripleWriterHDT(baseURI, hdtFormat, out); } @@ -187,4 +449,4 @@ protected HDT doHDTDiffBit(String location, String hdtFileName, Bitmap deleteBit return hdt; } } -} \ No newline at end of file +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java new file mode 100644 index 00000000..fe1baeab --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTBase.java @@ -0,0 +1,166 @@ +package org.rdfhdt.hdt.hdt.impl; + +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.hdt.HDTPrivate; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.options.ControlInformation; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.triples.TriplesPrivate; +import org.rdfhdt.hdt.util.StringUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Date; + +/** + * Abstract hdt base for {@link org.rdfhdt.hdt.hdt.HDTPrivate} + * + * @param header type + * @param dictionary type + * @param triple type + */ +public abstract class HDTBase implements HDTPrivate { + protected final HDTOptions spec; + protected H header; + protected D dictionary; + protected T triples; + + protected HDTBase(HDTOptions spec) { + if (spec == null) { + this.spec = new HDTSpecification(); + } else { + this.spec = spec; + } + } + + /** + * set the base URI of the hdt + * + * @param baseURI base uri + */ + public abstract void setBaseUri(String baseURI); + + /** + * @return if the HDT is closed + */ + public abstract boolean isClosed(); + + /* + * (non-Javadoc) + * + * @see hdt.HDT#getHeader() + */ + @Override + public H getHeader() { + return header; + } + + /* + * (non-Javadoc) + * + * @see hdt.HDT#getDictionary() + */ + @Override + public D getDictionary() { + return dictionary; + } + + /* + * (non-Javadoc) + * + * @see hdt.HDT#getTriples() + */ + @Override + public T getTriples() { + return triples; + } + + /* (non-Javadoc) + * @see hdt.hdt.HDT#getSize() + */ + @Override + public long size() { + if (isClosed()) + return 0; + + return dictionary.size() + triples.size(); + } + + /* + * (non-Javadoc) + * + * @see hdt.HDT#saveToHDT(java.io.OutputStream) + */ + @Override + public void saveToHDT(OutputStream output, ProgressListener listener) throws IOException { + ControlInfo ci = new ControlInformation(); + IntermediateListener iListener = new IntermediateListener(listener); + + ci.clear(); + ci.setType(ControlInfo.Type.GLOBAL); + ci.setFormat(HDTVocabulary.HDT_CONTAINER); + ci.save(output); + + ci.clear(); + ci.setType(ControlInfo.Type.HEADER); + header.save(output, ci, iListener); + + ci.clear(); + ci.setType(ControlInfo.Type.DICTIONARY); + dictionary.save(output, ci, iListener); + + ci.clear(); + ci.setType(ControlInfo.Type.TRIPLES); + triples.save(output, ci, iListener); + } + + @Override + public void populateHeaderStructure(String baseUri) { + if (baseUri == null || baseUri.length() == 0) { + throw new IllegalArgumentException("baseURI cannot be empty"); + } + + if (isClosed()) { + throw new IllegalStateException("Cannot add header to a closed HDT."); + } + + H header = getHeader(); + D dictionary = getDictionary(); + T triples = getTriples(); + header.insert(baseUri, HDTVocabulary.RDF_TYPE, HDTVocabulary.HDT_DATASET); + header.insert(baseUri, HDTVocabulary.RDF_TYPE, HDTVocabulary.VOID_DATASET); + + // VOID + header.insert(baseUri, HDTVocabulary.VOID_TRIPLES, triples.getNumberOfElements()); + header.insert(baseUri, HDTVocabulary.VOID_PROPERTIES, dictionary.getNpredicates()); + header.insert(baseUri, HDTVocabulary.VOID_DISTINCT_SUBJECTS, dictionary.getNsubjects()); + header.insert(baseUri, HDTVocabulary.VOID_DISTINCT_OBJECTS, dictionary.getNobjects()); + + // Structure + String formatNode = "_:format"; + String dictNode = "_:dictionary"; + String triplesNode = "_:triples"; + String statisticsNode = "_:statistics"; + String publicationInfoNode = "_:publicationInformation"; + + header.insert(baseUri, HDTVocabulary.HDT_FORMAT_INFORMATION, formatNode); + header.insert(formatNode, HDTVocabulary.HDT_DICTIONARY, dictNode); + header.insert(formatNode, HDTVocabulary.HDT_TRIPLES, triplesNode); + header.insert(baseUri, HDTVocabulary.HDT_STATISTICAL_INFORMATION, statisticsNode); + header.insert(baseUri, HDTVocabulary.HDT_PUBLICATION_INFORMATION, publicationInfoNode); + + dictionary.populateHeader(header, dictNode); + triples.populateHeader(header, triplesNode); + + header.insert(statisticsNode, HDTVocabulary.HDT_SIZE, getDictionary().size() + getTriples().size()); + + // Current time + header.insert(publicationInfoNode, HDTVocabulary.DUBLIN_CORE_ISSUED, StringUtil.formatDate(new Date())); + } + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java index 7c5e829d..c62eac6a 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java @@ -31,7 +31,6 @@ import org.rdfhdt.hdt.compact.bitmap.Bitmap; import org.rdfhdt.hdt.compact.bitmap.BitmapFactory; import org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap; -import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionaryCat; import org.rdfhdt.hdt.dictionary.DictionaryDiff; import org.rdfhdt.hdt.dictionary.DictionaryFactory; @@ -50,11 +49,9 @@ import org.rdfhdt.hdt.exceptions.NotFoundException; import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.hdt.HDT; -import org.rdfhdt.hdt.hdt.HDTPrivate; import org.rdfhdt.hdt.hdt.HDTVersion; import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.hdt.TempHDT; -import org.rdfhdt.hdt.header.Header; import org.rdfhdt.hdt.header.HeaderFactory; import org.rdfhdt.hdt.header.HeaderPrivate; import org.rdfhdt.hdt.iterator.DictionaryTranslateIterator; @@ -70,7 +67,6 @@ import org.rdfhdt.hdt.triples.TempTriples; import org.rdfhdt.hdt.triples.TripleID; import org.rdfhdt.hdt.triples.TripleString; -import org.rdfhdt.hdt.triples.Triples; import org.rdfhdt.hdt.triples.TriplesFactory; import org.rdfhdt.hdt.triples.TriplesPrivate; import org.rdfhdt.hdt.triples.impl.BitmapTriples; @@ -79,7 +75,6 @@ import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorDiff; import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorMapDiff; import org.rdfhdt.hdt.util.StopWatch; -import org.rdfhdt.hdt.util.StringUtil; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.listener.IntermediateListener; @@ -97,7 +92,6 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Paths; -import java.util.Date; import java.util.Iterator; import java.util.Map; import java.util.zip.GZIPInputStream; @@ -106,75 +100,20 @@ * Basic implementation of HDT interface * */ -public class HDTImpl implements HDTPrivate { +public class HDTImpl extends HDTBase { private static final Logger log = LoggerFactory.getLogger(HDTImpl.class); - private final HDTOptions spec; - - protected HeaderPrivate header; - protected DictionaryPrivate dictionary; - protected TriplesPrivate triples; - private String hdtFileName; private String baseUri; private boolean isMapped; private boolean isClosed=false; - private void createComponents() { - header = HeaderFactory.createHeader(spec); - dictionary = DictionaryFactory.createDictionary(spec); - triples = TriplesFactory.createTriples(spec); - } - - @Override - public void populateHeaderStructure(String baseUri) { - if(baseUri==null || baseUri.length()==0) { - throw new IllegalArgumentException("baseURI cannot be empty"); - } - - if(isClosed) { - throw new IllegalStateException("Cannot add header to a closed HDT."); - } - - header.insert(baseUri, HDTVocabulary.RDF_TYPE, HDTVocabulary.HDT_DATASET); - header.insert(baseUri, HDTVocabulary.RDF_TYPE, HDTVocabulary.VOID_DATASET); - - // VOID - header.insert(baseUri, HDTVocabulary.VOID_TRIPLES, triples.getNumberOfElements()); - header.insert(baseUri, HDTVocabulary.VOID_PROPERTIES, dictionary.getNpredicates()); - header.insert(baseUri, HDTVocabulary.VOID_DISTINCT_SUBJECTS, dictionary.getNsubjects()); - header.insert(baseUri, HDTVocabulary.VOID_DISTINCT_OBJECTS, dictionary.getNobjects()); - - // Structure - String formatNode = "_:format"; - String dictNode = "_:dictionary"; - String triplesNode = "_:triples"; - String statisticsNode = "_:statistics"; - String publicationInfoNode = "_:publicationInformation"; - - header.insert(baseUri, HDTVocabulary.HDT_FORMAT_INFORMATION, formatNode); - header.insert(formatNode, HDTVocabulary.HDT_DICTIONARY, dictNode); - header.insert(formatNode, HDTVocabulary.HDT_TRIPLES, triplesNode); - header.insert(baseUri, HDTVocabulary.HDT_STATISTICAL_INFORMATION, statisticsNode); - header.insert(baseUri, HDTVocabulary.HDT_PUBLICATION_INFORMATION, publicationInfoNode); - - dictionary.populateHeader(header, dictNode); - triples.populateHeader(header, triplesNode); - - header.insert(statisticsNode, HDTVocabulary.HDT_SIZE, getDictionary().size()+getTriples().size()); - - // Current time - header.insert(publicationInfoNode, HDTVocabulary.DUBLIN_CORE_ISSUED, StringUtil.formatDate(new Date())); - } - public HDTImpl(HDTOptions spec) { - if (spec == null) { - this.spec = new HDTSpecification(); - } else { - this.spec = spec; - } + super(spec); - createComponents(); + header = HeaderFactory.createHeader(this.spec); + dictionary = DictionaryFactory.createDictionary(this.spec); + triples = TriplesFactory.createTriples(this.spec); } @Override @@ -198,14 +137,7 @@ public void loadFromHDT(InputStream input, ProgressListener listener) throws IOE header.load(input, ci, iListener); // Set base URI. - try { - IteratorTripleString it = header.search("", HDTVocabulary.RDF_TYPE, HDTVocabulary.HDT_DATASET); - if(it.hasNext()) { - this.baseUri = it.next().getSubject().toString(); - } - } catch (NotFoundException e) { - log.error("Unexpected exception.", e); - } + this.baseUri = header.getBaseURI().toString(); // Load dictionary ci.clear(); @@ -281,14 +213,7 @@ public void mapFromHDT(File f, long offset, ProgressListener listener) throws IO header.load(input, ci, iListener); // Set base URI. - try { - IteratorTripleString it = header.search("", HDTVocabulary.RDF_TYPE, HDTVocabulary.HDT_DATASET); - if(it.hasNext()) { - this.baseUri = it.next().getSubject().toString(); - } - } catch (NotFoundException e) { - log.error("Unexpected exception.", e); - } + this.baseUri = header.getBaseURI().toString(); // Load dictionary ci.clear(); @@ -314,34 +239,6 @@ public void mapFromHDT(File f, long offset, ProgressListener listener) throws IO isClosed=false; } - /* - * (non-Javadoc) - * - * @see hdt.HDT#saveToHDT(java.io.OutputStream) - */ - @Override - public void saveToHDT(OutputStream output, ProgressListener listener) throws IOException { - ControlInfo ci = new ControlInformation(); - IntermediateListener iListener = new IntermediateListener(listener); - - ci.clear(); - ci.setType(ControlInfo.Type.GLOBAL); - ci.setFormat(HDTVocabulary.HDT_CONTAINER); - ci.save(output); - - ci.clear(); - ci.setType(ControlInfo.Type.HEADER); - header.save(output, ci, iListener); - - ci.clear(); - ci.setType(ControlInfo.Type.DICTIONARY); - dictionary.save(output, ci, iListener); - - ci.clear(); - ci.setType(ControlInfo.Type.TRIPLES); - triples.save(output, ci, iListener); - } - /* * (non-Javadoc) * @@ -419,52 +316,16 @@ public long getLastTriplePosition() { } } - /* - * (non-Javadoc) - * - * @see hdt.HDT#getHeader() - */ - @Override - public Header getHeader() { - return header; - } - - /* - * (non-Javadoc) - * - * @see hdt.HDT#getDictionary() - */ - @Override - public Dictionary getDictionary() { - return dictionary; - } - - /* - * (non-Javadoc) - * - * @see hdt.HDT#getTriples() - */ - @Override - public Triples getTriples() { - return triples; - } - - /* (non-Javadoc) - * @see hdt.hdt.HDT#getSize() - */ - @Override - public long size() { - if(isClosed) - return 0; - - return dictionary.size()+triples.size(); - } - public void loadFromParts(HeaderPrivate h, DictionaryPrivate d, TriplesPrivate t) { this.header = h; this.dictionary = d; this.triples = t; - isClosed=false; + isClosed=false; + } + + @Override + public void setBaseUri(String baseUri) { + this.baseUri = baseUri; } public void loadFromModifiableHDT(TempHDT modHdt, ProgressListener listener) { @@ -473,8 +334,8 @@ public void loadFromModifiableHDT(TempHDT modHdt, ProgressListener listener) { modHdt.reorganizeTriples(listener); // Get parts - TempTriples modifiableTriples = (TempTriples) modHdt.getTriples(); - TempDictionary modifiableDictionary = (TempDictionary) modHdt.getDictionary(); + TempTriples modifiableTriples = modHdt.getTriples(); + TempDictionary modifiableDictionary = modHdt.getDictionary(); // Convert triples to final format if(triples.getClass().equals(modifiableTriples.getClass())) { @@ -595,6 +456,7 @@ public String getHDTFileName() { return hdtFileName; } + @Override public boolean isClosed() { return isClosed; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java new file mode 100644 index 00000000..a1f13170 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/WriteHDTImpl.java @@ -0,0 +1,111 @@ +package org.rdfhdt.hdt.hdt.impl; + +import org.rdfhdt.hdt.dictionary.impl.WriteFourSectionDictionary; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.header.HeaderFactory; +import org.rdfhdt.hdt.header.HeaderPrivate; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.triples.IteratorTripleString; +import org.rdfhdt.hdt.triples.impl.WriteBitmapTriples; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * HDT implementation to write on disk the components + * + * @author Antoine Willerval + */ +public class WriteHDTImpl extends HDTBase { + private String baseURI; + private final CloseSuppressPath workingLocation; + private boolean isClosed; + + public WriteHDTImpl(HDTOptions spec, CloseSuppressPath workingLocation, int bufferSize) throws IOException { + super(spec); + this.workingLocation = workingLocation; + workingLocation.mkdirs(); + + dictionary = new WriteFourSectionDictionary(this.spec, workingLocation.resolve("section"), bufferSize); + // we need to have the bitmaps in memory, so we can't bypass the implementation + triples = new WriteBitmapTriples(this.spec, workingLocation.resolve("tripleBitmap"), bufferSize); + // small, can use default implementation + header = HeaderFactory.createHeader(this.spec); + } + + @Override + public void setBaseUri(String baseURI) { + this.baseURI = baseURI; + } + + @Override + public void loadFromHDT(InputStream input, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void loadFromHDT(String fileName, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void mapFromHDT(File f, long offset, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void loadOrCreateIndex(ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void saveToHDT(String fileName, ProgressListener listener) throws IOException { + try (OutputStream out = new BufferedOutputStream(Files.newOutputStream(Path.of(fileName)))) { + saveToHDT(out, listener); + } + } + + @Override + public long size() { + if (isClosed) + return 0; + + return getDictionary().size() + getTriples().size(); + } + + @Override + public String getBaseURI() { + return baseURI; + } + + @Override + public boolean isClosed() { + return isClosed; + } + + @Override + public void close() throws IOException { + if (isClosed()) { + return; + } + isClosed = true; + IOUtil.closeAll( + dictionary, + triples, + workingLocation + ); + } + + @Override + public IteratorTripleString search(CharSequence subject, CharSequence predicate, CharSequence object) { + throw new NotImplementedException(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java new file mode 100644 index 00000000..d8c4506d --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressTripleMapper.java @@ -0,0 +1,134 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.compact.sequence.SequenceLog64BigDisk; +import org.rdfhdt.hdt.dictionary.impl.CompressFourSectionDictionary; +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.disk.LongArray; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressUtil; +import org.rdfhdt.hdt.util.io.compress.WriteLongArrayBuffer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +/** + * Map a compress triple file to long array map files + * + * @author Antoine Willerval + */ +public class CompressTripleMapper implements CompressFourSectionDictionary.NodeConsumer { + private static final Logger log = LoggerFactory.getLogger(CompressTripleMapper.class); + private final WriteLongArrayBuffer subjects; + private final WriteLongArrayBuffer predicates; + private final WriteLongArrayBuffer objects; + private final CloseSuppressPath locationSubjects; + private final CloseSuppressPath locationPredicates; + private final CloseSuppressPath locationObjects; + private long shared = -1; + + public CompressTripleMapper(CloseSuppressPath location, long tripleCount, long chunkSize) { + locationSubjects = location.resolve("map_subjects"); + locationPredicates = location.resolve("map_predicates"); + locationObjects = location.resolve("map_objects"); + int numbits = BitUtil.log2(tripleCount + 2) + CompressUtil.INDEX_SHIFT; + int maxElement = (int) Math.min(chunkSize / Long.BYTES / 3, Integer.MAX_VALUE - 5); + subjects = + new WriteLongArrayBuffer( + new SequenceLog64BigDisk(locationSubjects.toAbsolutePath().toString(), numbits, tripleCount + 2, true), + tripleCount, maxElement); + predicates = + new WriteLongArrayBuffer(new SequenceLog64BigDisk(locationPredicates.toAbsolutePath().toString(), numbits, tripleCount + 2, true), + tripleCount, maxElement); + objects = + new WriteLongArrayBuffer(new SequenceLog64BigDisk(locationObjects.toAbsolutePath().toString(), numbits, tripleCount + 2, true), + tripleCount, maxElement); + } + + /** + * delete the map files and the location files + */ + public void delete() { + try { + IOUtil.closeAll(subjects, predicates, objects); + } catch (IOException e) { + log.warn("Can't close triple map array", e); + } + try { + IOUtil.closeAll(locationSubjects, locationPredicates, locationObjects); + } catch (IOException e) { + log.warn("Can't delete triple map array files", e); + } + } + + @Override + public void onSubject(long preMapId, long newMapId) { + assert preMapId > 0; + assert newMapId >= CompressUtil.getHeaderId(1); + subjects.set(preMapId, newMapId); + } + + @Override + public void onPredicate(long preMapId, long newMapId) { + assert preMapId > 0; + assert newMapId >= CompressUtil.getHeaderId(1); + predicates.set(preMapId, newMapId); + } + + @Override + public void onObject(long preMapId, long newMapId) { + assert preMapId > 0; + assert newMapId >= CompressUtil.getHeaderId(1); + objects.set(preMapId, newMapId); + } + + public void setShared(long shared) { + this.shared = shared; + subjects.free(); + predicates.free(); + objects.free(); + } + + private void checkShared() { + if (this.shared < 0) { + throw new IllegalArgumentException("Shared not set!"); + } + } + + /** + * extract the map id of a subject + * + * @param id id + * @return new id + */ + public long extractSubject(long id) { + return extract(subjects, id); + } + + /** + * extract the map id of a predicate + * + * @param id id + * @return new id + */ + public long extractPredicate(long id) { + return extract(predicates, id) - shared; + } + + /** + * extract the map id of a object + * + * @param id id + * @return new id + */ + public long extractObjects(long id) { + return extract(objects, id); + } + + private long extract(LongArray array, long id) { + checkShared(); + // compute shared if required + return CompressUtil.computeSharedNode(array.get(id), shared); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java new file mode 100644 index 00000000..cd47c64f --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResult.java @@ -0,0 +1,64 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.options.HDTOptionsKeys; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; + +import java.io.Closeable; +import java.io.IOException; + +/** + * Result for the {@link org.rdfhdt.hdt.hdt.impl.diskimport.SectionCompressor} + * @author Antoine Willerval + */ +public interface CompressionResult extends Closeable { + /** + * partial mode for config + * @see org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResultPartial + */ + String COMPRESSION_MODE_PARTIAL = HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL; + /** + * complete mode for config + * @see org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResultFile + */ + String COMPRESSION_MODE_COMPLETE = HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE; + + /** + * @return the number of triple + */ + long getTripleCount(); + /** + * @return a sorted iterator of subject + */ + ExceptionIterator getSubjects(); + /** + * @return a sorted iterator of predicates + */ + ExceptionIterator getPredicates(); + /** + * @return a sorted iterator of objects + */ + ExceptionIterator getObjects(); + /** + * @return the count of subjects + */ + long getSubjectsCount(); + /** + * @return the count of predicates + */ + long getPredicatesCount(); + /** + * @return the count of objects + */ + long getObjectsCount(); + /** + * @return the count of shared + */ + long getSharedCount(); + + /** + * delete data associated with this result + */ + void delete() throws IOException; +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultFile.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultFile.java new file mode 100644 index 00000000..29c99471 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultFile.java @@ -0,0 +1,82 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressNodeReader; + +import java.io.IOException; + +/** + * Implementation of {@link org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult} for full file reading + * @author Antoine Willerval + */ +public class CompressionResultFile implements CompressionResult { + private final long tripleCount; + private final CompressNodeReader subjects; + private final CompressNodeReader predicates; + private final CompressNodeReader objects; + private final SectionCompressor.TripleFile sections; + + public CompressionResultFile(long tripleCount, SectionCompressor.TripleFile sections) throws IOException { + this.tripleCount = tripleCount; + this.subjects = new CompressNodeReader(sections.openRSubject()); + this.predicates = new CompressNodeReader(sections.openRPredicate()); + this.objects = new CompressNodeReader(sections.openRObject()); + this.sections = sections; + } + + @Override + public long getTripleCount() { + return tripleCount; + } + + @Override + public ExceptionIterator getSubjects() { + return subjects; + } + + @Override + public ExceptionIterator getPredicates() { + return predicates; + } + + @Override + public ExceptionIterator getObjects() { + return objects; + } + + @Override + public void delete() throws IOException { + sections.close(); + } + + @Override + public long getSubjectsCount() { + return subjects.getSize(); + } + + @Override + public long getPredicatesCount() { + return predicates.getSize(); + } + + @Override + public long getObjectsCount() { + return objects.getSize(); + } + + @Override + public long getSharedCount() { + return tripleCount; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll( + objects, + predicates, + subjects + ); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultPartial.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultPartial.java new file mode 100644 index 00000000..ba72198a --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/CompressionResultPartial.java @@ -0,0 +1,135 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressNodeMergeIterator; +import org.rdfhdt.hdt.util.io.compress.CompressNodeReader; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Function; + +/** + * Implementation of {@link org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult} for partial file reading + * + * @author Antoine Willerval + */ +public class CompressionResultPartial implements CompressionResult { + private final List files; + private final long triplesCount; + private final ExceptionIterator subject; + private final ExceptionIterator predicate; + private final ExceptionIterator object; + + public CompressionResultPartial(List files, long triplesCount) throws IOException { + this.files = new ArrayList<>(files.size()); + for (SectionCompressor.TripleFile file : files) { + this.files.add(new CompressNodeReaderTriple(file)); + } + this.triplesCount = triplesCount; + + // building iterator trees + this.subject = createBTree(0, files.size(), CompressNodeReaderTriple::getS); + this.predicate = createBTree(0, files.size(), CompressNodeReaderTriple::getP); + this.object = createBTree(0, files.size(), CompressNodeReaderTriple::getO); + } + + private ExceptionIterator createBTree(int start, int end, Function fetcher) { + int size = end - start; + if (size <= 0) { + return ExceptionIterator.empty(); + } + if (size == 1) { + return fetcher.apply(files.get(start)); + } + int mid = (start + end) / 2; + ExceptionIterator left = createBTree(start, mid, fetcher); + ExceptionIterator right = createBTree(mid, end, fetcher); + return new CompressNodeMergeIterator(left, right); + } + + @Override + public long getTripleCount() { + return triplesCount; + } + + @Override + public ExceptionIterator getSubjects() { + return subject; + } + + @Override + public ExceptionIterator getPredicates() { + return predicate; + } + + @Override + public ExceptionIterator getObjects() { + return object; + } + + @Override + public void delete() throws IOException { + IOUtil.closeAll(files); + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(files); + } + + /* + * use the count of triples because we don't know the number of subjects + */ + @Override + public long getSubjectsCount() { + return triplesCount; + } + + @Override + public long getPredicatesCount() { + return triplesCount; + } + + @Override + public long getObjectsCount() { + return triplesCount; + } + + @Override + public long getSharedCount() { + return triplesCount; + } + + private static class CompressNodeReaderTriple implements Closeable { + final CompressNodeReader s, p, o; + final SectionCompressor.TripleFile file; + + public CompressNodeReaderTriple(SectionCompressor.TripleFile file) throws IOException { + this.s = new CompressNodeReader(file.openRSubject()); + this.p = new CompressNodeReader(file.openRPredicate()); + this.o = new CompressNodeReader(file.openRObject()); + this.file = file; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(s, p, o); + } + + public CompressNodeReader getS() { + return s; + } + + public CompressNodeReader getP() { + return p; + } + + public CompressNodeReader getO() { + return o; + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java new file mode 100644 index 00000000..da716080 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/SectionCompressor.java @@ -0,0 +1,485 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.iterator.utils.FileTripleIterator; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.ParallelSortableArrayList; +import org.rdfhdt.hdt.util.concurrent.ExceptionFunction; +import org.rdfhdt.hdt.util.concurrent.ExceptionSupplier; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.concurrent.TreeWorker; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressNodeMergeIterator; +import org.rdfhdt.hdt.util.io.compress.CompressNodeReader; +import org.rdfhdt.hdt.util.io.compress.CompressUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; + +/** + * Tree worker object to compress the section of a triple stream into 3 sections (SPO) and a compress triple file + * + * @author Antoine Willerval + */ +public class SectionCompressor implements TreeWorker.TreeWorkerObject { + private static final AtomicInteger ID_INC = new AtomicInteger(); + private static final Logger log = LoggerFactory.getLogger(SectionCompressor.class); + + private final CloseSuppressPath baseFileName; + private final FileTripleIterator source; + private boolean done; + private final MultiThreadListener listener; + private long triples = 0; + private final int bufferSize; + private final IdFetcher subjectIdFetcher = new IdFetcher(); + private final IdFetcher predicateIdFetcher = new IdFetcher(); + private final IdFetcher objectIdFetcher = new IdFetcher(); + + public SectionCompressor(CloseSuppressPath baseFileName, FileTripleIterator source, MultiThreadListener listener, int bufferSize) { + this.source = source; + this.listener = listener; + this.baseFileName = baseFileName; + this.bufferSize = bufferSize; + } + + /** + * @return the next file to merge + */ + @Override + public SectionCompressor.BufferedSection get() { + if (done || !source.hasNewFile()) { + done = true; + return null; + } + + listener.notifyProgress(0, "start reading triples"); + + BufferedSection buffer = new BufferedSection(); + ParallelSortableArrayList subjects = buffer.subjects; + ParallelSortableArrayList predicates = buffer.predicates; + ParallelSortableArrayList objects = buffer.objects; + + listener.notifyProgress(10, "reading triples " + triples); + while (source.hasNext()) { + // too much ram allowed? + if (subjects.size() == Integer.MAX_VALUE - 5) { + source.forceNewFile(); + continue; + } + TripleString next = source.next(); + + // get indexed mapped char sequence + IndexedNode subjectNode = new IndexedNode( + convertSubject(next.getSubject()), + subjectIdFetcher.getNodeId() + ); + subjects.add(subjectNode); + + // get indexed mapped char sequence + IndexedNode predicateNode = new IndexedNode( + convertPredicate(next.getPredicate()), + predicateIdFetcher.getNodeId() + ); + predicates.add(predicateNode); + + // get indexed mapped char sequence + IndexedNode objectNode = new IndexedNode( + convertObject(next.getObject()), + objectIdFetcher.getNodeId() + ); + objects.add(objectNode); + + // load the map triple and write it in the writer + triples++; + + if (triples % 100_000 == 0) { + listener.notifyProgress(10, "reading triples " + triples); + } + } + + return buffer; + } + + @Override + public TripleFile map(BufferedSection buffer) { + ParallelSortableArrayList subjects = buffer.subjects; + ParallelSortableArrayList predicates = buffer.predicates; + ParallelSortableArrayList objects = buffer.objects; + try { + int fid = ID_INC.incrementAndGet(); + TripleFile sections = new TripleFile(baseFileName.resolve("section" + fid + ".raw")); + try { + IntermediateListener il = new IntermediateListener(listener); + il.setRange(70, 80); + il.setPrefix("creating subjects section " + sections.root.getFileName() + ": "); + il.notifyProgress(0, "sorting"); + try (OutputStream stream = sections.openWSubject()) { + subjects.parallelSort(IndexedNode::compareTo); + CompressUtil.writeCompressedSection(subjects, stream, il); + } + il.setRange(80, 90); + il.setPrefix("creating predicates section " + sections.root.getFileName() + ": "); + il.notifyProgress(0, "sorting"); + try (OutputStream stream = sections.openWPredicate()) { + predicates.parallelSort(IndexedNode::compareTo); + CompressUtil.writeCompressedSection(predicates, stream, il); + } + il.setRange(90, 100); + il.setPrefix("creating objects section " + sections.root.getFileName() + ": "); + il.notifyProgress(0, "sorting"); + try (OutputStream stream = sections.openWObject()) { + objects.parallelSort(IndexedNode::compareTo); + CompressUtil.writeCompressedSection(objects, stream, il); + } + } finally { + subjects.clear(); + predicates.clear(); + objects.clear(); + listener.notifyProgress(100, "section completed" + sections.root.getFileName().toString()); + } + return sections; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public SectionCompressor.TripleFile construct(SectionCompressor.TripleFile[] triples, int count) { + int fid = ID_INC.incrementAndGet(); + TripleFile sections; + try { + sections = new TripleFile(baseFileName.resolve("section" + fid + ".raw")); + sections.compute(triples, count, false); + listener.notifyProgress(100, "sections merged " + sections.root.getFileName()); + } catch (IOException | InterruptedException e) { + throw new RuntimeException(e); + } + // delete old sections + for (int i = 0; i < count; i++) { + delete(triples[i]); + } + return sections; + } + + /** + * delete the file f if it exists or warn + * + * @param f the file to delete + */ + @Override + public void delete(SectionCompressor.TripleFile f) { + f.close(); + } + + /* + * FIXME: create a factory and override these methods with the hdt spec + */ + + /** + * mapping method for the subject of the triple, this method should copy the sequence! + * + * @param seq the subject (before) + * @return the subject mapped + */ + protected CharSequence convertSubject(CharSequence seq) { + return seq.toString(); + } + + /** + * mapping method for the predicate of the triple, this method should copy the sequence! + * + * @param seq the predicate (before) + * @return the predicate mapped + */ + protected CharSequence convertPredicate(CharSequence seq) { + return seq.toString(); + } + + /** + * mapping method for the object of the triple, this method should copy the sequence! + * + * @param seq the object (before) + * @return the object mapped + */ + protected CharSequence convertObject(CharSequence seq) { + return seq.toString(); + } + + /** + * Compress the stream into complete pre-sections files + * + * @param workers the number of workers + * @param nodePerMerge the number of node layer per merge + * @return compression result + * @throws IOException io exception + * @throws InterruptedException if the thread is interrupted + * @throws TreeWorker.TreeWorkerException exception with the tree working + * @see #compressPartial() + * @see #compress(int, int, String) + */ + public CompressionResult compressToFile(int workers, int nodePerMerge) throws IOException, InterruptedException, TreeWorker.TreeWorkerException { + // force to create the first file + TreeWorker treeWorker = new TreeWorker<>(this, TripleFile[]::new, workers, nodePerMerge); + treeWorker.setListener(listener); + treeWorker.start(); + // wait for the workers to merge the sections and create the triples + TripleFile sections = treeWorker.waitToComplete(); + return new CompressionResultFile(triples, sections); + } + + /** + * Compress the stream into multiple pre-sections files and merge them on the fly + * + * @return compression result + * @throws IOException io exception + * @see #compressToFile(int, int) + * @see #compress(int, int, String) + */ + public CompressionResult compressPartial() throws IOException { + BufferedSection section; + List files = new ArrayList<>(); + try { + while ((section = get()) != null) { + files.add(map(section)); + } + } catch (RuntimeException e) { + IOUtil.closeAll(files); + throw e; + } + return new CompressionResultPartial(files, triples); + } + + /** + * compress the sections/triples with a particular mode + * + * @param workers the worker required + * @param mode the mode to compress, can be {@link org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult#COMPRESSION_MODE_COMPLETE} (default), {@link org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult#COMPRESSION_MODE_PARTIAL} or null/"" for default + * @param nodePerMerge the number of node layer per merge + * @return the compression result + * @throws TreeWorker.TreeWorkerException tree working exception + * @throws IOException io exception + * @throws InterruptedException thread interruption + * @see #compressToFile(int, int) + * @see #compressPartial() + */ + public CompressionResult compress(int workers, int nodePerMerge, String mode) throws TreeWorker.TreeWorkerException, IOException, InterruptedException { + if (mode == null) { + mode = ""; + } + switch (mode) { + case "": + case CompressionResult.COMPRESSION_MODE_COMPLETE: + return compressToFile(workers, nodePerMerge); + case CompressionResult.COMPRESSION_MODE_PARTIAL: + return compressPartial(); + default: + throw new IllegalArgumentException("Unknown compression mode: " + mode); + } + } + + /** + * A triple directory, contains 3 files, subject, predicate and object + * + * @author Antoine Willerval + */ + public class TripleFile implements Closeable { + private final CloseSuppressPath root; + private final CloseSuppressPath s; + private final CloseSuppressPath p; + private final CloseSuppressPath o; + + private TripleFile(CloseSuppressPath root) throws IOException { + this.root = root; + this.s = root.resolve("subject"); + this.p = root.resolve("predicate"); + this.o = root.resolve("object"); + + root.closeWithDeleteRecurse(); + root.mkdirs(); + } + + /** + * delete the directory + */ + @Override + public void close() { + try { + root.close(); + } catch (IOException e) { + log.warn("Can't delete sections {}: {}", root, e); + } + } + + /** + * @return open a write stream to the subject file + * @throws IOException can't open the stream + */ + public OutputStream openWSubject() throws IOException { + return s.openOutputStream(bufferSize); + } + + /** + * @return open a write stream to the predicate file + * @throws IOException can't open the stream + */ + public OutputStream openWPredicate() throws IOException { + return p.openOutputStream(bufferSize); + } + + /** + * @return open a write stream to the object file + * @throws IOException can't open the stream + */ + public OutputStream openWObject() throws IOException { + return o.openOutputStream(bufferSize); + } + + /** + * @return open a read stream to the subject file + * @throws IOException can't open the stream + */ + public InputStream openRSubject() throws IOException { + return s.openInputStream(bufferSize); + } + + /** + * @return open a read stream to the predicate file + * @throws IOException can't open the stream + */ + public InputStream openRPredicate() throws IOException { + return p.openInputStream(bufferSize); + } + + /** + * @return open a read stream to the object file + * @throws IOException can't open the stream + */ + public InputStream openRObject() throws IOException { + return o.openInputStream(bufferSize); + } + + /** + * @return the path to the subject file + */ + public CloseSuppressPath getSubjectPath() { + return s; + } + + /** + * @return the path to the predicate file + */ + public CloseSuppressPath getPredicatePath() { + return p; + } + + /** + * @return the path to the object file + */ + public CloseSuppressPath getObjectPath() { + return o; + } + + /** + * compute this triple file from multiple triples files + * + * @param triples triples files container + * @param count length of the container (index start at 0) + * @param async if the method should load all the files asynchronously or not + * @throws IOException io exception while reading/writing + * @throws InterruptedException interruption while waiting for the async thread + */ + public void compute(SectionCompressor.TripleFile[] triples, int count, boolean async) throws IOException, InterruptedException { + if (!async) { + computeSubject(triples, count, false); + computePredicate(triples, count, false); + computeObject(triples, count, false); + } else { + ExceptionThread.async("SectionMerger" + root.getFileName(), + () -> computeSubject(triples, count, true), + () -> computePredicate(triples, count, true), + () -> computeObject(triples, count, true) + ).joinAndCrashIfRequired(); + } + } + + private void computeSubject(SectionCompressor.TripleFile[] triples, int count, boolean async) throws IOException { + computeSection(triples, count, "subject", 0, 33, this::openWSubject, TripleFile::openRSubject, TripleFile::getSubjectPath, async); + } + + private void computePredicate(SectionCompressor.TripleFile[] triples, int count, boolean async) throws IOException { + computeSection(triples, count, "predicate", 33, 66, this::openWPredicate, TripleFile::openRPredicate, TripleFile::getPredicatePath, async); + } + + private void computeObject(SectionCompressor.TripleFile[] triples, int count, boolean async) throws IOException { + computeSection(triples, count, "object", 66, 100, this::openWObject, TripleFile::openRObject, TripleFile::getObjectPath, async); + } + + private void computeSection(SectionCompressor.TripleFile[] triples, int count, String section, int start, int end, ExceptionSupplier openW, ExceptionFunction openR, Function fileDelete, boolean async) throws IOException { + IntermediateListener il = new IntermediateListener(listener); + if (async) { + listener.registerThread(Thread.currentThread().getName()); + } else { + il.setRange(start, end); + } + il.setPrefix("merging " + section + " section " + root.getFileName() + ": "); + il.notifyProgress(0, "merging section"); + CompressNodeReader[] readers = new CompressNodeReader[count]; + Closeable[] fileDeletes = new Closeable[count]; + try { + long size = 0L; + for (int i = 0; i < count; i++) { + CompressNodeReader reader = new CompressNodeReader(openR.apply(triples[i])); + size += reader.getSize(); + readers[i] = reader; + fileDeletes[i] = fileDelete.apply(triples[i]); + } + + // section + try (OutputStream output = openW.get()) { + CompressUtil.writeCompressedSection(CompressNodeMergeIterator.buildOfTree(readers), size, output, il); + } + } finally { + if (async) { + listener.unregisterThread(Thread.currentThread().getName()); + } + try { + IOUtil.closeAll(readers); + } finally { + IOUtil.closeAll(fileDeletes); + } + } + } + } + + private static class IdFetcher { + private long id = 0; + + public long getNodeId() { + return ++id; + } + + public long getCount() { + return id; + } + } + + public static class BufferedSection { + private final ParallelSortableArrayList subjects = new ParallelSortableArrayList<>(IndexedNode[].class); + private final ParallelSortableArrayList predicates = new ParallelSortableArrayList<>(IndexedNode[].class); + private final ParallelSortableArrayList objects = new ParallelSortableArrayList<>(IndexedNode[].class); + + private BufferedSection() { + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResult.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResult.java new file mode 100644 index 00000000..901d2617 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResult.java @@ -0,0 +1,20 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.triples.TempTriples; + +import java.io.Closeable; + +/** + * Result for the {@link org.rdfhdt.hdt.util.io.compress.MapCompressTripleMerger} + * @author Antoine Willerval + */ +public interface TripleCompressionResult extends Closeable { + /** + * @return a sorted iterator of subject + */ + TempTriples getTriples(); + /** + * @return the number of triples + */ + long getTripleCount(); +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultFile.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultFile.java new file mode 100644 index 00000000..cb51e55d --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultFile.java @@ -0,0 +1,44 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.impl.OneReadTempTriples; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressTripleReader; + +import java.io.IOException; + +/** + * Implementation of {@link org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResult} for full file reading + * + * @author Antoine Willerval + */ +public class TripleCompressionResultFile implements TripleCompressionResult { + private final long tripleCount; + private final CompressTripleReader reader; + private final TripleComponentOrder order; + private final CloseSuppressPath triples; + + public TripleCompressionResultFile(long tripleCount, CloseSuppressPath triples, TripleComponentOrder order, int bufferSize) throws IOException { + this.tripleCount = tripleCount; + this.reader = new CompressTripleReader(triples.openInputStream(bufferSize)); + this.order = order; + this.triples = triples; + } + + @Override + public TempTriples getTriples() { + return new OneReadTempTriples(reader.asIterator(), order, tripleCount); + } + + @Override + public long getTripleCount() { + return tripleCount; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(reader, triples); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultPartial.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultPartial.java new file mode 100644 index 00000000..ebb777a0 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/diskimport/TripleCompressionResultPartial.java @@ -0,0 +1,64 @@ +package org.rdfhdt.hdt.hdt.impl.diskimport; + +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.impl.OneReadTempTriples; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressTripleMergeIterator; +import org.rdfhdt.hdt.util.io.compress.CompressTripleReader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Implementation of {@link org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResult} for partial file reading + * @author Antoine Willerval + */ +public class TripleCompressionResultPartial implements TripleCompressionResult { + private final List files; + private final TempTriples triples; + private final long tripleCount; + private final TripleComponentOrder order; + + public TripleCompressionResultPartial(List files, long tripleCount, TripleComponentOrder order, int bufferSize) throws IOException { + this.files = new ArrayList<>(files.size()); + this.tripleCount = tripleCount; + this.order = order; + this.triples = new OneReadTempTriples(createBTree(files, 0, files.size(), bufferSize).asIterator(), order, tripleCount); + } + + private ExceptionIterator createBTree(List files, int start, int end, int bufferSize) throws IOException { + int size = end - start; + if (size <= 0) { + return ExceptionIterator.empty(); + } + if (size == 1) { + CompressTripleReader r = new CompressTripleReader(files.get(start).openInputStream(bufferSize)); + this.files.add(r); + return r; + } + int mid = (start + end) / 2; + ExceptionIterator left = createBTree(files, start, mid, bufferSize); + ExceptionIterator right = createBTree(files, mid, end, bufferSize); + return new CompressTripleMergeIterator(left, right, order); + } + + @Override + public TempTriples getTriples() { + return triples; + } + + @Override + public long getTripleCount() { + return tripleCount; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll(files); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java index 7ea92113..2444a489 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/header/PlainHeader.java @@ -75,12 +75,14 @@ public PlainHeader(HDTOptions spec) { */ @Override public void insert(CharSequence subject, CharSequence predicate, CharSequence object) { + TripleString tripleString; String objStr = object.toString(); if(objStr.charAt(0)=='<'|| objStr.charAt(0)=='"' || objStr.startsWith("http://")||objStr.startsWith("file://")) { - triples.add(new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), object)); + tripleString = new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), HeaderUtil.cleanURI(object)); } else { - triples.add(new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), '"'+objStr+'"')); + tripleString = new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), '"'+objStr+'"'); } + triples.add(tripleString); } /* (non-Javadoc) @@ -152,7 +154,13 @@ public int getNumberOfElements() { */ @Override public IteratorTripleString search(CharSequence subject, CharSequence predicate, CharSequence object) { - TripleString pattern = new TripleString(subject.toString(), predicate.toString(), object.toString()); + TripleString pattern; + String objStr = object.toString(); + if(objStr.isEmpty() || objStr.charAt(0)=='<'|| objStr.charAt(0)=='"' || objStr.startsWith("http://")||objStr.startsWith("file://")) { + pattern = new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), HeaderUtil.cleanURI(object)); + } else { + pattern = new TripleString(HeaderUtil.cleanURI(subject), HeaderUtil.cleanURI(predicate), '"'+objStr+'"'); + } return new PlainHeaderIterator(this, pattern); } @@ -164,13 +172,7 @@ public void processTriple(TripleString triple, long pos) { @Override public void remove(CharSequence subject, CharSequence predicate, CharSequence object) { TripleString pattern = new TripleString(subject.toString(), predicate.toString(), object.toString()); - Iterator iter = triples.iterator(); - while(iter.hasNext()) { - TripleString next = iter.next(); - if(next.match(pattern)) { - iter.remove(); - } - } + triples.removeIf(next -> next.match(pattern)); } @Override diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java new file mode 100644 index 00000000..bf59bdfb --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/ExceptionIterator.java @@ -0,0 +1,168 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Iterator; +import java.util.Objects; +import java.util.function.Consumer; + +/** + * alternative iterator with exception throwing + * @param the iterator type + * @param the allowed exception + * @author Antoine Willerval + */ +public interface ExceptionIterator { + @FunctionalInterface + interface ExceptionConsumer { + void consume(T element) throws E; + } + /** + * create an exception iterator from a basic iterator + * @param it the iterator the wrap + * @param the iterator type + * @param the exception to allow + * @return exception iterator + */ + static ExceptionIterator of(final Iterator it) { + return new ExceptionIterator<>() { + @Override + public boolean hasNext() { + return it.hasNext(); + } + + @Override + public T next() { + return it.next(); + } + + @Override + public void remove() { + it.remove(); + } + }; + } /** + * create an empty iterator + * @param the iterator type + * @param the exception to allow + * @return exception iterator + */ + static ExceptionIterator empty() { + return of(new Iterator<>() { + @Override + public boolean hasNext() { + return false; + } + + @Override + public T next() { + return null; + } + }); + } + + + /** + * @return if the iterator has a next element + * @throws E exception triggered by the implementation + */ + boolean hasNext() throws E; + + /** + * @return the next iterator element + * @throws E exception triggered by the implementation + */ + T next() throws E; + + /** + * remove the last element returned by the iterator + * @throws E exception triggered by the implementation + */ + default void remove() throws E { + throw new UnsupportedOperationException("remove"); + } + + /** + * loop over all the elements + * @param action the action to handle the element + * @throws E exception triggered by the implementation + */ + default void forEachRemaining(ExceptionConsumer action) throws E { + Objects.requireNonNull(action); + while (hasNext()) + action.consume(next()); + } + + /** + * map this iterator with a function + * @param mappingFunc the mapping function + * @param the new iterator type + * @return iterator + */ + default ExceptionIterator map(MapExceptionIterator.ExceptionFunction mappingFunc) { + return new MapExceptionIterator<>(this, mappingFunc); + } + /** + * map this iterator with a function + * @param mappingFunc the mapping function + * @param the new iterator type + * @return iterator + */ + default ExceptionIterator map(MapExceptionIterator.MapWithIdFunction mappingFunc) { + return new MapExceptionIterator<>(this, mappingFunc); + } + + /** + * convert this exception iterator to a base iterator and convert the exception to RuntimeException + * @return iterator + */ + default Iterator asIterator() { + return new Iterator<>() { + @Override + public boolean hasNext() { + try { + return ExceptionIterator.this.hasNext(); + } catch (Exception e) { + if (e instanceof RuntimeException) { + throw (RuntimeException) e; + } + throw new RuntimeException(e); + } + } + + @Override + public T next() { + try { + return ExceptionIterator.this.next(); + } catch (Exception e) { + if (e instanceof RuntimeException) { + throw (RuntimeException) e; + } + throw new RuntimeException(e); + } + } + + @Override + public void forEachRemaining(Consumer action) { + try { + ExceptionIterator.this.forEachRemaining(action::accept); + } catch (Exception e) { + if (e instanceof RuntimeException) { + throw (RuntimeException) e; + } + throw new RuntimeException(e); + } + } + + @Override + public void remove() { + try { + ExceptionIterator.this.remove(); + } catch (Exception e) { + if (e instanceof RuntimeException) { + throw (RuntimeException) e; + } + throw new RuntimeException(e); + } + } + }; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileChunkIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileChunkIterator.java new file mode 100644 index 00000000..5ce8d019 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileChunkIterator.java @@ -0,0 +1,105 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Iterator; +import java.util.function.Consumer; +import java.util.function.ToLongFunction; + +/** + * Iterator to split an iterator stream into multiple files, the iterator return {@link #hasNext()} == true once the + * first file is returned, then the {@link #hasNewFile()} should be called to check if another file can be created and + * re-allow {@link #hasNext()} to return true + * @author Antoine Willerval + */ +public class FileChunkIterator implements Iterator { + private final ToLongFunction estimationFunction; + private final Iterator it; + private final long maxSize; + private long totalSize = 0L; + private long currentSize = 0L; + private E next; + private boolean stop = false; + + /** + * create a file iterator from a stream and a max size + * @param it the iterator + * @param maxSize the maximum size of each file, this size is estimated, so files can be bigger. + * @param estimationFunction the element estimation function + */ + public FileChunkIterator(Iterator it, long maxSize, ToLongFunction estimationFunction) { + this.it = it; + this.maxSize = maxSize; + this.estimationFunction = estimationFunction; + } + + @Override + public boolean hasNext() { + if (stop) + return false; + + if (next != null) + return true; + + if (it.hasNext()) { + next = it.next(); + long estimation = estimationFunction.applyAsLong(next); + + totalSize += estimation; + + if (currentSize + estimation >= maxSize) { + stop = true; + currentSize = estimation; + return false; + } + + currentSize += estimation; + return true; + } + return false; + } + + @Override + public E next() { + if (!hasNext()) { + return null; + } + E t = next; + next = null; + return t; + } + + @Override + public void remove() { + it.remove(); + } + + @Override + public void forEachRemaining(Consumer action) { + it.forEachRemaining(action); + } + + /** + * force the iterator to create a new file after the next hasNext() + */ + public void forceNewFile() { + long estimation; + if (next != null) { + estimation = estimationFunction.applyAsLong(next); + } else { + estimation = 0; + } + currentSize = estimation; + stop = true; + } + + /** + * @return if we need to open a new file + */ + public boolean hasNewFile() { + stop = false; + return hasNext(); + } + + public long getTotalSize() { + return totalSize; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIDIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIDIterator.java new file mode 100644 index 00000000..22890e3f --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIDIterator.java @@ -0,0 +1,17 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.triples.TripleID; + +import java.util.Iterator; + +public class FileTripleIDIterator extends FileChunkIterator { + /** + * create a file iterator from a stream and a max size + * + * @param it the iterator + * @param maxSize the maximum size of each file, this size is estimated, so files can be bigger. + */ + public FileTripleIDIterator(Iterator it, long maxSize) { + super(it, maxSize, tripleID -> 4L * Long.BYTES); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java new file mode 100644 index 00000000..30bf6e99 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/FileTripleIterator.java @@ -0,0 +1,37 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.string.ByteStringUtil; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.Iterator; +import java.util.function.Consumer; +import java.util.function.ToIntFunction; + +/** + * Iterator to split an iterator stream into multiple files, the iterator return {@link #hasNext()} == true once the + * first file is returned, then the {@link #hasNewFile()} should be called to check if another file can be created and + * re-allow {@link #hasNext()} to return true + * @author Antoine Willerval + */ +public class FileTripleIterator extends FileChunkIterator { + private static long estimateSize(TripleString tripleString) { + try { + return tripleString.asNtriple().toString().getBytes(ByteStringUtil.STRING_ENCODING).length; + } catch (IOException e) { + throw new RuntimeException("Can't estimate the size of the triple " + tripleString, e); + } + } + + /** + * create a file iterator from a stream and a max size + * + * @param it the iterator + * @param maxSize the maximum size of each file, this size is estimated, so files can be bigger. + */ + public FileTripleIterator(Iterator it, long maxSize) { + super(it, maxSize, FileTripleIterator::estimateSize); + } + +} \ No newline at end of file diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapExceptionIterator.java new file mode 100644 index 00000000..5eef7e71 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapExceptionIterator.java @@ -0,0 +1,47 @@ +package org.rdfhdt.hdt.iterator.utils; + +/** + * Exception Iterator to map a value to another + * @param origin type + * @param return type + * @param the allowed exception + * @author Antoine Willerval + */ +public class MapExceptionIterator implements ExceptionIterator { + private final MapWithIdFunction mappingFunction; + private final ExceptionIterator base; + private long index; + + public MapExceptionIterator(ExceptionIterator base, ExceptionFunction mappingFunction) { + this(base, (m, i) -> mappingFunction.apply(m)); + } + public MapExceptionIterator(ExceptionIterator base, MapWithIdFunction mappingFunction) { + this.base = base; + this.mappingFunction = mappingFunction; + } + + @Override + public boolean hasNext() throws E { + return base.hasNext(); + } + + @Override + public N next() throws E { + return mappingFunction.apply(base.next(), index++); + } + + @Override + public void remove() throws E{ + base.remove(); + } + + @FunctionalInterface + public interface MapWithIdFunction { + N apply(M element, long index) throws E; + } + + @FunctionalInterface + public interface ExceptionFunction { + N apply(M element) throws E; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java new file mode 100644 index 00000000..ca933bc9 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MapIterator.java @@ -0,0 +1,44 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Iterator; +import java.util.function.Function; + +/** + * Iterator to map a value to another + * @param origin type + * @param return type + * @author Antoine Willerval + */ +public class MapIterator implements Iterator { + private final MapWithIdFunction mappingFunction; + private final Iterator base; + private long index; + + public MapIterator(Iterator base, Function mappingFunction) { + this(base, (m, i) -> mappingFunction.apply(m)); + } + public MapIterator(Iterator base, MapWithIdFunction mappingFunction) { + this.base = base; + this.mappingFunction = mappingFunction; + } + + @Override + public boolean hasNext() { + return base.hasNext(); + } + + @Override + public N next() { + return mappingFunction.apply(base.next(), index++); + } + + @Override + public void remove() { + base.remove(); + } + + @FunctionalInterface + public interface MapWithIdFunction { + E apply(T element, long index); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java new file mode 100644 index 00000000..a24d9e81 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIterator.java @@ -0,0 +1,136 @@ +package org.rdfhdt.hdt.iterator.utils; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.function.Function; + +public class MergeExceptionIterator implements ExceptionIterator { + + /** + * Create a tree of merge iterators from an array of element + * @param itFunction a function to create an iterator from an element + * @param comp comparator for the merge iterator + * @param array the elements + * @param length the number of elements + * @param input of the element + * @param type of the element in the iterator + * @param exception returned by the iterator + * @return the iterator + */ + public static ExceptionIterator buildOfTree( + Function> itFunction, Comparator comp, I[] array, int length) { + return buildOfTree(itFunction, comp, array, 0, length); + } + + /** + * Create a tree of merge iterators from an array of element + * @param itFunction a function to create an iterator from an element + * @param comp comparator for the merge iterator + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param type of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static ExceptionIterator buildOfTree( + Function> itFunction, Comparator comp, I[] array, int start, int end) { + return buildOfTree(itFunction, comp, Arrays.asList(array), start, end); + } + + /** + * Create a tree of merge iterators from an array of element + * @param itFunction a function to create an iterator from an element + * @param comp comparator for the merge iterator + * @param array the elements + * @param start the start of the array (inclusive) + * @param end the end of the array (exclusive) + * @param type of the element + * @param exception returned by the iterator + * @return the iterator + */ + public static ExceptionIterator buildOfTree( + Function> itFunction, Comparator comp, List array, int start, int end) { + int length = end - start; + if (length <= 0) { + return ExceptionIterator.empty(); + } + if (length == 1) { + return itFunction.apply(array.get(start)); + } + int mid = (start + end) / 2; + return new MergeExceptionIterator<>( + buildOfTree(itFunction, comp, array, start, mid), + buildOfTree(itFunction, comp, array, mid, end), + comp + ); + } + + private final ExceptionIterator in1, in2; + private final Comparator comp; + private T next; + private T prevE1; + private T prevE2; + + public MergeExceptionIterator(ExceptionIterator in1, ExceptionIterator in2, Comparator comp) { + this.in1 = in1; + this.in2 = in2; + this.comp = comp; + } + + @Override + public boolean hasNext() throws E { + if (next != null) { + return true; + } + + // read next element 1 if required + if (prevE1 == null && in1.hasNext()) { + prevE1 = in1.next(); + } + // read next element 2 if required + if (prevE2 == null && in2.hasNext()) { + prevE2 = in2.next(); + } + + if (prevE1 != null && prevE2 != null) { + // we have an element from both stream, compare them + if (comp.compare(prevE1, prevE2) < 0) { + // element 1 lower, return it + next = prevE1; + prevE1 = null; + } else { + // element 2 lower, return it + next = prevE2; + prevE2 = null; + } + return true; + } + // we have at most one element + if (prevE1 != null) { + // return element 1 + next = prevE1; + prevE1 = null; + return true; + } + if (prevE2 != null) { + // return element 2 + next = prevE2; + prevE2 = null; + return true; + } + // nothing else + return false; + } + + @Override + public T next() throws E { + if (!hasNext()) { + return null; + } + T next = this.next; + this.next = null; + return next; + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java new file mode 100644 index 00000000..dd372185 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/NotificationExceptionIterator.java @@ -0,0 +1,57 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.listener.ProgressListener; + +import java.util.Objects; + +/** + * ExceptionIterator Wrapper to notify a progress + * + * @param iterator type + * @param iterator exception + * @author Antoine WillervalF + */ +public class NotificationExceptionIterator implements ExceptionIterator { + private final ExceptionIterator it; + private final long size; + private final long split; + private final String message; + private final ProgressListener listener; + private long current = 0L; + + public NotificationExceptionIterator(ExceptionIterator it, long size, long split, String message, ProgressListener listener) { + this.it = Objects.requireNonNull(it, "it can't be null!"); + if (size < 0) { + throw new IllegalArgumentException("size can't be negative!"); + } + if (split < 0) { + throw new IllegalArgumentException("split can't be negative! " + split); + } + // set size to be at least 1 to allow empty next() error + this.size = Math.max(1, size); + // minimize split by size to avoid dividing by 0 + this.split = Math.min(split, size); + this.message = Objects.requireNonNull(message, "message can't be null!"); + this.listener = Objects.requireNonNullElseGet(listener, () -> (perc, msg) -> { + }); + } + + @Override + public boolean hasNext() throws E { + return it.hasNext(); + } + + @Override + public T next() throws E { + current++; + if (current % (size / split) == 0) { + listener.notifyProgress((float) current / size, message + " " + current + "/" + size); + } + return it.next(); + } + + @Override + public void remove() throws E { + it.remove(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java new file mode 100644 index 00000000..2c279575 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/iterator/utils/PipedCopyIterator.java @@ -0,0 +1,198 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.util.io.IOUtil; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.nio.charset.StandardCharsets; +import java.util.Iterator; +import java.util.function.Function; + +/** + * a utility class to create an iterator from the value returned by another Thread + * + * @param the iterator type + * @author Antoine Willerval + */ + +public class PipedCopyIterator implements Iterator { + /** + * RuntimeException generated by the PipedCopyIterator + * + * @author Antoine Willerval + */ + public static class PipedIteratorException extends RuntimeException { + public PipedIteratorException(String message, Throwable t) { + super(message, t); + } + } + + + /** + * Callback for the {@link #createOfCallback(PipedCopyIterator.Parser, PipedCopyIterator.PipeCallBack)} method + * + * @param the iterator type + * @author Antoine Willerval + */ + @FunctionalInterface + public interface PipeCallBack { + /** + * method called from the new thread to generate the new data, at the end of the callback, the pipe is closed + * with or without exception + * + * @param pipe the pipe to fill + * @throws Exception any exception returned by the generator + */ + void createPipe(PipedCopyIterator pipe) throws Exception; + } + + /** + * create a piped iterator from a callback runner, the call to the callback should be made in the callbackRunner + * + * @param serializer serializer to pass the data + * @param callbackRunner the callback runner + * @param type of the iterator + * @return the iterator + */ + public static PipedCopyIterator createOfCallback(Parser serializer, PipeCallBack callbackRunner) { + PipedCopyIterator pipe = new PipedCopyIterator<>(serializer); + + Thread thread = new Thread(() -> { + try { + callbackRunner.createPipe(pipe); + pipe.closePipe(); + } catch (Throwable e) { + pipe.closePipe(e); + } + }, "PipeIterator"); + thread.start(); + + return pipe; + } + public interface Parser { + static void writeString(CharSequence s, OutputStream out) throws IOException { + byte[] bytes = s.toString().getBytes(StandardCharsets.UTF_8); + VByte.encode(out, bytes.length); + out.write(bytes); + } + static String readString(InputStream in) throws IOException { + int size = (int) VByte.decode(in); + byte[] bytes = IOUtil.readBuffer(in, size, null); + return new String(bytes, StandardCharsets.UTF_8); + } + void write(T t, OutputStream stream) throws IOException; + T read(InputStream stream) throws IOException; + } + + private final PipedInputStream in; + private final PipedOutputStream out; + private final Parser serializer; + private T next; + private boolean end; + private PipedIteratorException exception; + + public PipedCopyIterator(Parser serializer) { + this.serializer = serializer; + try { + in = new PipedInputStream(); + out = new PipedOutputStream(); + in.connect(out); + } catch (IOException e) { + throw new PipedIteratorException("can't connect pipe", e); + } + } + private int readByte() { + try { + return in.read(); + } catch (IOException e) { + throw new PipedIteratorException("Can't read byte", e); + } + } + + @Override + public boolean hasNext() { + if (end) { + return false; + } + if (next != null) { + return true; + } + + int b = readByte(); + if (b == 0) { + end = true; + if (exception != null) { + throw exception; + } + return false; + } + try { + next = serializer.read(in); + } catch (IOException e) { + throw new PipedIteratorException("Can't read pipe", e); + } + return true; + } + + @Override + public T next() { + if (!hasNext()) { + return null; + } + T next = this.next; + this.next = null; + return next; + } + + public void closePipe() { + closePipe(null); + } + public void closePipe(Throwable e) { + if (e != null) { + if (e instanceof PipedIteratorException) { + this.exception = (PipedIteratorException) e; + } else { + this.exception = new PipedIteratorException("closing exception", e); + } + } + try { + // end byte + out.write(0); + } catch (IOException ee) { + throw new PipedIteratorException("Can't close pipe", ee); + } + } + + /** + * map this iterator to another type + * @param mappingFunction the mapping function + * @param the future type + * @return mapped iterator + */ + public Iterator map(Function mappingFunction) { + return new MapIterator<>(this, mappingFunction); + } + /** + * map this iterator to another type + * @param mappingFunction the mapping function + * @param the future type + * @return mapped iterator + */ + public Iterator mapWithId(MapIterator.MapWithIdFunction mappingFunction) { + return new MapIterator<>(this, mappingFunction); + } + + public void addElement(T node) { + try { + // not end byte + out.write(1); + serializer.write(node, out); + } catch (IOException ee) { + throw new PipedIteratorException("Can't add element to pipe", ee); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java index 246396c9..40bde70d 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java @@ -29,6 +29,7 @@ import org.rdfhdt.hdt.enums.RDFNotation; import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.iterator.utils.PipedCopyIterator; import org.rdfhdt.hdt.rdf.parsers.RDFParserDir; import org.rdfhdt.hdt.rdf.parsers.RDFParserHDT; import org.rdfhdt.hdt.rdf.parsers.RDFParserList; @@ -37,6 +38,12 @@ import org.rdfhdt.hdt.rdf.parsers.RDFParserSimple; import org.rdfhdt.hdt.rdf.parsers.RDFParserTar; import org.rdfhdt.hdt.rdf.parsers.RDFParserZip; +import org.rdfhdt.hdt.triples.TripleString; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Iterator; /** * @author mario.arias @@ -78,4 +85,34 @@ public static RDFParserCallback getParserCallback(RDFNotation notation, boolean throw new NotImplementedException("Parser not found for notation: "+notation); } + + /** + * convert a stream to a triple iterator + * @param parser the parser to convert the stream + * @param stream the stream to parse + * @param baseUri the base uri to parse + * @param notation the rdf notation to parse + * @return iterator + */ + public static Iterator readAsIterator(RDFParserCallback parser, InputStream stream, String baseUri, boolean keepBNode, RDFNotation notation) { + return PipedCopyIterator.createOfCallback(TripleStringParser.INSTANCE, pipe -> parser.doParse(stream, baseUri, notation, keepBNode, (triple, pos) -> pipe.addElement(triple))); + } + + private static class TripleStringParser implements PipedCopyIterator.Parser { + private static final TripleStringParser INSTANCE = new TripleStringParser(); + @Override + public void write(TripleString tripleString, OutputStream stream) throws IOException { + PipedCopyIterator.Parser.writeString(tripleString.getSubject(), stream); + PipedCopyIterator.Parser.writeString(tripleString.getPredicate(), stream); + PipedCopyIterator.Parser.writeString(tripleString.getObject(), stream); + } + + @Override + public TripleString read(InputStream stream) throws IOException { + String s = PipedCopyIterator.Parser.readString(stream); + String p = PipedCopyIterator.Parser.readString(stream); + String o = PipedCopyIterator.Parser.readString(stream); + return new TripleString(s, p, o); + } + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java new file mode 100644 index 00000000..59da86f4 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedNode.java @@ -0,0 +1,39 @@ +package org.rdfhdt.hdt.triples; + +import org.rdfhdt.hdt.util.string.CharSequenceComparator; + +import java.util.Comparator; + +public class IndexedNode implements Comparable { + private static final Comparator NODE_COMPARATOR = CharSequenceComparator.getInstance(); + private CharSequence node; + private long index; + + public IndexedNode(CharSequence node, long index) { + this.node = node; + this.index = index; + } + public IndexedNode() { + } + + public CharSequence getNode() { + return node; + } + + public long getIndex() { + return index; + } + + public void setIndex(long index) { + this.index = index; + } + + public void setNode(CharSequence node) { + this.node = node; + } + + @Override + public int compareTo(IndexedNode o) { + return NODE_COMPARATOR.compare(node, o.getNode()); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedTriple.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedTriple.java new file mode 100644 index 00000000..07927398 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/IndexedTriple.java @@ -0,0 +1,43 @@ +package org.rdfhdt.hdt.triples; + +/** + * A triple of {@link org.rdfhdt.hdt.triples.IndexedNode} + * @author Antoine Willerval + */ +public class IndexedTriple { + private IndexedNode subject; + private IndexedNode predicate; + private IndexedNode object; + + public IndexedTriple() { + } + + public IndexedTriple(IndexedNode subject, IndexedNode predicate, IndexedNode object) { + load(subject, predicate, object); + } + + public IndexedNode getSubject() { + return subject; + } + + public IndexedNode getPredicate() { + return predicate; + } + + public IndexedNode getObject() { + return object; + } + + /** + * load a new s p o inside this triple + * @param subject the subject + * @param predicate the predicate + * @param object the object + */ + public void load(IndexedNode subject, IndexedNode predicate, IndexedNode object) { + this.subject = subject; + this.predicate = predicate; + this.object = object; + } + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TempTriples.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TempTriples.java index 94af28ca..cdfd1106 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TempTriples.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TempTriples.java @@ -84,11 +84,6 @@ public interface TempTriples extends TriplesPrivate, Closeable { */ void setOrder(TripleComponentOrder order); - /** - * Gets the currently set order(TripleComponentOrder) - */ - TripleComponentOrder getOrder(); - /** * Clear all triples, resulting in an empty triples section. */ diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TriplesPrivate.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TriplesPrivate.java index 728a423b..2d570020 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TriplesPrivate.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/TriplesPrivate.java @@ -5,6 +5,7 @@ import java.io.InputStream; import java.io.OutputStream; +import org.rdfhdt.hdt.enums.TripleComponentOrder; import org.rdfhdt.hdt.iterator.SuppliableIteratorTripleID; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.ControlInfo; @@ -80,4 +81,9 @@ public interface TriplesPrivate extends Triples { * The TempTriples input to load from */ void load(TempTriples input, ProgressListener listener); + + /** + * Gets the currently set order(TripleComponentOrder) + */ + TripleComponentOrder getOrder(); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java index 957cdbb7..c8379c76 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/BitmapTriples.java @@ -839,6 +839,7 @@ public void close() throws IOException { } } + @Override public TripleComponentOrder getOrder() { return this.order; } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/OneReadTempTriples.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/OneReadTempTriples.java new file mode 100644 index 00000000..f477bd8c --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/OneReadTempTriples.java @@ -0,0 +1,243 @@ +package org.rdfhdt.hdt.triples.impl; + +import org.rdfhdt.hdt.dictionary.impl.DictionaryIDMapping; +import org.rdfhdt.hdt.enums.ResultEstimationType; +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.iterator.SuppliableIteratorTripleID; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.triples.IteratorTripleID; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.Triples; +import org.rdfhdt.hdt.util.io.CountInputStream; +import org.rdfhdt.hdt.util.io.compress.NoDuplicateTripleIDIterator; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Iterator; + +/** + * {@link org.rdfhdt.hdt.triples.TempTriples} only readable once with the {@link #searchAll()} method with a predefined + * order, trying to set another order will lead to an exception, trying to use any other method can lead to a + * {@link org.rdfhdt.hdt.exceptions.NotImplementedException}. + * @author Antoine Willerval + */ +public class OneReadTempTriples implements TempTriples { + private IteratorTripleID iterator; + private TripleComponentOrder order; + + public OneReadTempTriples(Iterator iterator, TripleComponentOrder order, long triples) { + this.iterator = new SimpleIteratorTripleID(iterator, order, triples); + this.order = order; + } + + @Override + public boolean insert(long subject, long predicate, long object) { + throw new NotImplementedException(); + } + + @Override + public boolean insert(TripleID... triples) { + throw new NotImplementedException(); + } + + @Override + public boolean remove(TripleID... pattern) { + throw new NotImplementedException(); + } + + @Override + public void sort(ProgressListener listener) { + // already sorted + } + + @Override + public void removeDuplicates(ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void setOrder(TripleComponentOrder order) { + if (order != this.order) { + throw new IllegalArgumentException("order asked by isn't the same as the set one!"); + } + } + + @Override + public void clear() { + throw new NotImplementedException(); + } + + @Override + public void load(Triples triples, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void replaceAllIds(DictionaryIDMapping mapSubj, DictionaryIDMapping mapPred, DictionaryIDMapping mapObj) { + throw new NotImplementedException(); + } + + @Override + public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public SuppliableIteratorTripleID search(TripleID pattern) { + throw new NotImplementedException(); + } + + @Override + public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { + throw new NotImplementedException(); + } + + @Override + public void generateIndex(ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void loadIndex(InputStream input, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void mapIndex(CountInputStream input, File f, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void saveIndex(OutputStream output, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void load(TempTriples input, ProgressListener listener) { + if (input instanceof OneReadTempTriples) { + OneReadTempTriples input2 = (OneReadTempTriples) input; + this.iterator = input2.iterator; + this.order = input2.order; + } else { + throw new NotImplementedException(); + } + } + + @Override + public TripleComponentOrder getOrder() { + return order; + } + + @Override + public IteratorTripleID searchAll() { + return new NoDuplicateTripleIDIterator(iterator); + } + + @Override + public long getNumberOfElements() { + return iterator.estimatedNumResults(); + } + + @Override + public long size() { + return iterator.estimatedNumResults(); + } + + @Override + public void populateHeader(Header head, String rootNode) { + throw new NotImplementedException(); + } + + @Override + public String getType() { + throw new NotImplementedException(); + } + + @Override + public TripleID findTriple(long position) { + throw new NotImplementedException(); + } + + @Override + public void close() throws IOException { + // nothing to do + } + + private static class SimpleIteratorTripleID implements IteratorTripleID { + private final Iterator it; + private final TripleComponentOrder order; + private final long tripleCount; + + public SimpleIteratorTripleID(Iterator it, TripleComponentOrder order, long tripleCount) { + this.it = it; + this.order = order; + this.tripleCount = tripleCount; + } + + @Override + public boolean hasPrevious() { + throw new NotImplementedException(); + } + + @Override + public TripleID previous() { + throw new NotImplementedException(); + } + + @Override + public void goToStart() { + throw new NotImplementedException(); + } + + @Override + public boolean canGoTo() { + throw new NotImplementedException(); + } + + @Override + public void goTo(long pos) { + throw new NotImplementedException(); + } + + @Override + public long estimatedNumResults() { + return tripleCount; + } + + @Override + public ResultEstimationType numResultEstimation() { + return ResultEstimationType.UP_TO; + } + + @Override + public TripleComponentOrder getOrder() { + return order; + } + + @Override + public long getLastTriplePosition() { + return tripleCount; + } + + @Override + public boolean hasNext() { + return it.hasNext(); + } + + @Override + public TripleID next() { + return it.next(); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/WriteBitmapTriples.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/WriteBitmapTriples.java new file mode 100644 index 00000000..55fd4a2f --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/triples/impl/WriteBitmapTriples.java @@ -0,0 +1,249 @@ +package org.rdfhdt.hdt.triples.impl; + +import org.rdfhdt.hdt.compact.bitmap.AppendableWriteBitmap; +import org.rdfhdt.hdt.compact.sequence.SequenceLog64BigDisk; +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.exceptions.IllegalFormatException; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.hdt.HDTVocabulary; +import org.rdfhdt.hdt.header.Header; +import org.rdfhdt.hdt.iterator.SuppliableIteratorTripleID; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.ControlInfo; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.triples.IteratorTripleID; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.TriplesPrivate; +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.CountInputStream; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.rdfhdt.hdt.util.listener.ListenerUtil; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; + +/** + * Appendable write {@link org.rdfhdt.hdt.triples.impl.BitmapTriples} version + * + * @author Antoine Willerval + */ +public class WriteBitmapTriples implements TriplesPrivate { + protected TripleComponentOrder order = TripleComponentOrder.SPO; + private long numTriples; + private final AppendableWriteBitmap bitY, bitZ; + private final CloseSuppressPath seqY, seqZ, triples; + private SequenceLog64BigDisk vectorY, vectorZ; + + public WriteBitmapTriples(HDTOptions spec, CloseSuppressPath triples, int bufferSize) throws IOException { + String orderStr = spec.get("triplesOrder"); + if (orderStr != null) { + order = TripleComponentOrder.valueOf(orderStr); + } + triples.mkdirs(); + triples.closeWithDeleteRecurse(); + this.triples = triples; + bitY = new AppendableWriteBitmap(triples.resolve("bitmapY"), bufferSize); + bitZ = new AppendableWriteBitmap(triples.resolve("bitmapZ"), bufferSize); + seqY = triples.resolve("seqY"); + seqZ = triples.resolve("seqZ"); + } + + @Override + public void save(OutputStream output, ControlInfo ci, ProgressListener listener) throws IOException { + ci.clear(); + ci.setFormat(getType()); + ci.setInt("order", order.ordinal()); + ci.setType(ControlInfo.Type.TRIPLES); + ci.save(output); + + IntermediateListener iListener = new IntermediateListener(listener); + bitY.save(output, iListener); + bitZ.save(output, iListener); + vectorY.save(output, iListener); + vectorZ.save(output, iListener); + } + + @Override + public IteratorTripleID searchAll() { + throw new NotImplementedException(); + } + + @Override + public SuppliableIteratorTripleID search(TripleID pattern) { + throw new NotImplementedException(); + } + + @Override + public long getNumberOfElements() { + return numTriples; + } + + @Override + public long size() { + return numTriples * 4; + } + + @Override + public void populateHeader(Header header, String rootNode) { + if (rootNode == null || rootNode.length() == 0) { + throw new IllegalArgumentException("Root node for the header cannot be null"); + } + + header.insert(rootNode, HDTVocabulary.TRIPLES_TYPE, getType()); + header.insert(rootNode, HDTVocabulary.TRIPLES_NUM_TRIPLES, getNumberOfElements()); + header.insert(rootNode, HDTVocabulary.TRIPLES_ORDER, order.toString()); +// header.insert(rootNode, HDTVocabulary.TRIPLES_SEQY_TYPE, seqY.getType() ); +// header.insert(rootNode, HDTVocabulary.TRIPLES_SEQZ_TYPE, seqZ.getType() ); +// header.insert(rootNode, HDTVocabulary.TRIPLES_SEQY_SIZE, seqY.size() ); +// header.insert(rootNode, HDTVocabulary.TRIPLES_SEQZ_SIZE, seqZ.size() ); +// if(bitmapY!=null) { +// header.insert(rootNode, HDTVocabulary.TRIPLES_BITMAPY_SIZE, bitmapY.getSizeBytes() ); +// } +// if(bitmapZ!=null) { +// header.insert(rootNode, HDTVocabulary.TRIPLES_BITMAPZ_SIZE, bitmapZ.getSizeBytes() ); +// } + } + + @Override + public String getType() { + return HDTVocabulary.TRIPLES_TYPE_BITMAP; + } + + @Override + public TripleID findTriple(long position) { + throw new NotImplementedException(); + } + + @Override + public void load(InputStream input, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void mapFromFile(CountInputStream in, File f, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void generateIndex(ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void loadIndex(InputStream input, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void mapIndex(CountInputStream input, File f, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void saveIndex(OutputStream output, ControlInfo ci, ProgressListener listener) { + throw new NotImplementedException(); + } + + @Override + public void load(TempTriples triples, ProgressListener listener) { + triples.setOrder(order); + triples.sort(listener); + + IteratorTripleID it = triples.searchAll(); + + long number = it.estimatedNumResults(); + + vectorY = new SequenceLog64BigDisk(seqY.toAbsolutePath().toString(), BitUtil.log2(number)); + vectorZ = new SequenceLog64BigDisk(seqZ.toAbsolutePath().toString(), BitUtil.log2(number)); + + long lastX = 0, lastY = 0, lastZ = 0; + long x, y, z; + numTriples = 0; + + while (it.hasNext()) { + TripleID triple = it.next(); + TripleOrderConvert.swapComponentOrder(triple, TripleComponentOrder.SPO, order); + + x = triple.getSubject(); + y = triple.getPredicate(); + z = triple.getObject(); + if (x == 0 || y == 0 || z == 0) { + throw new IllegalFormatException("None of the components of a triple can be null"); + } + + if (numTriples == 0) { + // First triple + vectorY.append(y); + vectorZ.append(z); + } else if (x != lastX) { + if (x != lastX + 1) { + throw new IllegalFormatException("Upper level must be increasing and correlative."); + } + // X changed + bitY.append(true); + vectorY.append(y); + + bitZ.append(true); + vectorZ.append(z); + } else if (y != lastY) { + if (y < lastY) { + throw new IllegalFormatException("Middle level must be increasing for each parent."); + } + + // Y changed + bitY.append(false); + vectorY.append(y); + + bitZ.append(true); + vectorZ.append(z); + } else { + if (z < lastZ) { + throw new IllegalFormatException("Lower level must be increasing for each parent."); + } + + // Z changed + bitZ.append(false); + vectorZ.append(z); + } + + lastX = x; + lastY = y; + lastZ = z; + + ListenerUtil.notifyCond(listener, "Converting to BitmapTriples", numTriples, numTriples, number); + numTriples++; + } + + if (numTriples > 0) { + bitY.append(true); + bitZ.append(true); + } + + vectorY.aggressiveTrimToSize(); + vectorZ.aggressiveTrimToSize(); + } + + @Override + public TripleComponentOrder getOrder() { + return order; + } + + @Override + public void close() throws IOException { + IOUtil.closeAll( + bitY, + bitZ, + vectorY, + seqY, + vectorZ, + seqZ, + triples + ); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/BitUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/BitUtil.java index 8353d865..f7882be5 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/BitUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/BitUtil.java @@ -44,7 +44,7 @@ public static int log2(long n) { } public static long maxVal(int numbits) { - return ~(~0L< list type + * @author Antoine Willerval + */ +public class ParallelSortableArrayList implements List { + public static final double GROW_FACTOR = 1.5f; + private int used; + private T[] array; + private final Class type; + + public ParallelSortableArrayList(Class type) { + this(type, 16); + } + + @SuppressWarnings("unchecked") + public ParallelSortableArrayList(Class type, int capacity) { + this.type = type; + array = (T[]) Array.newInstance(type.getComponentType(), capacity); + } + + private void checkSize(int newSize) { + if (newSize >= array.length) { + // don't allocate beyond the max size + int allocate = (int) Math.min(Integer.MAX_VALUE - 5L, (long) (newSize * GROW_FACTOR)); + array = Arrays.copyOf(array, allocate, type); + } + } + + @Override + public boolean add(T element) { + checkSize(used + 1); + array[used++] = element; + return true; + } + + @Override + public boolean remove(Object o) { + throw new NotImplementedException(); + } + + @Override + public boolean containsAll(Collection c) { + throw new NotImplementedException(); + } + + @Override + public boolean addAll(Collection c) { + throw new NotImplementedException(); + } + + @Override + public boolean addAll(int index, Collection c) { + throw new NotImplementedException(); + } + + @Override + public boolean removeAll(Collection c) { + throw new NotImplementedException(); + } + + @Override + public boolean retainAll(Collection c) { + throw new NotImplementedException(); + } + + @Override + public int size() { + return used; + } + + @Override + public boolean isEmpty() { + return size() == 0; + } + + @Override + public boolean contains(Object o) { + throw new NotImplementedException(); + } + + @Override + public void clear() { + for (int i = 0; i < used; i++) { + array[i] = null; + } + used = 0; + } + + @Override + public T get(int index) { + return array[index]; + } + + @Override + public T set(int index, T element) { + return array[index] = element; + } + + @Override + public void add(int index, T element) { + throw new NotImplementedException(); + } + + @Override + public T remove(int index) { + throw new NotImplementedException(); + } + + @Override + public int indexOf(Object o) { + for (int i = 0; i < size(); i++) { + if (get(i).equals(o)) { + return i; + } + } + return -1; + } + + @Override + public int lastIndexOf(Object o) { + for (int i = size() - 1; i >= 0; i--) { + if (get(i).equals(o)) { + return i; + } + } + return -1; + } + + @Override + public ListIterator listIterator() { + throw new NotImplementedException(); + } + + @Override + public ListIterator listIterator(int index) { + throw new NotImplementedException(); + } + + @Override + public List subList(int fromIndex, int toIndex) { + throw new NotImplementedException(); + } + + public T[] getArray() { + return array; + } + + @Override + public Iterator iterator() { + return Arrays.asList(array).subList(0, used).iterator(); + } + + @Override + public Object[] toArray() { + return Arrays.copyOf(array, used, Object[].class); + } + + @Override + public T1[] toArray(T1[] a) { + throw new NotImplementedException(); + } + + @Override + public void sort(Comparator comparator) { + Arrays.sort(array, 0, used, comparator); + } + + /** + * sort this array in parallel (if available) + * @param comparator sort comparator + */ + public void parallelSort(Comparator comparator) { + Arrays.parallelSort(array, 0, used, comparator); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java new file mode 100644 index 00000000..686b21c7 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Profiler.java @@ -0,0 +1,155 @@ +package org.rdfhdt.hdt.util; + +import java.util.ArrayList; +import java.util.List; + +/** + * tool to profile time + * @author Antoine Willerval + */ +public class Profiler { + private int maxSize = 0; + private final String name; + private Section mainSection; + private boolean disabled; + + /** + * create a profiler + * @param name the profiler name + */ + public Profiler(String name) { + this.name = name; + } + + /** + * disable the profiler methods + * @param disable if true, the methods will be callable, but won't do anything + */ + public void setDisabled(boolean disable) { + this.disabled = disable; + } + + /** + * start a section + * @param name the section name + */ + public void pushSection(String name) { + if (disabled) { + return; + } + getMainSection().pushSection(name, 0); + } + + /** + * complete a section + */ + public void popSection() { + if (disabled) { + return; + } + if (!getMainSection().isRunning()) { + throw new IllegalArgumentException("profiler not running!"); + } + getMainSection().popSection(); + } + + /** + * stop the profiler without poping sections + */ + public void stop() { + if (disabled) { + return; + } + getMainSection().stop(); + } + + /** + * write the profile into the console + */ + public void writeProfiling() { + if (disabled) { + return; + } + getMainSection().writeProfiling("", true); + } + + /** + * @return the main section of the profiler + */ + public Section getMainSection() { + if (this.mainSection == null) { + this.mainSection = new Section(name); + } + return this.mainSection; + } + + /** + * a section in the profiling + */ + public class Section { + private final String name; + private final long start = System.nanoTime(); + private long end = start; + private final List
subSections = new ArrayList<>(); + private Section currentSection; + + Section(String name) { + this.name = name; + } + + /** + * @return the subsections + */ + public List
getSubSections() { + return subSections; + } + + /** + * @return the section name + */ + public String getName() { + return name; + } + + boolean isRunning() { + return currentSection != null; + } + + void pushSection(String name, int deep) { + if (isRunning()) { + currentSection.pushSection(name, deep + 1); + return; + } + + subSections.add(currentSection = new Section(name)); + maxSize = Math.max(name.length() + deep * 2, maxSize); + } + + boolean popSection() { + if (isRunning()) { + if (currentSection.popSection()) { + currentSection = null; + } + return false; + } else { + end = System.nanoTime(); + return true; + } + } + + void stop() { + if (isRunning()) { + currentSection.stop(); + } + end = System.nanoTime(); + } + + void writeProfiling(String prefix, boolean isLast) { + System.out.println(prefix + (getSubSections().isEmpty() ? "+--" : "+-+") + " [" + getName() + "] " + "-".repeat(1 + maxSize - getName().length()) + " elapsed=" + (end - start) / 1_000_000L + "ms"); + for (int i = 0; i < subSections.size(); i++) { + Section s = subSections.get(i); + s.writeProfiling(prefix + (isLast ? " " : "| "), i == subSections.size() - 1); + } + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Reference.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Reference.java new file mode 100644 index 00000000..91111882 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/Reference.java @@ -0,0 +1,60 @@ +package org.rdfhdt.hdt.util; + +import java.util.function.Supplier; + +/** + * Simple object reference + * @param type of the object + */ +public class Reference { + private T object; + + /** + * create with an object + * @param object the object + */ + public Reference(T object) { + this.object = object; + } + + /** + * create with a null object + */ + public Reference() { + this(null); + } + + /** + * set the object + * @param object the object + */ + public void setObject(T object) { + this.object = object; + } + + /** + * @return the object + */ + public T getObject() { + return object; + } + + /** + * @return if the object is null + */ + public boolean isNull() { + return object == null; + } + + /** + * compute the object if it is null and return the objec + * @param compute the compute function + * @return the object + */ + public T computeIfAbsent(Supplier compute) { + if (isNull()) { + setObject(compute.get()); + } + return getObject(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionFunction.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionFunction.java new file mode 100644 index 00000000..3983b756 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionFunction.java @@ -0,0 +1,6 @@ +package org.rdfhdt.hdt.util.concurrent; + +@FunctionalInterface +public interface ExceptionFunction { + O apply(I value) throws E; +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionSupplier.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionSupplier.java new file mode 100644 index 00000000..02ba145f --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionSupplier.java @@ -0,0 +1,6 @@ +package org.rdfhdt.hdt.util.concurrent; + +@FunctionalInterface +public interface ExceptionSupplier { + T get() throws E; +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java new file mode 100644 index 00000000..1689a6a0 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/ExceptionThread.java @@ -0,0 +1,198 @@ +package org.rdfhdt.hdt.util.concurrent; + +import java.util.Objects; + +/** + * Thread allowing exception and returning it when joining it with {@link #joinAndCrashIfRequired()} or by using + * {@link #getException()}, can be attached to other threads to crash the others if an exception occurs in one of + * them with {@link #attach(ExceptionThread...)}. + * + * @author Antoine Willerval + */ +public class ExceptionThread extends Thread { + /** + * create exception threads of multiple runnables + * + * @param name common name + * @param runnables the runnables list, can't be empty + * @return exception thread attached with other runnables + * @throws java.lang.IllegalArgumentException if the array is empty + * @throws java.lang.NullPointerException if an argument is null + */ + public static ExceptionThread async(String name, ExceptionRunnable... runnables) { + Objects.requireNonNull(name, "name can't be null!"); + Objects.requireNonNull(runnables, "runnables can't be null"); + for (int i = 0; i < runnables.length; i++) { + Objects.requireNonNull(runnables[i], "runnable#" + i + " is null!"); + } + if (runnables.length == 0) { + throw new IllegalArgumentException("empty runnable list"); + } + + ExceptionThread thread = new ExceptionThread(runnables[0], name + "#" + 0); + + for (int i = 1; i < runnables.length; i++) { + thread.attach(new ExceptionThread(runnables[i], name + "#" + i)); + } + + return thread; + } + + + /** + * Version of {@link java.lang.Runnable} with an exception + */ + @FunctionalInterface + public interface ExceptionRunnable { + /** + * Runnable used in an {@link org.rdfhdt.hdt.util.concurrent.ExceptionThread}, can throw an exception + * + * @see org.rdfhdt.hdt.util.concurrent.ExceptionThread#ExceptionThread(org.rdfhdt.hdt.util.concurrent.ExceptionThread.ExceptionRunnable, String) + * @throws java.lang.Exception if any + */ + void run() throws Exception; + } + + private Throwable exception = null; + private final ExceptionRunnable target; + private ExceptionThread next; + private ExceptionThread prev; + + public ExceptionThread(ExceptionRunnable target, String name) { + super(name); + this.target = target; + } + + /** + * attach another threads to wait with this one + * + * @param threads others + * @return this + */ + public ExceptionThread attach(ExceptionThread... threads) { + Objects.requireNonNull(threads, "can't attach null thread"); + for (ExceptionThread thread : threads) { + if (thread.prev != null) { + throw new IllegalArgumentException("Thread " + thread.getName() + " already attached"); + } + if (this.next != null) { + this.next.attach(thread); + continue; + } + this.next = thread; + thread.prev = this; + } + return this; + } + + /** + * start this thread and all attached thread + * + * @return this + */ + public ExceptionThread startAll() { + ExceptionThread prev = this.prev; + while (prev != null) { + prev.start(); + prev = prev.prev; + } + start(); + ExceptionThread next = this.next; + while (next != null) { + next.start(); + next = next.next; + } + return this; + } + + @Override + public final void run() { + try { + target.run(); + } catch (Throwable t) { + if (exception != null) { + exception.addSuppressed(t); + return; // another attached thread crashed, probably interruption exception + } + exception = t; + if (this.next != null) { + this.next.interruptForward(t); + } + if (this.prev != null) { + this.prev.interruptBackward(t); + } + } + } + + private void interruptBackward(Throwable t) { + exception = t; + if (this.prev != null) { + this.prev.interruptBackward(t); + } + interrupt(); + } + + private void interruptForward(Throwable t) { + exception = t; + if (this.next != null) { + this.next.interruptForward(t); + } + interrupt(); + } + + /** + * @return the exception returned by this thread, another attached thread or null if no exception occurred + */ + public Throwable getException() { + return exception; + } + + /** + * join this thread and create an exception if required, will convert it to a runtime exception if it can't be + * created. If the thread returned an exception while the current thread is interrupted, the exception will be + * suppressed in the {@link java.lang.InterruptedException}. + * + * @throws InterruptedException interruption while joining the thread + * @throws ExceptionThreadException if the thread or any attached thread returned an exception + */ + public void joinAndCrashIfRequired() throws InterruptedException { + try { + join(); + ExceptionThread next = this.next; + while (next != null) { + next.join(); + next = next.next; + } + ExceptionThread prev = this.prev; + while (prev != null) { + prev.join(); + prev = prev.prev; + } + } catch (InterruptedException ie) { + // we got an exception in the thread while this thread was interrupted + if (exception != null) { + ie.addSuppressed(exception); + } + throw ie; + } + if (exception == null) { + return; + } + if (exception instanceof ExceptionThreadException) { + throw (ExceptionThreadException) exception; + } + throw new ExceptionThreadException(exception); + } + + /** + * Exception returned by {@link #joinAndCrashIfRequired()}, will always have a cause + * + * @author Antoine Willerval + */ + public static class ExceptionThreadException extends RuntimeException { + public ExceptionThreadException(Throwable cause) { + super(cause); + } + } + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncListener.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncListener.java new file mode 100644 index 00000000..1fa90b2c --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/SyncListener.java @@ -0,0 +1,28 @@ +package org.rdfhdt.hdt.util.concurrent; + +import org.rdfhdt.hdt.listener.ProgressListener; + +/** + * {@link org.rdfhdt.hdt.listener.ProgressListener} wrapper to allow multiple thread to notify a progression + * @author Antoine Willerval + */ +public class SyncListener implements ProgressListener { + /** + * create a sync listener from another progress listener + * @param listener listener to sync, if it is null, this method returns null + * @return sync version of listener, or null if listener is null + */ + public static ProgressListener of(ProgressListener listener) { + return listener instanceof SyncListener || listener == null ? listener : new SyncListener(listener); + } + private final ProgressListener wrapper; + + private SyncListener(ProgressListener wrapper) { + this.wrapper = wrapper; + } + + @Override + public synchronized void notifyProgress(float level, String message) { + wrapper.notifyProgress(level, message); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/TreeWorker.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/TreeWorker.java new file mode 100644 index 00000000..dcd8232c --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/concurrent/TreeWorker.java @@ -0,0 +1,699 @@ +package org.rdfhdt.hdt.util.concurrent; + + +import org.rdfhdt.hdt.listener.MultiThreadListener; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.IntFunction; + +/** + * a worker to parse tree operation + * @param the type used in the tree to supply + * @param the type used in the tree + * @author Antoine Willerval + */ +public class TreeWorker { + /** + * ID fetcher for the workers + */ + private static final AtomicInteger JOB_ID_NAME = new AtomicInteger(); + + /** + * Sync object for the FETCH operation + */ + private final Object FETCH_SYNC = new Object() { + }; + /** + * Sync object for waiting for new job + */ + private final Object WAITING_SYNC = new Object() { + }; + /** + * Sync object to show the current count of workers in the ProgressListener + */ + private final Object WORKING_SYNC = new Object() { + }; + /** + * Cat function (T[]) -> T + */ + private final TreeWorkerCat catFunction; + /** + * Supplier Function () -> S + */ + private final TreeWorkerSupplier baseLevelSupplier; + /** + * Map Function (S) -> T + */ + private final TreeWorkerMap mapFunction; + /** + * Delete Function (T) -> void + */ + private final TreeWorkerDelete delete; + /** + * Function to create array of type T + */ + private final IntFunction arrayBuilder; + /** + * the current maximum level of the elements + */ + private int maxLevel = 0; + /** + * the count of workers waiting for a job + */ + private int workerWaiting = 0; + /** + * the count of working workers + */ + private int workerWorking; + /** + * the minimum number of elements to merge when the supplying phase isn't completed + */ + private final int treeCount; + /** + * the mapped elements waiting for a merge (T[]) + */ + private final List elements = new ArrayList<>(); + /** + * the supplied elements waiting for a map (S[]) + */ + private final List suppliedElements = new ArrayList<>(); + /** + * the worker threads + */ + private final List workers; + /** + * if the TreeWorker is started + */ + private boolean started = false; + /** + * if the fetch phase is completed + */ + private boolean fetchDone = false; + /** + * if the map phase is completed + */ + private boolean mapDone = false; + /** + * any throwable returned by the TreeWorker + */ + private TreeWorkerException throwable; + /** + * the progress listener + */ + private MultiThreadListener listener; + + /** + * create a tree worker + * @param catFunction the function to cat 2 nodes + * @param baseLevelSupplier the supplier to get base nodes + * @param delete the delete method to delete data in case of error, can be null if no delete is required + * @param arrayBuilder method to create an array of type T + * @throws TreeWorkerException if the tree worker can't be created + * @throws java.lang.NullPointerException if catFunction or baseLevelSupplier is null + */ + public TreeWorker(TreeWorkerCat catFunction, TreeWorkerSupplier baseLevelSupplier, TreeWorkerDelete delete, TreeWorkerMap mapFunction, IntFunction arrayBuilder) throws TreeWorkerException { + this(catFunction, baseLevelSupplier, delete, mapFunction, arrayBuilder, Runtime.getRuntime().availableProcessors(), 1); + } + + /** + * create a tree worker + * @param workerObject the worker object + * @param arrayBuilder method to create an array of type T + * @param workers the number of workers to use + * @param nodePerMerge number of simultaneous merge tree (at least 1) + * @throws TreeWorkerException if the tree worker can't be created + * @throws java.lang.NullPointerException if catFunction or baseLevelSupplier is null + */ + public & TreeWorkerSupplier & TreeWorkerDelete & TreeWorkerMap> TreeWorker(E workerObject, IntFunction arrayBuilder, int workers, int nodePerMerge) throws TreeWorkerException { + this(workerObject, workerObject, workerObject, workerObject, arrayBuilder, workers, nodePerMerge); + } + /** + * create a tree worker + * @param catFunction the function to cat 2 nodes + * @param baseLevelSupplier the supplier to get base nodes + * @param delete the delete method to delete data in case of error, can be null if no delete is required + * @param mapFunction the map function + * @param arrayBuilder method to create an array of type T + * @param workers the number of workers to use + * @param nodePerMerge number of simultaneous merge tree (at least 1) + * @throws TreeWorkerException if the tree worker can't be created + * @throws java.lang.NullPointerException if catFunction or baseLevelSupplier is null + */ + public TreeWorker(TreeWorkerCat catFunction, TreeWorkerSupplier baseLevelSupplier, TreeWorkerDelete delete, TreeWorkerMap mapFunction, IntFunction arrayBuilder, int workers, int nodePerMerge) throws TreeWorkerException { + this.catFunction = Objects.requireNonNull(catFunction, "catFunction can't be null!"); + this.mapFunction = Objects.requireNonNull(mapFunction, "mapFunction can't be null!"); + this.baseLevelSupplier = Objects.requireNonNull(baseLevelSupplier, "baseLevelSupplier can't be null!"); + this.arrayBuilder = Objects.requireNonNull(arrayBuilder, "arrayBuilder can't be null!"); + if (delete == null) { + this.delete = (t) -> {}; + } else { + this.delete = delete; + } + if (nodePerMerge <= 0) { + throw new TreeWorkerException("nodePerMerge count can't be <= 0!"); + } + treeCount = 1 << nodePerMerge; + if (workers <= 0) { + throw new TreeWorkerException("worker count can't be <= 0!"); + } + S s = baseLevelSupplier.get(); + if (s == null) { + throw new TreeWorkerException("no base element!"); + } + suppliedElements.add(s); + this.workers = new ArrayList<>(workers); + for (int i = 0; i < workers; i++) { + this.workers.add(new Worker()); + } + workerWorking = workers; + } + + /** + * create a generic array T[] of a size + * @param size the size + * @return the array + */ + private T[] createArray(int size) { + T[] array = arrayBuilder.apply(size); + assert array != null && array.length >= size : "array function should create an array with a size of a least size"; + return array; + } + + /** + * set a listener for each worker + * @param listener the listener + */ + public void setListener(MultiThreadListener listener) { + this.listener = listener; + } + + /** + * Start the workers + */ + public void start() { + synchronized (elements) { + if (started) { + throw new IllegalArgumentException("TreeWorker already started!"); + } + for (Worker worker : this.workers) { + worker.start(); + } + started = true; + } + } + + /** + * delete all the elements + */ + private void clearData() { + for (Element e: elements) { + delete.delete(e.mappedValue); + } + } + + /** + * wait for the tree worker to complete + * @return the last element + * @throws TreeWorkerException if an error occurred in a worker + * @throws InterruptedException in case of interruption + */ + public T waitToComplete() throws TreeWorkerException, InterruptedException { + try { + if (listener != null) { + synchronized (WORKING_SYNC) { + while (workerWorking > 0) { + listener.notifyProgress(100F * (workers.size() - workerWorking) / workers.size(), "waiting for workers to complete " + (workers.size() - workerWorking) + "/" + workers.size()); + WORKING_SYNC.wait(); + } + } + } + for (Worker w: workers) { + w.join(); + } + + if (listener != null) { + listener.notifyProgress(100, "tree completed"); + } + } catch (InterruptedException e) { + clearData(); + throw e; + } + + if (throwable != null) { + clearData(); + throw throwable; + } + + if (!fetchDone || !mapDone) { + clearData(); + // shouldn't be possible? + throw new TreeWorkerException("The worker isn't done!"); + } + if (elements.isEmpty()) { + return null; + } + return elements.get(0).mappedValue; + } + + private int countBase() { + return suppliedElements.size(); + } + + /** + * map function to map an element to another + * @param old type + * @param new type + * @author Antoine Willerval + */ + public interface TreeWorkerMap { + /** + * create an identity map function + * @param the type + * @return map function + */ + static TreeWorkerMap identity() { + return t -> t; + } + /** + * map the value + * @param prev the previous value + * @return the new value + */ + E map(T prev); + } + + /** + * cat function to merge two elements + * @param the elements type + * @author Antoine Willerval + */ + @FunctionalInterface + public interface TreeWorkerCat { + /** + * construct an element from elements + * @param element the array of elements. + * @param count the number of elements in the array, from index 0 (inclusive) to count (exclusive) + * @return the cat of the 2 elements + */ + T construct(T[] element, int count); + } + /** + * delete function in case of error + * @param the elements type + * @author Antoine Willerval + */ + @FunctionalInterface + public interface TreeWorkerDelete { + /** + * delete an unused element + * @param e the element to delete + */ + void delete(T e); + } + /** + * supply function + * @param the elements type + * @author Antoine Willerval + */ + @FunctionalInterface + public interface TreeWorkerSupplier { + /** + * supply an element to merge + * @return the element to merge + */ + S get(); + } + + /** + * Interface containing all the TreeWorker function to implement + * @param Supplying type + * @param Mapped type + * @author Antoine Willerval + */ + public interface TreeWorkerObject extends TreeWorkerCat, TreeWorkerSupplier, TreeWorkerDelete, TreeWorkerMap { + } + /** + * Interface containing all the TreeWorker function to implement without the map operation + * @param type + * @author Antoine Willerval + */ + public interface TreeWorkerObjectNoMap extends TreeWorkerObject { + @Override + default T map(T prev) { + return prev; + } + } + + /** + * @return if the worker is completed + */ + public boolean isCompleted() { + synchronized (elements) { + return (fetchDone && mapDone && elements.size() <= 1) || throwable != null; + } + } + + private class Element { + T mappedValue; + int level; + + public Element(T mappedValue, int level) { + this.mappedValue = mappedValue; + this.level = level; + } + } + + private class Tuple { + Element first; + T[] elements; + int count; + int level; + Tuple() { + elements = createArray(treeCount); + clear(); + } + + /** + * add an element to this tuple + * @param e the element + */ + public void addElement(Element e) { + if (count == 0) { + first = e; + level = e.level; + } + elements[count++] = e.mappedValue; + assert level == e.level : "add from different level"; + } + + /** + * @return the first element added since the last tuple reset/creation + */ + public Element getFirstElement() { + return first; + } + + /** + * remove all the elements from the tree worker elements + * @throws TreeWorkerException if an element can't be removed + */ + public void remove() throws TreeWorkerException { + for (int i = 0; i < count; i++) { + removeFirst(elements[i]); + } + } + + private void removeFirst(T element) throws TreeWorkerException { + Iterator it = TreeWorker.this.elements.iterator(); + while (it.hasNext()) { + Element e = it.next(); + if (e.mappedValue == element && e.level == level) { + it.remove(); + return; + } + } + throw new TreeWorkerException("Can't remove an elements! " + element); + } + + /** + * @return the internal array inside, at least the size returned by {@link #size()} + */ + public T[] getArray() { + return elements; + } + + /** + * @return the count of elements + */ + public int size() { + return count; + } + + /** + * reset the tuple + */ + public void clear() { + this.count = 0; + } + + /** + * get a element in a particular index + * @param index the index + * @return the element + */ + public T get(int index) { + return elements[index]; + } + + private int searchDir(int start, int direction, int min) { + if (direction < 0) { + for (int i = start; i >= 0; i--) { + searchAtLevel(i); + if (size() >= min) { + return i; + } + } + } else { + for (int i = start; i <= maxLevel; i++) { + searchAtLevel(i); + if (size() >= min) { + return i; + } + } + } + return -1; + } + + private void searchAtLevel(int level) { + clear(); + synchronized (TreeWorker.this.elements) { + for (Element e: TreeWorker.this.elements) { + if (e.level == level) { + addElement(e); + if (count == treeCount) { + return; + } + } + } + } + } + } + + private abstract static class TreeWorkerJob { + abstract void runJob(); + void clear() { + } + } + private class Fetch extends TreeWorkerJob { + @Override + public void runJob() { + synchronized (FETCH_SYNC) { + if (fetchDone) { + return; // another fetch job won + } + S s = baseLevelSupplier.get(); + synchronized (elements) { + if (s == null) { + fetchDone = true; + // say if all the mapping is done, only after the fetch was done + if (suppliedElements.isEmpty()) { + mapDone = true; + } + } else { + suppliedElements.add(s); + } + elements.notifyAll(); + } + } + } + } + + private class Map extends TreeWorkerJob { + S old; + + public Map(S old) { + this.old = old; + } + + @Override + public void runJob() { + // map the supplied value + T mappedValue = mapFunction.map(old); + + synchronized (TreeWorker.this.elements) { + // add it to the element list + TreeWorker.this.elements.add(new Element(mappedValue, 0)); + + // say if all the mapping is done, only after the fetch was done + if (fetchDone && suppliedElements.isEmpty()) { + mapDone = true; + } + elements.notifyAll(); + } + } + } + + private class Merge extends TreeWorkerJob { + T[] elements; + int count; + int level; + + public Merge(T[] elements, int count, int level) { + this.elements = elements; + this.count = count; + this.level = level; + assert count > 0: "cat from empty element!"; + } + + @Override + public void runJob() { + T t = catFunction.construct(elements, count); + synchronized (TreeWorker.this.elements) { + TreeWorker.this.elements.add(new Element(t, level + 1)); + maxLevel = Math.max(maxLevel, level + 1); + } + } + @Override + void clear() { + for (int i = 0; i < count; i++) { + delete.delete(elements[i]); + } + } + } + + private class Worker extends Thread { + // array used to get merge object + private final Tuple tuple = new Tuple(); + public Worker() { + super("JobWorker#" + JOB_ID_NAME.incrementAndGet()); + } + + @Override + public void run() { + try { + while (!isCompleted()) { + if (listener != null) { + listener.notifyProgress(0, "waiting job"); + } + TreeWorkerJob job = null; + try { + synchronized (WAITING_SYNC) { + job = getJob(); + if (job == null) { + if (isCompleted()) { + return; + } + workerWaiting++; + WAITING_SYNC.wait(); + --workerWaiting; + continue; + } + } + job.runJob(); + synchronized (WAITING_SYNC) { + if (workerWaiting > 0) { + WAITING_SYNC.notify(); + } + } + } catch (Throwable t) { + if (job != null) { + job.clear(); + } + synchronized (elements) { + if (throwable != null) { + throwable.addSuppressed(t); + } + if (t instanceof TreeWorkerException) { + throwable = (TreeWorkerException) t; + } else { + throwable = new TreeWorkerException(t); + } + elements.notifyAll(); + } + synchronized (WAITING_SYNC) { + WAITING_SYNC.notifyAll(); + } + } + } + } finally { + if (listener != null) { + listener.notifyProgress(100, "completed"); + listener.unregisterThread(getName()); + } + synchronized (WORKING_SYNC) { + workerWorking--; + WORKING_SYNC.notify(); + } + } + } + + private TreeWorkerJob getJob() throws TreeWorkerException { + synchronized (elements) { + while (true) { + if (mapDone) { + if (elements.size() == 1) { + return null; // end, no ascend/merge required + } + int level = tuple.searchDir(0, 1, 1); + if (level == -1) { + return null; // size == 0 end + } + if (tuple.size() == 1) { + tuple.getFirstElement().level++; + } else { //size == 2 + tuple.remove(); + return new Merge(tuple.getArray(), tuple.size(), level); + } + } else { + if (fetchDone) { + if (suppliedElements.isEmpty()) { + // edge case if we are waiting for a map to complete, Fetch won't do anything + return new Fetch(); + } + return new Map(suppliedElements.remove(0)); + } + // count the number of supplied elements to know if we need to fetch another one + int level0 = countBase(); + if (workers.size() != 1 && level0 < workers.size() / 2) { + return new Fetch(); + } + // search for a merge candidate with the size treeCount + int level = tuple.searchDir(maxLevel, -1, treeCount); + + if (level != -1) { + // remove the component of the candidate and merge them + tuple.remove(); + return new Merge(tuple.getArray(), tuple.size(), level); + } + + if (suppliedElements.isEmpty()) { + // no supplied element to map, we fetch a new one + return new Fetch(); + } else { + // map the supplied element + return new Map(suppliedElements.remove(0)); + } + } + } + } + } + } + + /** + * An exception in the tree worker + * @author Antoine Willerval + */ + public static class TreeWorkerException extends Exception { + public TreeWorkerException(Throwable cause) { + super(cause); + } + + public TreeWorkerException(String message) { + super(message); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArray.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArray.java new file mode 100644 index 00000000..6874d74e --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArray.java @@ -0,0 +1,24 @@ +package org.rdfhdt.hdt.util.disk; + +/** + * Describe a large array of longs + */ +public interface LongArray { + /** + * get an element at a particular index + * @param index the index + * @return the value + */ + long get(long index); + /** + * Set a new value at the specified position. + * @param index the index + * @param value the value + */ + void set(long index, long value); + + /** + * @return the length of the array + */ + long length(); +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArrayDisk.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArrayDisk.java index d49fc7bb..1651f465 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArrayDisk.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/disk/LongArrayDisk.java @@ -33,7 +33,7 @@ //Implementing an array of longs that is backed up on disk. Following this: http://vanillajava.blogspot.fr/2011/12/using-memory-mapped-file-for-huge.html -public class LongArrayDisk implements Closeable { +public class LongArrayDisk implements Closeable, LongArray { private static final long MAPPING_SIZE = 1 << 30; private FileChannel channel; private CloseMappedByteBuffer[] mappings; @@ -118,6 +118,7 @@ public void close() throws IOException { channel = null; } + @Override public long get(long x) { long p = x * 8; int block = (int) (p / MAPPING_SIZE); @@ -129,6 +130,7 @@ public long getLong(long x) { return this.get(x); } + @Override public void set(long x, long y) { long p = x * 8; int block = (int) (p / MAPPING_SIZE); @@ -136,6 +138,7 @@ public void set(long x, long y) { mappings[block].putLong(offset, y); } + @Override public long length() { return size; } @@ -188,4 +191,5 @@ public long getSizeBits() { return size * 8L; } -} \ No newline at end of file +} + diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseMappedByteBuffer.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseMappedByteBuffer.java index 2603aaa4..65c96246 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseMappedByteBuffer.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseMappedByteBuffer.java @@ -41,14 +41,18 @@ static void crashMapTest() { this.duplicated = duplicated; this.buffer = buffer; if (mapTest && !duplicated) { - MAP_TEST_MAP.put(id, new Throwable("MAP " + filename + "#" + id + "|"+ buffer)); + synchronized (MAP_TEST_MAP) { + MAP_TEST_MAP.put(id, new Throwable("MAP " + filename + "#" + id + "|" + buffer)); + } } } @Override public void close() { if (mapTest && !duplicated) { - MAP_TEST_MAP.remove(id); + synchronized (MAP_TEST_MAP) { + MAP_TEST_MAP.remove(id); + } } IOUtil.cleanBuffer(buffer); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java new file mode 100644 index 00000000..10c0095a --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/CloseSuppressPath.java @@ -0,0 +1,247 @@ +package org.rdfhdt.hdt.util.io; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URI; +import java.nio.file.FileSystem; +import java.nio.file.Files; +import java.nio.file.LinkOption; +import java.nio.file.Path; +import java.nio.file.WatchEvent; +import java.nio.file.WatchKey; +import java.nio.file.WatchService; +import java.util.Iterator; +import java.util.Spliterator; +import java.util.function.Consumer; + +/** + * a file that delete itself when we close it + */ +public class CloseSuppressPath implements Path, Closeable { + public static final int BUFFER_SIZE = 1 << 13; + private final Path wrapper; + private boolean isDir; + + CloseSuppressPath(Path wrapper) { + this.wrapper = wrapper; + } + + public static CloseSuppressPath of(String first, String... more) { + return new CloseSuppressPath(Path.of(first, more)); + } + + public static CloseSuppressPath of(Path component) { + return component instanceof CloseSuppressPath ? (CloseSuppressPath) component : new CloseSuppressPath(component); + } + + @Override + public FileSystem getFileSystem() { + return wrapper.getFileSystem(); + } + + @Override + public boolean isAbsolute() { + return wrapper.isAbsolute(); + } + + @Override + public Path getRoot() { + return wrapper.getRoot(); + } + + @Override + public Path getFileName() { + return wrapper.getFileName(); + } + + @Override + public Path getParent() { + return wrapper.getParent(); + } + + @Override + public int getNameCount() { + return wrapper.getNameCount(); + } + + @Override + public Path getName(int index) { + return wrapper.getName(index); + } + + @Override + public Path subpath(int beginIndex, int endIndex) { + return wrapper.subpath(beginIndex, endIndex); + } + + @Override + public boolean startsWith(Path other) { + return wrapper.startsWith(other); + } + + @Override + public boolean startsWith(String other) { + return wrapper.startsWith(other); + } + + @Override + public boolean endsWith(Path other) { + return wrapper.endsWith(other); + } + + @Override + public boolean endsWith(String other) { + return wrapper.endsWith(other); + } + + @Override + public Path normalize() { + return wrapper.normalize(); + } + + @Override + public CloseSuppressPath resolve(Path other) { + return of(wrapper.resolve(other)); + } + + @Override + public CloseSuppressPath resolve(String other) { + return of(wrapper.resolve(other)); + } + + @Override + public CloseSuppressPath resolveSibling(Path other) { + return of(wrapper.resolveSibling(other)); + } + + @Override + public CloseSuppressPath resolveSibling(String other) { + return of(wrapper.resolveSibling(other)); + } + + @Override + public CloseSuppressPath relativize(Path other) { + return of(wrapper.relativize(other)); + } + + @Override + public URI toUri() { + return wrapper.toUri(); + } + + @Override + public Path toAbsolutePath() { + return wrapper.toAbsolutePath(); + } + + @Override + public Path toRealPath(LinkOption... options) throws IOException { + return wrapper.toRealPath(options); + } + + @Override + public File toFile() { + return wrapper.toFile(); + } + + @Override + public WatchKey register(WatchService watcher, WatchEvent.Kind[] events, WatchEvent.Modifier... modifiers) throws IOException { + return wrapper.register(watcher, events, modifiers); + } + + @Override + public WatchKey register(WatchService watcher, WatchEvent.Kind... events) throws IOException { + return wrapper.register(watcher, events); + } + + @Override + public Iterator iterator() { + return wrapper.iterator(); + } + + @Override + public int compareTo(Path other) { + return wrapper.compareTo(other); + } + + @Override + public boolean equals(Object other) { + if (other instanceof CloseSuppressPath) { + return wrapper.equals(((CloseSuppressPath) other).wrapper); + } + return wrapper.equals(other); + } + + @Override + public int hashCode() { + return wrapper.hashCode(); + } + + @Override + public String toString() { + return wrapper.toString(); + } + + @Override + public void forEach(Consumer action) { + wrapper.forEach(action); + } + + @Override + public Spliterator spliterator() { + return wrapper.spliterator(); + } + + private InputStream openInputStream(boolean buffered) throws IOException { + if (buffered) { + return openInputStream(BUFFER_SIZE); + } else { + return Files.newInputStream(wrapper); + } + } + + public InputStream openInputStream(int bufferSize) throws IOException { + return new BufferedInputStream(openInputStream(false), bufferSize); + } + + private OutputStream openOutputStream(boolean buffered) throws IOException { + if (buffered) { + return openOutputStream(BUFFER_SIZE); + } else { + return Files.newOutputStream(wrapper); + } + } + + public OutputStream openOutputStream(int bufferSize) throws IOException { + return new BufferedOutputStream(openOutputStream(false), bufferSize); + } + + /** + * close this path with a delete recurse instead of delete if exists + */ + public void closeWithDeleteRecurse() { + isDir = true; + } + + public void mkdirs() throws IOException { + Files.createDirectories(wrapper); + } + + public Path getJavaPath() { + return wrapper; + } + + @Override + public void close() throws IOException { + if (isDir) { + IOUtil.deleteDirRecurse(wrapper); + } else { + Files.deleteIfExists(wrapper); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java index f4275796..09d89b3c 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/IOUtil.java @@ -28,25 +28,54 @@ import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; +import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.util.Reference; import org.rdfhdt.hdt.util.string.ByteStringUtil; -import pl.edu.icm.jlargearrays.LargeArrayUtils; -import java.io.*; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.EOFException; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; import java.net.URL; import java.net.URLConnection; +import java.nio.file.FileVisitResult; +import java.nio.file.FileVisitor; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; + +import pl.edu.icm.jlargearrays.LargeArrayUtils; + import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.*; -import java.util.zip.GZIPInputStream; /** * @author mario.arias - * */ public class IOUtil { private static int mappedBuffer; - private IOUtil() {} + + private IOUtil() { + } /** * clean direct allocated buffer @@ -165,25 +194,31 @@ private static void throwIOOrRuntime(Throwable t) throws IOException { } public static InputStream getFileInputStream(String fileName) throws IOException { + return getFileInputStream(fileName, true); + } + + public static InputStream getFileInputStream(String fileName, boolean uncompress) throws IOException { InputStream input; String name = fileName.toLowerCase(); - if(name.startsWith("http:/") || name.startsWith("ftp:/")) { + if (name.startsWith("http:/") || name.startsWith("ftp:/")) { URL url = new URL(fileName); URLConnection con = url.openConnection(); - con.connect(); - input = con.getInputStream(); - } else if(name.equals("-")) { + con.connect(); + input = con.getInputStream(); + } else if (name.equals("-")) { input = new BufferedInputStream(System.in); } else { input = new BufferedInputStream(new FileInputStream(fileName)); } - if(name.endsWith(".gz")||name.endsWith(".tgz")) { - input = new GZIPInputStream(input); - } else if(name.endsWith("bz2") || name.endsWith("bz")) { - input = new BZip2CompressorInputStream(input, true); - } else if(name.endsWith("xz")) { - input = new XZCompressorInputStream(input, true); + if (uncompress) { + if (name.endsWith(".gz") || name.endsWith(".tgz")) { + input = new GZIPInputStream(input); + } else if (name.endsWith("bz2") || name.endsWith("bz")) { + input = new BZip2CompressorInputStream(input, true); + } else if (name.endsWith("xz")) { + input = new XZCompressorInputStream(input, true); + } } return input; } @@ -194,12 +229,12 @@ public static BufferedReader getFileReader(String fileName) throws IOException { public static String readLine(InputStream in, char character) throws IOException { ByteArrayOutputStream buf = new ByteArrayOutputStream(); - while(true) { + while (true) { int value = in.read(); - if(value==-1) { + if (value == -1) { throw new EOFException(); } - if(value==character) { + if (value == character) { break; } buf.write(value); @@ -209,12 +244,12 @@ public static String readLine(InputStream in, char character) throws IOException public static String readChars(InputStream in, int numChars) throws IOException { StringBuilder out = new StringBuilder(); - for(int i=0;in ? n-total : buffer.length); + total += len; + len = (int) (total + buffer.length > n ? n - total : buffer.length); } } @@ -277,33 +312,35 @@ public static void decompressGzip(File src, File trgt) throws IOException { } finally { out.close(); } - }finally { + } finally { in.close(); } } /** * Write long, little endian + * * @param output * @param value * @throws IOException */ public static void writeLong(OutputStream output, long value) throws IOException { byte[] writeBuffer = new byte[8]; - writeBuffer[7] = (byte)(value >>> 56); - writeBuffer[6] = (byte)(value >>> 48); - writeBuffer[5] = (byte)(value >>> 40); - writeBuffer[4] = (byte)(value >>> 32); - writeBuffer[3] = (byte)(value >>> 24); - writeBuffer[2] = (byte)(value >>> 16); - writeBuffer[1] = (byte)(value >>> 8); - writeBuffer[0] = (byte)(value); + writeBuffer[7] = (byte) (value >>> 56); + writeBuffer[6] = (byte) (value >>> 48); + writeBuffer[5] = (byte) (value >>> 40); + writeBuffer[4] = (byte) (value >>> 32); + writeBuffer[3] = (byte) (value >>> 24); + writeBuffer[2] = (byte) (value >>> 16); + writeBuffer[1] = (byte) (value >>> 8); + writeBuffer[0] = (byte) (value); output.write(writeBuffer, 0, 8); } /** * Read long, little endian. + * * @param input * @throws IOException */ @@ -311,25 +348,26 @@ public static long readLong(InputStream input) throws IOException { int n = 0; byte[] readBuffer = new byte[8]; while (n < 8) { - int count = input.read(readBuffer, n , 8-n); + int count = input.read(readBuffer, n, 8 - n); if (count < 0) throw new EOFException(); n += count; } - return ((long)readBuffer[7] << 56) + - ((long)(readBuffer[6] & 255) << 48) + - ((long)(readBuffer[5] & 255) << 40) + - ((long)(readBuffer[4] & 255) << 32) + - ((long)(readBuffer[3] & 255) << 24) + - ((readBuffer[2] & 255) << 16) + - ((readBuffer[1] & 255) << 8) + - ((readBuffer[0] & 255) - ); + return ((long) readBuffer[7] << 56) + + ((long) (readBuffer[6] & 255) << 48) + + ((long) (readBuffer[5] & 255) << 40) + + ((long) (readBuffer[4] & 255) << 32) + + ((long) (readBuffer[3] & 255) << 24) + + ((readBuffer[2] & 255) << 16) + + ((readBuffer[1] & 255) << 8) + + ((readBuffer[0] & 255) + ); } /** * Write int, little endian + * * @param output * @param value * @throws IOException @@ -337,10 +375,10 @@ public static long readLong(InputStream input) throws IOException { public static void writeInt(OutputStream output, int value) throws IOException { byte[] writeBuffer = new byte[4]; writeBuffer[0] = (byte) (value & 0xFF); - writeBuffer[1] = (byte) ((value>>8) & 0xFF); - writeBuffer[2] = (byte) ((value>>16) & 0xFF); - writeBuffer[3] = (byte) ((value>>24) & 0xFF); - output.write(writeBuffer,0,4); + writeBuffer[1] = (byte) ((value >> 8) & 0xFF); + writeBuffer[2] = (byte) ((value >> 16) & 0xFF); + writeBuffer[3] = (byte) ((value >> 24) & 0xFF); + output.write(writeBuffer, 0, 4); } /** @@ -349,14 +387,15 @@ public static void writeInt(OutputStream output, int value) throws IOException { public static byte[] intToByteArray(int value) { byte[] writeBuffer = new byte[4]; writeBuffer[0] = (byte) (value & 0xFF); - writeBuffer[1] = (byte) ((value>>8) & 0xFF); - writeBuffer[2] = (byte) ((value>>16) & 0xFF); - writeBuffer[3] = (byte) ((value>>24) & 0xFF); + writeBuffer[1] = (byte) ((value >> 8) & 0xFF); + writeBuffer[2] = (byte) ((value >> 16) & 0xFF); + writeBuffer[3] = (byte) ((value >> 24) & 0xFF); return writeBuffer; } /** * Read int, little endian + * * @param in input * @return integer * @throws IOException @@ -373,29 +412,30 @@ public static int readInt(InputStream in) throws IOException { /** * Convert byte array to int, little endian + * * @param value */ - public static int byteArrayToInt(byte[] value){ + public static int byteArrayToInt(byte[] value) { return (value[3] << 24) + (value[2] << 16) + (value[1] << 8) + (value[0] << 0); } /** - * @param input din - * @param length bytes + * @param input din + * @param length bytes * @param listener */ public static byte[] readBuffer(InputStream input, int length, ProgressListener listener) throws IOException { int nRead; - int pos=0; + int pos = 0; byte[] data = new byte[length]; - while ((nRead = input.read(data, pos, length-pos)) >0) { + while ((nRead = input.read(data, pos, length - pos)) > 0) { // TODO: Notify progress listener pos += nRead; } - if(pos!=length) { - throw new IOException("EOF while reading array from InputStream"); + if (pos != length) { + throw new EOFException("EOF while reading array from InputStream"); } return data; @@ -404,8 +444,8 @@ public static byte[] readBuffer(InputStream input, int length, ProgressListener public static CharSequence toBinaryString(long val) { StringBuilder str = new StringBuilder(64); int bits = 64; - while(bits-- != 0) { - str.append(((val>>>bits) & 1) !=0 ? '1' : '0'); + while (bits-- != 0) { + str.append(((val >>> bits) & 1) != 0 ? '1' : '0'); } return str; } @@ -413,8 +453,8 @@ public static CharSequence toBinaryString(long val) { public static CharSequence toBinaryString(int val) { StringBuilder str = new StringBuilder(32); int bits = 32; - while(bits-- != 0) { - str.append(((val>>>bits) & 1) !=0 ? '1' : '0'); + while (bits-- != 0) { + str.append(((val >>> bits) & 1) != 0 ? '1' : '0'); } return str; } @@ -425,8 +465,8 @@ public static void printBitsln(long val, int bits) { } public static void printBits(long val, int bits) { - while(bits-- != 0) { - System.out.print( ((val>>>bits) & 1) !=0 ? '1' : '0'); + while (bits-- != 0) { + System.out.print(((val >>> bits) & 1) != 0 ? '1' : '0'); } } @@ -438,7 +478,7 @@ public static short readShort(InputStream in) throws IOException { throw new EOFException(); } - return (short)((ch2 << 8) + (ch1)); + return (short) ((ch2 << 8) + (ch1)); } public static void writeShort(OutputStream out, short value) throws IOException { @@ -451,7 +491,7 @@ public static byte readByte(InputStream in) throws IOException { if (b < 0) { throw new EOFException(); } - return (byte)(b&0xFF); + return (byte) (b & 0xFF); } public static void writeByte(OutputStream out, byte value) throws IOException { @@ -461,18 +501,18 @@ public static void writeByte(OutputStream out, byte value) throws IOException { // InputStream might not skip the specified number of bytes. This call makes multiple calls // if needed to ensure that the desired number of bytes is actually skipped. public static void skip(InputStream in, long n) throws IOException { - if(n==0) { + if (n == 0) { return; } long totalSkipped = in.skip(n); - while(totalSkipped() { + @Override + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) { + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFileFailed(Path file, IOException exc) { + return FileVisitResult.TERMINATE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + Files.delete(dir); + return FileVisitResult.CONTINUE; + } + }); + } + } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeMergeIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeMergeIterator.java new file mode 100644 index 00000000..816ed280 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeMergeIterator.java @@ -0,0 +1,26 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; + +import java.io.IOException; +import java.util.Comparator; +import java.util.List; +import java.util.function.Function; + +/** + * Version of {@link org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator} with {@link org.rdfhdt.hdt.triples.IndexedNode} + * @author Antoine Willerval + */ +public class CompressNodeMergeIterator extends MergeExceptionIterator { + + public CompressNodeMergeIterator(ExceptionIterator in1, ExceptionIterator in2) { + super(in1, in2, IndexedNode::compareTo); + } + + public static > ExceptionIterator buildOfTree( + T[] lst) { + return buildOfTree(it -> it, IndexedNode::compareTo, lst, 0, lst.length); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java new file mode 100644 index 00000000..0b5f0916 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeReader.java @@ -0,0 +1,90 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.exceptions.CRCException; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRC8; +import org.rdfhdt.hdt.util.crc.CRCInputStream; +import org.rdfhdt.hdt.util.string.ReplazableString; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; + +/** + * Class to read a compress node file + * + * @author Antoine Willerval + */ +public class CompressNodeReader implements ExceptionIterator, Closeable { + private final CRCInputStream stream; + private final long size; + private long index; + private boolean waiting; + private final IndexedNode last; + private final ReplazableString tempString; + + public CompressNodeReader(InputStream stream) throws IOException { + this.stream = new CRCInputStream(stream, new CRC8()); + this.size = VByte.decode(this.stream); + if(!this.stream.readCRCAndCheck()) { + throw new CRCException("CRC Error while merging Section Plain Front Coding Header."); + } + this.stream.setCRC(new CRC32()); + this.tempString = new ReplazableString(); + this.last = new IndexedNode(tempString, -1); + } + + public long getSize() { + return size; + } + + public void checkComplete() throws IOException { + if(!this.stream.readCRCAndCheck()) { + throw new CRCException("CRC Error while merging Section Plain Front Coding Header."); + } + } + + /** + * @return the next element without passing to the next element + * @throws IOException reading exception + */ + public IndexedNode read() throws IOException { + if (waiting) { + return last; + } + int delta = (int) VByte.decode(stream); + tempString.replace2(stream, delta); + long index = VByte.decode(stream); + last.setIndex(index); + waiting = true; + return last; + } + + /** + * pass to the next element, mandatory with {@link #read()} + */ + public void pass() { + waiting = false; + index++; + } + + @Override + public IndexedNode next() throws IOException { + IndexedNode node = read(); + pass(); + return node; + } + @Override + public boolean hasNext() throws IOException { + return index < size; + } + + @Override + public void close() throws IOException { + stream.close(); + } + +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java new file mode 100644 index 00000000..ebdcc534 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressNodeWriter.java @@ -0,0 +1,62 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRC8; +import org.rdfhdt.hdt.util.crc.CRCOutputStream; +import org.rdfhdt.hdt.util.string.ByteStringUtil; +import org.rdfhdt.hdt.util.string.CompactString; +import org.rdfhdt.hdt.util.string.ReplazableString; + +import java.io.Closeable; +import java.io.IOException; +import java.io.OutputStream; + +/** + * Class to write a compress node file + * + * @author Antoine Willerval + */ +public class CompressNodeWriter implements Closeable { + private final CRCOutputStream out; + private final ReplazableString previousStr = new ReplazableString(); + + public CompressNodeWriter(OutputStream stream, long size) throws IOException { + this.out = new CRCOutputStream(stream, new CRC8()); + VByte.encode(this.out, size); + this.out.writeCRC(); + this.out.setCRC(new CRC32()); + } + + public void appendNode(IndexedNode node) throws IOException { + CharSequence str = node.getNode(); + long index = node.getIndex(); + + // to avoid bad longestCommonPrefix call + // cf: https://github.com/rdfhdt/hdt-java/issues/165 + if (str instanceof String) { + str = new CompactString(str); + } + + // Find common part. + int delta = ByteStringUtil.longestCommonPrefix(previousStr, str); + // Write Delta in VByte + VByte.encode(out, delta); + // Write remaining + ByteStringUtil.append(out, str, delta); + out.write(0); // End of string + VByte.encode(out, index); // index of the node + previousStr.replace(str); + } + + public void writeCRC() throws IOException { + out.writeCRC(); + } + + @Override + public void close() throws IOException{ + writeCRC(); + out.close(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleMergeIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleMergeIterator.java new file mode 100644 index 00000000..f9dcfa13 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleMergeIterator.java @@ -0,0 +1,27 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.TripleIDComparator; + +import java.io.IOException; +import java.util.List; + +/** + * Version of {@link org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator} with {@link org.rdfhdt.hdt.triples.TripleID} + * @author Antoine Willerval + */ +public class CompressTripleMergeIterator extends MergeExceptionIterator { + + public CompressTripleMergeIterator(ExceptionIterator in1, ExceptionIterator in2, TripleComponentOrder order) { + super(in1, in2, TripleIDComparator.getComparator(order)); + } + + public static > ExceptionIterator buildOfTree( + T[] lst, TripleComponentOrder order) { + return buildOfTree(it -> it, TripleIDComparator.getComparator(order), lst, 0, lst.length); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleReader.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleReader.java new file mode 100644 index 00000000..2de3cbb9 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleReader.java @@ -0,0 +1,87 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.exceptions.CRCException; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRCInputStream; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; + +/** + * Class to read and map pre-mapped a triples file + * + * @author Antoine Willerval + */ +public class CompressTripleReader implements ExceptionIterator, Closeable { + private final CRCInputStream stream; + private final TripleID next = new TripleID(-1, -1, -1); + private boolean read = false, end = false; + + public CompressTripleReader(InputStream stream) { + this.stream = new CRCInputStream(stream, new CRC32()); + } + + @Override + public boolean hasNext() throws IOException { + if (read) { + return true; + } + + // the reader is empty, null end triple + if (end) { + return false; + } + + long s, p, o; + + do { + s = VByte.decode(stream); + p = VByte.decode(stream); + o = VByte.decode(stream); + // continue to read to avoid duplicated triples + } while (s == next.getSubject() && p == next.getPredicate() && o == next.getObject()); + + return !setAllOrEnd(s, p, o); + } + + private boolean setAllOrEnd(long s, long p, long o) throws IOException { + if (end) { + // already completed + return true; + } + if (s == 0 || p == 0 || o == 0) { + // check triples validity + if (s != 0 || p != 0 || o != 0) { + throw new IOException("Triple got null node, but not all the nodes are 0! " + s + " " + p + " " + o); + } + if (!stream.readCRCAndCheck()) { + throw new CRCException("CRC Error while reading PreMapped triples."); + } + // set to true to avoid reading again the CRC + end = true; + return true; + } + // map the triples to the end id, compute the shared with the end shared size + next.setAll(s, p, o); + read = true; + return false; + } + + @Override + public TripleID next() throws IOException { + if (!hasNext()) { + return null; + } + read = false; + return next; + } + + @Override + public void close() throws IOException { + stream.close(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleWriter.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleWriter.java new file mode 100644 index 00000000..837e1206 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressTripleWriter.java @@ -0,0 +1,61 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.compact.integer.VByte; +import org.rdfhdt.hdt.triples.IndexedTriple; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.util.crc.CRC32; +import org.rdfhdt.hdt.util.crc.CRCOutputStream; + +import java.io.Closeable; +import java.io.IOException; +import java.io.OutputStream; + +/** + * Class to write pre-mapped triples file + * + * @author Antoine Willerval + */ +public class CompressTripleWriter implements Closeable { + private final CRCOutputStream out; + + public CompressTripleWriter(OutputStream writer) { + this.out = new CRCOutputStream(writer, new CRC32()); + } + /** + * write a indexed triple into an output + * @param triple the triple to write + * @throws java.io.IOException write exception + */ + public void appendTriple(IndexedTriple triple) throws IOException { + VByte.encode(out, triple.getSubject().getIndex()); + VByte.encode(out, triple.getPredicate().getIndex()); + VByte.encode(out, triple.getObject().getIndex()); + } + /** + * write a indexed triple into an output + * @param triple the triple to write + * @throws java.io.IOException write exception + */ + public void appendTriple(TripleID triple) throws IOException { + VByte.encode(out, triple.getSubject()); + VByte.encode(out, triple.getPredicate()); + VByte.encode(out, triple.getObject()); + } + + /** + * Write an end triple and a CRC to complete the writer + * @throws IOException write error + */ + public void writeCRC() throws IOException { + VByte.encode(out, 0); + VByte.encode(out, 0); + VByte.encode(out, 0); + out.writeCRC(); + } + + @Override + public void close() throws IOException { + writeCRC(); + out.close(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java new file mode 100644 index 00000000..4849a0a9 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/CompressUtil.java @@ -0,0 +1,202 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; +import org.rdfhdt.hdt.util.string.ReplazableString; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; + +/** + * Utility class to manipulate compressed node + * + * @author Antoine Willerval + */ +public class CompressUtil { + /** + * the mask for shared computed compressed node + */ + public static final long SHARED_MASK = 1L; + /** + * shift after the SHARED/DUPLICATES + */ + public static final int INDEX_SHIFT = 1; + + /** + * write a sorted list of indexed node + * + * @param strings the nodes to write + * @param output the output + * @param listener the listener to see the progress + * @throws IOException writing exception + */ + public static void writeCompressedSection(List strings, OutputStream output, ProgressListener listener) throws IOException { + writeCompressedSection(ExceptionIterator.of(strings.iterator()), strings.size(), output, listener); + } + + /** + * write a sorted iterator of indexed node + * + * @param it iterator to write + * @param size size of the iterator + * @param output the output where to write + * @param listener the listener to see the progress + * @throws IOException writing exception + */ + public static void writeCompressedSection(ExceptionIterator it, long size, OutputStream output, ProgressListener listener) throws IOException { + CompressNodeWriter writer = new CompressNodeWriter(output, size); + long element = 0; + long block = size < 10 ? 1 : size / 10; + while (it.hasNext()) { + if (listener != null && element % block == 0) { + listener.notifyProgress((float) (10 * element / block), "write section " + element + "/" + size); + } + writer.appendNode(it.next()); + element++; + } + it.forEachRemaining(writer::appendNode); + writer.writeCRC(); + if (listener != null) { + listener.notifyProgress(100, "section completed " + size + " nodes"); + } + } + + /** + * merge two stream together into an output stream + * + * @param stream1 input stream 1 + * @param stream2 input stream 2 + * @param output output stream + * @param listener the listener to see the progress + * @throws IOException read/writing exception + */ + public static void mergeCompressedSection(InputStream stream1, InputStream stream2, OutputStream output, ProgressListener listener) throws IOException { + CompressNodeReader in1r = new CompressNodeReader(stream1); + CompressNodeReader in2r = new CompressNodeReader(stream2); + + long size1 = in1r.getSize(); + long size2 = in2r.getSize(); + + // merge the section + writeCompressedSection(new CompressNodeMergeIterator(in1r, in2r), size1 + size2, output, listener); + // check we have completed the 2 readers + in1r.checkComplete(); + in2r.checkComplete(); + } + + /** + * compute the shared-computed id from a shared-computable id + * + * @param id the shared-computable id + * @param sharedCount the count of shared elements + * @return the shared-computed element + */ + public static long computeSharedNode(long id, long sharedCount) { + if ((id & SHARED_MASK) != 0) { + // shared element + return CompressUtil.getId(id); + } + // not shared + return CompressUtil.getId(id) + sharedCount; + } + + /** + * convert this id to a shared-computable element + * + * @param id the id + * @return shared-computable element + */ + public static long asShared(long id) { + return getHeaderId(id) | SHARED_MASK; + } + + /** + * get the id from a header id + * @param headerId the header id + * @return the id + */ + public static long getId(long headerId) { + return headerId >>> INDEX_SHIFT; + } + + /** + * get a header id from an id + * @param id the id + * @return the header id + */ + public static long getHeaderId(long id) { + return id << INDEX_SHIFT; + } + + /** + * @return a char sequence base iterator view of this iterator + */ + public static DuplicatedIterator asNoDupeCharSequenceIterator(ExceptionIterator nodes, DuplicatedNodeConsumer duplicatedNodeConsumer) { + return new DuplicatedIterator(nodes.asIterator(), duplicatedNodeConsumer); + } + + @FunctionalInterface + public interface DuplicatedNodeConsumer { + void onDuplicated(long originalIndex, long duplicatedIndex, long originalHeader); + } + + public static class DuplicatedIterator implements Iterator { + private final Iterator it; + private final ReplazableString prev = new ReplazableString(); + private IndexedNode next; + private long id; + private final DuplicatedNodeConsumer duplicatedNodeConsumer; + private long lastHeader; + + DuplicatedIterator(Iterator it, DuplicatedNodeConsumer duplicatedNodeConsumer) { + this.it = it; + this.duplicatedNodeConsumer = Objects.requireNonNullElseGet(duplicatedNodeConsumer, () -> (i, j, k) -> { + }); + } + + @Override + public boolean hasNext() { + if (next != null) { + return true; + } + while (it.hasNext()) { + IndexedNode node = it.next(); + CharSequence next = node.getNode(); + if (CharSequenceComparator.getInstance().compare(prev, next) == 0) { + // same as previous, ignore + assert this.id != node.getIndex() : "same index and prevIndex"; + duplicatedNodeConsumer.onDuplicated(this.id, node.getIndex(), lastHeader); + continue; + } + this.next = node; + prev.replace(next); + this.id = node.getIndex(); + return true; + } + return false; + } + + @Override + public IndexedNode next() { + if (!hasNext()) { + return null; + } + IndexedNode old = next; + next = null; + return old; + } + + public void setLastHeader(long lastHeader) { + this.lastHeader = lastHeader; + } + } + + private CompressUtil() { + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/MapCompressTripleMerger.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/MapCompressTripleMerger.java new file mode 100644 index 00000000..118adf36 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/MapCompressTripleMerger.java @@ -0,0 +1,252 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressTripleMapper; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResult; +import org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResultFile; +import org.rdfhdt.hdt.hdt.impl.diskimport.TripleCompressionResultPartial; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.FileTripleIDIterator; +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.TripleIDComparator; +import org.rdfhdt.hdt.util.ParallelSortableArrayList; +import org.rdfhdt.hdt.util.concurrent.TreeWorker; +import org.rdfhdt.hdt.util.io.CloseSuppressPath; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.listener.IntermediateListener; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * TreeWorkerObject implementation to map and merge tripleID from a compress triple file + * + * @author Antoine Willerval + */ +public class MapCompressTripleMerger implements TreeWorker.TreeWorkerObject { + private static final Logger log = LoggerFactory.getLogger(MapCompressTripleMerger.class); + private final AtomicInteger FID = new AtomicInteger(); + private final CloseSuppressPath baseFileName; + private final FileTripleIDIterator source; + private final CompressTripleMapper mapper; + private final MultiThreadListener listener; + private final TripleComponentOrder order; + private final int bufferSize; + private boolean done; + private long triplesCount = 0; + + public MapCompressTripleMerger(CloseSuppressPath baseFileName, FileTripleIDIterator it, CompressTripleMapper mapper, MultiThreadListener listener, TripleComponentOrder order, int bufferSize) { + this.baseFileName = baseFileName; + this.source = it; + this.mapper = mapper; + this.listener = listener; + this.order = order; + this.bufferSize = bufferSize; + } + + @Override + public TripleFile construct(TripleFile[] tripleFiles, int count) { + try { + int fid = FID.incrementAndGet(); + CloseSuppressPath triplesFiles = baseFileName.resolve("triples" + fid + ".raw"); + long triples = 0; + listener.notifyProgress(0, "merging triples " + triplesFiles.getFileName()); + CompressTripleReader[] readers = new CompressTripleReader[count]; + try { + for (int i = 0; i < count; i++) { + readers[i] = new CompressTripleReader(tripleFiles[i].path.openInputStream(bufferSize)); + } + + try (CompressTripleWriter w = new CompressTripleWriter(triplesFiles.openOutputStream(bufferSize))) { + ExceptionIterator it = CompressTripleMergeIterator.buildOfTree(readers, order); + while (it.hasNext()) { + w.appendTriple(it.next()); + triples++; + } + } + } finally { + IOUtil.closeAll(readers); + } + listener.notifyProgress(100, "triples merged " + triplesFiles.getFileName()); + // delete old triples + for (int i = 0; i < count; i++) { + delete(tripleFiles[i]); + } + return new TripleFile(triples, triplesFiles); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void delete(TripleFile f) { + try { + f.close(); + } catch (IOException e) { + log.warn("Can't delete triple file {}", f.path, e); + } + } + + + @Override + public BufferedTriples get() { + if (done || !source.hasNewFile()) { + done = true; + return null; + } + BufferedTriples buffer = new BufferedTriples(); + ParallelSortableArrayList tripleIDS = buffer.triples; + listener.notifyProgress(10, "reading triples part2 " + triplesCount); + while (source.hasNext()) { + if (tripleIDS.size() == Integer.MAX_VALUE - 5) { + source.forceNewFile(); + continue; + } + TripleID next = source.next(); + TripleID mappedTriple = new TripleID( + mapper.extractSubject(next.getSubject()), + mapper.extractPredicate(next.getPredicate()), + mapper.extractObjects(next.getObject()) + ); + assert mappedTriple.isValid(); + tripleIDS.add(mappedTriple); + triplesCount++; + if (triplesCount % 100_000 == 0) { + listener.notifyProgress(10, "reading triples part2 " + triplesCount); + } + } + + return buffer; + } + + @Override + public TripleFile map(BufferedTriples buffer) { + try { + ParallelSortableArrayList tripleIDS = buffer.triples; + tripleIDS.parallelSort(TripleIDComparator.getComparator(order)); + int fid = FID.incrementAndGet(); + CloseSuppressPath triplesFiles = baseFileName.resolve("triples" + fid + ".raw"); + long triples = 0; + int count = 0; + int block = tripleIDS.size() < 10 ? 1 : tripleIDS.size() / 10; + IntermediateListener il = new IntermediateListener(listener); + il.setRange(70, 100); + il.setPrefix("writing triples " + triplesFiles.getFileName() + " "); + try (CompressTripleWriter w = new CompressTripleWriter(triplesFiles.openOutputStream(bufferSize))) { + il.notifyProgress(0, "creating file"); + TripleID prev = new TripleID(-1, -1, -1); + for (TripleID triple : tripleIDS) { + count++; + if (count % block == 0) { + il.notifyProgress(count / (block / 10f), "writing triples " + count + "/" + tripleIDS.size()); + } + if (prev.match(triple)) { + continue; + } + prev.setAll(triple.getSubject(), triple.getPredicate(), triple.getObject()); + w.appendTriple(triple); + triples++; + } + listener.notifyProgress(100, "writing completed " + triplesCount + " " + triplesFiles.getFileName()); + } + return new TripleFile(triples, triplesFiles); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * merge these triples into a file + * + * @param workers number of worker + * @param nodePerMerge the number of node layer per merge + * @return result + * @throws TreeWorker.TreeWorkerException TreeWorker error + * @throws InterruptedException thread interruption + * @throws IOException io error + */ + public TripleCompressionResult mergeToFile(int workers, int nodePerMerge) throws TreeWorker.TreeWorkerException, InterruptedException, IOException { + // force to create the first file + TreeWorker treeWorker = new TreeWorker<>(this, TripleFile[]::new, workers, nodePerMerge); + treeWorker.setListener(listener); + treeWorker.start(); + // wait for the workers to merge the sections and create the triples + CloseSuppressPath triples = treeWorker.waitToComplete().path; + return new TripleCompressionResultFile(triplesCount, triples, order, bufferSize); + } + + /** + * merge these triples while reading them, increase the memory usage + * + * @return result + * @throws IOException io error + */ + public TripleCompressionResult mergeToPartial() throws IOException { + BufferedTriples triples; + List files = new ArrayList<>(); + try { + while ((triples = get()) != null) { + files.add(map(triples).path); + } + } catch (RuntimeException e) { + IOUtil.closeAll(files); + throw e; + } + return new TripleCompressionResultPartial(files, triplesCount, order, bufferSize); + } + + /** + * merge the triples into a result + * + * @param workers number of workers (complete mode) + * @param mode the mode of merging + * @param nodePerMerge the number of node layer per merge + * @return result + * @throws TreeWorker.TreeWorkerException TreeWorker error (complete mode) + * @throws InterruptedException thread interruption (complete mode) + * @throws IOException io error + */ + public TripleCompressionResult merge(int workers, int nodePerMerge, String mode) throws TreeWorker.TreeWorkerException, InterruptedException, IOException { + if (mode == null) { + mode = ""; + } + switch (mode) { + case "": + case CompressionResult.COMPRESSION_MODE_COMPLETE: + return mergeToFile(workers, nodePerMerge); + case CompressionResult.COMPRESSION_MODE_PARTIAL: + return mergeToPartial(); + default: + throw new IllegalArgumentException("Unknown compression mode: " + mode); + } + } + + public static class TripleFile implements Closeable { + long triples; + CloseSuppressPath path; + + private TripleFile(long triples, CloseSuppressPath path) { + this.triples = triples; + this.path = path; + } + + @Override + public void close() throws IOException { + path.close(); + } + } + + public static class BufferedTriples { + ParallelSortableArrayList triples = new ParallelSortableArrayList<>(TripleID[].class); + + private BufferedTriples() { + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/NoDuplicateTripleIDIterator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/NoDuplicateTripleIDIterator.java new file mode 100644 index 00000000..bc4d003b --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/NoDuplicateTripleIDIterator.java @@ -0,0 +1,95 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.enums.ResultEstimationType; +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.triples.IteratorTripleID; +import org.rdfhdt.hdt.triples.TripleID; + +/** + * a iterator triple id implementation remove duplicated + * @author Antoine Willerval + */ +public class NoDuplicateTripleIDIterator implements IteratorTripleID { + private TripleID next; + private final TripleID prev = new TripleID(-1, -1, -1); + private final IteratorTripleID it; + + public NoDuplicateTripleIDIterator(IteratorTripleID it) { + this.it = it; + } + + @Override + public boolean hasNext() { + while (this.next == null) { + if (!it.hasNext()) { + return false; + } + + TripleID next = it.next(); + + if (next.match(prev)) { + continue; + } + prev.setAll(next.getSubject(), next.getPredicate(), next.getObject()); + + this.next = next; + } + return true; + } + + @Override + public TripleID next() { + if (!hasNext()) { + return null; + } + TripleID next = this.next; + this.next = null; + return next; + } + + @Override + public boolean hasPrevious() { + throw new NotImplementedException(); + } + + @Override + public TripleID previous() { + throw new NotImplementedException(); + } + + @Override + public void goToStart() { + throw new NotImplementedException(); + } + + @Override + public boolean canGoTo() { + return false; + } + + @Override + public void goTo(long pos) { + throw new NotImplementedException(); + } + + @Override + public long estimatedNumResults() { + return it.estimatedNumResults(); + } + + @Override + public ResultEstimationType numResultEstimation() { + return it.numResultEstimation(); + } + + @Override + public TripleComponentOrder getOrder() { + return it.getOrder(); + } + + @Override + public long getLastTriplePosition() { + throw new NotImplementedException(); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/TripleGenerator.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/TripleGenerator.java new file mode 100644 index 00000000..bea9096d --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/TripleGenerator.java @@ -0,0 +1,28 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.triples.TripleID; + +import java.util.Iterator; + +/** + * Utility class to generate triples + */ +public class TripleGenerator implements Iterator { + private final long triples; + private long current = 1; + + public TripleGenerator(long triples) { + this.triples = triples; + } + + @Override + public boolean hasNext() { + return current <= triples; + } + + @Override + public TripleID next() { + long c = current++; + return new TripleID(c, c, c); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/WriteLongArrayBuffer.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/WriteLongArrayBuffer.java new file mode 100644 index 00000000..4da21cc3 --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/compress/WriteLongArrayBuffer.java @@ -0,0 +1,224 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.rdfhdt.hdt.util.BitUtil; +import org.rdfhdt.hdt.util.disk.LongArray; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; + +/** + * A class to buffer write to a long array to chunk the sets and sort them by index before calling them + * @author Antoine Willerval + */ +public class WriteLongArrayBuffer implements LongArray, Closeable { + // debug field + private static final boolean DISABLE_BUFFER = true; + private final LongArray array; + private ArrayElementLong[] bufferLong; + private ArrayElementInt[] bufferInt; + private int index = 0; + private boolean lastOrder; + + /** + * create the buffer + * @param array the array to write + * @param maxValue the maximum value, to use int64 or int32 + * @param maxElement count of long elements to store + */ + public WriteLongArrayBuffer(LongArray array, long maxValue, int maxElement) { + this.array = array; + if (!DISABLE_BUFFER) { + int bits = BitUtil.log2(maxValue + 2) + CompressUtil.INDEX_SHIFT; // + 1 for shared + + if (bits > 31) { + bufferLong = new ArrayElementLong[maxElement / 3]; + } else { + // we can store twice as many elements, so we add * 2L + bufferInt = new ArrayElementInt[(int) (maxElement / 3)]; + } + } + } + + /** + * clear all the elements + */ + public void clear() { + index = 0; + } + + public void free() { + flush(); + bufferInt = null; + bufferLong = null; + System.gc(); + } + + private ArrayElement get(int index) { + if (bufferLong != null) { + return bufferLong[index]; + } else if (bufferInt != null) { + return bufferInt[index]; + } else { + throw new IllegalArgumentException("free buffer!"); + } + } + + private void checkConsistency() { + if (size() == maxCapacity()) { + flush(); + } + } + + /** + * write all the sets and clear the buffer + */ + public void flush() { + // ignore empty array + if (size() == 0) { + return; + } + + // sort the set calls + if (bufferLong != null) { + Arrays.sort(bufferLong, 0, size(), ArrayElement::compareTo); + } else if (bufferInt != null) { + Arrays.sort(bufferInt, 0, size(), ArrayElement::compareTo); + } else { + return; + } + + // reverse the order to write from the end to the start + if (lastOrder) { + for (int i = 0; i < index; i++) { + ArrayElement e = get(i); + array.set(e.getIndex(), e.getValue()); + } + } else { + for (int i = index - 1; i >= 0; i--) { + ArrayElement e = get(i); + array.set(e.getIndex(), e.getValue()); + } + } + // reverse for next run + lastOrder = !lastOrder; + // clear the buffer + clear(); + } + + /** + * get a value of the buffer, will flush all remaining sets before + * @param index the index + * @return the value + */ + @Override + public long get(long index) { + flush(); + return array.get(index); + } + + /** + * set a value in the array + * @param index the index + * @param value the value to set + */ + @Override + public void set(long index, long value) { + if (DISABLE_BUFFER) { + array.set(index, value); + return; + } + if (bufferLong != null) { + bufferLong[this.index++] = new ArrayElementLong(index, value); + } else { + bufferInt[this.index++] = new ArrayElementInt(index, value); + } + // check for flush + checkConsistency(); + } + + /** + * get the length of the array, will flush remaining sets before + * @return the length of the array + */ + @Override + public long length() { + flush(); + return array.length(); + } + + /** + * @return the used size of the buffer + */ + public int size() { + return index; + } + + /** + * @return the max capacity of the buffer + */ + public int maxCapacity() { + if (bufferLong != null) { + return bufferLong.length; + } else { + return bufferInt.length; + } + } + + @Override + public void close() throws IOException { + flush(); + if (array instanceof Closeable) { + ((Closeable) array).close(); + } + } + + private interface ArrayElement extends Comparable { + long getIndex(); + + long getValue(); + + @Override + default int compareTo(ArrayElement o) { + return Long.compare(getIndex(), o.getIndex()); + } + } + + private static class ArrayElementLong implements ArrayElement { + private final long index, value; + + public ArrayElementLong(long index, long value) { + this.index = index; + this.value = value; + } + + @Override + public long getIndex() { + return index; + } + + @Override + public long getValue() { + return value; + } + } + + private static class ArrayElementInt implements ArrayElement { + private final int index, value; + + public ArrayElementInt(long index, long value) { + this.index = (int) index; + this.value = (int) value; + } + + @Override + public long getIndex() { + return index; + } + + @Override + public long getValue() { + return value; + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/IntermediateListener.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/IntermediateListener.java index 8ff76bf0..ff47e6a6 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/IntermediateListener.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/IntermediateListener.java @@ -45,16 +45,48 @@ public class IntermediateListener implements ProgressListener { private final ProgressListener child; private float min, max; - + private String prefix; + /** * Create an IntermediateListener that translates notifications of a * child into a broader range. - * @param child + * @param child child listener */ public IntermediateListener(ProgressListener child) { + this(child, 0, 100); + } + /** + * Create an IntermediateListener that translates notifications of a + * child into a broader range. + * @param child child listener + * @param min minimum value + * @param max maximum value + */ + public IntermediateListener(ProgressListener child, float min, float max) { + this(child, min, max, ""); + } + /** + * Create an IntermediateListener that translates notifications of a + * child into a broader range. + * @param child child listener + * @param prefix prefix of this listener + */ + public IntermediateListener(ProgressListener child, String prefix) { + this(child, 0, 100, prefix); + } + /** + * Create an IntermediateListener that translates notifications of a + * child into a broader range. + * @param child child listener + * @param min minimum value + * @param max maximum value + * @param prefix prefix of this listener + */ + public IntermediateListener(ProgressListener child, float min, float max, String prefix) { this.child = child; - this.min = 0; - this.max = 100; + this.min = min; + this.max = max; + this.prefix = prefix; } /** @@ -67,7 +99,7 @@ public IntermediateListener(ProgressListener child) { public void notifyProgress(float level, String message) { if(child!=null) { float newlevel = min + level*(max-min)/100; - child.notifyProgress(newlevel,message); + child.notifyProgress(newlevel, prefix + message); } } @@ -76,12 +108,19 @@ public void notifyProgress(float level, String message) { * when the child notifies 0, this IntermediateListener notifies the parent with 20%, and when * the child notifies 100, the IntermediateListener notifies 40. Any intermediate values are * linearly interpolated. - * @param min - * @param max + * @param min minimum value + * @param max maximum value */ public void setRange(float min, float max) { this.min = min; this.max = max; } + /** + * Set the prefix for this listener, will be put before the messages of this listener + * @param prefix the prefix + */ + public void setPrefix(String prefix) { + this.prefix = prefix; + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java index a8d8edaf..82575686 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/ListenerUtil.java @@ -27,7 +27,9 @@ package org.rdfhdt.hdt.util.listener; +import org.rdfhdt.hdt.listener.MultiThreadListener; import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.util.concurrent.SyncListener; /** * @author mario.arias @@ -45,7 +47,7 @@ public static void notify(ProgressListener listener, String message, float value public static void notifyCond(ProgressListener listener, String message, long value, long total) { if(listener!=null && (value%5000==0)) { - listener.notifyProgress( ((value)*100/total), message); + listener.notifyProgress( (float) ((value)*100/total), message); } } @@ -54,4 +56,23 @@ public static void notifyCond(ProgressListener listener, String message, long co listener.notifyProgress( ((value)*100/total), message); } } + + /** + * convert a progress listener to a {@link org.rdfhdt.hdt.listener.MultiThreadListener} + * @param listener the listener + * @return a new multi thread listener, or the listener if it was multi + */ + public static MultiThreadListener multiThreadListener(ProgressListener listener) { + // null, create an empty one + if (listener == null) { + return new PrefixMultiThreadListener((a, b) -> { + }); + } + // already a multi thread listener + if (listener instanceof MultiThreadListener) { + return (MultiThreadListener) listener; + } + // create a sync version of a prefix one + return new PrefixMultiThreadListener(SyncListener.of(listener)); + } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixMultiThreadListener.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixMultiThreadListener.java new file mode 100644 index 00000000..b424213c --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/listener/PrefixMultiThreadListener.java @@ -0,0 +1,24 @@ +package org.rdfhdt.hdt.util.listener; + +import org.rdfhdt.hdt.listener.MultiThreadListener; +import org.rdfhdt.hdt.listener.ProgressListener; + +/** + * Simple implementation of {@link org.rdfhdt.hdt.listener.MultiThreadListener} redirecting all the progression to + * a progression listener with a prefix + * + * @author Antoine Willerval + */ +public class PrefixMultiThreadListener implements MultiThreadListener { + + private final ProgressListener progressListener; + + public PrefixMultiThreadListener(ProgressListener progressListener) { + this.progressListener = progressListener; + } + + @Override + public void notifyProgress(String thread, float level, String message) { + progressListener.notifyProgress(level, "[" + thread + "]" + message); + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java index 8e71a7c7..6ba9ecc4 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java @@ -26,7 +26,9 @@ package org.rdfhdt.hdt.util.string; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.charset.Charset; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java index 0d63c9d0..48865bef 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ReplazableString.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import org.rdfhdt.hdt.exceptions.NotImplementedException; @@ -57,7 +58,7 @@ public ReplazableString(int initialCapacity) { used=0; } - private ReplazableString(byte [] buffer) { + public ReplazableString(byte [] buffer) { this.buffer = buffer; this.used = buffer.length; } @@ -71,7 +72,7 @@ private void ensureSize(int size) { buffer = Arrays.copyOf(buffer, Math.max(size, buffer.length * 2)); } } - + public void append(byte [] data, int offset, int len) { this.replace(used, data, offset, len); } @@ -79,7 +80,7 @@ public void append(byte [] data, int offset, int len) { public void append(BigByteBuffer data, long offset, int len) { this.replace(used, data, offset, len); } - + public void append(CharSequence other) { ensureSize(this.used+other.length()); for(int i=0;i params() { + return Arrays.asList( + new SequenceGenerator( + "SequenceLog64BigDisk", + SequenceLog64BigDisk::new + ), + new SequenceGenerator( + "SequenceLog64", + ((workFile, bits, elements) -> new SequenceLog64(bits, elements)) + ), + new SequenceGenerator( + "SequenceLog64Big", + ((workFile, bits, elements) -> new SequenceLog64Big(bits, elements)) + ) + ); + } + + @Parameterized.Parameter + public SequenceGenerator sequenceGenerator; + + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + + private void sequenceTest(int bits, long elements, boolean trim) throws IOException { + long maxMask = (~0L) >>> (Long.SIZE - bits); + + Path p = tempDir.newFolder().toPath(); + try (DynamicSequence actual = sequenceGenerator.bld.generate( + p.resolve("test.seq").toString(), + trim ? 64 : bits, + elements) + ) { + { + Random rnd = new Random(32); + for (long i = 0; i < elements; i++) { + long v = rnd.nextLong() & maxMask; + if (v < 0) { + v = -v; + } + actual.append(v); + } + } + { + Random rnd = new Random(32); + for (long i = 0; i < elements; i++) { + long v = rnd.nextLong() & maxMask; + if (v < 0) { + v = -v; + } + Assert.assertEquals(actual.get(i), v); + } + } + if (trim) { + actual.aggressiveTrimToSize(); + } + { + Random rnd = new Random(32); + for (long i = 0; i < elements; i++) { + long v = rnd.nextLong() & maxMask; + if (v < 0) { + v = -v; + } + Assert.assertEquals("actual fail", actual.get(i), v); + } + } + } + } + + @Test + public void littleTest() throws IOException { + sequenceTest(64, 100L, false); + } + + @Test + public void bit64Test() throws IOException { + sequenceTest(64, 10_000L, false); + } + + @Test + public void bit32Test() throws IOException { + sequenceTest(32, 10_000L, false); + } + + @Test + public void bit64TrimTest() throws IOException { + sequenceTest(64, 10_000L, true); + } + + @Test + public void bit32TrimTest() throws IOException { + sequenceTest(32, 10_000L, true); + } + + private static class SequenceGenerator { + final String name; + final SequenceGeneratorBuilder bld; + + public SequenceGenerator(String name, SequenceGeneratorBuilder bld) { + this.name = name; + this.bld = bld; + } + + @Override + public String toString() { + return name; + } + } + + @FunctionalInterface + private interface SequenceGeneratorBuilder { + DynamicSequence generate(String workFile, int bits, long elements); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java new file mode 100644 index 00000000..9da5b6af --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/CompressFourSectionDictionaryTest.java @@ -0,0 +1,164 @@ +package org.rdfhdt.hdt.dictionary.impl; + +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MapIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.io.compress.CompressTest; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +public class CompressFourSectionDictionaryTest { + @Test + public void compressDictTest() throws Exception { + TestCompressionResult result = new TestCompressionResult( + new CharSequence[]{ + "2222", "4444", "5555", "7777", "9999", "9999" + }, + new CharSequence[]{ + "1111", "1111", "2222", "3333", "3333", "4444" + }, + new CharSequence[]{ + "1111", "3333", "3333", "4444", "6666", "7777", "8888" + } + ); + List exceptedSubjects = Arrays.asList( + "2222", "5555", "9999" + ); + List exceptedPredicates = Arrays.asList( + "1111", "2222", "3333", "4444" + ); + List exceptedObjects = Arrays.asList( + "1111", "3333", "6666", "8888" + ); + List exceptedShared = Arrays.asList( + "4444", "7777" + ); + CompressFourSectionDictionary dictionary = new CompressFourSectionDictionary(result, new FakeNodeConsumer(), (p, m) -> { + }); + Iterator su = dictionary.getSubjects().getSortedEntries(); + Iterator pr = dictionary.getPredicates().getSortedEntries(); + Iterator ob = dictionary.getObjects().getSortedEntries(); + Iterator sh = dictionary.getShared().getSortedEntries(); + ExceptionThread subjectReader = new ExceptionThread(() -> { + for (CharSequence e : exceptedSubjects) { + Assert.assertTrue(su.hasNext()); + CharSequence a = su.next(); + Thread.sleep(40); + CompressTest.assertCharSequenceEquals("Subject", e, a); + } + }, "compressDictTestS"); + ExceptionThread predicateReader = new ExceptionThread(() -> { + for (CharSequence e : exceptedPredicates) { + Assert.assertTrue(pr.hasNext()); + CharSequence a = pr.next(); + Thread.sleep(40); + CompressTest.assertCharSequenceEquals("Predicate", e, a); + } + }, "compressDictTestP"); + ExceptionThread objectReader = new ExceptionThread(() -> { + for (CharSequence e : exceptedObjects) { + Assert.assertTrue(ob.hasNext()); + CharSequence a = ob.next(); + Thread.sleep(40); + CompressTest.assertCharSequenceEquals("Object", e, a); + } + }, "compressDictTestO"); + ExceptionThread sharedReader = new ExceptionThread(() -> { + for (CharSequence e : exceptedShared) { + Assert.assertTrue(sh.hasNext()); + CharSequence a = sh.next(); + Thread.sleep(40); + CompressTest.assertCharSequenceEquals("Shared", e, a); + } + }, "compressDictTestSh"); + + sharedReader.attach( + predicateReader, + objectReader, + subjectReader + ).startAll().joinAndCrashIfRequired(); + } + + static class TestCompressionResult implements CompressionResult { + private final CharSequence[] subjects; + private final CharSequence[] predicates; + private final CharSequence[] objects; + // used to create fake id to avoid duplicate assert error + private int sid, pid, oid; + + public TestCompressionResult(CharSequence[] subjects, CharSequence[] predicates, CharSequence[] objects) { + this.subjects = subjects; + this.predicates = predicates; + this.objects = objects; + } + + @Override + public long getTripleCount() { + return Math.max(subjects.length, Math.max(predicates.length, objects.length)); + } + + @Override + public ExceptionIterator getSubjects() { + return ExceptionIterator.of(new MapIterator<>(Arrays.asList(subjects).iterator(), s -> new IndexedNode(s, sid++))); + } + + @Override + public ExceptionIterator getPredicates() { + return ExceptionIterator.of(new MapIterator<>(Arrays.asList(predicates).iterator(), s -> new IndexedNode(s, pid++))); + } + + @Override + public ExceptionIterator getObjects() { + return ExceptionIterator.of(new MapIterator<>(Arrays.asList(objects).iterator(), s -> new IndexedNode(s, oid++))); + } + + @Override + public long getSubjectsCount() { + return subjects.length; + } + + @Override + public long getPredicatesCount() { + return predicates.length; + } + + @Override + public long getObjectsCount() { + return objects.length; + } + + @Override + public long getSharedCount() { + return Math.min(subjects.length, objects.length); + } + + @Override + public void delete() { + } + + @Override + public void close() { + } + } + + static class FakeNodeConsumer implements CompressFourSectionDictionary.NodeConsumer { + @Override + public void onSubject(long preMapId, long newMapId) { + } + + @Override + public void onPredicate(long preMapId, long newMapId) { + } + + @Override + public void onObject(long preMapId, long newMapId) { + } + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java new file mode 100644 index 00000000..693131c4 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/dictionary/impl/section/OneReadDictionarySectionTest.java @@ -0,0 +1,69 @@ +package org.rdfhdt.hdt.dictionary.impl.section; + +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MapIterator; +import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.io.compress.CompressUtil; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +public class OneReadDictionarySectionTest { + + @Test + public void sectionTest() { + List aa = Arrays.asList( + new IndexedNode("1", 1), + new IndexedNode("2", 2), + new IndexedNode("2", 3), + new IndexedNode("3", 4), + new IndexedNode("4", 5), + new IndexedNode("5", 6), + new IndexedNode("5", 7), + new IndexedNode("5", 8), + new IndexedNode("6", 9), + new IndexedNode("7", 10), + new IndexedNode("8", 11), + new IndexedNode("9", 12) + ); + + OneReadDictionarySection sec1 = new OneReadDictionarySection( + removeDupe(aa), + aa.size() + ); + assertIteratorEquals(removeDupe(aa), sec1.getSortedEntries()); + + OneReadDictionarySection sec2 = new OneReadDictionarySection( + removeDupe(aa), + aa.size() + ); + + PFCDictionarySection section = new PFCDictionarySection(new HDTSpecification()); + section.load(sec2, null); + + assertIteratorEquals(removeDupe(aa), section.getSortedEntries()); + } + + private void assertIteratorEquals(Iteratorit1, Iterator it2) { + while (it1.hasNext()) { + Assert.assertTrue(it2.hasNext()); + Assert.assertEquals(it1.next().toString(), it2.next().toString()); + } + Assert.assertFalse(it2.hasNext()); + } + + private Iterator removeDupe(List nodes) { + return + new MapIterator<>( + CompressUtil.asNoDupeCharSequenceIterator( + ExceptionIterator.of(nodes.iterator()), + (i, j, k) -> { + } + ), IndexedNode::getNode + ); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java new file mode 100644 index 00000000..98c74896 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/hdt/HDTManagerTest.java @@ -0,0 +1,407 @@ +package org.rdfhdt.hdt.hdt; + +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Suite; +import org.rdfhdt.hdt.dictionary.Dictionary; +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.enums.CompressionType; +import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.exceptions.NotFoundException; +import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.hdt.impl.diskimport.CompressionResult; +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.options.HDTOptionsKeys; +import org.rdfhdt.hdt.options.HDTSpecification; +import org.rdfhdt.hdt.rdf.RDFParserFactory; +import org.rdfhdt.hdt.triples.IteratorTripleString; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.triples.impl.utils.HDTTestUtils; +import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier; +import org.rdfhdt.hdt.util.StopWatch; +import org.rdfhdt.hdt.util.io.AbstractMapMemoryTest; +import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.compress.CompressTest; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +@RunWith(Suite.class) +@Suite.SuiteClasses({ + HDTManagerTest.DynamicTest.class, + HDTManagerTest.StaticTest.class +}) +public class HDTManagerTest { + private static class HDTManagerTestBase extends AbstractMapMemoryTest implements ProgressListener { + protected static final long SIZE = 1L << 16; + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + protected HDTSpecification spec; + + @Before + public void setupManager() throws IOException { + spec = new HDTSpecification(); + spec.set("loader.disk.location", tempDir.newFolder().getAbsolutePath()); + } + + @Override + public void notifyProgress(float level, String message) { + // System.out.println("[" + level + "] " + message); + } + + protected void assertEqualsHDT(HDT expected, HDT actual) throws NotFoundException { + + // test dictionary + Dictionary ed = expected.getDictionary(); + Dictionary ad = actual.getDictionary(); + assertEqualsHDT("Subjects", ed.getSubjects(), ad.getSubjects()); + assertEqualsHDT("Predicates", ed.getPredicates(), ad.getPredicates()); + assertEqualsHDT("Objects", ed.getObjects(), ad.getObjects()); + assertEqualsHDT("Shared", ed.getShared(), ad.getShared()); + assertEquals(ed.getType(), ad.getType()); + + // test triples + IteratorTripleString actualIt = actual.search("", "", ""); + IteratorTripleString expectedIt = expected.search("", "", ""); + + while (expectedIt.hasNext()) { + assertTrue(actualIt.hasNext()); + + TripleString expectedTriple = expectedIt.next(); + TripleString actualTriple = actualIt.next(); + assertEquals(expectedIt.getLastTriplePosition(), actualIt.getLastTriplePosition()); + assertEquals(expectedTriple, actualTriple); + } + assertFalse(actualIt.hasNext()); + + // test header + assertEquals(actual.getHeader().getBaseURI(), expected.getHeader().getBaseURI()); + assertEquals(actual.getHeader().getNumberOfElements(), expected.getHeader().getNumberOfElements()); + } + + protected void assertEqualsHDT(String section, DictionarySection excepted, DictionarySection actual) { + Iterator itEx = excepted.getSortedEntries(); + Iterator itAc = actual.getSortedEntries(); + + while (itEx.hasNext()) { + assertTrue(itAc.hasNext()); + CharSequence expectedTriple = itEx.next(); + CharSequence actualTriple = itAc.next(); + CompressTest.assertCharSequenceEquals(section + " section strings", expectedTriple, actualTriple); + } + assertFalse(itAc.hasNext()); + assertEquals(excepted.getNumberOfElements(), actual.getNumberOfElements()); + } + } + + @RunWith(Parameterized.class) + public static class DynamicTest extends HDTManagerTestBase { + + @Parameterized.Parameters(name = "{0}") + public static Collection params() { + List params = new ArrayList<>(); + for (int threads : new int[]{ + // sync + 1, + // async, low thread count + 2, + // async, large thread count + 8 + }) { + List modes; + if (threads > 1) { + // async, no need for partial + modes = List.of( + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE + ); + } else { + modes = List.of( + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_PARTIAL, + HDTOptionsKeys.LOADER_DISK_COMPRESSION_MODE_VALUE_COMPLETE + ); + } + for (String mode : modes) { + params.addAll(List.of( + new Object[]{"base-w" + threads + "-" + mode, SIZE * 8, 20, 50, threads, mode, false}, + new Object[]{"duplicates-w" + threads + "-" + mode, SIZE * 8, 10, 50, threads, mode, false}, + new Object[]{"large-literals-w" + threads + "-" + mode, SIZE * 2, 20, 250, threads, mode, false}, + new Object[]{"quiet-w" + threads + "-" + mode, SIZE * 8, 10, 50, threads, mode, false} + )); + } + } + return params; + } + + @Parameterized.Parameter + public String name; + @Parameterized.Parameter(1) + public long maxSize; + @Parameterized.Parameter(2) + public int maxElementSplit; + @Parameterized.Parameter(3) + public int maxLiteralSize; + @Parameterized.Parameter(4) + public int threads; + @Parameterized.Parameter(5) + public String compressMode; + @Parameterized.Parameter(6) + public boolean quiet; + + @Before + public void setupSpecs() { + spec.set(HDTOptionsKeys.LOADER_DISK_COMPRESSION_WORKER_KEY, String.valueOf(threads)); + spec.set("loader.disk.compressMode", compressMode); + } + + private void generateDiskTest() throws IOException, ParserException, NotFoundException, InterruptedException { + LargeFakeDataSetStreamSupplier supplier = + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(maxSize, 34) + .withMaxElementSplit(maxElementSplit) + .withMaxLiteralSize(maxLiteralSize); + + // create DISK HDT + LargeFakeDataSetStreamSupplier.ThreadedStream genActual = supplier.createNTInputStream(CompressionType.GZIP); + HDT actual = null; + try { + actual = HDTManager.generateHDTDisk( + genActual.getStream(), + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + CompressionType.GZIP, + spec, + quiet ? null : this + ); + } finally { + if (actual == null) { + genActual.getThread().interrupt(); + } + } + genActual.getThread().joinAndCrashIfRequired(); + + supplier.reset(); + + LargeFakeDataSetStreamSupplier.ThreadedStream genExpected = supplier.createNTInputStream(CompressionType.GZIP); + // create MEMORY HDT + HDT expected = null; + try { + expected = HDTManager.generateHDT( + genExpected.getStream(), + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + CompressionType.GZIP, + spec, + null + ); + } finally { + if (expected == null) { + genExpected.getThread().interrupt(); + } + } + genExpected.getThread().joinAndCrashIfRequired(); + + // happy compiler, should throw before + assertNotNull(expected); + assertNotNull(actual); + try { + assertEqualsHDT(expected, actual); + } finally { + IOUtil.closeAll(expected, actual); + } + } + + @Test + public void generateSaveLoadMapTest() throws IOException, ParserException, NotFoundException { + LargeFakeDataSetStreamSupplier supplier = + LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(maxSize, 34) + .withMaxElementSplit(maxElementSplit) + .withMaxLiteralSize(maxLiteralSize); + + // create MEMORY HDT + + try (HDT expected = HDTManager.generateHDT( + supplier.createTripleStringStream(), + HDTTestUtils.BASE_URI, + spec, + quiet ? null : this + )) { + String tmp = tempDir.newFile().getAbsolutePath(); + expected.saveToHDT(tmp, null); + + try (HDT mapExcepted = HDTManager.mapHDT(tmp, quiet ? null : this)) { + assertEqualsHDT(expected, mapExcepted); + } + + try (HDT loadExcepted = HDTManager.loadHDT(tmp, quiet ? null : this)) { + assertEqualsHDT(expected, loadExcepted); + } + } + + } + + @Test + public void generateDiskMemTest() throws IOException, ParserException, NotFoundException, InterruptedException { + spec.setInt("loader.disk.chunkSize", SIZE); + generateDiskTest(); + } + + @Test + public void generateDiskMapTest() throws IOException, ParserException, NotFoundException, InterruptedException { + spec.setInt("loader.disk.chunkSize", SIZE); + File mapHDT = tempDir.newFile("mapHDTTest.hdt"); + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.getAbsolutePath()); + generateDiskTest(); + Files.deleteIfExists(mapHDT.toPath()); + } + } + + @RunWith(Parameterized.class) + public static class StaticTest extends HDTManagerTestBase { + @Parameterized.Parameters(name = "{0}") + public static Collection params() { + return List.of( + new Object[]{"hdtGenDisk/unicode_disk_encode.nt", true} + ); + } + + @Parameterized.Parameter + public String file; + @Parameterized.Parameter(1) + public boolean quiet; + + + private void generateDiskTest() throws IOException, ParserException, NotFoundException { + String ntFile = Objects.requireNonNull(getClass().getClassLoader().getResource(file), "Can't find " + file).getFile(); + // create DISK HDT + HDT actual = HDTManager.generateHDTDisk( + ntFile, + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + quiet ? null : this + ); + + // create MEMORY HDT + HDT expected = HDTManager.generateHDT( + ntFile, + HDTTestUtils.BASE_URI, + RDFNotation.NTRIPLES, + spec, + null + ); + + try { + assertEqualsHDT(expected, actual); + } finally { + IOUtil.closeAll(expected, actual); + } + } + + @Test + public void generateDiskCompleteTest() throws IOException, ParserException, NotFoundException { + spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_COMPLETE); + spec.setInt("loader.disk.chunkSize", SIZE); + generateDiskTest(); + } + + @Test + public void generateDiskPartialTest() throws IOException, ParserException, NotFoundException { + spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_PARTIAL); + spec.setInt("loader.disk.chunkSize", SIZE); + generateDiskTest(); + } + + @Test + public void generateDiskCompleteMapTest() throws IOException, ParserException, NotFoundException { + spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_COMPLETE); + spec.setInt("loader.disk.chunkSize", SIZE); + File mapHDT = tempDir.newFile("mapHDTTest.hdt"); + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.getAbsolutePath()); + generateDiskTest(); + Files.deleteIfExists(mapHDT.toPath()); + } + + @Test + public void generateDiskPartialMapTest() throws IOException, ParserException, NotFoundException { + spec.set("loader.disk.compressMode", CompressionResult.COMPRESSION_MODE_PARTIAL); + spec.setInt("loader.disk.chunkSize", SIZE); + File mapHDT = tempDir.newFile("mapHDTTest.hdt"); + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, mapHDT.getAbsolutePath()); + generateDiskTest(); + Files.deleteIfExists(mapHDT.toPath()); + } + + @Test + public void generateTest() throws IOException, ParserException, NotFoundException { + String ntFile = Objects.requireNonNull(getClass().getClassLoader().getResource(file), "Can't find " + file).getFile(); + // create DISK HDT + try (InputStream in = IOUtil.getFileInputStream(ntFile)) { + Iterator it = RDFParserFactory.readAsIterator( + RDFParserFactory.getParserCallback(RDFNotation.NTRIPLES, true), + in, HDTTestUtils.BASE_URI, true, RDFNotation.NTRIPLES + ); + HDT expected = HDTManager.generateHDT( + it, + HDTTestUtils.BASE_URI, + spec, + quiet ? null : this + ); + + String testCopy = tempDir.newFile().getAbsolutePath(); + expected.saveToHDT(testCopy, null); + + // create MEMORY HDT + HDT actual = HDTManager.loadHDT(testCopy); + + try { + assertEqualsHDT(expected, actual); + } finally { + IOUtil.closeAll(expected, actual); + } + } + } + } + + @Ignore("handTests") + public static class HandTest extends HDTManagerTestBase { + @Test + public void bigTest() throws ParserException, IOException { + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier + .createSupplierWithMaxSize(10_000_000_000L, 94); + + Path output = tempDir.newFolder().toPath(); + + HDTOptions spec = new HDTSpecification(); + spec.set(HDTOptionsKeys.LOADER_DISK_FUTURE_HDT_LOCATION_KEY, output.resolve("future.hdt").toAbsolutePath().toString()); + spec.set(HDTOptionsKeys.LOADER_DISK_LOCATION_KEY, output.resolve("gen_dir").toAbsolutePath().toString()); + StopWatch watch = new StopWatch(); + watch.reset(); + try (HDT hdt = HDTManager.generateHDTDisk(supplier.createTripleStringStream(), "http://ex.ogr/#", spec, + (level, message) -> System.out.println("[" + level + "] " + message) + )) { + System.out.println(watch.stopAndShow()); + System.out.println(hdt.getTriples().getNumberOfElements()); + } + } + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIteratorTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIteratorTest.java new file mode 100644 index 00000000..115ba86e --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/iterator/utils/MergeExceptionIteratorTest.java @@ -0,0 +1,28 @@ +package org.rdfhdt.hdt.iterator.utils; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.function.Function; + +import static org.junit.Assert.*; + +public class MergeExceptionIteratorTest { + + @Test + public void mergeTest() { + ExceptionIterator it1 = ExceptionIterator.of(Arrays.asList("1", "3", "5", "7").iterator()); + ExceptionIterator it2 = ExceptionIterator.of(Arrays.asList("2", "4", "6", "6").iterator()); + + ExceptionIterator it = MergeExceptionIterator.buildOfTree(Function.identity(), String::compareTo, List.of(it1, it2), 0, 2); + + ExceptionIterator itExcepted = ExceptionIterator.of(Arrays.asList("1", "2", "3", "4", "5", "6", "6", "7").iterator()); + + while (itExcepted.hasNext()) { + assertTrue(it.hasNext()); + assertEquals(itExcepted.next(), it.next()); + } + assertFalse(it.hasNext()); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java index 738c9e96..4ec5523b 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java @@ -1,24 +1,35 @@ package org.rdfhdt.hdt.util; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream; +import org.rdfhdt.hdt.enums.CompressionType; import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; +import org.rdfhdt.hdt.util.string.ByteStringUtil; import java.io.FileWriter; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.io.PrintStream; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.Iterator; import java.util.Random; +import java.util.zip.GZIPOutputStream; public class LargeFakeDataSetStreamSupplier { - private static final Charset DEFAULT_CHARSET = Charset.defaultCharset(); + private static final Charset DEFAULT_CHARSET = ByteStringUtil.STRING_ENCODING; /** * create a lowercase name from a number, to create string without any number in it @@ -64,6 +75,7 @@ public static LargeFakeDataSetStreamSupplier createSupplierWithMaxTriples(long m private final long maxSize; private final long maxTriples; public int maxFakeType = 10; + public int maxLiteralSize = 2; public int maxElementSplit = Integer.MAX_VALUE; private LargeFakeDataSetStreamSupplier(long maxSize, long maxTriples, long seed) { @@ -89,8 +101,48 @@ public void createNTFile(String file) throws IOException { } } + public ThreadedStream createNTInputStream(CompressionType compressionType) throws IOException { + PipedOutputStream pout = new PipedOutputStream(); + InputStream is = new PipedInputStream(pout); + OutputStream out; + + if (compressionType != null) { + switch (compressionType) { + case NONE: + out = pout; + break; + case XZ: + out = new XZCompressorOutputStream(pout); + break; + case BZIP: + out = new BZip2CompressorOutputStream(pout); + break; + case GZIP: + out = new GZIPOutputStream(pout); + break; + default: + throw new NotImplementedException(compressionType.name()); + } + } else { + out = pout; + } + + ExceptionThread run = new ExceptionThread(() -> { + try (PrintStream ps = new PrintStream(out, true)) { + Iterator it = createTripleStringStream(); + while (it.hasNext()) { + it.next().dumpNtriple(ps); + } + } + }, + "ThreadedFakedStream"); + run.start(); + + return new ThreadedStream(run, is); + } + public HDT createFakeHDTTwoPass(HDTOptions spec) throws ParserException, IOException { - Path f = Paths.get("tempNtFile.nt").toAbsolutePath(); + Path f = Path.of("tempNtFile.nt").toAbsolutePath(); try { createNTFile(f.toString()); spec.set("loader.type", "two-pass"); @@ -130,14 +182,22 @@ private CharSequence createValue() { if (random.nextBoolean()) { return createPredicate(); } - - String text = "\"" + stringNameOfInt(random.nextInt(maxElementSplit)) + "\""; - if (random.nextBoolean()) { + int size = random.nextInt(maxLiteralSize); + StringBuilder litText = new StringBuilder(); + for (int i = 0; i < size; i++) { + litText.append(stringNameOfInt(random.nextInt(maxElementSplit))).append(" "); + } + String text = "\"" + litText + "\""; + int litType = random.nextInt(3); + if (litType == 1) { // language node return text + "@" + stringNameOfInt(random.nextInt(maxElementSplit)); - } else { + } else if (litType == 2) { // typed node return text + "^^<" + createType() + ">"; + } else { + // no type/language node + return text; } } @@ -185,4 +245,37 @@ public TripleString next() { } } -} \ No newline at end of file + public LargeFakeDataSetStreamSupplier withMaxFakeType(int maxFakeType) { + this.maxFakeType = maxFakeType; + return this; + } + + public LargeFakeDataSetStreamSupplier withMaxElementSplit(int maxElementSplit) { + this.maxElementSplit = maxElementSplit; + return this; + } + + public LargeFakeDataSetStreamSupplier withMaxLiteralSize(int maxLiteralSize) { + this.maxLiteralSize = maxLiteralSize; + return this; + + } + + public static class ThreadedStream { + private final ExceptionThread thread; + private final InputStream stream; + + public ThreadedStream(ExceptionThread thread, InputStream stream) { + this.thread = thread; + this.stream = stream; + } + + public ExceptionThread getThread() { + return thread; + } + + public InputStream getStream() { + return stream; + } + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/TreeWorkerTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/TreeWorkerTest.java new file mode 100644 index 00000000..dd5bcc32 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/concurrent/TreeWorkerTest.java @@ -0,0 +1,308 @@ +package org.rdfhdt.hdt.util.concurrent; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.iterator.utils.MergeExceptionIterator; +import org.rdfhdt.hdt.util.BitUtil; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +@RunWith(Parameterized.class) +public class TreeWorkerTest { + @Parameterized.Parameters(name = "test {0} worker(s) {1} way(s)") + public static Collection params() { + return Arrays.asList( + new Object[]{1, 1}, + new Object[]{8, 1}, + new Object[]{1, 4}, + new Object[]{8, 4} + ); + } + + private static class SyncSupplierTest implements TreeWorker.TreeWorkerSupplier { + private final int max; + private final long sleep; + private int val; + private boolean inUse = false; + + public SyncSupplierTest(int max, long sleep) { + this.max = max; + this.sleep = sleep; + } + + @Override + public Integer get() { + synchronized (this) { + assertFalse(inUse); + inUse = true; + } + sleepOrThrow(sleep); + synchronized (this) { + assertTrue(inUse); + inUse = false; + } + if (val == max) { + return null; + } + return ++val; + } + } + + private static Integer sum(Integer[] array, int count) { + int s = 0; + for (int i = 0; i < count; i++) { + s += array[i]; + } + return s; + } + + private static class CountCatTest implements TreeWorker.TreeWorkerCat { + int call = 0; + + @Override + public Integer construct(Integer[] array, int count) { + synchronized (this) { + call++; + } + return sum(array, count); + } + } + + private static class CountComparator implements Comparator { + int call = 0; + + @Override + public int compare(Integer o1, Integer o2) { + synchronized (this) { + call++; + } + return Integer.compare(o1, o2); + } + } + + private static class IntegerArrayList extends ArrayList { + } + + @Parameterized.Parameter + public int workers; + @Parameterized.Parameter(1) + public int ways; + + @Test + public void syncSupplierTest() throws InterruptedException, TreeWorker.TreeWorkerException { + TreeWorker.TreeWorkerCat cat = TreeWorkerTest::sum; + int max = 10; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 20L); + + TreeWorker worker = new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + Integer result = worker.waitToComplete(); + assertTrue(worker.isCompleted()); + assertNotNull(result); + assertEquals(max * (max + 1) / 2, result.intValue()); + } + + @Test(expected = TreeWorker.TreeWorkerException.class) + public void noElementSupplierTest() throws TreeWorker.TreeWorkerException { + TreeWorker.TreeWorkerCat cat = TreeWorkerTest::sum; + int max = 0; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 20L); + + // should crash because the supplier won't return any value to merge + new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + } + + @Test + public void oneElementSupplierTest() throws InterruptedException, TreeWorker.TreeWorkerException { + TreeWorker.TreeWorkerCat cat = TreeWorkerTest::sum; + int max = 1; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 20L); + + TreeWorker worker = new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + Integer result = worker.waitToComplete(); + assertTrue(worker.isCompleted()); + assertNotNull(result); + assertEquals(1, result.intValue()); + } + + @Test + public void catExceptionTest() throws InterruptedException, TreeWorker.TreeWorkerException { + final String error = "I like HDT"; + TreeWorker.TreeWorkerCat cat = (a, b) -> { + throw new RuntimeException(error); + }; + int max = 1; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 20L); + + TreeWorker worker = new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + try { + worker.waitToComplete(); + } catch (TreeWorker.TreeWorkerException e) { + assertEquals(error, e.getCause().getMessage()); + } + assertTrue(worker.isCompleted()); + } + + @Test + public void countTest() throws InterruptedException, TreeWorker.TreeWorkerException { + CountCatTest cat = new CountCatTest(); + int max = 1 << 5; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 2L); + + TreeWorker worker = new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + Integer result = worker.waitToComplete(); + assertTrue(worker.isCompleted()); + assertNotNull(result); + assertEquals(max * (max + 1) / 2, result.intValue()); + } + + @Test + public void countAscendTest() throws InterruptedException, TreeWorker.TreeWorkerException { + CountCatTest cat = new CountCatTest(); + int max = 1 << 5 - 1; + TreeWorker.TreeWorkerSupplier supplier = new SyncSupplierTest(max, 2L); + + TreeWorker worker = new TreeWorker<>(cat, supplier, null, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + Integer result = worker.waitToComplete(); + assertTrue(worker.isCompleted()); + assertNotNull(result); + assertEquals(max * (max + 1) / 2, result.intValue()); + } + + @Test + public void deleteTest() throws TreeWorker.TreeWorkerException, InterruptedException { + int max = 10; + Set elements = new HashSet<>(); + TreeWorker.TreeWorkerCat cat = (array, count) -> { + synchronized (elements) { + for (int i = 0; i < count; i++) { + elements.remove(array[i] * max); + } + int next = sum(array, count); + elements.add(next * max); + return next; + } + }; + TreeWorker.TreeWorkerSupplier supplier = new TreeWorker.TreeWorkerSupplier<>() { + int value = 0; + + @Override + public Integer get() { + if (value == max) { + return null; + } + int v = ++value; + synchronized (elements) { + elements.add(v * max); + } + return v; + } + }; + + TreeWorker.TreeWorkerDelete delete = elements::remove; + + TreeWorker worker = new TreeWorker<>(cat, supplier, delete, TreeWorker.TreeWorkerMap.identity(), Integer[]::new, workers, ways); + worker.start(); + Integer result = worker.waitToComplete(); + assertTrue(worker.isCompleted()); + assertNotNull(result); + assertEquals(1, elements.size()); + assertEquals(result * max, elements.iterator().next().intValue()); + assertEquals(max * (max + 1) / 2, result.intValue()); + } + + @Test + public void mergeSortTest() throws TreeWorker.TreeWorkerException, InterruptedException { + Random rnd = new Random(42); + int count = 20; + int maxValue = Integer.MAX_VALUE / 4; + List values = new ArrayList<>(); + List lst = new ArrayList<>(); + for (int i = 0; i < count; i++) { + int v = rnd.nextInt(maxValue); + values.add(v); + lst.add(v); + } + assertEquals(lst, values); + List sorted = lst.stream() + .map(i -> i * 3) + .sorted(Comparator.comparingInt(a -> a)) + .collect(Collectors.toList()); + assertNotEquals(sorted, values); + CountComparator com = new CountComparator(); + assertTrue(com.compare(1325939940, -1360544799) > 0); + assertTrue(com.compare(2, 1) > 0); + assertTrue(com.compare(-3, -2) < 0); + assertTrue(com.compare(-2, -3) > 0); + com.call = 0; + TreeWorker worker = new TreeWorker<>( + (IntegerArrayList[] array, int length) -> { + Iterator it = MergeExceptionIterator.buildOfTree( + l -> ExceptionIterator.of(l.iterator()), + com, + array, length).asIterator(); + IntegerArrayList l = new IntegerArrayList(); + while (it.hasNext()) { + l.add(it.next()); + } + IntegerArrayList tst = new IntegerArrayList(); + tst.addAll(l); + tst.sort(Integer::compareTo); + sleepOrThrow(25); + assertEquals(tst, l); + return l; + }, + new TreeWorker.TreeWorkerSupplier<>() { + int index; + + @Override + public IntegerArrayList get() { + if (index == values.size()) { + return null; + } + IntegerArrayList l = new IntegerArrayList(); + l.add(values.get(index++)); + sleepOrThrow(25); + return l; + } + }, + null, v -> v.stream() + .map(i -> i * 3) + .collect(Collectors.toCollection(IntegerArrayList::new)), IntegerArrayList[]::new, workers, ways + ); + worker.start(); + List result = worker.waitToComplete(); + // test O(n log(n)) + assertTrue("calls: " + com.call + ", n logn : " + count * BitUtil.log2(count), com.call <= count * BitUtil.log2(count)); + assertEquals(sorted, result); + } + + private static void sleepOrThrow(long time) { + try { + Thread.sleep(time); + } catch (InterruptedException e) { + throw new AssertionError("Interruption", e); + } + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java index 00f3a662..791f3a3a 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/IOUtilTest.java @@ -5,13 +5,22 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.Closeable; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import org.junit.Assert; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TemporaryFolder; public class IOUtilTest { + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + @Before public void setUp() throws Exception { } @@ -20,28 +29,28 @@ public void setUp() throws Exception { public void testWriteLong() { try { ByteArrayOutputStream bout = new ByteArrayOutputStream(); - + IOUtil.writeLong(bout, 3); IOUtil.writeLong(bout, 4); IOUtil.writeLong(bout, 0xFF000000000000AAL); IOUtil.writeLong(bout, 0x33AABBCCDDEEFF11L); - + ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray()); - + long a = IOUtil.readLong(bin); assertEquals(a, 3); - + long b = IOUtil.readLong(bin); assertEquals(b, 4); - + long c = IOUtil.readLong(bin); assertEquals(c, 0xFF000000000000AAL); - + long d = IOUtil.readLong(bin); assertEquals(d, 0x33AABBCCDDEEFF11L); - + } catch (IOException e) { - fail("Exception thrown: "+e); + fail("Exception thrown: " + e); } } @@ -49,28 +58,119 @@ public void testWriteLong() { public void testWriteInt() { try { ByteArrayOutputStream bout = new ByteArrayOutputStream(); - + IOUtil.writeInt(bout, 3); IOUtil.writeInt(bout, 4); IOUtil.writeInt(bout, 0xFF0000AA); IOUtil.writeInt(bout, 0xAABBCCDD); - + ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray()); - + long a = IOUtil.readInt(bin); assertEquals(a, 3); - + long b = IOUtil.readInt(bin); assertEquals(b, 4); - + long c = IOUtil.readInt(bin); assertEquals(c, 0xFF0000AA); - + long d = IOUtil.readInt(bin); assertEquals(d, 0xAABBCCDD); - + } catch (IOException e) { - fail("Exception thrown: "+e); + fail("Exception thrown: " + e); + } + } + + @Test(expected = IOException.class) + public void closeAllSeverity11Test() throws IOException { + IOUtil.closeAll( + () -> { + throw new IOException(); + }, + () -> { + throw new IOException(); + }, + () -> { + throw new IOException(); + } + ); + } + + @Test(expected = IOException.class) + public void closeAllSeverity12Test() throws IOException { + IOUtil.closeAll( + (Closeable) () -> { + throw new IOException(); + } + ); + } + + @Test(expected = IOException.class) + public void closeAllSeverity13Test() throws IOException { + IOUtil.closeAll( + () -> { + throw new IOException(); + }, + () -> { + throw new IOException(); + } + ); + } + + @Test(expected = RuntimeException.class) + public void closeAllSeverity2Test() throws IOException { + IOUtil.closeAll( + () -> { + throw new IOException(); + }, + () -> { + throw new RuntimeException(); + }, + () -> { + throw new IOException(); + } + ); + } + + @Test(expected = Error.class) + public void closeAllSeverity3Test() throws IOException { + IOUtil.closeAll( + () -> { + throw new Error(); + }, + () -> { + throw new RuntimeException(); + }, + () -> { + throw new IOException(); + } + ); + } + + @Test + public void closeablePathTest() throws IOException { + Path p = tempDir.newFolder().toPath(); + + Path p1 = p.resolve("test1"); + try (CloseSuppressPath csp = CloseSuppressPath.of(p1)) { + Files.writeString(csp.getJavaPath(), "test"); + Assert.assertTrue(Files.exists(p1)); } + Assert.assertFalse(Files.exists(p1)); + + + Path p2 = p.resolve("test2"); + try (CloseSuppressPath csp = CloseSuppressPath.of(p2)) { + csp.closeWithDeleteRecurse(); + Path p3 = csp.getJavaPath().resolve("test3/test4/test5"); + Path f4 = p3.resolve("child.txt"); + Files.createDirectories(p3); + Files.writeString(f4, "hello world"); + Assert.assertTrue(Files.exists(f4)); + } + Assert.assertFalse(Files.exists(p2)); + } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java new file mode 100644 index 00000000..b6a85e98 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressNodeTest.java @@ -0,0 +1,257 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; + +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.util.Arrays; +import java.util.List; + +public class CompressNodeTest { + + @Test + public void writeReadTest() throws InterruptedException, IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + out.connect(in); + List nodes = Arrays.asList( + new IndexedNode("bob", 1), + new IndexedNode("michel", 3), + new IndexedNode("jack", 2), + new IndexedNode("charles", 6) + ); + new ExceptionThread(() -> { + CompressNodeReader reader = new CompressNodeReader(in); + Assert.assertEquals(nodes.size(), reader.getSize()); + try { + for (IndexedNode excepted : nodes) { + Assert.assertTrue(reader.hasNext()); + IndexedNode actual = reader.next(); + Assert.assertEquals(excepted.getIndex(), actual.getIndex()); + CompressTest.assertCharSequenceEquals("indexed node", excepted.getNode(), actual.getNode()); + } + reader.checkComplete(); + Assert.assertEquals(34, in.read()); + Assert.assertEquals(12, in.read()); + Assert.assertEquals(27, in.read()); + } finally { + in.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + CompressNodeWriter writer = new CompressNodeWriter(out, nodes.size()); + try { + for (IndexedNode node : nodes) { + writer.appendNode(node); + } + writer.writeCRC(); + // raw data to check if we didn't read too/not enough data + out.write(34); + out.write(12); + out.write(27); + } finally { + out.close(); + } + }, "WriteTest") + ).startAll().joinAndCrashIfRequired(); + } + + @Test + public void writeReadUtilTest() throws InterruptedException, IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + out.connect(in); + List nodes = Arrays.asList( + new IndexedNode("bob", 1), + new IndexedNode("michel", 3), + new IndexedNode("jack", 2), + new IndexedNode("charles", 6) + ); + new ExceptionThread(() -> { + CompressNodeReader reader = new CompressNodeReader(in); + Assert.assertEquals(nodes.size(), reader.getSize()); + try { + for (IndexedNode excepted : nodes) { + Assert.assertTrue(reader.hasNext()); + IndexedNode actual = reader.next(); + Assert.assertEquals(excepted.getIndex(), actual.getIndex()); + CompressTest.assertCharSequenceEquals("indexed node", excepted.getNode(), actual.getNode()); + } + reader.checkComplete(); + Assert.assertEquals(34, in.read()); + Assert.assertEquals(12, in.read()); + Assert.assertEquals(27, in.read()); + } finally { + in.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + try { + CompressUtil.writeCompressedSection(nodes, out, null); + out.write(34); + out.write(12); + out.write(27); + } finally { + out.close(); + } + }, "WriteTest") + ).startAll().joinAndCrashIfRequired(); + } + + @Test + public void writeReadPassTest() throws InterruptedException, IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + out.connect(in); + List nodes = Arrays.asList( + new IndexedNode("bob", 1), + new IndexedNode("michel", 3), + new IndexedNode("jack", 2), + new IndexedNode("charles", 6) + ); + new ExceptionThread(() -> { + CompressNodeReader reader = new CompressNodeReader(in); + Assert.assertEquals(nodes.size(), reader.getSize()); + try { + for (IndexedNode excepted : nodes) { + Assert.assertTrue(reader.hasNext()); + IndexedNode actual = reader.read(); + Assert.assertEquals(excepted.getIndex(), actual.getIndex()); + CompressTest.assertCharSequenceEquals("indexed node", excepted.getNode(), actual.getNode()); + String actual1Node = actual.getNode().toString(); + IndexedNode actual2 = reader.read(); + Assert.assertEquals(actual.getIndex(), actual2.getIndex()); + CompressTest.assertCharSequenceEquals("post pass indexed node", actual1Node, actual2.getNode()); + Assert.assertTrue(reader.hasNext()); + reader.pass(); + } + reader.checkComplete(); + Assert.assertEquals(34, in.read()); + Assert.assertEquals(12, in.read()); + Assert.assertEquals(27, in.read()); + } finally { + in.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + CompressNodeWriter writer = new CompressNodeWriter(out, nodes.size()); + try { + for (IndexedNode node : nodes) { + writer.appendNode(node); + } + writer.writeCRC(); + out.write(34); + out.write(12); + out.write(27); + } finally { + out.close(); + } + }, "WriteTest") + ).startAll().joinAndCrashIfRequired(); + } + + @Test + public void writeReadMergeTest() throws InterruptedException, IOException { + PipedOutputStream node1Out = new PipedOutputStream(); + PipedInputStream node1In = new PipedInputStream(); + node1Out.connect(node1In); + + PipedOutputStream node2Out = new PipedOutputStream(); + PipedInputStream node2In = new PipedInputStream(); + node2Out.connect(node2In); + + PipedOutputStream finalOut = new PipedOutputStream(); + PipedInputStream finalIn = new PipedInputStream(); + finalOut.connect(finalIn); + + List nodes1 = Arrays.asList( + new IndexedNode("zzzaaa", 1), + new IndexedNode("zzzccc", 2), + new IndexedNode("zzzddd", 6) + ); + List nodes2 = Arrays.asList( + new IndexedNode("zzzbbb", 3), + new IndexedNode("zzzeee", 4), + new IndexedNode("zzzfff", 5), + new IndexedNode("zzzggg", 7) + ); + List finalExcepted = Arrays.asList( + new IndexedNode("zzzaaa", 1), + new IndexedNode("zzzbbb", 3), + new IndexedNode("zzzccc", 2), + new IndexedNode("zzzddd", 6), + new IndexedNode("zzzeee", 4), + new IndexedNode("zzzfff", 5), + new IndexedNode("zzzggg", 7) + ); + new ExceptionThread(() -> { + CompressNodeReader reader = new CompressNodeReader(finalIn); + Assert.assertEquals(finalExcepted.size(), reader.getSize()); + try { + for (IndexedNode excepted : finalExcepted) { + Assert.assertTrue(reader.hasNext()); + IndexedNode actual = reader.next(); + Assert.assertEquals(excepted.getIndex(), actual.getIndex()); + CompressTest.assertCharSequenceEquals("merged node", excepted.getNode(), actual.getNode()); + } + reader.checkComplete(); + Assert.assertEquals(98, finalIn.read()); + Assert.assertEquals(18, finalIn.read()); + Assert.assertEquals(22, finalIn.read()); + } finally { + finalIn.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + try { + CompressUtil.writeCompressedSection(nodes1, node1Out, null); + node1Out.write(34); + node1Out.write(12); + node1Out.write(27); + } finally { + node1Out.close(); + } + }, "Write1Test"), + new ExceptionThread(() -> { + try { + CompressUtil.writeCompressedSection(nodes2, node2Out, null); + node2Out.write(42); + node2Out.write(19); + node2Out.write(1); + } finally { + node2Out.close(); + } + }, "Write2Test"), + new ExceptionThread(() -> { + try { + CompressUtil.mergeCompressedSection(node1In, node2In, finalOut, null); + finalOut.write(98); + finalOut.write(18); + finalOut.write(22); + + Assert.assertEquals(34, node1In.read()); + Assert.assertEquals(12, node1In.read()); + Assert.assertEquals(27, node1In.read()); + + Assert.assertEquals(42, node2In.read()); + Assert.assertEquals(19, node2In.read()); + Assert.assertEquals(1, node2In.read()); + } finally { + try { + node1In.close(); + } finally { + try { + node2In.close(); + } finally { + finalOut.close(); + } + } + } + }, "MergeTest") + ).startAll().joinAndCrashIfRequired(); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java new file mode 100644 index 00000000..1ba68115 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTest.java @@ -0,0 +1,75 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.util.string.CharSequenceComparator; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +public class CompressTest { + public static void assertCharSequenceEquals(String location, CharSequence s1, CharSequence s2) { + if (CharSequenceComparator.getInstance().compare(s1, s2) != 0) { + throw new AssertionError(location + + "\nexcepted: " + s1 + + "\nactual: " + s2 + ); + } + } + + @Test + public void noDupeTest() { + List duplicatedList = Arrays.asList( + new IndexedNode("a", 0), + new IndexedNode("b", 1), + new IndexedNode("b", 2), + new IndexedNode("c", 3), + new IndexedNode("c", 4), + new IndexedNode("c", 5), + new IndexedNode("d", 6), + new IndexedNode("e", 7), + new IndexedNode("f", 8) + ); + List noDuplicatedList = Arrays.asList( + "a", + "b", + "c", + "d", + "e", + "f" + ); + + Set duplicates = new HashSet<>(); + duplicates.add(2L); + duplicates.add(4L); + duplicates.add(5L); + + Iterator actual = CompressUtil.asNoDupeCharSequenceIterator( + ExceptionIterator.of(duplicatedList.iterator()), + (originalIndex, duplicatedIndex, oldIndex) -> + Assert.assertTrue(duplicates.remove(duplicatedIndex)) + ); + for (CharSequence e : noDuplicatedList) { + Assert.assertTrue(actual.hasNext()); + CharSequence a = actual.next().getNode(); + + assertCharSequenceEquals("noDupeTest", e, a); + } + } + + @Test + public void bitMappingTest() { + long sharedCount = 1000L; + long index1 = 888L; + + long sharedIndex1 = CompressUtil.asShared(index1); + + Assert.assertEquals(index1, CompressUtil.computeSharedNode(sharedIndex1, sharedCount)); + Assert.assertEquals(sharedCount + index1, CompressUtil.computeSharedNode(CompressUtil.getHeaderId(index1), sharedCount)); + } +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTripleTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTripleTest.java new file mode 100644 index 00000000..360eba68 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/compress/CompressTripleTest.java @@ -0,0 +1,206 @@ +package org.rdfhdt.hdt.util.io.compress; + +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.enums.TripleComponentOrder; +import org.rdfhdt.hdt.iterator.utils.ExceptionIterator; +import org.rdfhdt.hdt.triples.IndexedNode; +import org.rdfhdt.hdt.triples.IndexedTriple; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.util.concurrent.ExceptionThread; + +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +public class CompressTripleTest { + @Test + public void writeReadTest() throws InterruptedException, IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + out.connect(in); + List triples = Arrays.asList( + new IndexedTriple( + new IndexedNode("", 1), + new IndexedNode("", 9), + new IndexedNode("", 11) + ), + new IndexedTriple( + new IndexedNode("", 1), + new IndexedNode("", 9), + new IndexedNode("", 11) + ), + new IndexedTriple( + new IndexedNode("", 3), + new IndexedNode("", 10), + new IndexedNode("", 11) + ), + new IndexedTriple( + new IndexedNode("", 2), + new IndexedNode("", 12), + new IndexedNode("", 15) + ), + new IndexedTriple( + new IndexedNode("", 2), + new IndexedNode("", 12), + new IndexedNode("", 15) + ), + new IndexedTriple( + new IndexedNode("", 6), + new IndexedNode("", 14), + new IndexedNode("", 13) + ) + ); + List noDupeTriples = Arrays.asList( + new IndexedTriple( + new IndexedNode("", 1), + new IndexedNode("", 9), + new IndexedNode("", 11) + ), + new IndexedTriple( + new IndexedNode("", 3), + new IndexedNode("", 10), + new IndexedNode("", 11) + ), + new IndexedTriple( + new IndexedNode("", 2), + new IndexedNode("", 12), + new IndexedNode("", 15) + ), + new IndexedTriple( + new IndexedNode("", 6), + new IndexedNode("", 14), + new IndexedNode("", 13) + ) + ); + new ExceptionThread(() -> { + CompressTripleReader reader = new CompressTripleReader(in); + try { + for (IndexedTriple exceptedIndex : noDupeTriples) { + Assert.assertTrue(reader.hasNext()); + TripleID actual = reader.next(); + TripleID excepted = new TripleID( + exceptedIndex.getSubject().getIndex(), + exceptedIndex.getPredicate().getIndex(), + exceptedIndex.getObject().getIndex() + ); + Assert.assertEquals(excepted, actual); + } + Assert.assertFalse(reader.hasNext()); + Assert.assertEquals(34, in.read()); + Assert.assertEquals(12, in.read()); + Assert.assertEquals(27, in.read()); + } finally { + in.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + CompressTripleWriter writer = new CompressTripleWriter(out); + try { + for (IndexedTriple triple : triples) { + writer.appendTriple(triple); + } + writer.writeCRC(); + // raw data to check if we didn't read too/not enough data + out.write(34); + out.write(12); + out.write(27); + } finally { + out.close(); + } + }, "WriteTest") + ).startAll().joinAndCrashIfRequired(); + } + + @Test + public void writeReadTripleIDTest() throws InterruptedException, IOException { + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + out.connect(in); + List triples = Arrays.asList( + new TripleID(1, 9, 11), + new TripleID(1, 9, 11), + new TripleID(3, 10, 11), + new TripleID(2, 12, 15), + new TripleID(2, 12, 15), + new TripleID(6, 14, 13) + ); + List noDupeTriples = Arrays.asList( + new TripleID(1, 9, 11), + new TripleID(3, 10, 11), + new TripleID(2, 12, 15), + new TripleID(6, 14, 13) + ); + new ExceptionThread(() -> { + CompressTripleReader reader = new CompressTripleReader(in); + try { + for (TripleID excepted : noDupeTriples) { + Assert.assertTrue(reader.hasNext()); + TripleID actual = reader.next(); + Assert.assertEquals(excepted, actual); + } + Assert.assertFalse(reader.hasNext()); + Assert.assertEquals(34, in.read()); + Assert.assertEquals(12, in.read()); + Assert.assertEquals(27, in.read()); + } finally { + in.close(); + } + }, "ReadTest").attach( + new ExceptionThread(() -> { + CompressTripleWriter writer = new CompressTripleWriter(out); + try { + for (TripleID triple : triples) { + writer.appendTriple(triple); + } + writer.writeCRC(); + // raw data to check if we didn't read too/not enough data + out.write(34); + out.write(12); + out.write(27); + } finally { + out.close(); + } + }, "WriteTest") + ).startAll().joinAndCrashIfRequired(); + } + + @Test + public void writeReadMergeTest() { + List triples1 = Arrays.asList( + new TripleID(2, 2, 2), + new TripleID(4, 4, 4), + new TripleID(5, 5, 5) + ); + List triples2 = Arrays.asList( + new TripleID(1, 1, 1), + new TripleID(3, 3, 3), + new TripleID(6, 6, 6) + ); + List triplesFinal = Arrays.asList( + new TripleID(1, 1, 1), + new TripleID(2, 2, 2), + new TripleID(3, 3, 3), + new TripleID(4, 4, 4), + new TripleID(5, 5, 5), + new TripleID(6, 6, 6) + ); + Iterator actual = new CompressTripleMergeIterator( + ExceptionIterator.of(triples1.iterator()), + ExceptionIterator.of(triples2.iterator()), + TripleComponentOrder.SPO + ).asIterator(); + Iterator expected = triplesFinal.iterator(); + + expected.forEachRemaining(tid -> { + Assert.assertTrue(actual.hasNext()); + Assert.assertEquals(tid, actual.next()); + }); + Assert.assertFalse(actual.hasNext()); + + } + +} diff --git a/hdt-java-core/src/test/resources/hdtGenDisk/unicode_disk_encode.nt b/hdt-java-core/src/test/resources/hdtGenDisk/unicode_disk_encode.nt new file mode 100644 index 00000000..b7498b13 --- /dev/null +++ b/hdt-java-core/src/test/resources/hdtGenDisk/unicode_disk_encode.nt @@ -0,0 +1,2 @@ + "d\u00A0normal"@nl . + "d\u00C2\u00A0normal"@dv .